Browse Source

rewrite: twitter and mastodon

pull/15/head
JayVii 7 months ago
parent
commit
6d6bf322f6
1 changed files with 161 additions and 253 deletions
  1. 161
    253
      collecto.R

+ 161
- 253
collecto.R View File

@@ -5,301 +5,209 @@
# SPDX-License-Identifier: GPL-3.0
################################################################################

### Loading Packages {{{ ----
#### Twitter
## Loading Packages ----

# Twitter
if(!require("rtweet")){ install.packages("rtweet"); library("rtweet") }
# had to install "httr" via packagemanager

#### Fediverse (eg: mastodon)
# Fediverse (eg: mastodon)
if(!require("curl")){ install.packages("curl"); library("curl") }
if(!require("rjson")){ install.packages("rjson"); library("rjson") }

### Reddit
# Reddit
if(!require("RedditExtractoR")){
install.packages("RedditExtractoR")
library("RedditExtractoR")
}
# }}}

## Twitter Collector {{{ ----
## Helper Functions ----
list2vec <- function(x){
sapply(X = x, FUN = function(y) paste(unlist(y), collapse = ","))
}
valifexst <- function(x) ifelse(test = length(x) > 0, yes = x, no = NA)

## Twitter Collector ----

# Twitter Auth.

## Manual input (uncomment if needed)
#tw_cred <- data.frame(
# consumer_key = readline("[Twitter] Enter your consumer API key."),
# consumer_private = readline("[Twitter] Enter your consumer API secret."))

### Authenticate to Twitter
#### Manual input (uncomment if needed)
#twitter_consumerkey <- readline("[Twitter] Enter your consumer API key.")
#twitter_consumerpri <- readline("[Twitter] Enter your consumer API secret.")
#twitter_tokenaccess <- readline("[Twitter] Enter your Access Token.")
#twitter_tokensecret <- readline("[Twitter] Enter your Token Secret.")
#### Saved credentials
twitter_api_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";")
twitter_consumerkey <- as.character(twitter_api_cred$consumer_key)
twitter_consumerpri <- as.character(twitter_api_cred$consumer_private)
twitter_appname <- as.character(twitter_api_cred$appname)
## Saved credentials
tw_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";",
colClasses = "character")

twitter_token <- create_token(app = twitter_appname,
consumer_key = twitter_consumerkey,
consumer_secret = twitter_consumerpri)
## Create Twitter Token
twitter_token <- create_token(app = tw_cred$appname,
consumer_key = tw_cred$consumer_key,
consumer_secret = tw_cred$consumer_private)

# Note -------------------------------------------------------------------------
# Please refer to the Documentation on where to receive your API credentials.
# ------------------------------------------------------------------------------

### Collecting Tweets
twitter_tw <- search_tweets(q = "#ilovefs",
n = 9999,
include_rts = FALSE)

twitter_number <- length(twitter_tw$text)
text <- twitter_tw$text
user <- twitter_tw$screen_name
clnt <- twitter_tw$source
favs <- twitter_tw$favorite_count
retw <- twitter_tw$retweet_count
lang <- twitter_tw$lang
fdat <- twitter_tw$created_at

link <- vector(mode = "character", length = twitter_number)
murl <- vector(mode = "character", length = twitter_number)
for(i in 1:twitter_number){
link[i] <- twitter_tw$urls_expanded_url[[i]][1]
murl[i] <- twitter_tw$media_expanded_url[[i]][1]
}

### Forming variables for dataframe
time <- sub(pattern = ".* ", x = fdat, replace = "")
time <- gsub(pattern = ":", x = time, replace = "")
date <- sub(pattern = " .*", x = fdat, replace = "")
date <- gsub(pattern = "-", x = date, replace = "")

###
twitter_exclude <- which(as.numeric(date) > 20180216 | as.numeric(date) < 20180210)
date <- date[-twitter_exclude]
time <- time[-twitter_exclude]
fdat <- fdat[-twitter_exclude]
retw <- retw[-twitter_exclude]
favs <- favs[-twitter_exclude]
text <- text[-twitter_exclude]
lang <- lang[-twitter_exclude]
murl <- murl[-twitter_exclude]
link <- link[-twitter_exclude]
clnt <- clnt[-twitter_exclude]
user <- user[-twitter_exclude]

### Creating dataframe
twitter <- data.frame(cbind(date, time, fdat, retw, favs, text, lang, murl, link, clnt, user))

#### Clean-Up
rm(list = c("date", "time", "fdat", "retw", "favs", "text", "link", "murl", "lang", "clnt", "user"))

twitter <- within(data = twitter, expr = {
date <- as.character(date);
time <- as.character(time);
fdat <- as.character(fdat);
retw <- as.character(retw);
favs <- as.character(favs);
text <- as.character(text);
link <- as.character(link);
murl <- as.character(murl);
lang <- as.character(lang);
clnt <- as.character(clnt);
user <- as.character(user);
})
# }}}
## Collecting Tweets
tweets <- search_tweets(q = "#ilovefs",
n = 9999,
include_rts = FALSE)[,
# include only revelvant information
c("user_id", "created_at", "text", "source", "favorite_count",
"retweet_count", "hashtags", "urls_expanded_url", "media_expanded_url",
"ext_media_expanded_url", "lang", "location", "status_url", "protected")]

## Some recoding, simplistic(!) anonymization
tweets <- within(data = tweets, expr = {
# replace global user ID by index only unique to this dataset
user <- as.numeric(as.factor(user_id))
rm("user_id")

# extract date and time
time <- sub(pattern = ".*\\s", x = created_at, replace = "")
date <- sub(pattern = "\\s.*", x = created_at, replace = "")
rm("created_at")

# extract "clean" text (without URLs or lineabreaks)
ctxt <- gsub(pattern = "http.?://.+($|\\s)", x = text, replace = "") %>%
gsub(pattern = "\n", x = text, replace = "")

# Client data
clnt <- as.factor(source)
rm("source")

# Favorites and Retweets
favs <- favorite_count
retw <- retweet_count
rm(list = c("favorite_count", "retweet_count"))

# List Hashtags in single Variable
htag <- sapply(X = hashtags, FUN = function(x){
paste(unlist(x), collapse = ",")
})
rm("hashtags")

# URLs and Media
link <- status_url
urls <- list2vec(urls_expanded_url)
murl <- list2vec(media_expanded_url)
mext <- list2vec(ext_media_expanded_url)
rm(list = c("urls_expanded_url", "media_expanded_url",
"ext_media_expanded_url", "status_url"))

# Location
posi <- location
rm("location")
})

## Eclusion: before 2019-01-01, after 2019-02-17, protected tweets
tweets <- tweets[(as.Date(tweets$date) > as.Date("2019-01-01") &
as.Date(tweets$date) < as.Date("2019-02-17")),]
tweets <- tweets[!tweets$protected,]

## Mastodon Collector {{{ ----

mastodon.fetchdata <- function(data){

tmp_datetime <- c()
tmp_lang <- c()
tmp_inst <- c()
tmp_link <- c()
tmp_text <- c()
tmp_reto <- c()
tmp_favs <- c()
tmp_murl <- c()
tmp_acct <- c()
for(i in 1:length(data)){

#### Time and Date of Toot
if(length(data[[i]]$created_at) > 0){
tmp_datetime[i] <- data[[i]]$created_at
} else {
# insert empty value, if it does not exist
tmp_datetime[i] <- NA
}
mastodon.extract <- function(data){

#### Language of Toot
if(length(data[[i]]$language) > 0){
tmp_lang[i] <- data[[i]]$language
} else {
# insert empty value, if it does not exist
tmp_lang[i] <- NA
}
# Within each post
data <- sapply(X = data, FUN = function(x){

#### Instance of Toot
if(length(data[[i]]$uri) > 0){
tmp_inst[i] <- data[[i]]$uri
} else {
# insert empty value, if it does not exist
tmp_inst[i] <- NA
}
# time and date
time <- gsub(x = x$created_at, pattern = ".*T|\\..*", replacement = "")
date <- sub(x = x$created_at, pattern = "T.*", replacement = "")

#### URL of Toot
if(length(data[[i]]$url) > 0){
tmp_link[i] <- data[[i]]$url
} else {
# insert empty value, if it does not exist
tmp_link[i] <- NA
}
# simple extraction, return NA if value does not exist
lang <- valifexst(x$language) # language
inst <- valifexst(x$uri) # instance name
link <- valifexst(x$url) # post URL
rebl <- valifexst(x$reblogs_count) # number of reblogs
favs <- valifexst(x$favourites_count) # number of favorites
acct <- valifexst(x$account$url) # account url (unique)

#### Text/Content of Toot
if(length(data[[i]]$content) > 0){
tmp_text[i] <- data[[i]]$content
} else {
# insert empty value, if it does not exist
tmp_text[i] <- NA
}
# sanitizing text (removing HTML tags and whitespace)
text <- gsub(pattern = "<.*?>|\\s{2,}", x = x$content, replacement = "")

#### Number of Retoots
if(length(data[[i]]$reblogs_count) > 0){
tmp_reto[i] <- data[[i]]$reblogs_count
} else {
# insert empty value, if it does not exist
tmp_reto[i] <- NA
}
# media URL (multiple possible)
murl <- valifexst(
sapply(X = x$media_attachements, FUN = function(y){
list2vec(y$url)
})
)

#### Number of Favorites
if(length(data[[i]]$favourites_count) > 0){
tmp_favs[i] <- data[[i]]$favourites_count
} else {
# insert empty value, if it does not exist
tmp_favs[i] <- NA
}
# return extracted data only
return(data.frame(
rbind(time, date, lang, inst, link, text, rebl, favs, acct, murl)
))
})

#### Number of Favorites
if(length(data[[i]]$media_attachments) > 0){
tmp_murl[i] <- data[[i]]$media_attachments[[1]]$url
} else {
# insert empty value, if it does not exist
tmp_murl[i] <- NA
}
data <- as.data.frame(
t(matrix(data = unlist(data), nrow = length(data[[1]])))
)

#### Account of Tooter
if(length(data[[i]]$account) > 0){
tmp_acct[i] <- data[[i]]$account$acct
} else {
# insert empty value, if it does not exist
tmp_acct[i] <- NA
}

}
return(data.frame(cbind(tmp_datetime,
tmp_lang,
tmp_inst,
tmp_text,
tmp_link,
tmp_reto,
tmp_favs,
tmp_murl,
tmp_acct)))
return(data)
}

datetime <- c()
lang <- c()
inst <- c()
link <- c()
text <- c()
reto <- c()
favs <- c()
murl <- c()
acct <- c()
## Set search parameters
mastodon_instance <- "https://mastodon.social"
mastodon_hashtag <- "ilovefs"
mastodon_url <- paste0(mastodon_instance,
"/api/v1/timelines/tag/",
mastodon_hashtag,
"?limit=40")
for(i in 1:999){
mastodon_reqres <- curl_fetch_memory(mastodon_url)

mastodon_rawjson <- rawToChar(mastodon_reqres$content)
toots <- fromJSON(mastodon_rawjson)

if(length(toots) > 0){
tmp_mastodon_df <- mastodon.fetchdata(data = toots)
datetime <- c(datetime, as.character(tmp_mastodon_df$tmp_datetime))
lang <- c(lang, as.character(tmp_mastodon_df$tmp_lang))
inst <- c(inst, as.character(tmp_mastodon_df$tmp_inst))
link <- c(link, as.character(tmp_mastodon_df$tmp_link))
text <- c(text, as.character(tmp_mastodon_df$tmp_text))
reto <- c(reto, as.character(tmp_mastodon_df$tmp_reto))
favs <- c(favs, as.character(tmp_mastodon_df$tmp_favs))
murl <- c(murl, as.character(tmp_mastodon_df$tmp_murl))
acct <- c(acct, as.character(tmp_mastodon_df$tmp_acct))
} else {
break
}
mastodon_iterations <- 999
toots <- c()

## Scrape Mastodon
for(i in 1:mastodon_iterations){

# Download and extract Posts
mastodon_reqres <- curl_fetch_memory(mastodon_url)
mastodon_rawjson <- rawToChar(mastodon_reqres$content)
raw_toots <- fromJSON(mastodon_rawjson)

# If Post-Data is present, extract it. Else break the loop
if(length(raw_toots) > 0){
tmp_toots <- mastodon.extract(data = raw_toots)
toots <- rbind(toots, tmp_toots)
} else {
break
}

# Update the URL for the next iteration of the for loop so we can download
# the next toots.
mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
mastodon_next <- sub(x = mastodon_lheader, pattern = ".*link:\ <", replace = "")
mastodon_url <- sub(x = mastodon_next, pattern = ">;\ rel=\"next\".*", replace = "")
mastodon_next <- sub(x = mastodon_lheader, pattern = ".*link:\ <",
replace = "")
mastodon_url <- sub(x = mastodon_next, pattern = ">;\ rel=\"next\".*",
replace = "")
}

### Time of post
#### date
date <- sub(pattern = "T.*", x = datetime, replacement = "")
date <- gsub(pattern = "-", x = date, replacement = "")
#### time
time <- sub(pattern = ".*T", x = datetime, replacement = "")
time <- sub(pattern = "\\..*", x = time, replacement = "")
time <- gsub(pattern = ":", x = time, replacement = "")
#### full time
fdat <- strptime(x = paste0(date, time), format = "%Y%m%d%H%M%S", tz = "CET")
fdat <- as.character(fdat)

### Removing HTML-Tags from Toots
text <- gsub(pattern = "<.*?>", x = text, replacement = "")
text <- gsub(pattern = " ", x = text, replacement = "")

### Cleaning Instance-String
#### GNUsocial
inst <- sub(pattern = "tag:", x = inst, replacement = "")
inst <- sub(pattern = ",\\d+.*", x = inst, replacement = "")
#### Mastodon
inst <- sub(pattern = "https:\\/\\/", x = inst, replacement = "")
inst <- sub(pattern = "\\/.*", x = inst, replacement = "")

### Only include Toots from this year
mastodon_exclude <- which(as.numeric(date) < 20180210 | as.numeric(date) > 20180216)
date <- date[-mastodon_exclude]
time <- time[-mastodon_exclude]
fdat <- fdat[-mastodon_exclude]
lang <- lang[-mastodon_exclude]
inst <- inst[-mastodon_exclude]
text <- text[-mastodon_exclude]
link <- link[-mastodon_exclude]
reto <- reto[-mastodon_exclude]
favs <- favs[-mastodon_exclude]
murl <- murl[-mastodon_exclude]
acct <- acct[-mastodon_exclude]

### Creating dataframe
mastodon <- data.frame(cbind(date, time, fdat, lang, inst, text, link, reto, favs, murl, acct))

#### Clean-Up
rm(list = c("date", "time", "fdat", "lang", "inst", "text", "link", "favs", "reto", "murl", "datetime", "acct"))

mastodon <- within(data = mastodon, expr = {
date <- as.character(date);
time <- as.character(time);
fdat <- as.character(fdat);
text <- as.character(text);
link <- as.character(link);
murl <- as.character(murl);
})
# }}}
names(toots) <- c("time", "date", "lang", "inst", "link", "text",
"rebl", "favs", "acct", "murl")

## Simple(!) anonymization
toots$acct <- as.numeric(toots$acct) # unique only to this dataframe
toots$link <- as.numeric(toots$link) # unique only to this dataframe

## Cleanup
toots <- within(data = toots, expr = {

# Time Variables
time <- as.character(time)
date <- as.character(date)
fdat <- strptime(x = paste(date, time), format = "%Y-%m-%d %H:%M:%S",
tz = "CET")

# Instances
inst <- gsub(pattern = "(tag:)|(,\\d+.*)|(https:\\/\\/)|(\\/.*)",
x = inst, replacement = "")
})

## Only include Toots from this year
mst_exclude <- which(as.Date(toots$date) < as.Date("2019-01-01") |
as.Date(toots$date) > as.Date("2019-01-17"))
toots <- toots[-mst_exclude,]

## Reddit Collector {{{ ----


Loading…
Cancel
Save