Collecting, Analyzing and Presenting data about the participation in #ilovefs day
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

collecto.R 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. ################################################################################
  2. # Copyright (c) 2018 Free Software Foundation Europe e.V. <contatc@fsfe.org>
  3. # Author 2018 Jan Weymeirsch <janwey@fsfe.org>
  4. # Author 2018 Vincent Lequertier <vincent@fsfe.org>
  5. # SPDX-License-Identifier: GPL-3.0
  6. ################################################################################
  7. ## Loading Packages ----
  8. # Twitter
  9. if(!require("rtweet")){ install.packages("rtweet"); library("rtweet") }
  10. # had to install "httr" via packagemanager
  11. # Fediverse (eg: mastodon)
  12. if(!require("curl")){ install.packages("curl"); library("curl") }
  13. if(!require("rjson")){ install.packages("rjson"); library("rjson") }
  14. # Reddit
  15. if(!require("RedditExtractoR")){
  16. install.packages("RedditExtractoR")
  17. library("RedditExtractoR")
  18. }
  19. ## Helper Functions ----
  20. list2vec <- function(x){
  21. sapply(X = x, FUN = function(y) paste(unlist(y), collapse = ","))
  22. }
  23. valifexst <- function(x) ifelse(test = length(x) > 0, yes = x, no = NA)
  24. ## Twitter Collector ----
  25. # Twitter Auth.
  26. ## Manual input (uncomment if needed)
  27. #tw_cred <- data.frame(
  28. # consumer_key = readline("[Twitter] Enter your consumer API key."),
  29. # consumer_private = readline("[Twitter] Enter your consumer API secret."))
  30. ## Saved credentials
  31. tw_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";",
  32. colClasses = "character")
  33. ## Create Twitter Token
  34. twitter_token <- create_token(app = tw_cred$appname,
  35. consumer_key = tw_cred$consumer_key,
  36. consumer_secret = tw_cred$consumer_private)
  37. # Note -------------------------------------------------------------------------
  38. # Please refer to the Documentation on where to receive your API credentials.
  39. # ------------------------------------------------------------------------------
  40. ## Collecting Tweets
  41. tweets <- search_tweets(q = "#ilovefs",
  42. n = 9999,
  43. include_rts = FALSE)[,
  44. # include only revelvant information
  45. c("user_id", "created_at", "text", "source", "favorite_count",
  46. "retweet_count", "hashtags", "urls_expanded_url", "media_expanded_url",
  47. "ext_media_expanded_url", "lang", "location", "status_url", "protected")]
  48. ## Some recoding, simplistic(!) anonymization
  49. tweets <- within(data = tweets, expr = {
  50. # replace global user ID by index only unique to this dataset
  51. user <- as.numeric(as.factor(user_id))
  52. rm("user_id")
  53. # extract date and time
  54. time <- sub(pattern = ".*\\s", x = created_at, replace = "")
  55. date <- sub(pattern = "\\s.*", x = created_at, replace = "")
  56. rm("created_at")
  57. # extract "clean" text (without URLs or lineabreaks)
  58. ctxt <- gsub(pattern = "http.?://.+($|\\s)", x = text, replace = "") %>%
  59. gsub(pattern = "\n", x = text, replace = "")
  60. # Client data
  61. clnt <- as.factor(source)
  62. rm("source")
  63. # Favorites and Retweets
  64. favs <- favorite_count
  65. retw <- retweet_count
  66. rm(list = c("favorite_count", "retweet_count"))
  67. # List Hashtags in single Variable
  68. htag <- sapply(X = hashtags, FUN = function(x){
  69. paste(unlist(x), collapse = ",")
  70. })
  71. rm("hashtags")
  72. # URLs and Media
  73. link <- status_url
  74. urls <- list2vec(urls_expanded_url)
  75. murl <- list2vec(media_expanded_url)
  76. mext <- list2vec(ext_media_expanded_url)
  77. rm(list = c("urls_expanded_url", "media_expanded_url",
  78. "ext_media_expanded_url", "status_url"))
  79. # Location
  80. posi <- location
  81. rm("location")
  82. })
  83. ## Eclusion: before 2019-01-01, after 2019-02-17, protected tweets
  84. tweets <- tweets[(as.Date(tweets$date) > as.Date("2019-01-01") &
  85. as.Date(tweets$date) < as.Date("2019-02-17")),]
  86. tweets <- tweets[!tweets$protected,]
  87. ## Mastodon Collector {{{ ----
  88. mastodon.extract <- function(data){
  89. # Within each post
  90. data <- sapply(X = data, FUN = function(x){
  91. # time and date
  92. time <- gsub(x = x$created_at, pattern = ".*T|\\..*", replacement = "")
  93. date <- sub(x = x$created_at, pattern = "T.*", replacement = "")
  94. # simple extraction, return NA if value does not exist
  95. lang <- valifexst(x$language) # language
  96. inst <- valifexst(x$uri) # instance name
  97. link <- valifexst(x$url) # post URL
  98. rebl <- valifexst(x$reblogs_count) # number of reblogs
  99. favs <- valifexst(x$favourites_count) # number of favorites
  100. acct <- valifexst(x$account$url) # account url (unique)
  101. # sanitizing text (removing HTML tags and whitespace)
  102. text <- gsub(pattern = "<.*?>|\\s{2,}", x = x$content, replacement = "")
  103. # media URL (multiple possible)
  104. murl <- valifexst(
  105. sapply(X = x$media_attachements, FUN = function(y){
  106. list2vec(y$url)
  107. })
  108. )
  109. # return extracted data only
  110. return(data.frame(
  111. rbind(time, date, lang, inst, link, text, rebl, favs, acct, murl)
  112. ))
  113. })
  114. data <- as.data.frame(
  115. t(matrix(data = unlist(data), nrow = length(data[[1]])))
  116. )
  117. return(data)
  118. }
  119. ## Set search parameters
  120. mastodon_instance <- "https://mastodon.social"
  121. mastodon_hashtag <- "ilovefs"
  122. mastodon_url <- paste0(mastodon_instance,
  123. "/api/v1/timelines/tag/",
  124. mastodon_hashtag,
  125. "?limit=40")
  126. mastodon_iterations <- 999
  127. toots <- c()
  128. ## Scrape Mastodon
  129. for(i in 1:mastodon_iterations){
  130. # Download and extract Posts
  131. mastodon_reqres <- curl_fetch_memory(mastodon_url)
  132. mastodon_rawjson <- rawToChar(mastodon_reqres$content)
  133. raw_toots <- fromJSON(mastodon_rawjson)
  134. # If Post-Data is present, extract it. Else break the loop
  135. if(length(raw_toots) > 0){
  136. tmp_toots <- mastodon.extract(data = raw_toots)
  137. toots <- rbind(toots, tmp_toots)
  138. } else {
  139. break
  140. }
  141. # Update the URL for the next iteration of the for loop so we can download
  142. # the next toots.
  143. mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
  144. mastodon_next <- sub(x = mastodon_lheader, pattern = ".*link:\ <",
  145. replace = "")
  146. mastodon_url <- sub(x = mastodon_next, pattern = ">;\ rel=\"next\".*",
  147. replace = "")
  148. }
  149. names(toots) <- c("time", "date", "lang", "inst", "link", "text",
  150. "rebl", "favs", "acct", "murl")
  151. ## Simple(!) anonymization
  152. toots$acct <- as.numeric(toots$acct) # unique only to this dataframe
  153. toots$link <- as.numeric(toots$link) # unique only to this dataframe
  154. ## Cleanup
  155. toots <- within(data = toots, expr = {
  156. # Time Variables
  157. time <- as.character(time)
  158. date <- as.character(date)
  159. fdat <- strptime(x = paste(date, time), format = "%Y-%m-%d %H:%M:%S",
  160. tz = "CET")
  161. # Instances
  162. inst <- gsub(pattern = "(tag:)|(,\\d+.*)|(https:\\/\\/)|(\\/.*)",
  163. x = inst, replacement = "")
  164. })
  165. ## Only include Toots from this year
  166. mst_exclude <- which(as.Date(toots$date) < as.Date("2019-01-01") |
  167. as.Date(toots$date) > as.Date("2019-01-17"))
  168. toots <- toots[-mst_exclude,]
  169. ## Reddit Collector {{{ ----
  170. ### Authentication at Reddit
  171. # no authentication necessary, hence we can directly start scraping
  172. ### Get posts on Reddit
  173. reddit_post_dirty <- reddit_urls(search_terms = "ilovefs",
  174. #subreddit = "freesoftware linux opensource",
  175. cn_threshold = 0,
  176. page_threshold = 99999,
  177. sort_by = "new",
  178. wait_time = 5)
  179. ### Only use posts from the current year
  180. reddit_searchinyear <- 18 # has to have format "YY", eg "18" for "2018"
  181. reddit_post_year <- gsub(x = reddit_post_dirty$date,
  182. pattern = "\\d.-\\d.-",
  183. replace = "")
  184. reddit_post <- reddit_post_dirty[which(reddit_post_year == reddit_searchinyear),]
  185. ### Extracting relevant variables
  186. comt <- c() # Comments / Replies
  187. subr <- c() # Subreddit
  188. ptns <- c() # Points / Score
  189. ttle <- c() # Title
  190. text <- c() # Text / Content
  191. link <- c() # Linked to Website
  192. date <- c() # Date
  193. rurl <- c() # Reddit-URL of post
  194. acct <- c() # Author of Post
  195. for(i in c(1:length(reddit_post$URL))){
  196. comt[i] <- reddit_post$num_comments[i]
  197. ttle[i] <- reddit_post$title[i]
  198. rurl[i] <- reddit_post$URL[i]
  199. date[i] <- gsub(x = reddit_post$date[i], pattern = "-", replace = "")
  200. subr[i] <- reddit_post$subreddit[i]
  201. Sys.sleep(2)
  202. reddit_content <- reddit_content(URL = reddit_post$URL[i], wait_time = 0)
  203. ptns[i] <- reddit_content$post_score[1]
  204. text[i] <- reddit_content$post_text[1]
  205. link[i] <- reddit_content$link[1]
  206. acct[i] <- reddit_content$author[1]
  207. }
  208. ### Creating dataframe
  209. reddit <- data.frame(cbind(date, rurl, link, text, ttle, ptns, subr, comt, acct))
  210. #### Clean-Up
  211. rm(list = c("date", "rurl", "link", "text", "ttle", "ptns", "subr", "comt", "acct"))
  212. reddit <- within(data = reddit, expr = {
  213. date <- as.character(date);
  214. rurl <- as.character(rurl);
  215. link <- as.character(link);
  216. text <- as.character(text);
  217. ttle <- as.character(ttle);
  218. ptns <- as.numeric(as.character(ptns));
  219. subr <- as.character(subr);
  220. comt <- as.numeric(as.character(comt));
  221. })
  222. # }}}
  223. ### Exporting data {{{ ----
  224. time_of_saving <- sub(x = Sys.time(), pattern = " CET", replace = "")
  225. time_of_saving <- sub(x = time_of_saving, pattern = " ", replace = "_")
  226. time_of_saving <- gsub(x = time_of_saving, pattern = ":", replace = "-")
  227. #### RData
  228. save_path <- paste0("./data/ilovefs-all_", time_of_saving, ".RData")
  229. save(list = c("twitter", "mastodon", "reddit"), file = save_path)
  230. #### Text
  231. ##### Fediverse
  232. save_path_fed_t <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".txt")
  233. write.table(mastodon, file = save_path_fed_t)
  234. ##### Twitter
  235. save_path_twitter_t <- paste0("./data/ilovefs-twitter_", time_of_saving, ".txt")
  236. write.table(twitter, file = save_path_twitter_t)
  237. ##### Reddit
  238. save_path_reddit_t <- paste0("./data/ilovefs-reddit_", time_of_saving, ".txt")
  239. write.table(reddit, file = save_path_reddit_t)
  240. #### CSV
  241. ##### Fediverse
  242. save_path_fed_c <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".csv")
  243. write.csv(mastodon, file = save_path_fed_c)
  244. ##### Twitter
  245. save_path_twitter_c <- paste0("./data/ilovefs-twitter_", time_of_saving, ".csv")
  246. write.csv(twitter, file = save_path_twitter_c)
  247. ##### Reddit
  248. save_path_reddit_c <- paste0("./data/ilovefs-reddit_", time_of_saving, ".csv")
  249. write.csv(reddit, file = save_path_reddit_c)
  250. # }}}