Collecting, Analyzing and Presenting data about the participation in #ilovefs day
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

collecto.R 8.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. ################################################################################
  2. # Copyright (c) 2018 Free Software Foundation Europe e.V. <contatc@fsfe.org>
  3. # Author 2018 Jan Weymeirsch <janwey@fsfe.org>
  4. # Author 2018 Vincent Lequertier <vincent@fsfe.org>
  5. # SPDX-License-Identifier: GPL-3.0
  6. ################################################################################
  7. # Loading Packages and Functions -----------------------------------------------
  8. # Twitter ------------------------------
  9. if(!require("rtweet")){ install.packages("rtweet"); library("rtweet") }
  10. # had to install "httr" via packagemanager
  11. # Fediverse (eg: mastodon) -------------
  12. if(!require("curl")){ install.packages("curl"); library("curl") }
  13. if(!require("rjson")){ install.packages("rjson"); library("rjson") }
  14. # Reddit -------------------------------
  15. if(!require("RedditExtractoR")){
  16. install.packages("RedditExtractoR")
  17. library("RedditExtractoR")
  18. }
  19. # Export as ODS ------------------------
  20. if(!require("readODS")){ install.packages("readODS"); library("readODS") }
  21. # Read helper functions ----------------
  22. source("./functions.R")
  23. # Twitter Collector ------------------------------------------------------------
  24. # Reading stored API credentials -------
  25. tw_cred <- read.table(file = "../twitter_api.txt", header = TRUE, sep = ";",
  26. colClasses = "character")
  27. # Create Twitter Token -----------------
  28. twitter_token <- create_token(app = tw_cred$appname,
  29. consumer_key = tw_cred$consumer_key,
  30. consumer_secret = tw_cred$consumer_private)
  31. # Note
  32. # Please refer to the Documentation on where to receive your API credentials.
  33. # Collecting Tweets ------------------------------------------------------------
  34. # Scrape Tweets ------------------------
  35. tweets <- search_tweets(q = "#ilovefs",
  36. n = 9999,
  37. include_rts = FALSE)[,
  38. # include only revelvant information
  39. c("user_id", "created_at", "text", "source", "favorite_count",
  40. "retweet_count", "hashtags", "urls_expanded_url", "media_expanded_url",
  41. "ext_media_expanded_url", "lang", "location", "status_url", "protected")]
  42. # Recoding -----------------------------
  43. tweets <- within(data = tweets, expr = {
  44. # replace global user ID by index only unique to this dataset
  45. user <- as.numeric(as.factor(user_id))
  46. rm("user_id")
  47. # extract date and time
  48. time <- sub(pattern = ".*\\s", x = created_at, replace = "")
  49. date <- sub(pattern = "\\s.*", x = created_at, replace = "")
  50. rm("created_at")
  51. # extract "clean" text (without URLs or lineabreaks)
  52. ctxt <- gsub(pattern = "http.?://.+($|\\s)", x = text, replace = "") %>%
  53. gsub(pattern = "\n", x = text, replace = "")
  54. # Client data
  55. clnt <- as.factor(source)
  56. rm("source")
  57. # Favorites and Retweets
  58. favs <- favorite_count
  59. retw <- retweet_count
  60. rm(list = c("favorite_count", "retweet_count"))
  61. # List Hashtags in single Variable
  62. htag <- list2vec(x = hashtags)
  63. rm("hashtags")
  64. # URLs and Media
  65. link <- status_url
  66. urls <- list2vec(urls_expanded_url)
  67. murl <- list2vec(media_expanded_url)
  68. mext <- list2vec(ext_media_expanded_url)
  69. rm(list = c("urls_expanded_url", "media_expanded_url",
  70. "ext_media_expanded_url", "status_url"))
  71. # Location
  72. posi <- location
  73. rm("location")
  74. })
  75. # Eclusion------------------------------
  76. # before 2019-01-01, after 2019-02-17, protected tweets
  77. tweets <- tweets[(as.Date(tweets$date) > as.Date("2019-01-01") &
  78. as.Date(tweets$date) < as.Date("2019-02-17")),]
  79. tweets <- tweets[!tweets$protected,]
  80. # Mastodon Collector -----------------------------------------------------------
  81. # Set search parameters ----------------
  82. toots <- c()
  83. mastodon_iterations <- 999
  84. mastodon_instance <- "https://mastodon.social"
  85. mastodon_hashtag <- "ilovefs"
  86. mastodon_url <- paste0(mastodon_instance, "/api/v1/timelines/tag/",
  87. mastodon_hashtag, "?limit=40")
  88. # Scrape Mastodon ----------------------
  89. for(i in 1:mastodon_iterations){
  90. # Download and extract Posts
  91. mastodon_reqres <- curl_fetch_memory(mastodon_url)
  92. mastodon_rawjson <- rawToChar(mastodon_reqres$content)
  93. raw_toots <- fromJSON(mastodon_rawjson)
  94. # If Post-Data is present, extract it. Else break the loop
  95. if(length(raw_toots) > 0){
  96. tmp_toots <- mastodon.extract(data = raw_toots)
  97. toots <- rbind(toots, tmp_toots)
  98. } else {
  99. break
  100. }
  101. # Update the URL for the next iteration of the for loop so we can download
  102. # the next toots.
  103. mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
  104. mastodon_url <- gsub(x = mastodon_lheader,
  105. pattern = "(.*link:\ <)|(>;\ rel=\"next\".*)",
  106. replace = "")
  107. }
  108. # adding variable-names (again)
  109. names(toots) <- c("time", "date", "lang", "inst", "link", "text",
  110. "rebl", "favs", "acct", "murl")
  111. # Simple(!) anonymization --------------
  112. toots$acct <- as.numeric(toots$acct) # unique only to this dataframe
  113. toots$link <- as.numeric(toots$link) # unique only to this dataframe
  114. # Cleanup ------------------------------
  115. toots <- within(data = toots, expr = {
  116. # Time Variables
  117. time <- as.character(time)
  118. date <- as.character(date)
  119. fdat <- strptime(x = paste(date, time), format = "%Y-%m-%d %H:%M:%S",
  120. tz = "CET")
  121. # Instances
  122. inst <- gsub(pattern = "(tag:)|(,\\d+.*)|(https:\\/\\/)|(\\/.*)",
  123. x = inst, replacement = "")
  124. })
  125. # Exclusion ----------------------------
  126. # Only include Toots from this year
  127. mst_exclude <- which(as.Date(toots$date) < as.Date("2019-01-01") &
  128. as.Date(toots$date) > as.Date("2019-01-17"))
  129. if(length(mst_exclude) > 0){ toots <- toots[-mst_exclude,] }
  130. # Reddit Collector -------------------------------------------------------------
  131. # Get posts on Reddit ------------------
  132. reddit_post_dirty <- reddit_urls(search_terms = "ilovefs",
  133. #subreddit = "freesoftware linux opensource",
  134. cn_threshold = 0,
  135. page_threshold = 99999,
  136. sort_by = "new",
  137. wait_time = 5)
  138. # Extract relevant information ---------
  139. reddit <- within(data = reddit_post_dirty, expr = {
  140. # extract year
  141. year <- paste0(20, gsub(x = date, pattern = ".*-", replacement = ""))
  142. # rename relevant variables
  143. cmts <- num_comments
  144. name <- title
  145. subr <- subreddit
  146. link <- URL
  147. # Cleanup
  148. rm(list = c("num_comments", "title", "subreddit", "URL"))
  149. })
  150. # Exclude ------------------------------
  151. # Limit to this year only
  152. reddit_exclude <- which(as.numeric(reddit$year) < 2019)
  153. reddit <- reddit[-reddit_exclude,]
  154. # Additional Information ---------------
  155. # for all remaining posts, additional information may be gathered
  156. rdcnt <- lapply(X = reddit$link, FUN = function(x){
  157. reddit_content(URL = x, wait_time = 30)
  158. })
  159. # merge additional information into main dataset
  160. reddit$ptns <- reddit$text <- reddit$user <- NA
  161. for(i in 1:length(rdcnt)){
  162. reddit$ptns[i] <- rdcnt[[i]]$post_score[1]
  163. reddit$text[i] <- rdcnt[[i]]$post_text[1]
  164. reddit$user[i] <- rdcnt[[i]]$author[1]
  165. }
  166. # Exporting data ---------------------------------------------------------------
  167. # Create timestamp ---------------------
  168. time_of_saving <- sub(x = Sys.time(), pattern = " CET", replace = "") %>%
  169. sub(pattern = " ", replace = "_") %>%
  170. gsub(pattern = ":", replace = "-")
  171. # Save as RData ------------------------
  172. save_path <- paste0("../data/ilovefs-all_", time_of_saving, ".RData")
  173. save(list = c("tweets", "toots", "reddit"), file = save_path)
  174. # Save as Text -------------------------
  175. save_path <- paste0("../data/ilovefs-", c("fediverse_", "twitter_", "reddit_"),
  176. time_of_saving, ".txt")
  177. write.table(x = toots, file = save_path[1])
  178. write.table(x = tweets, file = save_path[2])
  179. write.table(x = reddit, file = save_path[3])
  180. # Save as CSV --------------------------
  181. save_path <- paste0("../data/ilovefs-", c("fediverse_", "twitter_", "reddit_"),
  182. time_of_saving, ".csv")
  183. write.csv(x = toots, file = save_path[1])
  184. write.csv(x = tweets, file = save_path[2])
  185. write.csv(x = reddit, file = save_path[3])
  186. # Save as ODS --------------------------
  187. save_path <- paste0("../data/ilovefs-", c("fediverse_", "twitter_", "reddit_"),
  188. time_of_saving, ".ods")
  189. write_ods(x = toots, path = save_path[1])
  190. write_ods(x = tweets, path = save_path[2])
  191. write_ods(x = reddit, path = save_path[3])