Collecting, Analyzing and Presenting data about the participation in #ilovefs day

collecto.R 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. ################################################################################
  2. # Copyright (c) 2018 Free Software Foundation Europe e.V. <contatc@fsfe.org>
  3. # Author 2018 Jan Weymeirsch <janwey@fsfe.org>
  4. # Author 2018 Vincent Lequertier <vincent@fsfe.org>
  5. # SPDX-License-Identifier: GPL-3.0
  6. ################################################################################
  7. ### Loading Packages {{{ ----
  8. #### Twitter
  9. install.packages("rtweet")
  10. library("rtweet")
  11. # had to install "httr" via packagemanager
  12. #### Fediverse (eg: mastodon)
  13. install.packages("curl")
  14. library("curl")
  15. install.packages("rjson")
  16. library(rjson)
  17. ### Reddit
  18. install.packages("RedditExtractoR")
  19. library("RedditExtractoR")
  20. # }}}
  21. ## Twitter Collector {{{ ----
  22. ### Authenticate to Twitter
  23. #### Manual input (uncomment if needed)
  24. #twitter_consumerkey <- readline("[Twitter] Enter your consumer API key.")
  25. #twitter_consumerpri <- readline("[Twitter] Enter your consumer API secret.")
  26. #twitter_tokenaccess <- readline("[Twitter] Enter your Access Token.")
  27. #twitter_tokensecret <- readline("[Twitter] Enter your Token Secret.")
  28. #### Saved credentials
  29. twitter_api_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";")
  30. twitter_consumerkey <- as.character(twitter_api_cred$consumer_key)
  31. twitter_consumerpri <- as.character(twitter_api_cred$consumer_private)
  32. twitter_appname <- as.character(twitter_api_cred$appname)
  33. twitter_token <- create_token(app = twitter_appname,
  34. consumer_key = twitter_consumerkey,
  35. consumer_secret = twitter_consumerpri)
  36. # Note -------------------------------------------------------------------------
  37. # Please refer to the Documentation on where to receive your API credentials.
  38. # ------------------------------------------------------------------------------
  39. ### Collecting Tweets
  40. twitter_tw <- search_tweets(q = "#ilovefs",
  41. n = 9999,
  42. include_rts = FALSE)
  43. twitter_number <- length(twitter_tw$text)
  44. text <- twitter_tw$text
  45. user <- twitter_tw$screen_name
  46. clnt <- twitter_tw$source
  47. favs <- twitter_tw$favorite_count
  48. retw <- twitter_tw$retweet_count
  49. lang <- twitter_tw$lang
  50. fdat <- twitter_tw$created_at
  51. link <- vector(mode = "character", length = twitter_number)
  52. murl <- vector(mode = "character", length = twitter_number)
  53. for(i in 1:twitter_number){
  54. link[i] <- twitter_tw$urls_expanded_url[[i]][1]
  55. murl[i] <- twitter_tw$media_expanded_url[[i]][1]
  56. }
  57. ### Forming variables for dataframe
  58. time <- sub(pattern = ".* ", x = fdat, replace = "")
  59. time <- gsub(pattern = ":", x = time, replace = "")
  60. date <- sub(pattern = " .*", x = fdat, replace = "")
  61. date <- gsub(pattern = "-", x = date, replace = "")
  62. ###
  63. twitter_exclude <- which(as.numeric(date) > 20180216 | as.numeric(date) < 20180210)
  64. date <- date[-twitter_exclude]
  65. time <- time[-twitter_exclude]
  66. fdat <- fdat[-twitter_exclude]
  67. retw <- retw[-twitter_exclude]
  68. favs <- favs[-twitter_exclude]
  69. text <- text[-twitter_exclude]
  70. lang <- lang[-twitter_exclude]
  71. murl <- murl[-twitter_exclude]
  72. link <- link[-twitter_exclude]
  73. clnt <- clnt[-twitter_exclude]
  74. user <- user[-twitter_exclude]
  75. ### Creating dataframe
  76. twitter <- data.frame(cbind(date, time, fdat, retw, favs, text, lang, murl, link, clnt, user))
  77. #### Clean-Up
  78. rm(list = c("date", "time", "fdat", "retw", "favs", "text", "link", "murl", "lang", "clnt", "user"))
  79. twitter <- within(data = twitter, expr = {
  80. date <- as.character(date);
  81. time <- as.character(time);
  82. fdat <- as.character(fdat);
  83. retw <- as.character(retw);
  84. favs <- as.character(favs);
  85. text <- as.character(text);
  86. link <- as.character(link);
  87. murl <- as.character(murl);
  88. lang <- as.character(lang);
  89. clnt <- as.character(clnt);
  90. user <- as.character(user);
  91. })
  92. # }}}
  93. ## Mastodon Collector {{{ ----
  94. mastodon.fetchdata <- function(data){
  95. tmp_datetime <- c()
  96. tmp_lang <- c()
  97. tmp_inst <- c()
  98. tmp_link <- c()
  99. tmp_text <- c()
  100. tmp_reto <- c()
  101. tmp_favs <- c()
  102. tmp_murl <- c()
  103. tmp_acct <- c()
  104. for(i in 1:length(data)){
  105. #### Time and Date of Toot
  106. if(length(data[[i]]$created_at) > 0){
  107. tmp_datetime[i] <- data[[i]]$created_at
  108. } else {
  109. # insert empty value, if it does not exist
  110. tmp_datetime[i] <- NA
  111. }
  112. #### Language of Toot
  113. if(length(data[[i]]$language) > 0){
  114. tmp_lang[i] <- data[[i]]$language
  115. } else {
  116. # insert empty value, if it does not exist
  117. tmp_lang[i] <- NA
  118. }
  119. #### Instance of Toot
  120. if(length(data[[i]]$uri) > 0){
  121. tmp_inst[i] <- data[[i]]$uri
  122. } else {
  123. # insert empty value, if it does not exist
  124. tmp_inst[i] <- NA
  125. }
  126. #### URL of Toot
  127. if(length(data[[i]]$url) > 0){
  128. tmp_link[i] <- data[[i]]$url
  129. } else {
  130. # insert empty value, if it does not exist
  131. tmp_link[i] <- NA
  132. }
  133. #### Text/Content of Toot
  134. if(length(data[[i]]$content) > 0){
  135. tmp_text[i] <- data[[i]]$content
  136. } else {
  137. # insert empty value, if it does not exist
  138. tmp_text[i] <- NA
  139. }
  140. #### Number of Retoots
  141. if(length(data[[i]]$reblogs_count) > 0){
  142. tmp_reto[i] <- data[[i]]$reblogs_count
  143. } else {
  144. # insert empty value, if it does not exist
  145. tmp_reto[i] <- NA
  146. }
  147. #### Number of Favorites
  148. if(length(data[[i]]$favourites_count) > 0){
  149. tmp_favs[i] <- data[[i]]$favourites_count
  150. } else {
  151. # insert empty value, if it does not exist
  152. tmp_favs[i] <- NA
  153. }
  154. #### Number of Favorites
  155. if(length(data[[i]]$media_attachments) > 0){
  156. tmp_murl[i] <- data[[i]]$media_attachments[[1]]$url
  157. } else {
  158. # insert empty value, if it does not exist
  159. tmp_murl[i] <- NA
  160. }
  161. #### Account of Tooter
  162. if(length(data[[i]]$account) > 0){
  163. tmp_acct[i] <- data[[i]]$account$acct
  164. } else {
  165. # insert empty value, if it does not exist
  166. tmp_acct[i] <- NA
  167. }
  168. }
  169. return(data.frame(cbind(tmp_datetime,
  170. tmp_lang,
  171. tmp_inst,
  172. tmp_text,
  173. tmp_link,
  174. tmp_reto,
  175. tmp_favs,
  176. tmp_murl,
  177. tmp_acct)))
  178. }
  179. datetime <- c()
  180. lang <- c()
  181. inst <- c()
  182. link <- c()
  183. text <- c()
  184. reto <- c()
  185. favs <- c()
  186. murl <- c()
  187. acct <- c()
  188. mastodon_instance <- "https://mastodon.social"
  189. mastodon_hashtag <- "ilovefs"
  190. mastodon_url <- paste0(mastodon_instance,
  191. "/api/v1/timelines/tag/",
  192. mastodon_hashtag,
  193. "?limit=40")
  194. for(i in 1:999){
  195. mastodon_reqres <- curl_fetch_memory(mastodon_url)
  196. mastodon_rawjson <- rawToChar(mastodon_reqres$content)
  197. toots <- fromJSON(mastodon_rawjson)
  198. if(length(toots) > 0){
  199. tmp_mastodon_df <- mastodon.fetchdata(data = toots)
  200. datetime <- c(datetime, as.character(tmp_mastodon_df$tmp_datetime))
  201. lang <- c(lang, as.character(tmp_mastodon_df$tmp_lang))
  202. inst <- c(inst, as.character(tmp_mastodon_df$tmp_inst))
  203. link <- c(link, as.character(tmp_mastodon_df$tmp_link))
  204. text <- c(text, as.character(tmp_mastodon_df$tmp_text))
  205. reto <- c(reto, as.character(tmp_mastodon_df$tmp_reto))
  206. favs <- c(favs, as.character(tmp_mastodon_df$tmp_favs))
  207. murl <- c(murl, as.character(tmp_mastodon_df$tmp_murl))
  208. acct <- c(acct, as.character(tmp_mastodon_df$tmp_acct))
  209. } else {
  210. break
  211. }
  212. # Update the URL for the next iteration of the for loop so we can download
  213. # the next toots.
  214. mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
  215. mastodon_next <- sub(x = mastodon_lheader, pattern = ".*link:\ <", replace = "")
  216. mastodon_url <- sub(x = mastodon_next, pattern = ">;\ rel=\"next\".*", replace = "")
  217. }
  218. ### Time of post
  219. #### date
  220. date <- sub(pattern = "T.*", x = datetime, replacement = "")
  221. date <- gsub(pattern = "-", x = date, replacement = "")
  222. #### time
  223. time <- sub(pattern = ".*T", x = datetime, replacement = "")
  224. time <- sub(pattern = "\\..*", x = time, replacement = "")
  225. time <- gsub(pattern = ":", x = time, replacement = "")
  226. #### full time
  227. fdat <- strptime(x = paste0(date, time), format = "%Y%m%d%H%M%S", tz = "CET")
  228. fdat <- as.character(fdat)
  229. ### Removing HTML-Tags from Toots
  230. text <- gsub(pattern = "<.*?>", x = text, replacement = "")
  231. text <- gsub(pattern = " ", x = text, replacement = "")
  232. ### Cleaning Instance-String
  233. #### GNUsocial
  234. inst <- sub(pattern = "tag:", x = inst, replacement = "")
  235. inst <- sub(pattern = ",\\d+.*", x = inst, replacement = "")
  236. #### Mastodon
  237. inst <- sub(pattern = "https:\\/\\/", x = inst, replacement = "")
  238. inst <- sub(pattern = "\\/.*", x = inst, replacement = "")
  239. ### Only include Toots from this year
  240. mastodon_exclude <- which(as.numeric(date) < 20180210 | as.numeric(date) > 20180216)
  241. date <- date[-mastodon_exclude]
  242. time <- time[-mastodon_exclude]
  243. fdat <- fdat[-mastodon_exclude]
  244. lang <- lang[-mastodon_exclude]
  245. inst <- inst[-mastodon_exclude]
  246. text <- text[-mastodon_exclude]
  247. link <- link[-mastodon_exclude]
  248. reto <- reto[-mastodon_exclude]
  249. favs <- favs[-mastodon_exclude]
  250. murl <- murl[-mastodon_exclude]
  251. acct <- acct[-mastodon_exclude]
  252. ### Creating dataframe
  253. mastodon <- data.frame(cbind(date, time, fdat, lang, inst, text, link, reto, favs, murl, acct))
  254. #### Clean-Up
  255. rm(list = c("date", "time", "fdat", "lang", "inst", "text", "link", "favs", "reto", "murl", "datetime", "acct"))
  256. mastodon <- within(data = mastodon, expr = {
  257. date <- as.character(date);
  258. time <- as.character(time);
  259. fdat <- as.character(fdat);
  260. text <- as.character(text);
  261. link <- as.character(link);
  262. murl <- as.character(murl);
  263. })
  264. # }}}
  265. ## Reddit Collector {{{ ----
  266. ### Authentication at Reddit
  267. # no authentication necessary, hence we can directly start scraping
  268. ### Get posts on Reddit
  269. reddit_post_dirty <- reddit_urls(search_terms = "ilovefs",
  270. #subreddit = "freesoftware linux opensource",
  271. cn_threshold = 0,
  272. page_threshold = 99999,
  273. sort_by = "new",
  274. wait_time = 5)
  275. ### Only use posts from the current year
  276. reddit_searchinyear <- 18 # has to have format "YY", eg "18" for "2018"
  277. reddit_post_year <- gsub(x = reddit_post_dirty$date,
  278. pattern = "\\d.-\\d.-",
  279. replace = "")
  280. reddit_post <- reddit_post_dirty[which(reddit_post_year == reddit_searchinyear),]
  281. ### Extracting relevant variables
  282. comt <- c() # Comments / Replies
  283. subr <- c() # Subreddit
  284. ptns <- c() # Points / Score
  285. ttle <- c() # Title
  286. text <- c() # Text / Content
  287. link <- c() # Linked to Website
  288. date <- c() # Date
  289. rurl <- c() # Reddit-URL of post
  290. acct <- c() # Author of Post
  291. for(i in c(1:length(reddit_post$URL))){
  292. comt[i] <- reddit_post$num_comments[i]
  293. ttle[i] <- reddit_post$title[i]
  294. rurl[i] <- reddit_post$URL[i]
  295. date[i] <- gsub(x = reddit_post$date[i], pattern = "-", replace = "")
  296. subr[i] <- reddit_post$subreddit[i]
  297. Sys.sleep(2)
  298. reddit_content <- reddit_content(URL = reddit_post$URL[i], wait_time = 0)
  299. ptns[i] <- reddit_content$post_score[1]
  300. text[i] <- reddit_content$post_text[1]
  301. link[i] <- reddit_content$link[1]
  302. acct[i] <- reddit_content$author[1]
  303. }
  304. ### Creating dataframe
  305. reddit <- data.frame(cbind(date, rurl, link, text, ttle, ptns, subr, comt, acct))
  306. #### Clean-Up
  307. rm(list = c("date", "rurl", "link", "text", "ttle", "ptns", "subr", "comt", "acct"))
  308. reddit <- within(data = reddit, expr = {
  309. date <- as.character(date);
  310. rurl <- as.character(rurl);
  311. link <- as.character(link);
  312. text <- as.character(text);
  313. ttle <- as.character(ttle);
  314. ptns <- as.numeric(as.character(ptns));
  315. subr <- as.character(subr);
  316. comt <- as.numeric(as.character(comt));
  317. })
  318. # }}}
  319. ### Exporting data {{{ ----
  320. time_of_saving <- sub(x = Sys.time(), pattern = " CET", replace = "")
  321. time_of_saving <- sub(x = time_of_saving, pattern = " ", replace = "_")
  322. time_of_saving <- gsub(x = time_of_saving, pattern = ":", replace = "-")
  323. #### RData
  324. save_path <- paste0("./data/ilovefs-all_", time_of_saving, ".RData")
  325. save(list = c("twitter", "mastodon", "reddit"), file = save_path)
  326. #### Text
  327. ##### Fediverse
  328. save_path_fed_t <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".txt")
  329. write.table(mastodon, file = save_path_fed_t)
  330. ##### Twitter
  331. save_path_twitter_t <- paste0("./data/ilovefs-twitter_", time_of_saving, ".txt")
  332. write.table(twitter, file = save_path_twitter_t)
  333. ##### Reddit
  334. save_path_reddit_t <- paste0("./data/ilovefs-reddit_", time_of_saving, ".txt")
  335. write.table(reddit, file = save_path_reddit_t)
  336. #### CSV
  337. ##### Fediverse
  338. save_path_fed_c <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".csv")
  339. write.csv(mastodon, file = save_path_fed_c)
  340. ##### Twitter
  341. save_path_twitter_c <- paste0("./data/ilovefs-twitter_", time_of_saving, ".csv")
  342. write.csv(twitter, file = save_path_twitter_c)
  343. ##### Reddit
  344. save_path_reddit_c <- paste0("./data/ilovefs-reddit_", time_of_saving, ".csv")
  345. write.csv(reddit, file = save_path_reddit_c)
  346. # }}}