Collecting, Analyzing and Presenting data about the participation in #ilovefs day
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

collecto.R 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. ################################################################################
  2. # collecto.R #
  3. # Collecting data from different social media sources with a specified #
  4. # searchterm/criteria #
  5. # Jan Weymeirsch janwey@fsfe.org #
  6. ################################################################################
  7. ### Loading Packages {{{ ----
  8. #### Twitter
  9. install.packages("rtweet")
  10. library("rtweet")
  11. # had to install "httr" via packagemanager
  12. #### Fediverse (eg: mastodon)
  13. install.packages("curl")
  14. library("curl")
  15. install.packages("rjson")
  16. library(rjson)
  17. ### Reddit
  18. install.packages("RedditExtractoR")
  19. library("RedditExtractoR")
  20. # }}}
  21. ## Twitter Collector {{{ ----
  22. ### Authenticate to Twitter
  23. #### Manual input (uncomment if needed)
  24. #twitter_consumerkey <- readline("[Twitter] Enter your consumer API key.")
  25. #twitter_consumerpri <- readline("[Twitter] Enter your consumer API secret.")
  26. #twitter_tokenaccess <- readline("[Twitter] Enter your Access Token.")
  27. #twitter_tokensecret <- readline("[Twitter] Enter your Token Secret.")
  28. #### Saved credentials
  29. twitter_api_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";")
  30. twitter_consumerkey <- as.character(twitter_api_cred$consumer_key)
  31. twitter_consumerpri <- as.character(twitter_api_cred$consumer_private)
  32. twitter_appname <- as.character(twitter_api_cred$appname)
  33. twitter_token <- create_token(app = twitter_appname, consumer_key = twitter_consumerkey, consumer_secret = twitter_consumerpri)
  34. # Note -------------------------------------------------------------------------
  35. # Please refer to the Documentation on where to receive your API credentials.
  36. # ------------------------------------------------------------------------------
  37. ### Collecting Tweets
  38. twitter_tw <- search_tweets("#ilovefs", n = 9999, include_rts = FALSE)
  39. twitter_number <- length(twitter_tw$text)
  40. text <- twitter_tw$text
  41. user <- twitter_tw$screen_name
  42. clnt <- twitter_tw$source
  43. favs <- twitter_tw$favorite_count
  44. retw <- twitter_tw$retweet_count
  45. lang <- twitter_tw$lang
  46. fdat <- twitter_tw$created_at
  47. link <- vector(mode = "character", length = twitter_number)
  48. murl <- vector(mode = "character", length = twitter_number)
  49. for(i in 1:twitter_number){
  50. link[i] <- twitter_tw$urls_expanded_url[[i]][1]
  51. murl[i] <- twitter_tw$media_expanded_url[[i]][1]
  52. }
  53. ### Forming variables for dataframe
  54. time <- sub(pattern = ".* ", x = fdat, replace = "")
  55. time <- gsub(pattern = ":", x = time, replace = "")
  56. date <- sub(pattern = " .*", x = fdat, replace = "")
  57. date <- gsub(pattern = "-", x = date, replace = "")
  58. ###
  59. twitter_exclude <- which(as.numeric(date) > 20180216 | as.numeric(date) < 20180210)
  60. date <- date[-twitter_exclude]
  61. time <- time[-twitter_exclude]
  62. fdat <- fdat[-twitter_exclude]
  63. retw <- retw[-twitter_exclude]
  64. favs <- favs[-twitter_exclude]
  65. text <- text[-twitter_exclude]
  66. lang <- lang[-twitter_exclude]
  67. murl <- murl[-twitter_exclude]
  68. link <- link[-twitter_exclude]
  69. clnt <- clnt[-twitter_exclude]
  70. user <- user[-twitter_exclude]
  71. ### Creating dataframe
  72. twitter <- data.frame(cbind(date, time, fdat, retw, favs, text, lang, murl, link, clnt, user))
  73. #### Clean-Up
  74. rm(list = c("date", "time", "fdat", "retw", "favs", "text", "link", "murl", "lang", "clnt", "user"))
  75. twitter <- within(data = twitter, expr = {
  76. date <- as.character(date);
  77. time <- as.character(time);
  78. fdat <- as.character(fdat);
  79. retw <- as.character(retw);
  80. favs <- as.character(favs);
  81. text <- as.character(text);
  82. link <- as.character(link);
  83. murl <- as.character(murl);
  84. lang <- as.character(lang);
  85. clnt <- as.character(clnt);
  86. user <- as.character(user);
  87. })
  88. # }}}
  89. ## Mastodon Collector with curl {{{ ----
  90. mastodon.fetchdata <- function(data){
  91. tmp_datetime <- c()
  92. tmp_lang <- c()
  93. tmp_inst <- c()
  94. tmp_link <- c()
  95. tmp_text <- c()
  96. tmp_reto <- c()
  97. tmp_favs <- c()
  98. tmp_murl <- c()
  99. tmp_acct <- c()
  100. for(i in 1:length(data)){
  101. #### Time and Date of Toot
  102. if(length(data[[i]]$created_at) > 0){
  103. tmp_datetime[i] <- data[[i]]$created_at
  104. } else {
  105. # insert empty value, if it does not exist
  106. tmp_datetime[i] <- NA
  107. }
  108. #### Language of Toot
  109. if(length(data[[i]]$language) > 0){
  110. tmp_lang[i] <- data[[i]]$language
  111. } else {
  112. # insert empty value, if it does not exist
  113. tmp_lang[i] <- NA
  114. }
  115. #### Instance of Toot
  116. if(length(data[[i]]$uri) > 0){
  117. tmp_inst[i] <- data[[i]]$uri
  118. } else {
  119. # insert empty value, if it does not exist
  120. tmp_inst[i] <- NA
  121. }
  122. #### URL of Toot
  123. if(length(data[[i]]$url) > 0){
  124. tmp_link[i] <- data[[i]]$url
  125. } else {
  126. # insert empty value, if it does not exist
  127. tmp_link[i] <- NA
  128. }
  129. #### Text/Content of Toot
  130. if(length(data[[i]]$content) > 0){
  131. tmp_text[i] <- data[[i]]$content
  132. } else {
  133. # insert empty value, if it does not exist
  134. tmp_text[i] <- NA
  135. }
  136. #### Number of Retoots
  137. if(length(data[[i]]$reblogs_count) > 0){
  138. tmp_reto[i] <- data[[i]]$reblogs_count
  139. } else {
  140. # insert empty value, if it does not exist
  141. tmp_reto[i] <- NA
  142. }
  143. #### Number of Favorites
  144. if(length(data[[i]]$favourites_count) > 0){
  145. tmp_favs[i] <- data[[i]]$favourites_count
  146. } else {
  147. # insert empty value, if it does not exist
  148. tmp_favs[i] <- NA
  149. }
  150. #### Number of Favorites
  151. if(length(data[[i]]$media_attachments) > 0){
  152. tmp_murl[i] <- data[[i]]$media_attachments[[1]]$url
  153. } else {
  154. # insert empty value, if it does not exist
  155. tmp_murl[i] <- NA
  156. }
  157. #### Account of Tooter
  158. if(length(data[[i]]$account) > 0){
  159. tmp_acct[i] <- data[[i]]$account$acct
  160. } else {
  161. # insert empty value, if it does not exist
  162. tmp_acct[i] <- NA
  163. }
  164. }
  165. return(data.frame(cbind(tmp_datetime,
  166. tmp_lang,
  167. tmp_inst,
  168. tmp_text,
  169. tmp_link,
  170. tmp_reto,
  171. tmp_favs,
  172. tmp_murl,
  173. tmp_acct)))
  174. }
  175. datetime <- c()
  176. lang <- c()
  177. inst <- c()
  178. link <- c()
  179. text <- c()
  180. reto <- c()
  181. favs <- c()
  182. murl <- c()
  183. acct <- c()
  184. mastodon_instance <- "https://mastodon.social"
  185. mastodon_hashtag <- "ilovefs"
  186. mastodon_url <- paste0(mastodon_instance,
  187. "/api/v1/timelines/tag/",
  188. mastodon_hashtag,
  189. "?limit=40")
  190. for(i in 1:999){
  191. mastodon_reqres <- curl_fetch_memory(mastodon_url)
  192. mastodon_rawjson <- rawToChar(mastodon_reqres$content)
  193. toots <- fromJSON(mastodon_rawjson)
  194. if(length(toots) > 0){
  195. tmp_mastodon_df <- mastodon.fetchdata(data = toots)
  196. datetime <- c(datetime, as.character(tmp_mastodon_df$tmp_datetime))
  197. lang <- c(lang, as.character(tmp_mastodon_df$tmp_lang))
  198. inst <- c(inst, as.character(tmp_mastodon_df$tmp_inst))
  199. link <- c(link, as.character(tmp_mastodon_df$tmp_link))
  200. text <- c(text, as.character(tmp_mastodon_df$tmp_text))
  201. reto <- c(reto, as.character(tmp_mastodon_df$tmp_reto))
  202. favs <- c(favs, as.character(tmp_mastodon_df$tmp_favs))
  203. murl <- c(murl, as.character(tmp_mastodon_df$tmp_murl))
  204. acct <- c(acct, as.character(tmp_mastodon_df$tmp_acct))
  205. } else {
  206. break
  207. }
  208. # Update the URL for the next iteration of the for loop so we can download
  209. # the next toots.
  210. mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
  211. mastodon_next <- sub(x = mastodon_lheader, pattern = ".*link:\ <", replace = "")
  212. mastodon_url <- sub(x = mastodon_next, pattern = ">;\ rel=\"next\".*", replace = "")
  213. }
  214. ### Time of post
  215. #### date
  216. date <- sub(pattern = "T.*", x = datetime, replacement = "")
  217. date <- gsub(pattern = "-", x = date, replacement = "")
  218. #### time
  219. time <- sub(pattern = ".*T", x = datetime, replacement = "")
  220. time <- sub(pattern = "\\..*", x = time, replacement = "")
  221. time <- gsub(pattern = ":", x = time, replacement = "")
  222. #### full time
  223. fdat <- strptime(x = paste0(date, time), format = "%Y%m%d%H%M%S", tz = "CET")
  224. fdat <- as.character(fdat)
  225. ### Removing HTML-Tags from Toots
  226. text <- gsub(pattern = "<.*?>", x = text, replacement = "")
  227. text <- gsub(pattern = " ", x = text, replacement = "")
  228. ### Cleaning Instance-String
  229. #### GNUsocial
  230. inst <- sub(pattern = "tag:", x = inst, replacement = "")
  231. inst <- sub(pattern = ",\\d+.*", x = inst, replacement = "")
  232. #### Mastodon
  233. inst <- sub(pattern = "https:\\/\\/", x = inst, replacement = "")
  234. inst <- sub(pattern = "\\/.*", x = inst, replacement = "")
  235. ### Only include Toots from this year
  236. mastodon_exclude <- which(as.numeric(date) < 20180210 | as.numeric(date) > 20180216)
  237. date <- date[-mastodon_exclude]
  238. time <- time[-mastodon_exclude]
  239. fdat <- fdat[-mastodon_exclude]
  240. lang <- lang[-mastodon_exclude]
  241. inst <- inst[-mastodon_exclude]
  242. text <- text[-mastodon_exclude]
  243. link <- link[-mastodon_exclude]
  244. reto <- reto[-mastodon_exclude]
  245. favs <- favs[-mastodon_exclude]
  246. murl <- murl[-mastodon_exclude]
  247. acct <- acct[-mastodon_exclude]
  248. ### Creating dataframe
  249. mastodon <- data.frame(cbind(date, time, fdat, lang, inst, text, link, reto, favs, murl, acct))
  250. #### Clean-Up
  251. rm(list = c("date", "time", "fdat", "lang", "inst", "text", "link", "favs", "reto", "murl", "datetime", "acct"))
  252. mastodon <- within(data = mastodon, expr = {
  253. date <- as.character(date);
  254. time <- as.character(time);
  255. fdat <- as.character(fdat);
  256. text <- as.character(text);
  257. link <- as.character(link);
  258. murl <- as.character(murl);
  259. })
  260. # }}}
  261. ## Reddit Collector {{{ ----
  262. ### Authentication at Reddit
  263. # no authentication necessary, hence we can directly start scraping
  264. ### Get posts on Reddit
  265. reddit_post_dirty <- reddit_urls(search_terms = "ilovefs",
  266. #subreddit = "freesoftware linux opensource",
  267. cn_threshold = 0,
  268. page_threshold = 99999,
  269. sort_by = "new",
  270. wait_time = 5)
  271. ### Only use posts from the current year
  272. reddit_searchinyear <- 18 # has to have format "YY", eg "18" for "2018"
  273. reddit_post_year <- gsub(x = reddit_post_dirty$date,
  274. pattern = "\\d.-\\d.-",
  275. replace = "")
  276. reddit_post <- reddit_post_dirty[which(reddit_post_year == reddit_searchinyear),]
  277. ### Extracting relevant variables
  278. comt <- c() # Comments / Replies
  279. subr <- c() # Subreddit
  280. ptns <- c() # Points / Score
  281. ttle <- c() # Title
  282. text <- c() # Text / Content
  283. link <- c() # Linked to Website
  284. date <- c() # Date
  285. rurl <- c() # Reddit-URL of post
  286. acct <- c() # Author of Post
  287. for(i in c(1:length(reddit_post$URL))){
  288. comt[i] <- reddit_post$num_comments[i]
  289. ttle[i] <- reddit_post$title[i]
  290. rurl[i] <- reddit_post$URL[i]
  291. date[i] <- gsub(x = reddit_post$date[i], pattern = "-", replace = "")
  292. subr[i] <- reddit_post$subreddit[i]
  293. Sys.sleep(2)
  294. reddit_content <- reddit_content(URL = reddit_post$URL[i], wait_time = 0)
  295. ptns[i] <- reddit_content$post_score[1]
  296. text[i] <- reddit_content$post_text[1]
  297. link[i] <- reddit_content$link[1]
  298. acct[i] <- reddit_content$author[1]
  299. }
  300. ### Creating dataframe
  301. reddit <- data.frame(cbind(date, rurl, link, text, ttle, ptns, subr, comt, acct))
  302. #### Clean-Up
  303. rm(list = c("date", "rurl", "link", "text", "ttle", "ptns", "subr", "comt", "acct"))
  304. reddit <- within(data = reddit, expr = {
  305. date <- as.character(date);
  306. rurl <- as.character(rurl);
  307. link <- as.character(link);
  308. text <- as.character(text);
  309. ttle <- as.character(ttle);
  310. ptns <- as.numeric(as.character(ptns));
  311. subr <- as.character(subr);
  312. comt <- as.numeric(as.character(comt));
  313. })
  314. # }}}
  315. ### Exporting data {{{ ----
  316. time_of_saving <- sub(x = Sys.time(), pattern = " CET", replace = "")
  317. time_of_saving <- sub(x = time_of_saving, pattern = " ", replace = "_")
  318. time_of_saving <- gsub(x = time_of_saving, pattern = ":", replace = "-")
  319. #### RData
  320. save_path <- paste0("./data/ilovefs-all_", time_of_saving, ".RData")
  321. save(list = c("twitter", "mastodon", "reddit"), file = save_path)
  322. #### Text
  323. ##### Fediverse
  324. save_path_fed_t <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".txt")
  325. write.table(mastodon, file = save_path_fed_t)
  326. ##### Twitter
  327. save_path_twitter_t <- paste0("./data/ilovefs-twitter_", time_of_saving, ".txt")
  328. write.table(twitter, file = save_path_twitter_t)
  329. ##### Reddit
  330. save_path_reddit_t <- paste0("./data/ilovefs-reddit_", time_of_saving, ".txt")
  331. write.table(reddit, file = save_path_reddit_t)
  332. #### CSV
  333. ##### Fediverse
  334. save_path_fed_c <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".csv")
  335. write.csv(mastodon, file = save_path_fed_c)
  336. ##### Twitter
  337. save_path_twitter_c <- paste0("./data/ilovefs-twitter_", time_of_saving, ".csv")
  338. write.csv(twitter, file = save_path_twitter_c)
  339. ##### Reddit
  340. save_path_reddit_c <- paste0("./data/ilovefs-reddit_", time_of_saving, ".csv")
  341. write.csv(reddit, file = save_path_reddit_c)
  342. # }}}