Collecting, Analyzing and Presenting data about the participation in #ilovefs day
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

collecto.R 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. ################################################################################
  2. # collecto.R #
  3. # Collecting data from different social media sources with a specified #
  4. # searchterm/criteria #
  5. # Jan Weymeirsch janwey@fsfe.org #
  6. ################################################################################
  7. ### Loading Packages {{{ ----
  8. #### Twitter
  9. install.packages("twitteR")
  10. library("twitteR")
  11. # had to install "httr" via packagemanager
  12. #### Fediverse (eg: mastodon)
  13. install.packages("curl")
  14. library("curl")
  15. install.packages("rjson")
  16. library(rjson)
  17. ### Reddit
  18. install.packages("RedditExtractoR")
  19. library("RedditExtractoR")
  20. # }}}
  21. ## Twitter Collector {{{ ----
  22. ### Authenticate to Twitter
  23. #### Manual input (uncomment if needed)
  24. #twitter_consumerkey <- readline("[Twitter] Enter your consumer API key.")
  25. #twitter_consumerpri <- readline("[Twitter] Enter your consumer API secret.")
  26. #twitter_tokenaccess <- readline("[Twitter] Enter your Access Token.")
  27. #twitter_tokensecret <- readline("[Twitter] Enter your Token Secret.")
  28. #### Saved credentials
  29. twitter_api_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";")
  30. twitter_consumerkey <- as.character(twitter_api_cred$consumer_key)
  31. twitter_consumerpri <- as.character(twitter_api_cred$consumer_private)
  32. twitter_tokenaccess <- as.character(twitter_api_cred$access_token)
  33. twitter_tokensecret <- as.character(twitter_api_cred$token_secret)
  34. setup_twitter_oauth(consumer_key = twitter_consumerkey,
  35. consumer_secret = twitter_consumerpri,
  36. access_token = twitter_tokenaccess,
  37. access_secret = twitter_tokensecret)
  38. # Note -------------------------------------------------------------------------
  39. # Please refer to the Documentation on where to receive your API credentials.
  40. # ------------------------------------------------------------------------------
  41. ### Collecting Tweets
  42. twitter_tw_dirty <- searchTwitter(search = "ilovefs",
  43. since = "2018-02-10",
  44. until = "2018-12-31",
  45. n = 9999,
  46. resultType = "recent")
  47. ### strip off retweets
  48. twitter_tw <- strip_retweets(tweets = twitter_tw_dirty,
  49. strip_manual = FALSE,
  50. strip_mt = FALSE)
  51. ### Extract relevant data from dataset
  52. twitter_timedate <- c()
  53. twitter_client <- c()
  54. twitter_name <- c()
  55. twitter_rts <- c()
  56. twitter_fav <- c()
  57. twitter_url <- c()
  58. twitter_txt <- c()
  59. twitter_usr <- c()
  60. for(i in 1:length(twitter_tw)){
  61. #### Time of tweet
  62. if(length(twitter_tw[[i]]$created) > 0){
  63. twitter_timedate[i] <- as.character(twitter_tw[[i]]$created)
  64. } else {
  65. # insert empty value, if it does not exist
  66. twitter_timedate[i] <- NA
  67. }
  68. #### Client used
  69. if(length(twitter_tw[[i]]$statusSource) > 0){
  70. twitter_client[i] <- as.character(twitter_tw[[i]]$statusSource)
  71. } else {
  72. # insert empty value, if it does not exist
  73. twitter_client[i] <- NA
  74. }
  75. #### Screen names / Twitter Handles
  76. if(length(twitter_tw[[i]]$screenName) > 0){
  77. twitter_name[i] <- as.character(twitter_tw[[i]]$screenName)
  78. } else {
  79. # insert empty value, if it does not exist
  80. twitter_name[i] <- NA
  81. }
  82. #### Number of retweets
  83. if(length(twitter_tw[[i]]$retweetCount) > 0){
  84. twitter_rts[i] <- as.character(twitter_tw[[i]]$retweetCount)
  85. } else {
  86. # insert empty value, if it does not exist
  87. twitter_rts[i] <- NA
  88. }
  89. #### Number of favorites
  90. if(length(twitter_tw[[i]]$favoriteCount) > 0){
  91. twitter_fav[i] <- as.character(twitter_tw[[i]]$favoriteCount)
  92. } else {
  93. # insert empty value, if it does not exist
  94. twitter_fav[i] <- NA
  95. }
  96. #### URLs posted about
  97. if(length(twitter_tw[[i]]$urls$expanded_url) > 0){
  98. twitter_url[i] <- as.character(twitter_tw[[i]]$urls$expanded_url)
  99. } else {
  100. # insert empty value, if it does not exist
  101. twitter_url[i] <- NA
  102. }
  103. #### actual tweet/text
  104. if(length(twitter_tw[[i]]$text) > 0){
  105. twitter_txt[i] <- as.character(twitter_tw[[i]]$text)
  106. } else {
  107. # insert empty value, if it does not exist
  108. twitter_txt[i] <- NA
  109. }
  110. #### poster of tweet
  111. if(length(twitter_tw[[i]]$screenName) > 0){
  112. twitter_usr[i] <- as.character(twitter_tw[[i]]$screenName)
  113. } else {
  114. # insert empty value, if it does not exist
  115. twitter_usr[i] <- NA
  116. }
  117. }
  118. ### Removing HTML-Tags from Client-info
  119. twitter_client <- sub(pattern = ".*\">", replace = "", x = twitter_client)
  120. twitter_client <- sub(pattern = "</a>", replace = "", x = twitter_client)
  121. ### Forming variables for dataframe
  122. time <- sub(pattern = ".* ", x = twitter_timedate, replace = "")
  123. time <- gsub(pattern = ":", x = time, replace = "")
  124. date <- sub(pattern = " .*", x = twitter_timedate, replace = "")
  125. date <- gsub(pattern = "-", x = date, replace = "")
  126. fdat <- strptime(x = paste0(date, time), format = "%Y%m%d%H%M%S", tz = "CET")
  127. fdat <- as.character(fdat)
  128. retw <- as.factor(twitter_rts)
  129. favs <- as.factor(twitter_fav)
  130. link <- as.character(twitter_url)
  131. text <- as.character(twitter_txt)
  132. clnt <- as.character(twitter_client)
  133. user <- as.character(twitter_usr)
  134. ### Creating dataframe
  135. twitter <- data.frame(cbind(date, time, fdat, retw, favs, text, link, clnt, user))
  136. #### Clean-Up
  137. rm(list = c("date", "time", "fdat", "retw", "favs", "text", "link", "clnt", "user"))
  138. twitter <- within(data = twitter, expr = {
  139. date <- as.character(date);
  140. time <- as.character(time);
  141. fdat <- as.character(fdat);
  142. text <- as.character(text);
  143. link <- as.character(link);
  144. })
  145. # }}}
  146. ## Mastodon Collector with curl {{{ ----
  147. mastodon.fetchdata <- function(data){
  148. tmp_datetime <- c()
  149. tmp_lang <- c()
  150. tmp_inst <- c()
  151. tmp_link <- c()
  152. tmp_text <- c()
  153. tmp_reto <- c()
  154. tmp_favs <- c()
  155. tmp_murl <- c()
  156. tmp_acct <- c()
  157. for(i in 1:length(data)){
  158. #### Time and Date of Toot
  159. if(length(data[[i]]$created_at) > 0){
  160. tmp_datetime[i] <- data[[i]]$created_at
  161. } else {
  162. # insert empty value, if it does not exist
  163. tmp_datetime[i] <- NA
  164. }
  165. #### Language of Toot
  166. if(length(data[[i]]$language) > 0){
  167. tmp_lang[i] <- data[[i]]$language
  168. } else {
  169. # insert empty value, if it does not exist
  170. tmp_lang[i] <- NA
  171. }
  172. #### Instance of Toot
  173. if(length(data[[i]]$uri) > 0){
  174. tmp_inst[i] <- data[[i]]$uri
  175. } else {
  176. # insert empty value, if it does not exist
  177. tmp_inst[i] <- NA
  178. }
  179. #### URL of Toot
  180. if(length(data[[i]]$url) > 0){
  181. tmp_link[i] <- data[[i]]$url
  182. } else {
  183. # insert empty value, if it does not exist
  184. tmp_link[i] <- NA
  185. }
  186. #### Text/Content of Toot
  187. if(length(data[[i]]$content) > 0){
  188. tmp_text[i] <- data[[i]]$content
  189. } else {
  190. # insert empty value, if it does not exist
  191. tmp_text[i] <- NA
  192. }
  193. #### Number of Retoots
  194. if(length(data[[i]]$reblogs_count) > 0){
  195. tmp_reto[i] <- data[[i]]$reblogs_count
  196. } else {
  197. # insert empty value, if it does not exist
  198. tmp_reto[i] <- NA
  199. }
  200. #### Number of Favorites
  201. if(length(data[[i]]$favourites_count) > 0){
  202. tmp_favs[i] <- data[[i]]$favourites_count
  203. } else {
  204. # insert empty value, if it does not exist
  205. tmp_favs[i] <- NA
  206. }
  207. #### Number of Favorites
  208. if(length(data[[i]]$media_attachments) > 0){
  209. tmp_murl[i] <- data[[i]]$media_attachments[[1]]$url
  210. } else {
  211. # insert empty value, if it does not exist
  212. tmp_murl[i] <- NA
  213. }
  214. #### Account of Tooter
  215. if(length(data[[i]]$account) > 0){
  216. tmp_acct[i] <- data[[i]]$account$acct
  217. } else {
  218. # insert empty value, if it does not exist
  219. tmp_acct[i] <- NA
  220. }
  221. }
  222. return(data.frame(cbind(tmp_datetime,
  223. tmp_lang,
  224. tmp_inst,
  225. tmp_text,
  226. tmp_link,
  227. tmp_reto,
  228. tmp_favs,
  229. tmp_murl,
  230. tmp_acct)))
  231. }
  232. datetime <- c()
  233. lang <- c()
  234. inst <- c()
  235. link <- c()
  236. text <- c()
  237. reto <- c()
  238. favs <- c()
  239. murl <- c()
  240. acct <- c()
  241. mastodon_instance <- "https://mastodon.social"
  242. mastodon_hashtag <- "ilovefs"
  243. mastodon_url <- paste0(mastodon_instance,
  244. "/api/v1/timelines/tag/",
  245. mastodon_hashtag,
  246. "?limit=40")
  247. for(i in 1:999){
  248. mastodon_reqres <- curl_fetch_memory(mastodon_url)
  249. mastodon_rawjson <- rawToChar(mastodon_reqres$content)
  250. toots <- fromJSON(mastodon_rawjson)
  251. if(length(toots) > 0){
  252. tmp_mastodon_df <- mastodon.fetchdata(data = toots)
  253. datetime <- c(datetime, as.character(tmp_mastodon_df$tmp_datetime))
  254. lang <- c(lang, as.character(tmp_mastodon_df$tmp_lang))
  255. inst <- c(inst, as.character(tmp_mastodon_df$tmp_inst))
  256. link <- c(link, as.character(tmp_mastodon_df$tmp_link))
  257. text <- c(text, as.character(tmp_mastodon_df$tmp_text))
  258. reto <- c(reto, as.character(tmp_mastodon_df$tmp_reto))
  259. favs <- c(favs, as.character(tmp_mastodon_df$tmp_favs))
  260. murl <- c(murl, as.character(tmp_mastodon_df$tmp_murl))
  261. acct <- c(acct, as.character(tmp_mastodon_df$tmp_acct))
  262. } else {
  263. break
  264. }
  265. # Update the URL for the next iteration of the for loop so we can download
  266. # the next toots.
  267. mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
  268. mastodon_next <- sub(x = mastodon_lheader, pattern = ".*link:\ <", replace = "")
  269. mastodon_url <- sub(x = mastodon_next, pattern = ">;\ rel=\"next\".*", replace = "")
  270. }
  271. ### Time of post
  272. #### date
  273. date <- sub(pattern = "T.*", x = datetime, replacement = "")
  274. date <- gsub(pattern = "-", x = date, replacement = "")
  275. #### time
  276. time <- sub(pattern = ".*T", x = datetime, replacement = "")
  277. time <- sub(pattern = "\\..*", x = time, replacement = "")
  278. time <- gsub(pattern = ":", x = time, replacement = "")
  279. #### full time
  280. fdat <- strptime(x = paste0(date, time), format = "%Y%m%d%H%M%S", tz = "CET")
  281. fdat <- as.character(fdat)
  282. ### Removing HTML-Tags from Toots
  283. text <- gsub(pattern = "<.*?>", x = text, replacement = "")
  284. text <- gsub(pattern = " ", x = text, replacement = "")
  285. ### Cleaning Instance-String
  286. #### GNUsocial
  287. inst <- sub(pattern = "tag:", x = inst, replacement = "")
  288. inst <- sub(pattern = ",\\d+.*", x = inst, replacement = "")
  289. #### Mastodon
  290. inst <- sub(pattern = "https:\\/\\/", x = inst, replacement = "")
  291. inst <- sub(pattern = "\\/.*", x = inst, replacement = "")
  292. ### Only include Toots from this year
  293. mastodon_exclude <- which(as.numeric(date) < 20180210)
  294. date <- date[-mastodon_exclude]
  295. time <- time[-mastodon_exclude]
  296. fdat <- fdat[-mastodon_exclude]
  297. lang <- lang[-mastodon_exclude]
  298. inst <- inst[-mastodon_exclude]
  299. text <- text[-mastodon_exclude]
  300. link <- link[-mastodon_exclude]
  301. reto <- reto[-mastodon_exclude]
  302. favs <- favs[-mastodon_exclude]
  303. murl <- murl[-mastodon_exclude]
  304. acct <- acct[-mastodon_exclude]
  305. ### Creating dataframe
  306. mastodon <- data.frame(cbind(date, time, fdat, lang, inst, text, link, reto, favs, murl, acct))
  307. #### Clean-Up
  308. rm(list = c("date", "time", "fdat", "lang", "inst", "text", "link", "favs", "reto", "murl", "datetime", "acct"))
  309. mastodon <- within(data = mastodon, expr = {
  310. date <- as.character(date);
  311. time <- as.character(time);
  312. fdat <- as.character(fdat);
  313. text <- as.character(text);
  314. link <- as.character(link);
  315. murl <- as.character(murl);
  316. })
  317. # }}}
  318. ## Reddit Collector {{{ ----
  319. ### Authentication at Reddit
  320. # no authentication necessary, hence we can directly start scraping
  321. ### Get posts on Reddit
  322. reddit_post_dirty <- reddit_urls(search_terms = "ilovefs",
  323. #subreddit = "freesoftware linux opensource",
  324. cn_threshold = 0,
  325. page_threshold = 99999,
  326. sort_by = "new",
  327. wait_time = 5)
  328. ### Only use posts from the current year
  329. reddit_searchinyear <- 18 # has to have format "YY", eg "18" for "2018"
  330. reddit_post_year <- gsub(x = reddit_post_dirty$date,
  331. pattern = "\\d.-\\d.-",
  332. replace = "")
  333. reddit_post <- reddit_post_dirty[which(reddit_post_year == reddit_searchinyear),]
  334. ### Extracting relevant variables
  335. comt <- c() # Comments / Replies
  336. subr <- c() # Subreddit
  337. ptns <- c() # Points / Score
  338. ttle <- c() # Title
  339. text <- c() # Text / Content
  340. link <- c() # Linked to Website
  341. date <- c() # Date
  342. rurl <- c() # Reddit-URL of post
  343. acct <- c() # Author of Post
  344. for(i in c(1:length(reddit_post$URL))){
  345. comt[i] <- reddit_post$num_comments[i]
  346. ttle[i] <- reddit_post$title[i]
  347. rurl[i] <- reddit_post$URL[i]
  348. date[i] <- gsub(x = reddit_post$date[i], pattern = "-", replace = "")
  349. subr[i] <- reddit_post$subreddit[i]
  350. Sys.sleep(2)
  351. reddit_content <- reddit_content(URL = reddit_post$URL[i], wait_time = 0)
  352. ptns[i] <- reddit_content$post_score[1]
  353. text[i] <- reddit_content$post_text[1]
  354. link[i] <- reddit_content$link[1]
  355. acct[i] <- reddit_content$author[1]
  356. }
  357. ### Creating dataframe
  358. reddit <- data.frame(cbind(date, rurl, link, text, ttle, ptns, subr, comt, acct))
  359. #### Clean-Up
  360. rm(list = c("date", "rurl", "link", "text", "ttle", "ptns", "subr", "comt", "acct"))
  361. reddit <- within(data = reddit, expr = {
  362. date <- as.character(date);
  363. rurl <- as.character(rurl);
  364. link <- as.character(link);
  365. text <- as.character(text);
  366. ttle <- as.character(ttle);
  367. ptns <- as.numeric(as.character(ptns));
  368. subr <- as.character(subr);
  369. comt <- as.numeric(as.character(comt));
  370. })
  371. # }}}
  372. ### Exporting data {{{ ----
  373. time_of_saving <- sub(x = Sys.time(), pattern = " CET", replace = "")
  374. time_of_saving <- sub(x = time_of_saving, pattern = " ", replace = "_")
  375. time_of_saving <- gsub(x = time_of_saving, pattern = ":", replace = "-")
  376. #### RData
  377. save_path <- paste0("./data/ilovefs-all_", time_of_saving, ".RData")
  378. save(list = c("twitter", "mastodon", "reddit"), file = save_path)
  379. #### Text
  380. ##### Fediverse
  381. save_path_fed_t <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".txt")
  382. write.table(mastodon, file = save_path_fed_t)
  383. ##### Twitter
  384. save_path_twitter_t <- paste0("./data/ilovefs-twitter_", time_of_saving, ".txt")
  385. write.table(twitter, file = save_path_twitter_t)
  386. ##### Reddit
  387. save_path_reddit_t <- paste0("./data/ilovefs-reddit_", time_of_saving, ".txt")
  388. write.table(reddit, file = save_path_reddit_t)
  389. #### CSV
  390. ##### Fediverse
  391. save_path_fed_c <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".csv")
  392. write.csv(mastodon, file = save_path_fed_c)
  393. ##### Twitter
  394. save_path_twitter_c <- paste0("./data/ilovefs-twitter_", time_of_saving, ".csv")
  395. write.csv(twitter, file = save_path_twitter_c)
  396. ##### Reddit
  397. save_path_reddit_c <- paste0("./data/ilovefs-reddit_", time_of_saving, ".csv")
  398. write.csv(reddit, file = save_path_reddit_c)
  399. # }}}