Collecting, Analyzing and Presenting data about the participation in #ilovefs day
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

collecto.R 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. ################################################################################
  2. # collecto.R #
  3. # Collecting data from different social media sources with a specified #
  4. # searchterm/criteria #
  5. # Jan Weymeirsch janwey@fsfe.org #
  6. ################################################################################
  7. ### Loading Packages {{{ ----
  8. #### Twitter
  9. install.packages("twitteR")
  10. library("twitteR")
  11. # had to install "httr" via packagemanager
  12. #### Fediverse (eg: mastodon)
  13. install.packages("devtools")
  14. # requires libssl-dev
  15. devtools::install_github("ThomasChln/mastodon", force = TRUE)
  16. library("mastodon")
  17. ### Reddit
  18. install.packages("RedditExtractoR")
  19. library("RedditExtractoR")
  20. # }}}
  21. ## Twitter Collector {{{ ----
  22. ### Authenticate to Twitter
  23. #### Manual input (uncomment if needed)
  24. #twitter_consumerkey <- readline("[Twitter] Enter your consumer API key.")
  25. #twitter_consumerpri <- readline("[Twitter] Enter your consumer API secret.")
  26. #twitter_tokenaccess <- readline("[Twitter] Enter your Access Token.")
  27. #twitter_tokensecret <- readline("[Twitter] Enter your Token Secret.")
  28. #### Saved credentials
  29. twitter_api_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";")
  30. twitter_consumerkey <- as.character(twitter_api_cred$consumer_key)
  31. twitter_consumerpri <- as.character(twitter_api_cred$consumer_private)
  32. twitter_tokenaccess <- as.character(twitter_api_cred$access_token)
  33. twitter_tokensecret <- as.character(twitter_api_cred$token_secret)
  34. setup_twitter_oauth(consumer_key = twitter_consumerkey,
  35. consumer_secret = twitter_consumerpri,
  36. access_token = twitter_tokenaccess,
  37. access_secret = twitter_tokensecret)
  38. # Note -------------------------------------------------------------------------
  39. # Please refer to the Documentation on where to receive your API credentials.
  40. # ------------------------------------------------------------------------------
  41. ### Collecting Tweets
  42. twitter_tw_dirty <- searchTwitter(search = "ilovefs",
  43. since = "2018-01-01",
  44. until = "2018-12-31",
  45. n = 100,
  46. resultType = "recent")
  47. ### strip off retweets
  48. twitter_tw <- strip_retweets(tweets = twitter_tw_dirty,
  49. strip_manual = FALSE,
  50. strip_mt = FALSE)
  51. ### Extract relevant data from dataset
  52. twitter_timedate <- c()
  53. twitter_client <- c()
  54. twitter_name <- c()
  55. twitter_rts <- c()
  56. twitter_fav <- c()
  57. twitter_url <- c()
  58. twitter_txt <- c()
  59. for(i in 1:length(twitter_tw)){
  60. #### Time of tweet
  61. if(length(twitter_tw[[i]]$created) > 0){
  62. twitter_timedate[i] <- as.character(twitter_tw[[i]]$created)
  63. } else {
  64. # insert empty value, if it does not exist
  65. twitter_timedate[i] <- NA
  66. }
  67. #### Client used
  68. if(length(twitter_tw[[i]]$statusSource) > 0){
  69. twitter_client[i] <- as.character(twitter_tw[[i]]$statusSource)
  70. } else {
  71. # insert empty value, if it does not exist
  72. twitter_client[i] <- NA
  73. }
  74. #### Screen names / Twitter Handles
  75. if(length(twitter_tw[[i]]$screenName) > 0){
  76. twitter_name[i] <- as.character(twitter_tw[[i]]$screenName)
  77. } else {
  78. # insert empty value, if it does not exist
  79. twitter_name[i] <- NA
  80. }
  81. #### Number of retweets
  82. if(length(twitter_tw[[i]]$retweetCount) > 0){
  83. twitter_rts[i] <- as.character(twitter_tw[[i]]$retweetCount)
  84. } else {
  85. # insert empty value, if it does not exist
  86. twitter_rts[i] <- NA
  87. }
  88. #### Number of favorites
  89. if(length(twitter_tw[[i]]$favoriteCount) > 0){
  90. twitter_fav[i] <- as.character(twitter_tw[[i]]$favoriteCount)
  91. } else {
  92. # insert empty value, if it does not exist
  93. twitter_fav[i] <- NA
  94. }
  95. #### URLs posted about
  96. if(length(twitter_tw[[i]]$urls$expanded_url) > 0){
  97. twitter_url[i] <- as.character(twitter_tw[[i]]$urls$expanded_url)
  98. } else {
  99. # insert empty value, if it does not exist
  100. twitter_url[i] <- NA
  101. }
  102. #### actual tweet/text
  103. if(length(twitter_tw[[i]]$text) > 0){
  104. twitter_txt[i] <- as.character(twitter_tw[[i]]$text)
  105. } else {
  106. # insert empty value, if it does not exist
  107. twitter_txt[i] <- NA
  108. }
  109. }
  110. ### Removing HTML-Tags from Client-info
  111. twitter_client <- sub(pattern = ".*\">", replace = "", x = twitter_client)
  112. twitter_client <- sub(pattern = "</a>", replace = "", x = twitter_client)
  113. ### Forming variables for dataframe
  114. time <- sub(pattern = ".* ", x = twitter_timedate, replace = "")
  115. time <- as.numeric(gsub(pattern = ":", x = time, replace = ""))
  116. date <- sub(pattern = " .*", x = twitter_timedate, replace = "")
  117. date <- as.numeric(gsub(pattern = "-", x = date, replace = ""))
  118. retw <- as.factor(twitter_rts)
  119. favs <- as.factor(twitter_fav)
  120. link <- as.character(twitter_url)
  121. text <- as.character(twitter_txt)
  122. clnt <- as.character(twitter_client)
  123. ### Creating dataframe
  124. twitter <- data.frame(cbind(date, time, retw, favs, text, link, clnt))
  125. #### Clean-Up
  126. rm(list = c("date", "time", "retw", "favs", "text", "link", "clnt"))
  127. twitter <- within(data = twitter, expr = {
  128. date <- as.numeric(as.character(date));
  129. time <- as.numeric(as.character(time));
  130. text <- as.character(text);
  131. link <- as.character(link);
  132. })
  133. # }}}
  134. ## Mastodon Collector {{{ ----
  135. ### Authenticate to the Fediverse (here: Mastodon)
  136. #### Manual input (uncomment if needed)
  137. #mastodon_auth_insta <- readline("[Mastodon] Enter your Instance-URL."
  138. #mastodon_auth_login <- readline("[Mastodon] Enter your registered mail.")
  139. #mastodon_auth_passw <- readline("[Mastodon] Enter your password.")
  140. #### Saved credentials
  141. mastodon_api_cred <- read.table(file = "./fediverse_mastodon_api.txt", header = TRUE, sep = ";")
  142. mastodon_auth_insta <- as.character(mastodon_api_cred$instance)
  143. mastodon_auth_login <- as.character(mastodon_api_cred$mail)
  144. mastodon_auth_passw <- as.character(mastodon_api_cred$password)
  145. #### Authentification process
  146. mastodon_auth <- mastodon::login(instance = mastodon_auth_insta,
  147. user = mastodon_auth_login,
  148. pass = mastodon_auth_passw)
  149. ### Get posts from mastodon
  150. mastodon_toot <- mastodon::get_hashtag(token = mastodon_auth,
  151. hashtag = "ilovefs",
  152. local = FALSE,
  153. n = 20)
  154. ### public and non-public posts
  155. mastodon_priv <- mastodon_toot[[7]]
  156. ### Time of post
  157. #### date (as numeric value)
  158. mastodon_date <- sub(pattern = "T.*", x = mastodon_toot[[2]], replacement = "")
  159. mastodon_date <- gsub(pattern = "-", x = mastodon_date, replacement = "")
  160. mastodon_date <- as.numeric(mastodon_date)
  161. #### time (as numeric value)
  162. mastodon_time <- sub(pattern = ".*T", x = mastodon_toot[[2]], replacement = "")
  163. mastodon_time <- sub(pattern = "\\..*", x = mastodon_time, replacement = "")
  164. mastodon_time <- gsub(pattern = ":", x = mastodon_time, replacement = "")
  165. mastodon_time <- as.numeric(mastodon_time)
  166. ### Language of post
  167. mastodon_lang <- mastodon_toot[[8]]
  168. ### Instance of post
  169. mastodon_insta <- sub(pattern = "tag:", x = mastodon_toot[[9]], replacement = "")
  170. mastodon_insta <- sub(pattern = ",\\d+.*", x = mastodon_insta, replacement = "")
  171. #### in case the instance name is a full url
  172. mastodon_insta <- sub(pattern = ".*://", x = mastodon_insta, replacement = "")
  173. mastodon_insta <- sub(pattern = "/.*", x = mastodon_insta, replacement = "")
  174. ### Text of post
  175. #### exclude all HTML
  176. mastodon_txt <- gsub(pattern = "<.*?>", x = mastodon_toot[[10]], replacement = "")
  177. mastodon_txt <- gsub(pattern = " ", x = mastodon_txt, replacement = "")
  178. ### URL of post
  179. mastodon_url <- mastodon_toot[[11]]
  180. ### Favorites of posts
  181. mastodon_fav <- mastodon_toot[[13]]
  182. ### Information about posters
  183. mastodon_pers <- mastodon_toot[[19]]
  184. mastodon_bot <- c()
  185. for(i in 1:length(mastodon_pers)){
  186. if(mastodon_pers[[i]]$username == "TrendingBot"){
  187. mastodon_bot[i] <- TRUE
  188. } else {
  189. mastodon_bot[i] <- FALSE
  190. }
  191. }
  192. ### images of post
  193. mastodon_img <- c()
  194. for(i in 1:length(mastodon_toot[[20]])){
  195. mastodon_img[i] <- length(mastodon_toot[[20]][[i]])
  196. }
  197. ### Cleaning data (removal of excluded posts)
  198. mastodon_exclude <- c(which(mastodon_bot),
  199. which(mastodon_date < 20180101),
  200. which(mastodon_priv != "public"))
  201. date <- mastodon_date[-mastodon_exclude]
  202. time <- mastodon_time[-mastodon_exclude]
  203. lang <- mastodon_lang[-mastodon_exclude]
  204. inst <- mastodon_insta[-mastodon_exclude]
  205. text <- mastodon_txt[-mastodon_exclude]
  206. link <- mastodon_url[-mastodon_exclude]
  207. favs <- mastodon_fav[-mastodon_exclude]
  208. imag <- mastodon_img[-mastodon_exclude]
  209. ### Creating dataframe
  210. mastodon <- data.frame(cbind(date, time, lang, inst, text, link, favs, imag))
  211. #### Clean-Up
  212. rm(list = c("date", "time", "lang", "inst", "text", "link", "favs", "imag"))
  213. mastodon <- within(data = mastodon, expr = {
  214. date <- as.numeric(as.character(date));
  215. time <- as.numeric(as.character(time));
  216. text <- as.character(text);
  217. link <- as.character(link);
  218. })
  219. # }}}
  220. ## Reddit Collector {{{ ----
  221. ### Authentication at Reddit
  222. # no authentication necessary, hence we can directly start scraping
  223. ### Get posts on Reddit
  224. reddit_post_dirty <- reddit_urls(search_terms = "ilovefs",
  225. #subreddit = "freesoftware linux opensource",
  226. cn_threshold = 0,
  227. page_threshold = 99999,
  228. sort_by = "new",
  229. wait_time = 5)
  230. ### Only use posts from the current year
  231. reddit_searchinyear <- 18 # has to have format "YY", eg "18" for "2018"
  232. reddit_post_year <- gsub(x = reddit_post_dirty$date,
  233. pattern = "\\d.-\\d.-",
  234. replace = "")
  235. reddit_post <- reddit_post_dirty[which(reddit_post_year == reddit_searchinyear),]
  236. ### Extracting relevant variables
  237. comt <- c() # Comments / Replies
  238. subr <- c() # Subreddit
  239. ptns <- c() # Points / Score
  240. ttle <- c() # Title
  241. text <- c() # Text / Content
  242. link <- c() # Linked to Website
  243. date <- c() # Date
  244. rurl <- c() # Reddit-URL of post
  245. for(i in c(1:length(reddit_post$URL))){
  246. comt[i] <- reddit_post$num_comments[i]
  247. ttle[i] <- reddit_post$title[i]
  248. rurl[i] <- reddit_post$URL[i]
  249. date[i] <- gsub(x = reddit_post$date[i], pattern = "-", replace = "")
  250. subr[i] <- reddit_post$subreddit[i]
  251. Sys.sleep(2)
  252. reddit_content <- reddit_content(URL = reddit_post$URL[i], wait_time = 0)
  253. ptns[i] <- reddit_content$post_score[1]
  254. text[i] <- reddit_content$post_text[1]
  255. link[i] <- reddit_content$link[1]
  256. }
  257. ### Creating dataframe
  258. reddit <- data.frame(cbind(date, rurl, link, text, ttle, ptns, subr, comt))
  259. #### Clean-Up
  260. rm(list = c("date", "rurl", "link", "text", "ttle", "ptns", "subr", "comt"))
  261. reddit <- within(data = reddit, expr = {
  262. date <- as.numeric(as.character(date));
  263. rurl <- as.character(rurl);
  264. link <- as.character(link);
  265. text <- as.character(text);
  266. ttle <- as.character(ttle);
  267. ptns <- as.numeric(as.character(ptns));
  268. subr <- as.character(subr);
  269. comt <- as.numeric(as.character(comt));
  270. })
  271. # }}}
  272. ### Exporting data {{{ ----
  273. time_of_saving <- sub(x = Sys.time(), pattern = " CET", replace = "")
  274. time_of_saving <- sub(x = time_of_saving, pattern = " ", replace = "_")
  275. time_of_saving <- gsub(x = time_of_saving, pattern = ":", replace = "-")
  276. #### RData
  277. save_path <- paste0("./data/ilovefs-all_", time_of_saving, ".RData")
  278. save(list = c("twitter", "mastodon", "reddit"), file = save_path)
  279. #### Text
  280. ##### Fediverse
  281. save_path_fed_t <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".txt")
  282. write.table(mastodon, file = save_path_fed_t)
  283. ##### Twitter
  284. save_path_twitter_t <- paste0("./data/ilovefs-twitter_", time_of_saving, ".txt")
  285. write.table(twitter, file = save_path_twitter_t)
  286. ##### Reddit
  287. save_path_reddit_t <- paste0("./data/ilovefs-reddit_", time_of_saving, ".txt")
  288. write.table(reddit, file = save_path_reddit_t)
  289. #### CSV
  290. ##### Fediverse
  291. save_path_fed_c <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".csv")
  292. write.csv(mastodon, file = save_path_fed_c)
  293. ##### Twitter
  294. save_path_twitter_c <- paste0("./data/ilovefs-twitter_", time_of_saving, ".csv")
  295. write.csv(twitter, file = save_path_twitter_c)
  296. ##### Reddit
  297. save_path_reddit_c <- paste0("./data/ilovefs-reddit_", time_of_saving, ".csv")
  298. write.csv(reddit, file = save_path_reddit_c)
  299. # }}}