Collecting, Analyzing and Presenting data about the participation in #ilovefs day
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

collecto.R 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. ################################################################################
  2. # collecto.R #
  3. # Collecting data from different social media sources with a specified #
  4. # searchterm/criteria #
  5. # Jan Weymeirsch janwey@fsfe.org #
  6. ################################################################################
  7. ### Loading Packages {{{ ----
  8. #### Twitter
  9. install.packages("twitteR")
  10. library("twitteR")
  11. # had to install "httr" via packagemanager
  12. #### Facebook
  13. install.packages("Rfacebook")
  14. library("Rfacebook")
  15. #### Fediverse (eg: mastodon)
  16. install.packages("devtools")
  17. # requires libssl-dev
  18. devtools::install_github("ThomasChln/mastodon")
  19. library("mastodon")
  20. # }}}
  21. ## Twitter Collector {{{ ----
  22. ### Authenticate to Twitter
  23. #### Manual input (uncomment if needed)
  24. #twitter_consumerkey <- readline("[Twitter] Enter your consumer API key.")
  25. #twitter_consumerpri <- readline("[Twitter] Enter your consumer API secret.")
  26. #twitter_tokenaccess <- readline("[Twitter] Enter your Access Token.")
  27. #twitter_tokensecret <- readline("[Twitter] Enter your Token Secret.")
  28. #### Saved credentials
  29. twitter_api_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";")
  30. twitter_consumerkey <- as.character(twitter_api_cred$consumer_key)
  31. twitter_consumerpri <- as.character(twitter_api_cred$consumer_private)
  32. twitter_tokenaccess <- as.character(twitter_api_cred$access_token)
  33. twitter_tokensecret <- as.character(twitter_api_cred$token_secret)
  34. setup_twitter_oauth(consumer_key = twitter_consumerkey,
  35. consumer_secret = twitter_consumerpri,
  36. access_token = twitter_tokenaccess,
  37. access_secret = twitter_tokensecret)
  38. # Note -------------------------------------------------------------------------
  39. # You can receive your Twitter API access credentials at
  40. # https://apps.twitter.com/
  41. # ------------------------------------------------------------------------------
  42. ### Collecting Tweets
  43. twitter_tw_dirty <- searchTwitter(search = "ilovefs",
  44. since = "2018-01-01",
  45. until = "2018-12-31",
  46. n = 100,
  47. resultType = "recent")
  48. ### strip off retweets
  49. twitter_tw <- strip_retweets(tweets = twitter_tw_dirty,
  50. strip_manual = TRUE,
  51. strip_mt = FALSE)
  52. ### Time of tweet
  53. twitter_timedate <- c()
  54. for(i in 1:length(twitter_tw)){
  55. if(length(twitter_tw[[i]]$created) > 0){
  56. twitter_timedate[i] <- as.character(twitter_tw[[i]]$created)
  57. } else {
  58. # insert empty value, if it does not exist
  59. twitter_timedate[i] <- NA
  60. }
  61. }
  62. ### Client used
  63. twitter_client <- c()
  64. for(i in 1:length(twitter_tw)){
  65. if(length(twitter_tw[[i]]$statusSource) > 0){
  66. twitter_client[i] <- as.character(twitter_tw[[i]]$statusSource)
  67. } else {
  68. # insert empty value, if it does not exist
  69. twitter_client[i] <- NA
  70. }
  71. }
  72. twitter_client <- sub(pattern = ".*\">", replace = "", x = twitter_client)
  73. twitter_client <- sub(pattern = "</a>", replace = "", x = twitter_client)
  74. ### Screen names / Twitter Handles
  75. twitter_name <- c()
  76. for(i in 1:length(twitter_tw)){
  77. if(length(twitter_tw[[i]]$screenName) > 0){
  78. twitter_name[i] <- as.character(twitter_tw[[i]]$screenName)
  79. } else {
  80. # insert empty value, if it does not exist
  81. twitter_name[i] <- NA
  82. }
  83. }
  84. ### Number of retweets
  85. twitter_rts <- c()
  86. for(i in 1:length(twitter_tw)){
  87. if(length(twitter_tw[[i]]$retweetCount) > 0){
  88. twitter_rts[i] <- as.character(twitter_tw[[i]]$retweetCount)
  89. } else {
  90. # insert empty value, if it does not exist
  91. twitter_rts[i] <- NA
  92. }
  93. }
  94. ### Number of favorites
  95. twitter_fav <- c()
  96. for(i in 1:length(twitter_tw)){
  97. if(length(twitter_tw[[i]]$favoriteCount) > 0){
  98. twitter_fav[i] <- as.character(twitter_tw[[i]]$favoriteCount)
  99. } else {
  100. # insert empty value, if it does not exist
  101. twitter_fav[i] <- NA
  102. }
  103. }
  104. ### URLs posted about
  105. twitter_url <- c()
  106. for(i in 1:length(twitter_tw)){
  107. if(length(twitter_tw[[i]]$urls$expanded_url) > 0){
  108. twitter_url[i] <- as.character(twitter_tw[[i]]$urls$expanded_url)
  109. } else {
  110. # insert empty value, if it does not exist
  111. twitter_url[i] <- NA
  112. }
  113. }
  114. ### actual tweet/text
  115. twitter_txt <- c()
  116. for(i in 1:length(twitter_tw)){
  117. if(length(twitter_tw[[i]]$text) > 0){
  118. twitter_txt[i] <- as.character(twitter_tw[[i]]$text)
  119. } else {
  120. # insert empty value, if it does not exist
  121. twitter_txt[i] <- NA
  122. }
  123. }
  124. # Control output, uncomment if needed
  125. twitter_timedate
  126. twitter_client
  127. twitter_name
  128. twitter_rts
  129. twitter_fav
  130. twitter_url
  131. twitter_txt
  132. time <- sub(pattern = ".* ", x = twitter_timedate, replace = "")
  133. time <- as.numeric(gsub(pattern = ":", x = time, replace = ""))
  134. date <- sub(pattern = " .*", x = twitter_timedate, replace = "")
  135. date <- as.numeric(gsub(pattern = "-", x = date, replace = ""))
  136. retw <- as.factor(twitter_rts)
  137. favs <- as.factor(twitter_fav)
  138. link <- as.character(twitter_url)
  139. text <- as.character(twitter_txt)
  140. ### Creating dataframe
  141. twitter <- data.frame(cbind(date, time, retw, favs, text, link))
  142. #### Clean-Up
  143. rm(list = c("date", "time", "retw", "favs", "text", "link"))
  144. twitter <- within(data = twitter, expr = {
  145. date <- as.numeric(as.character(date));
  146. time <- as.numeric(as.character(time));
  147. text <- as.character(text);
  148. link <- as.character(link);
  149. })
  150. # }}}
  151. ## Facebook Collector [WIP] {{{ ----
  152. ### Authenticate to Facebook
  153. #### Manual input (uncomment if needed)
  154. #facebook_app_id <- readline("[Twitter] Enter your App ID key.")
  155. #facebook_secret <- readline("[Twitter] Enter your App Secret.")
  156. #### Saved credentials
  157. facebook_api_cred <- read.table(file = "./facebook_api.txt", header = TRUE, sep = ";")
  158. facebook_app_id <- as.character(facebook_api_cred$app_id)
  159. facebook_secret <- as.character(facebook_api_cred$app_secret)
  160. facebook_auth <- fbOAuth(app_id = facebook_api_id,
  161. app_secret = facebook_secret)
  162. ### Get posts from FSFE
  163. facebook_fsfe_posts <- Rfacebook::getPage(page = "thefsfe",
  164. token = facebook_auth,
  165. since = "2018-01-01",
  166. until = "2018-31-12")
  167. # }}}
  168. ## Mastodon Collector {{{ ----
  169. ### Authenticate to the Fediverse (here: Mastodon)
  170. # Note -------------------------------------------------------------------------
  171. # It is sub-optimal to use clear-text credentials for the authentification
  172. # process, but the mastodon-package does not (yet) support oath
  173. # ------------------------------------------------------------------------------
  174. #### Manual input (uncomment if needed)
  175. #mastodon_auth_insta <- readline("[Mastodon] Enter your Instance-URL."
  176. #mastodon_auth_login <- readline("[Mastodon] Enter your registered mail.")
  177. #mastodon_auth_passw <- readline("[Mastodon] Enter your password.")
  178. #### Saved credentials
  179. mastodon_api_cred <- read.table(file = "./fediverse_mastodon_api.txt", header = TRUE, sep = ";")
  180. mastodon_auth_insta <- as.character(mastodon_api_cred$instance)
  181. mastodon_auth_login <- as.character(mastodon_api_cred$mail)
  182. mastodon_auth_passw <- as.character(mastodon_api_cred$password)
  183. #### Authentification process
  184. mastodon_auth <- mastodon::login(instance = mastodon_insta,
  185. user = mastodon_login,
  186. pass = mastodon_passw)
  187. ### Get posts from mastodon
  188. mastodon_toot <- mastodon::get_hashtag(token = mastodon_auth,
  189. hashtag = "ilovefs",
  190. local = FALSE,
  191. n = 100)
  192. # Note -------------------------------------------------------------------------
  193. # Documentation is really poor, so here is a guess of the variables in the
  194. # list() item
  195. # 1. id
  196. # 2. time
  197. # 3.
  198. # 4.
  199. # 5.
  200. # 6.
  201. # 7. public/private
  202. # 8. language
  203. # 9. user-agent
  204. # 10. post-text (html)
  205. # 11. url of post
  206. # 12.
  207. # 13. favorites
  208. # 14.
  209. # 15.
  210. # 16.
  211. # 17.
  212. # 18.
  213. # 19. poster-information
  214. # 20. image in post
  215. # 21.
  216. # 22. information about searched hashtag
  217. # 23.
  218. # ------------------------------------------------------------------------------
  219. ### Sort out non-public posts
  220. mastodon_priv <- which(mastodon_toot[[7]] != "public")
  221. if(length(mastodon_priv) > 0){
  222. for(i in 1:length(mastodon_toot)){
  223. mastodon_toot[[i]] <- mastodon_toot[[i]][-c(mastodon_priv)]
  224. }
  225. }
  226. ### Time of post
  227. #### date (as numeric value)
  228. mastodon_date <- sub(pattern = "T.*", x = mastodon_toot[[2]], replacement = "")
  229. mastodon_date <- gsub(pattern = "-", x = mastodon_date, replacement = "")
  230. mastodon_date <- as.numeric(mastodon_date)
  231. #### time (as numeric value)
  232. mastodon_time <- sub(pattern = ".*T", x = mastodon_toot[[2]], replacement = "")
  233. mastodon_time <- sub(pattern = "\\..*", x = mastodon_time, replacement = "")
  234. mastodon_time <- gsub(pattern = ":", x = mastodon_time, replacement = "")
  235. mastodon_time <- as.numeric(mastodon_time)
  236. ### Language of post
  237. mastodon_lang <- mastodon_toot[[8]]
  238. ### Instance of post
  239. mastodon_insta <- sub(pattern = "tag:", x = mastodon_toot[[9]], replacement = "")
  240. mastodon_insta <- sub(pattern = ",\\d+.*", x = mastodon_insta, replacement = "")
  241. #### in case the instance name is a full url
  242. mastodon_insta <- sub(pattern = ".*://", x = mastodon_insta, replacement = "")
  243. mastodon_insta <- sub(pattern = "/.*", x = mastodon_insta, replacement = "")
  244. ### Text of post
  245. #### exclude all HTML
  246. mastodon_txt <- gsub(pattern = "<.*?>", x = mastodon_toot[[10]], replacement = "")
  247. mastodon_txt <- gsub(pattern = " ", x = mastodon_txt, replacement = "")
  248. ### URL of post
  249. mastodon_url <- mastodon_toot[[11]]
  250. ### Favorites of posts
  251. mastodon_fav <- mastodon_toot[[13]]
  252. ### Information about posters
  253. mastodon_pers <- mastodon_toot[[19]]
  254. mastodon_bot <- c()
  255. for(i in 1:length(mastodon_pers)){
  256. if(mastodon_pers[[i]]$username == "TrendingBot"){
  257. mastodon_bot[i] <- TRUE
  258. } else {
  259. mastodon_bot[i] <- FALSE
  260. }
  261. }
  262. ### images of post
  263. mastodon_img <- c()
  264. for(i in 1:length(mastodon_toot[[20]])){
  265. mastodon_img[i] <- length(mastodon_toot[[20]][[i]])
  266. }
  267. ### Cleaning data (removal of excluded posts)
  268. mastodon_exclude <- c(which(mastodon_bot),
  269. which(mastodon_date < 20180101))
  270. date <- mastodon_date[-mastodon_exclude]
  271. time <- mastodon_time[-mastodon_exclude]
  272. lang <- mastodon_lang[-mastodon_exclude]
  273. inst <- mastodon_insta[-mastodon_exclude]
  274. text <- mastodon_txt[-mastodon_exclude]
  275. link <- mastodon_url[-mastodon_exclude]
  276. favs <- mastodon_fav[-mastodon_exclude]
  277. imag <- mastodon_img[-mastodon_exclude]
  278. ### Creating dataframe
  279. mastodon <- data.frame(cbind(date, time, lang, inst, text, link, favs, imag))
  280. #### Clean-Up
  281. rm(list = c("date", "time", "lang", "inst", "text", "link", "favs", "imag"))
  282. mastodon <- within(data = mastodon, expr = {
  283. date <- as.numeric(as.character(date));
  284. time <- as.numeric(as.character(time));
  285. text <- as.character(text);
  286. link <- as.character(link);
  287. })
  288. # }}}