Browse Source

Edit: implement changes by Vincent in a similar style to the rest of the code

janwey 1 year ago
parent
commit
0c5b89c921
1 changed files with 118 additions and 95 deletions
  1. 118
    95
      collecto.R

+ 118
- 95
collecto.R View File

@@ -12,10 +12,10 @@ library("twitteR")
12 12
 # had to install "httr" via packagemanager
13 13
 
14 14
 #### Fediverse (eg: mastodon)
15
-install.packages("devtools")
16
-# requires libssl-dev
17
-devtools::install_github("ThomasChln/mastodon", force = TRUE)
18
-library("mastodon")
15
+install.packages("curl")
16
+library("curl")
17
+install.packages("rjson")
18
+library(rjson)
19 19
 
20 20
 ### Reddit
21 21
 install.packages("RedditExtractoR")
@@ -156,121 +156,144 @@ twitter <- within(data = twitter, expr = {
156 156
 
157 157
 ## Mastodon Collector with curl {{{ ----
158 158
 
159
-library(curl)
160
-library(rjson)
161
-toot_raw_json <- rawToChar(curl_fetch_memory("https://mastodon.social/api/v1/timelines/tag/ilovefs?limit=40")$content)
162
-toots = fromJSON(toot_raw_json)
163
-toots_mat = matrix(ncol = 4, nrow = length(toots))
159
+mastodon_instance <- "https://mastodon.social"
160
+mastodon_hashtag <- "ilovefs"
161
+mastodon_apiurl <- paste0(mastodon_instance,
162
+			  "/api/v1/timelines/tag/",
163
+			  mastodon_hashtag,
164
+			  "?limit=40")
165
+mastodon_reqres <- curl_fetch_memory(mastodon_apiurl)
166
+toot_raw_json <- rawToChar(mastodon_reqres$content)
167
+toots <- fromJSON(toot_raw_json)
168
+
169
+mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
170
+mastodon_next <- sub(x = mastodon_lheader, pattern = ".*rel=\"next\",\ <", replace = "")
171
+mastodon_next <- sub(x = mastodon_next, pattern = ">;\ rel=\"prev\"", replace = "")
172
+
173
+datetime <- c()
174
+lang <- c()
175
+inst <- c()
176
+link <- c()
177
+text <- c()
178
+reto <- c()
179
+favs <- c()
180
+murl <- c()
164 181
 for(i in 1:length(toots)){
165
-    toots_mat[i,] <- c(toots[[i]]$created_at, toots[[i]]$url, toots[[i]]$content, toots[[i]]$language)
166
-}
167
-toots_df = data.frame(toots_mat)
168
-names(toots_df) = c ('created_at', 'url', 'content', 'language')
169
-toots_df$content <- gsub(pattern = "<.*?>", x = toots_df$content, replacement = "")
170
-toots_df$content <- gsub(pattern = "  ", x = toots_df$content, replacement = "")
171 182
 
172
-## Mastodon Collector {{{ ----
183
+  #### Time and Date of Toot
184
+  if(length(toots[[i]]$created_at) > 0){
185
+    datetime[i] <- toots[[i]]$created_at
186
+  } else {
187
+  # insert empty value, if it does not exist
188
+    datetime[i] <- NA
189
+  }
173 190
 
174
-### Authenticate to the Fediverse (here: Mastodon)
191
+  #### Language of Toot
192
+  if(length(toots[[i]]$language) > 0){
193
+    lang[i] <- toots[[i]]$language
194
+  } else {
195
+  # insert empty value, if it does not exist
196
+    lang[i] <- NA
197
+  }
175 198
 
176
-#### Manual input (uncomment if needed)
177
-#mastodon_auth_insta <- readline("[Mastodon] Enter your Instance-URL."
178
-#mastodon_auth_login <- readline("[Mastodon] Enter your registered mail.")
179
-#mastodon_auth_passw <- readline("[Mastodon] Enter your password.")
180
-#### Saved credentials
181
-mastodon_api_cred <- read.table(file = "./fediverse_mastodon_api.txt", header = TRUE, sep = ";")
182
-mastodon_auth_insta <- as.character(mastodon_api_cred$instance)
183
-mastodon_auth_login <- as.character(mastodon_api_cred$mail)
184
-mastodon_auth_passw <- as.character(mastodon_api_cred$password)
199
+  #### Instance of Toot
200
+  if(length(toots[[i]]$uri) > 0){
201
+    inst[i] <- toots[[i]]$uri
202
+  } else {
203
+  # insert empty value, if it does not exist
204
+    inst[i] <- NA
205
+  }
185 206
 
186
-#### Authentification process
187
-mastodon_auth <- mastodon::login(instance = mastodon_auth_insta,
188
-				 user = mastodon_auth_login,
189
-				 pass = mastodon_auth_passw)
207
+  #### URL of Toot
208
+  if(length(toots[[i]]$url) > 0){
209
+    link[i] <- toots[[i]]$url
210
+  } else {
211
+  # insert empty value, if it does not exist
212
+    link[i] <- NA
213
+  }
190 214
 
191
-### Get posts from mastodon
192
-mastodon_toot <- mastodon::get_hashtag(token = mastodon_auth,
193
-				       hashtag = "ilovefs",
194
-				       local = FALSE,
195
-				       n = 20)
215
+  #### Text/Content of Toot
216
+  if(length(toots[[i]]$content) > 0){
217
+    text[i] <- toots[[i]]$content
218
+  } else {
219
+  # insert empty value, if it does not exist
220
+    text[i] <- NA
221
+  }
196 222
 
197
-### public and non-public posts
198
-mastodon_priv <- mastodon_toot[[7]]
223
+  #### Number of Retoots
224
+  if(length(toots[[i]]$reblogs_count) > 0){
225
+    reto[i] <- toots[[i]]$reblogs_count
226
+  } else {
227
+  # insert empty value, if it does not exist
228
+    reto[i] <- NA
229
+  }
199 230
 
200
-### Time of post
201
-#### date (as numeric value)
202
-mastodon_date <- sub(pattern = "T.*", x = mastodon_toot[[2]], replacement = "")
203
-mastodon_date <- gsub(pattern = "-", x = mastodon_date, replacement = "")
204
-mastodon_date <- as.numeric(mastodon_date)
205
-#### time (as numeric value)
206
-mastodon_time <- sub(pattern = ".*T", x = mastodon_toot[[2]], replacement = "")
207
-mastodon_time <- sub(pattern = "\\..*", x = mastodon_time, replacement = "")
208
-mastodon_time <- gsub(pattern = ":", x = mastodon_time, replacement = "")
209
-mastodon_time <- as.numeric(mastodon_time)
210
-
211
-### Language of post
212
-mastodon_lang <- mastodon_toot[[8]]
213
-
214
-### Instance of post
215
-mastodon_insta <- sub(pattern = "tag:", x = mastodon_toot[[9]], replacement = "")
216
-mastodon_insta <- sub(pattern = ",\\d+.*", x = mastodon_insta, replacement = "")
217
-#### in case the instance name is a full url
218
-mastodon_insta <- sub(pattern = ".*://", x = mastodon_insta, replacement = "")
219
-mastodon_insta <- sub(pattern = "/.*", x = mastodon_insta, replacement = "")
220
-
221
-### Text of post
222
-#### exclude all HTML
223
-mastodon_txt <- gsub(pattern = "<.*?>", x = mastodon_toot[[10]], replacement = "")
224
-mastodon_txt <- gsub(pattern = "  ", x = mastodon_txt, replacement = "")
225
-
226
-### URL of post
227
-mastodon_url <- mastodon_toot[[11]]
228
-
229
-### Favorites of posts
230
-mastodon_fav <- mastodon_toot[[13]]
231
-
232
-### Information about posters
233
-mastodon_pers <- mastodon_toot[[19]]
234
-mastodon_bot <- c()
235
-for(i in 1:length(mastodon_pers)){
236
-  if(mastodon_pers[[i]]$username == "TrendingBot"){
237
-    mastodon_bot[i] <- TRUE
231
+  #### Number of Favorites
232
+  if(length(toots[[i]]$favourites_count) > 0){
233
+    favs[i] <- toots[[i]]$favourites_count
238 234
   } else {
239
-    mastodon_bot[i] <- FALSE
235
+  # insert empty value, if it does not exist
236
+    favs[i] <- NA
240 237
   }
241
-}
242 238
 
243
-### images of post
244
-mastodon_img <- c()
245
-for(i in 1:length(mastodon_toot[[20]])){
246
-  mastodon_img[i] <- length(mastodon_toot[[20]][[i]])
239
+  #### Number of Favorites
240
+  if(length(toots[[i]]$media_attachments) > 0){
241
+    murl[i] <- toots[[i]]$media_attachments[[1]]$url
242
+  } else {
243
+  # insert empty value, if it does not exist
244
+    murl[i] <- NA
245
+  }
247 246
 }
248 247
 
249
-### Cleaning data (removal of excluded posts)
250
-mastodon_exclude <- c(which(mastodon_bot),
251
-		      which(mastodon_date < 20180101),
252
-		      which(mastodon_priv != "public"))
248
+### Time of post
249
+#### date (as numeric value)
250
+date <- sub(pattern = "T.*", x = datetime, replacement = "")
251
+date <- gsub(pattern = "-", x = date, replacement = "")
252
+date <- as.numeric(date)
253
+#### time (as numeric value)
254
+time <- sub(pattern = ".*T", x = datetime, replacement = "")
255
+time <- sub(pattern = "\\..*", x = time, replacement = "")
256
+time <- gsub(pattern = ":", x = time, replacement = "")
257
+time <- as.numeric(time)
258
+
259
+### Removing HTML-Tags from Toots
260
+text <- gsub(pattern = "<.*?>", x = text, replacement = "")
261
+text <- gsub(pattern = "  ", x = text, replacement = "")
262
+
263
+### Cleaning Instance-String
264
+#### GNUsocial
265
+inst <- sub(pattern = "tag:", x = inst, replacement = "")
266
+inst <- sub(pattern = ",\\d+.*", x = inst, replacement = "")
267
+#### Mastodon
268
+inst <- sub(pattern = "https:\\/\\/", x = inst, replacement = "")
269
+inst <- sub(pattern = "\\/.*", x = inst, replacement = "")
270
+
271
+### Only include Toots from this year
272
+mastodon_exclude <- which(date < 20180101)
273
+date <- date[-mastodon_exclude]
274
+time <- time[-mastodon_exclude]
275
+lang <- lang[-mastodon_exclude]
276
+inst <- inst[-mastodon_exclude]
277
+text <- text[-mastodon_exclude]
278
+link <- link[-mastodon_exclude]
279
+reto <- reto[-mastodon_exclude]
280
+favs <- favs[-mastodon_exclude]
281
+murl <- murl[-mastodon_exclude]
253 282
 
254
-date <- mastodon_date[-mastodon_exclude]
255
-time <- mastodon_time[-mastodon_exclude]
256
-lang <- mastodon_lang[-mastodon_exclude]
257
-inst <- mastodon_insta[-mastodon_exclude]
258
-text <- mastodon_txt[-mastodon_exclude]
259
-link <- mastodon_url[-mastodon_exclude]
260
-favs <- mastodon_fav[-mastodon_exclude]
261
-imag <- mastodon_img[-mastodon_exclude]
283
+## Mastodon Collector {{{ ----
262 284
 
263 285
 ### Creating dataframe
264
-mastodon <- data.frame(cbind(date, time, lang, inst, text, link, favs, imag))
286
+mastodon <- data.frame(cbind(date, time, lang, inst, text, link, reto, favs, murl))
265 287
 
266 288
 #### Clean-Up
267
-rm(list = c("date", "time", "lang", "inst", "text", "link", "favs", "imag"))
289
+rm(list = c("date", "time", "lang", "inst", "text", "link", "favs", "reto", "murl", "datetime"))
268 290
 
269 291
 mastodon <- within(data = mastodon, expr = {
270 292
 		     date <- as.numeric(as.character(date));
271 293
 		     time <- as.numeric(as.character(time));
272 294
 		     text <- as.character(text);
273 295
 		     link <- as.character(link);
296
+		     murl <- as.character(murl);
274 297
 		  })
275 298
 # }}}
276 299
 

Loading…
Cancel
Save