Browse Source

Merge branch 'extract_all_toots' of vincent/ilfs-data into master

pull/5/head
janwey 1 year ago
parent
commit
e783ab79cd
1 changed files with 34 additions and 2 deletions
  1. 34
    2
      collecto.R

+ 34
- 2
collecto.R View File

@@ -158,13 +158,45 @@ twitter <- within(data = twitter, expr = {

library(curl)
library(rjson)
toot_raw_json <- rawToChar(curl_fetch_memory("https://mastodon.social/api/v1/timelines/tag/ilovefs?limit=40")$content)
req_result = curl_fetch_memory("https://mastodon.social/api/v1/timelines/tag/ilovefs?limit=40")
toot_raw_json <- rawToChar(req_result$content)
toots = fromJSON(toot_raw_json)
toots_mat = matrix(ncol = 4, nrow = length(toots))
# We can't know, in advance how many rows we need because we don't know tre
# number of toots we we'll get at the end
toots_mat = matrix(ncol = 4, nrow = 1000)
for(i in 1:length(toots)){
toots_mat[i,] <- c(toots[[i]]$created_at, toots[[i]]$url, toots[[i]]$content, toots[[i]]$language)
}
link_header = parse_headers(req_result$headers)[11]
# Extract first url in the link header whitch is the url for the next page
next_url = str_extract(link_header, "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
next_url = substr(next_url, 1, nchar(next_url) - 2)

# n keep tracks of rows that are already populated in the matrix
n = 1
# q is the counter to iterate over the toots result
q = 1

# Use the mastodon paging mechanism to get the next page results
for(j in 1:100) {
q = 1
req_result = curl_fetch_memory(next_url)
toot_raw_json <- rawToChar(req_result$content)
toots = fromJSON(toot_raw_json)
if (length(toots) == 0) break
next_n = n + length(toots)- 1
for(i in n:next_n){
toots_mat[i,] <- c(toots[[q]]$created_at, toots[[q]]$url, toots[[q]]$content, toots[[q]]$language)
q = q + 1
}
link_header = parse_headers(req_result$headers)[11]
next_url = str_extract(link_header, "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
next_url = substr(next_url, 1, nchar(next_url) - 2)
n = n + length(toots) - 1
}
toots_df = data.frame(toots_mat)
# Remove unpopulated dataframe rows
toots_df = na.omit(toots_df)
names(toots_df) = c ('created_at', 'url', 'content', 'language')
toots_df$content <- gsub(pattern = "<.*?>", x = toots_df$content, replacement = "")
toots_df$content <- gsub(pattern = " ", x = toots_df$content, replacement = "")

Loading…
Cancel
Save