Browse Source

Use the mastodon paging mechanism to get all the toots

vincent 1 year ago
parent
commit
b944998530
1 changed files with 34 additions and 2 deletions
  1. 34
    2
      collecto.R

+ 34
- 2
collecto.R View File

@@ -158,13 +158,45 @@ twitter <- within(data = twitter, expr = {
158 158
 
159 159
 library(curl)
160 160
 library(rjson)
161
-toot_raw_json <- rawToChar(curl_fetch_memory("https://mastodon.social/api/v1/timelines/tag/ilovefs?limit=40")$content)
161
+req_result = curl_fetch_memory("https://mastodon.social/api/v1/timelines/tag/ilovefs?limit=40")
162
+toot_raw_json <- rawToChar(req_result$content)
162 163
 toots = fromJSON(toot_raw_json)
163
-toots_mat = matrix(ncol = 4, nrow = length(toots))
164
+# We can't know, in advance how many rows we need because we don't know tre
165
+# number of toots we we'll get at the end
166
+toots_mat = matrix(ncol = 4, nrow = 1000)
164 167
 for(i in 1:length(toots)){
165 168
     toots_mat[i,] <- c(toots[[i]]$created_at, toots[[i]]$url, toots[[i]]$content, toots[[i]]$language)
166 169
 }
170
+link_header = parse_headers(req_result$headers)[11]
171
+# Extract first url in the link header whitch is the url for the next page
172
+next_url = str_extract(link_header, "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
173
+next_url = substr(next_url, 1, nchar(next_url) - 2)
174
+
175
+# n keep tracks of rows that are already populated in the matrix
176
+n = 1
177
+# q is the counter to iterate over the toots result
178
+q = 1
179
+
180
+# Use the mastodon paging mechanism to get the next page results
181
+for(j in 1:100) {
182
+    q = 1
183
+    req_result = curl_fetch_memory(next_url)
184
+    toot_raw_json <- rawToChar(req_result$content)
185
+    toots = fromJSON(toot_raw_json)
186
+    if (length(toots) == 0) break
187
+    next_n = n + length(toots)- 1
188
+    for(i in n:next_n){
189
+        toots_mat[i,] <- c(toots[[q]]$created_at, toots[[q]]$url, toots[[q]]$content, toots[[q]]$language)
190
+        q = q + 1
191
+    }
192
+    link_header = parse_headers(req_result$headers)[11]
193
+    next_url = str_extract(link_header, "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
194
+    next_url = substr(next_url, 1, nchar(next_url) - 2)
195
+    n = n + length(toots) - 1
196
+}
167 197
 toots_df = data.frame(toots_mat)
198
+# Remove unpopulated dataframe rows
199
+toots_df = na.omit(toots_df)
168 200
 names(toots_df) = c ('created_at', 'url', 'content', 'language')
169 201
 toots_df$content <- gsub(pattern = "<.*?>", x = toots_df$content, replacement = "")
170 202
 toots_df$content <- gsub(pattern = "  ", x = toots_df$content, replacement = "")

Loading…
Cancel
Save