Explorar el Código

too much to put in a single commit tbh

pull/15/head
JayVii hace 8 meses
padre
commit
fb3f3badb8
Se han modificado 5 ficheros con 311 adiciones y 306 borrados
  1. 0
    306
      collecto.R
  2. 241
    0
      scripts/collecto.R
  3. 70
    0
      scripts/functions.R
  4. 0
    0
      scripts/plotte.R
  5. 0
    0
      scripts/word_cloud.py

+ 0
- 306
collecto.R Ver fichero

@@ -1,306 +0,0 @@
################################################################################
# Copyright (c) 2018 Free Software Foundation Europe e.V. <contatc@fsfe.org>
# Author 2018 Jan Weymeirsch <janwey@fsfe.org>
# Author 2018 Vincent Lequertier <vincent@fsfe.org>
# SPDX-License-Identifier: GPL-3.0
################################################################################

## Loading Packages ----

# Twitter
if(!require("rtweet")){ install.packages("rtweet"); library("rtweet") }
# had to install "httr" via packagemanager

# Fediverse (eg: mastodon)
if(!require("curl")){ install.packages("curl"); library("curl") }
if(!require("rjson")){ install.packages("rjson"); library("rjson") }

# Reddit
if(!require("RedditExtractoR")){
install.packages("RedditExtractoR")
library("RedditExtractoR")
}

## Helper Functions ----
list2vec <- function(x){
sapply(X = x, FUN = function(y) paste(unlist(y), collapse = ","))
}
valifexst <- function(x) ifelse(test = length(x) > 0, yes = x, no = NA)

## Twitter Collector ----

# Twitter Auth.

## Manual input (uncomment if needed)
#tw_cred <- data.frame(
# consumer_key = readline("[Twitter] Enter your consumer API key."),
# consumer_private = readline("[Twitter] Enter your consumer API secret."))

## Saved credentials
tw_cred <- read.table(file = "./twitter_api.txt", header = TRUE, sep = ";",
colClasses = "character")

## Create Twitter Token
twitter_token <- create_token(app = tw_cred$appname,
consumer_key = tw_cred$consumer_key,
consumer_secret = tw_cred$consumer_private)

# Note -------------------------------------------------------------------------
# Please refer to the Documentation on where to receive your API credentials.
# ------------------------------------------------------------------------------

## Collecting Tweets
tweets <- search_tweets(q = "#ilovefs",
n = 9999,
include_rts = FALSE)[,
# include only revelvant information
c("user_id", "created_at", "text", "source", "favorite_count",
"retweet_count", "hashtags", "urls_expanded_url", "media_expanded_url",
"ext_media_expanded_url", "lang", "location", "status_url", "protected")]

## Some recoding, simplistic(!) anonymization
tweets <- within(data = tweets, expr = {
# replace global user ID by index only unique to this dataset
user <- as.numeric(as.factor(user_id))
rm("user_id")

# extract date and time
time <- sub(pattern = ".*\\s", x = created_at, replace = "")
date <- sub(pattern = "\\s.*", x = created_at, replace = "")
rm("created_at")

# extract "clean" text (without URLs or lineabreaks)
ctxt <- gsub(pattern = "http.?://.+($|\\s)", x = text, replace = "") %>%
gsub(pattern = "\n", x = text, replace = "")

# Client data
clnt <- as.factor(source)
rm("source")

# Favorites and Retweets
favs <- favorite_count
retw <- retweet_count
rm(list = c("favorite_count", "retweet_count"))

# List Hashtags in single Variable
htag <- sapply(X = hashtags, FUN = function(x){
paste(unlist(x), collapse = ",")
})
rm("hashtags")

# URLs and Media
link <- status_url
urls <- list2vec(urls_expanded_url)
murl <- list2vec(media_expanded_url)
mext <- list2vec(ext_media_expanded_url)
rm(list = c("urls_expanded_url", "media_expanded_url",
"ext_media_expanded_url", "status_url"))

# Location
posi <- location
rm("location")
})

## Eclusion: before 2019-01-01, after 2019-02-17, protected tweets
tweets <- tweets[(as.Date(tweets$date) > as.Date("2019-01-01") &
as.Date(tweets$date) < as.Date("2019-02-17")),]
tweets <- tweets[!tweets$protected,]

## Mastodon Collector {{{ ----

mastodon.extract <- function(data){

# Within each post
data <- sapply(X = data, FUN = function(x){

# time and date
time <- gsub(x = x$created_at, pattern = ".*T|\\..*", replacement = "")
date <- sub(x = x$created_at, pattern = "T.*", replacement = "")

# simple extraction, return NA if value does not exist
lang <- valifexst(x$language) # language
inst <- valifexst(x$uri) # instance name
link <- valifexst(x$url) # post URL
rebl <- valifexst(x$reblogs_count) # number of reblogs
favs <- valifexst(x$favourites_count) # number of favorites
acct <- valifexst(x$account$url) # account url (unique)

# sanitizing text (removing HTML tags and whitespace)
text <- gsub(pattern = "<.*?>|\\s{2,}", x = x$content, replacement = "")

# media URL (multiple possible)
murl <- valifexst(
sapply(X = x$media_attachements, FUN = function(y){
list2vec(y$url)
})
)

# return extracted data only
return(data.frame(
rbind(time, date, lang, inst, link, text, rebl, favs, acct, murl)
))
})

data <- as.data.frame(
t(matrix(data = unlist(data), nrow = length(data[[1]])))
)

return(data)
}

## Set search parameters
mastodon_instance <- "https://mastodon.social"
mastodon_hashtag <- "ilovefs"
mastodon_url <- paste0(mastodon_instance,
"/api/v1/timelines/tag/",
mastodon_hashtag,
"?limit=40")
mastodon_iterations <- 999
toots <- c()

## Scrape Mastodon
for(i in 1:mastodon_iterations){

# Download and extract Posts
mastodon_reqres <- curl_fetch_memory(mastodon_url)
mastodon_rawjson <- rawToChar(mastodon_reqres$content)
raw_toots <- fromJSON(mastodon_rawjson)

# If Post-Data is present, extract it. Else break the loop
if(length(raw_toots) > 0){
tmp_toots <- mastodon.extract(data = raw_toots)
toots <- rbind(toots, tmp_toots)
} else {
break
}

# Update the URL for the next iteration of the for loop so we can download
# the next toots.
mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
mastodon_next <- sub(x = mastodon_lheader, pattern = ".*link:\ <",
replace = "")
mastodon_url <- sub(x = mastodon_next, pattern = ">;\ rel=\"next\".*",
replace = "")
}
names(toots) <- c("time", "date", "lang", "inst", "link", "text",
"rebl", "favs", "acct", "murl")

## Simple(!) anonymization
toots$acct <- as.numeric(toots$acct) # unique only to this dataframe
toots$link <- as.numeric(toots$link) # unique only to this dataframe

## Cleanup
toots <- within(data = toots, expr = {

# Time Variables
time <- as.character(time)
date <- as.character(date)
fdat <- strptime(x = paste(date, time), format = "%Y-%m-%d %H:%M:%S",
tz = "CET")

# Instances
inst <- gsub(pattern = "(tag:)|(,\\d+.*)|(https:\\/\\/)|(\\/.*)",
x = inst, replacement = "")
})

## Only include Toots from this year
mst_exclude <- which(as.Date(toots$date) < as.Date("2019-01-01") |
as.Date(toots$date) > as.Date("2019-01-17"))
toots <- toots[-mst_exclude,]

## Reddit Collector {{{ ----

### Authentication at Reddit
# no authentication necessary, hence we can directly start scraping

### Get posts on Reddit
reddit_post_dirty <- reddit_urls(search_terms = "ilovefs",
#subreddit = "freesoftware linux opensource",
cn_threshold = 0,
page_threshold = 99999,
sort_by = "new",
wait_time = 5)

### Only use posts from the current year
reddit_searchinyear <- 18 # has to have format "YY", eg "18" for "2018"
reddit_post_year <- gsub(x = reddit_post_dirty$date,
pattern = "\\d.-\\d.-",
replace = "")
reddit_post <- reddit_post_dirty[which(reddit_post_year == reddit_searchinyear),]

### Extracting relevant variables
comt <- c() # Comments / Replies
subr <- c() # Subreddit
ptns <- c() # Points / Score
ttle <- c() # Title
text <- c() # Text / Content
link <- c() # Linked to Website
date <- c() # Date
rurl <- c() # Reddit-URL of post
acct <- c() # Author of Post
for(i in c(1:length(reddit_post$URL))){
comt[i] <- reddit_post$num_comments[i]
ttle[i] <- reddit_post$title[i]
rurl[i] <- reddit_post$URL[i]
date[i] <- gsub(x = reddit_post$date[i], pattern = "-", replace = "")
subr[i] <- reddit_post$subreddit[i]

Sys.sleep(2)
reddit_content <- reddit_content(URL = reddit_post$URL[i], wait_time = 0)
ptns[i] <- reddit_content$post_score[1]
text[i] <- reddit_content$post_text[1]
link[i] <- reddit_content$link[1]
acct[i] <- reddit_content$author[1]
}

### Creating dataframe
reddit <- data.frame(cbind(date, rurl, link, text, ttle, ptns, subr, comt, acct))

#### Clean-Up
rm(list = c("date", "rurl", "link", "text", "ttle", "ptns", "subr", "comt", "acct"))

reddit <- within(data = reddit, expr = {
date <- as.character(date);
rurl <- as.character(rurl);
link <- as.character(link);
text <- as.character(text);
ttle <- as.character(ttle);
ptns <- as.numeric(as.character(ptns));
subr <- as.character(subr);
comt <- as.numeric(as.character(comt));
})
# }}}

### Exporting data {{{ ----

time_of_saving <- sub(x = Sys.time(), pattern = " CET", replace = "")
time_of_saving <- sub(x = time_of_saving, pattern = " ", replace = "_")
time_of_saving <- gsub(x = time_of_saving, pattern = ":", replace = "-")

#### RData
save_path <- paste0("./data/ilovefs-all_", time_of_saving, ".RData")
save(list = c("twitter", "mastodon", "reddit"), file = save_path)

#### Text
##### Fediverse
save_path_fed_t <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".txt")
write.table(mastodon, file = save_path_fed_t)
##### Twitter
save_path_twitter_t <- paste0("./data/ilovefs-twitter_", time_of_saving, ".txt")
write.table(twitter, file = save_path_twitter_t)
##### Reddit
save_path_reddit_t <- paste0("./data/ilovefs-reddit_", time_of_saving, ".txt")
write.table(reddit, file = save_path_reddit_t)

#### CSV
##### Fediverse
save_path_fed_c <- paste0("./data/ilovefs-fediverse_", time_of_saving, ".csv")
write.csv(mastodon, file = save_path_fed_c)
##### Twitter
save_path_twitter_c <- paste0("./data/ilovefs-twitter_", time_of_saving, ".csv")
write.csv(twitter, file = save_path_twitter_c)
##### Reddit
save_path_reddit_c <- paste0("./data/ilovefs-reddit_", time_of_saving, ".csv")
write.csv(reddit, file = save_path_reddit_c)
# }}}

+ 241
- 0
scripts/collecto.R Ver fichero

@@ -0,0 +1,241 @@
################################################################################
# Copyright (c) 2018 Free Software Foundation Europe e.V. <contatc@fsfe.org>
# Author 2018 Jan Weymeirsch <janwey@fsfe.org>
# Author 2018 Vincent Lequertier <vincent@fsfe.org>
# SPDX-License-Identifier: GPL-3.0
################################################################################

# Loading Packages and Functions -----------------------------------------------

# Twitter ------------------------------
if(!require("rtweet")){ install.packages("rtweet"); library("rtweet") }
# had to install "httr" via packagemanager

# Fediverse (eg: mastodon) -------------
if(!require("curl")){ install.packages("curl"); library("curl") }
if(!require("rjson")){ install.packages("rjson"); library("rjson") }

# Reddit -------------------------------
if(!require("RedditExtractoR")){
install.packages("RedditExtractoR")
library("RedditExtractoR")
}

# Export as ODS ------------------------
if(!require("readODS")){ install.packages("readODS"); library("readODS") }

# Read helper functions ----------------
source("./functions.R")

# Twitter Collector ------------------------------------------------------------


# Reading stored API credentials -------
tw_cred <- read.table(file = "../twitter_api.txt", header = TRUE, sep = ";",
colClasses = "character")

# Create Twitter Token -----------------
twitter_token <- create_token(app = tw_cred$appname,
consumer_key = tw_cred$consumer_key,
consumer_secret = tw_cred$consumer_private)

# Note
# Please refer to the Documentation on where to receive your API credentials.


# Collecting Tweets ------------------------------------------------------------

# Scrape Tweets ------------------------
tweets <- search_tweets(q = "#ilovefs",
n = 9999,
include_rts = FALSE)[,
# include only revelvant information
c("user_id", "created_at", "text", "source", "favorite_count",
"retweet_count", "hashtags", "urls_expanded_url", "media_expanded_url",
"ext_media_expanded_url", "lang", "location", "status_url", "protected")]

# Recoding -----------------------------
tweets <- within(data = tweets, expr = {
# replace global user ID by index only unique to this dataset
user <- as.numeric(as.factor(user_id))
rm("user_id")

# extract date and time
time <- sub(pattern = ".*\\s", x = created_at, replace = "")
date <- sub(pattern = "\\s.*", x = created_at, replace = "")
rm("created_at")

# extract "clean" text (without URLs or lineabreaks)
ctxt <- gsub(pattern = "http.?://.+($|\\s)", x = text, replace = "") %>%
gsub(pattern = "\n", x = text, replace = "")

# Client data
clnt <- as.factor(source)
rm("source")

# Favorites and Retweets
favs <- favorite_count
retw <- retweet_count
rm(list = c("favorite_count", "retweet_count"))

# List Hashtags in single Variable
htag <- list2vec(x = hashtags)
rm("hashtags")

# URLs and Media
link <- status_url
urls <- list2vec(urls_expanded_url)
murl <- list2vec(media_expanded_url)
mext <- list2vec(ext_media_expanded_url)
rm(list = c("urls_expanded_url", "media_expanded_url",
"ext_media_expanded_url", "status_url"))

# Location
posi <- location
rm("location")
})

# Eclusion------------------------------
# before 2019-01-01, after 2019-02-17, protected tweets
tweets <- tweets[(as.Date(tweets$date) > as.Date("2019-01-01") &
as.Date(tweets$date) < as.Date("2019-02-17")),]
tweets <- tweets[!tweets$protected,]

# Mastodon Collector -----------------------------------------------------------

# Set search parameters ----------------
toots <- c()
mastodon_iterations <- 999
mastodon_instance <- "https://mastodon.social"
mastodon_hashtag <- "ilovefs"
mastodon_url <- paste0(mastodon_instance, "/api/v1/timelines/tag/",
mastodon_hashtag, "?limit=40")

# Scrape Mastodon ----------------------
for(i in 1:mastodon_iterations){

# Download and extract Posts
mastodon_reqres <- curl_fetch_memory(mastodon_url)
mastodon_rawjson <- rawToChar(mastodon_reqres$content)
raw_toots <- fromJSON(mastodon_rawjson)

# If Post-Data is present, extract it. Else break the loop
if(length(raw_toots) > 0){
tmp_toots <- mastodon.extract(data = raw_toots)
toots <- rbind(toots, tmp_toots)
} else {
break
}

# Update the URL for the next iteration of the for loop so we can download
# the next toots.
mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
mastodon_url <- gsub(x = mastodon_lheader,
pattern = "(.*link:\ <)|(>;\ rel=\"next\".*)",
replace = "")
}
# adding variable-names (again)
names(toots) <- c("time", "date", "lang", "inst", "link", "text",
"rebl", "favs", "acct", "murl")

# Simple(!) anonymization --------------
toots$acct <- as.numeric(toots$acct) # unique only to this dataframe
toots$link <- as.numeric(toots$link) # unique only to this dataframe

# Cleanup ------------------------------
toots <- within(data = toots, expr = {

# Time Variables
time <- as.character(time)
date <- as.character(date)
fdat <- strptime(x = paste(date, time), format = "%Y-%m-%d %H:%M:%S",
tz = "CET")

# Instances
inst <- gsub(pattern = "(tag:)|(,\\d+.*)|(https:\\/\\/)|(\\/.*)",
x = inst, replacement = "")
})

# Exclusion ----------------------------
# Only include Toots from this year
mst_exclude <- which(as.Date(toots$date) < as.Date("2019-01-01") &
as.Date(toots$date) > as.Date("2019-01-17"))
if(length(mst_exclude) > 0){ toots <- toots[-mst_exclude,] }

# Reddit Collector -------------------------------------------------------------

# Get posts on Reddit ------------------
reddit_post_dirty <- reddit_urls(search_terms = "ilovefs",
#subreddit = "freesoftware linux opensource",
cn_threshold = 0,
page_threshold = 99999,
sort_by = "new",
wait_time = 5)

# Extract relevant information ---------
reddit <- within(data = reddit_post_dirty, expr = {

# extract year
year <- paste0(20, gsub(x = date, pattern = ".*-", replacement = ""))

# rename relevant variables
cmts <- num_comments
name <- title
subr <- subreddit
link <- URL

# Cleanup
rm(list = c("num_comments", "title", "subreddit", "URL"))
})

# Exclude ------------------------------
# Limit to this year only
reddit_exclude <- which(as.numeric(reddit$year) < 2019)
reddit <- reddit[-reddit_exclude,]

# Additional Information ---------------
# for all remaining posts, additional information may be gathered
rdcnt <- lapply(X = reddit$link, FUN = function(x){
reddit_content(URL = x, wait_time = 30)
})

# merge additional information into main dataset
reddit$ptns <- reddit$text <- reddit$user <- NA
for(i in 1:length(rdcnt)){
reddit$ptns[i] <- rdcnt[[i]]$post_score[1]
reddit$text[i] <- rdcnt[[i]]$post_text[1]
reddit$user[i] <- rdcnt[[i]]$author[1]
}

# Exporting data ---------------------------------------------------------------

# Create timestamp ---------------------
time_of_saving <- sub(x = Sys.time(), pattern = " CET", replace = "") %>%
sub(pattern = " ", replace = "_") %>%
gsub(pattern = ":", replace = "-")

# Save as RData ------------------------
save_path <- paste0("../data/ilovefs-all_", time_of_saving, ".RData")
save(list = c("tweets", "toots", "reddit"), file = save_path)

# Save as Text -------------------------
save_path <- paste0("../data/ilovefs-", c("fediverse_", "twitter_", "reddit_"),
time_of_saving, ".txt")
write.table(x = toots, file = save_path[1])
write.table(x = tweets, file = save_path[2])
write.table(x = reddit, file = save_path[3])

# Save as CSV --------------------------
save_path <- paste0("../data/ilovefs-", c("fediverse_", "twitter_", "reddit_"),
time_of_saving, ".csv")
write.csv(x = toots, file = save_path[1])
write.csv(x = tweets, file = save_path[2])
write.csv(x = reddit, file = save_path[3])

# Save as ODS --------------------------
save_path <- paste0("../data/ilovefs-", c("fediverse_", "twitter_", "reddit_"),
time_of_saving, ".ods")
write_ods(x = toots, path = save_path[1])
write_ods(x = tweets, path = save_path[2])
write_ods(x = reddit, path = save_path[3])

+ 70
- 0
scripts/functions.R Ver fichero

@@ -0,0 +1,70 @@
################################################################################
# Copyright (c) 2018 Free Software Foundation Europe e.V. <contatc@fsfe.org>
# Author 2019 Jan Weymeirsch <janwey@fsfe.org>
# SPDX-License-Identifier: GPL-3.0
################################################################################

# Helper Functions -------------------------------------------------------------
# to be sourced from the main script.

# List2Vec ---------------------------------------------------------------------
# transforms a list() object x into a vector with single strings, divided by a
# seperator
list2vec <- function(x, sep = ","){
sapply(X = x, FUN = function(y) paste(unlist(y), collapse = sep))
}

# ValIfExst --------------------------------------------------------------------
# tests whether an object contains values. If yes, it returns that value, else
# returns NA
valifexst <- function(x) ifelse(test = length(x) > 0, yes = x, no = NA)

# Mastodon Extractor -----------------------------------------------------------
# extracts a set of information from a nested list-item containing every single
# information on a single post within one upper list, as returned by the
# "tags" mastodon-API v1
mastodon.extract <- function(data){

# Within each post
data <- sapply(X = data, FUN = function(x){

# time and date
time <- gsub(x = x$created_at, pattern = ".*T|\\..*", replacement = "")
date <- sub(x = x$created_at, pattern = "T.*", replacement = "")

# simple extraction, return NA if value does not exist
lang <- valifexst(x$language) # language
inst <- valifexst(x$uri) # instance name
link <- valifexst(x$url) # post URL
rebl <- valifexst(x$reblogs_count) # number of reblogs
favs <- valifexst(x$favourites_count) # number of favorites
acct <- valifexst(x$account$url) # account url (unique)

# sanitizing text (removing HTML tags and whitespace)
text <- gsub(pattern = "<.*?>|\\s{2,}", x = x$content, replacement = "")

# media URL (multiple possible)
murl <- valifexst(
list2vec(
sapply(X = x$media_attachements, FUN = function(y){
y$url
})
)
)

# return extracted data only
return(data.frame(
rbind(time, date, lang, inst, link, text, rebl, favs, acct, murl)
))
})

# transform "clean" list object into dataframe
data <- as.data.frame(
t(matrix(data = unlist(data), nrow = length(data[[1]])))
)

# return data.frame object
return(data)
}

# EOF functions.R

plotte.R → scripts/plotte.R Ver fichero


word_cloud.py → scripts/word_cloud.py Ver fichero


Cargando…
Cancelar
Guardar