Browse Source

Finalization of Plotting

master
JayVii 7 months ago
parent
commit
c5768e31fc
4 changed files with 171 additions and 171 deletions
  1. 8
    36
      scripts/collecto.R
  2. 44
    3
      scripts/functions.R
  3. 119
    108
      scripts/plotte.R
  4. 0
    24
      scripts/word_cloud.py

+ 8
- 36
scripts/collecto.R View File

@@ -26,6 +26,7 @@ if(!require("readODS")){ install.packages("readODS"); library("readODS") }

# Text Manipulation --------------------
if(!require("stringi")){ install.packages("stringi"); library("stringi") }
if(!require("tidyr")){ install.packages("tidyr"); library("tidyr") }

# Read helper functions ----------------
source("./functions.R")
@@ -103,7 +104,7 @@ tweets <- within(data = tweets, expr = {
rm("location")
})

# Eclusion------------------------------
# Exclusion------------------------------
# before 2019-01-01, after 2019-02-17, protected tweets
tweets <- tweets[(as.Date(tweets$date) > as.Date("2019-01-01") &
as.Date(tweets$date) < as.Date("2019-02-17")),]
@@ -111,40 +112,9 @@ tweets <- tweets[!tweets$protected,]

# Mastodon Collector -----------------------------------------------------------

# Set search parameters ----------------
toots <- c()
mastodon_iterations <- 999
mastodon_instance <- "https://mastodon.social"
mastodon_hashtag <- "ilovefs"
mastodon_url <- paste0(mastodon_instance, "/api/v1/timelines/tag/",
mastodon_hashtag, "?limit=40")

# Scrape Mastodon ----------------------
for(i in 1:mastodon_iterations){

# Download and extract Posts
mastodon_reqres <- curl_fetch_memory(mastodon_url)
mastodon_rawjson <- rawToChar(mastodon_reqres$content)
raw_toots <- fromJSON(mastodon_rawjson)

# If Post-Data is present, extract it. Else break the loop
if(length(raw_toots) > 0){
tmp_toots <- mastodon.extract(data = raw_toots)
toots <- rbind(toots, tmp_toots)
} else {
break
}

# Update the URL for the next iteration of the for loop so we can download
# the next toots.
mastodon_lheader <- parse_headers(mastodon_reqres$headers)[11]
mastodon_url <- gsub(x = mastodon_lheader,
pattern = "(.*link:\ <)|(>;\ rel=\"next\".*)",
replace = "")
}
# adding variable-names (again)
names(toots) <- c("time", "date", "lang", "inst", "link", "text",
"rebl", "favs", "acct", "murl", "htag", "ment")
toots <- mastodon.search(hashtag = "ilovefs",
instance = "https://mastodon.social")

# Simple(!) anonymization --------------
toots$acct <- as.numeric(toots$acct) # unique only to this dataframe
@@ -166,8 +136,8 @@ toots <- within(data = toots, expr = {

# Exclusion ----------------------------
# Only include Toots from this year
mst_exclude <- which(as.Date(toots$date) < as.Date("2019-01-01") &
as.Date(toots$date) > as.Date("2019-01-17"))
mst_exclude <- which(as.Date(toots$date) < as.Date("2019-01-01") |
as.Date(toots$date) > as.Date("2019-02-17"))
if(length(mst_exclude) > 0){ toots <- toots[-mst_exclude,] }

# Reddit Collector -------------------------------------------------------------
@@ -246,3 +216,5 @@ save_path <- paste0("../data/ilovefs-", c("fediverse_", "twitter_", "reddit_"),
write_ods(x = toots, path = save_path[1])
write_ods(x = tweets, path = save_path[2])
write_ods(x = reddit, path = save_path[3])

# EOF collecto.R

+ 44
- 3
scripts/functions.R View File

@@ -41,7 +41,8 @@ mastodon.extract <- function(data){
acct <- valifexst(x$account$url) # account url (unique)

# sanitizing text (removing HTML tags and whitespace)
text <- gsub(pattern = "<.*?>|\\s{2,}", x = x$content, replacement = "")
text <- gsub(pattern = "<.*?>", x = x$content, replacement = " ") %>%
gsub(pattern = "\\s{2,}", replacement = " ")

# extract hashtags
htag <- list2vec(
@@ -50,6 +51,7 @@ mastodon.extract <- function(data){

# extract mentions
ment <- list2vec(stri_extract_all(regex = "@\\S+", str = text)) %>%
gsub(pattern = "http\\w{0,1}\\W{3}.*", replacement = "") %>%
gsub(pattern = "(@twitter\\.com)|[\\?!;:\\.]|(,$)|\\(|\\)",
replacement = "")

@@ -64,18 +66,57 @@ mastodon.extract <- function(data){

# return extracted data only
return(data.frame(
rbind(time, date, lang, inst, link, text, rebl, favs, acct, murl,
cbind(time, date, lang, inst, link, text, rebl, favs, acct, murl,
htag, ment)
))
})

# transform "clean" list object into dataframe
data <- as.data.frame(
t(matrix(data = unlist(data), nrow = length(data[[1]])))
t(matrix(data = unlist(data), nrow = 12))
)
names(data) <- c("time", "date", "lang", "inst", "link", "text", "rebl",
"favs", "acct", "murl", "htag", "ment")

# return data.frame object
return(data)
}

# Mastodon Scraper -------------------------------------------------------------
# makes API calls to some specified mastodon instance in order to scrape data
# about a specified hashtag, using API version V1
mastodon.search <- function(maxiter = 9999, hashtag, data = NULL,
instance = "https://mastodon.social"){

# Build initial Request URL
apicall <- paste0(instance, "/api/v1/timelines/tag/", hashtag, "?limit=40")

for(i in 1:maxiter){

# Make API call
dl_raw <- curl_fetch_memory(apicall)
dl_json <- rawToChar(dl_raw$content)
dl_toot <- fromJSON(dl_json)

# If API call returns valid data parse it, else break fro-loop
if(length(dl_toot) > 0){
tmp_toot <- mastodon.extract(data = dl_toot)
df_name <- names(tmp_toot)
data <- rbind(data, tmp_toot)
names(data) <- df_name
} else {
break
}

# Update API call for next iteration
apihead <- parse_headers(dl_raw$headers)[11]
apicall <- gsub(x = apihead, replacement = "",
pattern = "(.*link:\ <)|(>;\ rel=\"next\".*)")
}

# Return Extracted Toot-Data
return(data)
}

# EOF functions.R

+ 119
- 108
scripts/plotte.R View File

@@ -9,14 +9,17 @@
# Plotting -----------------------------
if(!require("ggplot2")){ install.packages("ggplot2"); library("ggplot2") }
if(!require("gridExtra")){ install.packages("gridExtra"); library("gridExtra") }
if(!require("wordcloud")){ install.packages("wordcloud"); library("wordcloud") }
# Text Manipulation --------------------
if(!require("stringi")){ install.packages("stringi"); library("stringi") }

# Convenience Functions ----------------
if(!require("tidyr")){ install.packages("tidyr"); library("tidyr") }

# Loading Data -----------------------------------------------------------------

# Full Dataset -------------------------
load(file = "../data/ilovefs-all_2018-02-20_14-57-16.RData")
load(file = "../data/ilovefs-all_2019-02-07_09-15-38.RData")

# Extracted "mentioned projects" -------
if(length(grep(x = list.files("../data/"), pattern = "tags_mentions.csv")) > 0){
@@ -61,10 +64,15 @@ platform <- factor(c(twt_num, mst_num, rdt_num),

# Cleaning Instances -------------------
instances <- sub(x = toots$inst, pattern = "urn:X-dfrn:", replacement = "") %>%
sub(pattern = ":.*", replacement = "") %>%
as.factor()
sub(pattern = ":.*", replacement = "")
# Assign instances with "low" occurance the "other" tag
inst_other <- names(table(instances)[table(instances) < 4])
instances[instances %in% inst_other] <- "other"
instances <- as.factor(instances)

# Preparing Plot -----------------------

# Create a dataframe which contains all information relevant to ggplot()
part1_df <- data.frame(count = as.numeric(table(platform)),
category = as.character(levels(platform)))
part1_df <- within(data = part1_df, expr = {
@@ -76,6 +84,7 @@ part1_df <- within(data = part1_df, expr = {
count[count == 0] <- NA
})

# Create a dataframe also for participation per instance
part2_df <- data.frame(count = as.numeric(table(instances)),
category = as.character(levels(instances)))
part2_df <- within(data = part2_df, expr = {
@@ -89,138 +98,140 @@ part2_df <- within(data = part2_df, expr = {
})

# Plotting the results -----------------
part1_plot <- ggplot(
part1_df,
aes(fill = category,
ymax = ymax,
ymin = ymin,
xmax = 4,
xmin = 3
)) +

part1_plot <-
# Inital Plot, maximum values, etc
ggplot(part1_df, aes(fill = category, ymax = ymax, ymin = ymin, xmax = 4,
xmin = 3)) +
xlim(c(0, 4)) +
# Type of plot
geom_rect() +
coord_polar(theta = "y") + # I have actually no idea, what this does.
xlim(c(0, 4)) +
# Theme / Design of Plot
theme_minimal() +
theme(
legend.position = "right",
panel.grid=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
text = element_text(size = 14),
plot.margin=unit(c(-0.5, -0.5, -0.5, -0.5),"in"),
# top, right, bottom, left
) +
theme(legend.position = "left", panel.grid = element_blank(),
axis.text = element_blank(), axis.ticks = element_blank(),
axis.title.x = element_blank(), axis.title.y = element_blank(),
text = element_text(size = 14),
# Margins: top, right, bottom, left
plot.margin = unit(c(-0.5, -0.5, -0.5, -0.5),"in")) +
# Draw Title and Text inside Plot
annotate("text", x = 0, y = 0, fontface = 2, size = 8,
label = "Participation in #iLoveFS\nper Platform") +
geom_text(
aes(
x = 3.5, # xmax is set to 4, 3.5 is barely within the circle
y = pos,
label = count
),
size = 8) +
label = "Participation in #iLoveFS\nper Platform") +
geom_text(aes(x = 3.5, y = pos, label = count), size = 6) +
labs(title="") +
scale_fill_manual(
name=NULL,
breaks= part1_df$category,
values = part1_df$colors
)

part2_plot <- ggplot(
part2_df,
aes(fill = category,
ymax = ymax,
ymin = ymin,
xmax = 4,
xmin = 3
)) +
# Coloring of Plot
scale_fill_manual(name = NULL,
labels = paste0(as.character(part1_df$category),
" (", part1_df$count, ")"),
breaks = part1_df$category, values = part1_df$colors)

part2_plot <-
# Inital Plot, maximum values, etc
ggplot(part2_df, aes(fill = category, ymax = ymax, ymin = ymin, xmax = 4,
xmin = 3)) +
xlim(c(0, 4)) +
# Type of Plot
geom_rect() +
coord_polar(theta = "y") + # I have actually no idea, what this does.
xlim(c(0, 4)) +
# Theme / Design of Plot
theme_minimal() +
theme(
legend.position = "right",
panel.grid=element_blank(),
axis.text=element_blank(),
axis.ticks=element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
text = element_text(size = 13),
plot.margin=unit(c(-1, 0, -1, 0),"in"), # top, right, bottom, left
) +
theme(legend.position = "right", panel.grid = element_blank(),
axis.text = element_blank(), axis.ticks = element_blank(),
axis.title.x = element_blank(), axis.title.y = element_blank(),
text = element_text(size = 14),
# Margins: top, right, bottom, left
plot.margin = unit(c(-1, -1, -1, -1),"in")) +
# Draw Title and Text inside PLot
annotate("text", x = 0, y = 0, fontface = 2, size = 8,
label = "Participation in #iLoveFS\nin the Fediverse") +
label = "Participation in #iLoveFS\nin the Fediverse") +
geom_text(aes(x = 3.5, y = pos, label = count), size = 6) +
labs(title="") +
scale_fill_manual(
name=NULL,
labels = paste0(as.character(part2_df$category), " (", part2_df$count,
")"),
breaks= part2_df$category,
values = part2_df$colors
)



# Plotting the results -----------------
pdf(file = "./plots/participation_platform.pdf", height = 10, width = 20)
par(mfrow = c(1,2))

##### Platform (Twitter/Fediverse)
plot(x = platform,
col = c("#a22430", "#1da1f2"),
ylim = c(0, ceiling(max(table(platform))/100) * 100),
main = "#ilovefs participation by Platform",
names.arg = c("The Fediverse", "Twitter"))
##### Instances (Fediverse)
plot(x = instances,
horiz = FALSE,
col = rainbow(n = length(unique(instances))),
ylim = c(0, ceiling(max(table(instances))/10) * 10),
main = "#ilovefs participation by Instance",
names.arg = "")
legend(x = "topleft", legend = levels(instances),
fill = rainbow(n = length(unique(instances))),
bty = "n", ncol = 2)

# Coloring of Plot
scale_fill_manual(name = NULL,
labels = paste0(as.character(part2_df$category),
" (", part2_df$count, ")"),
breaks= part2_df$category, values = part2_df$colors)

# Save plots ---------------------------
filename <- paste0("../plots/participation_",
gsub(x = Sys.time(), pattern = "-.*", replacement = ""),
"_", 1:2, ".pdf")
pdf(file = filename[1], height = 8, width = 12)
part1_plot
dev.off()
pdf(file = filename[2], height = 8, width = 12)
part2_plot
dev.off()
# }}}

### Time and Date {{{ ----
# Time and Date ----------------------------------------------------------------

#### Transform time into POSIX dates
twitter_time <- strptime(paste0(twitter$date, twitter$time),
format = "%Y%m%d%H%M%S")
mastodon_time <- strptime(paste0(mastodon$date, mastodon$time),
format = "%Y%m%d%H%M%S")
# Transform time into POSIX dates ------
twitter_time <- strptime(paste(tweets$date, tweets$time),
format = "%Y-%m-%d %H:%M:%S")

#### Participation by Time on Twitter
twitter_plot <- ggplot(data = twitter, aes(x=twitter_time)) +
# Twitter ------------------------------
twitter_plot <- ggplot(data = tweets, aes(x = as.POSIXct(twitter_time))) +
geom_histogram(aes(fill=..count..), binwidth=60*180) +
scale_x_datetime("Date") +
scale_y_continuous("Frequency") +
ggtitle("Participation on Twitter") +
scale_fill_gradient("Count", low="#002864", high="#329cc3")

#### Participation by Time in the Fediverse
mastodon_plot <- ggplot(data = mastodon, aes(x=mastodon_time)) +
# Fediverse ----------------------------
mastodon_plot <- ggplot(data = toots, aes(x = as.POSIXct(toots$fdat))) +
geom_histogram(aes(fill=..count..), binwidth=60*180) +
scale_x_datetime("Date") +
scale_y_continuous("Frequency") +
ggtitle("Participation in the Fediverse") +
scale_fill_gradient("Count", low="#640000", high="#FF0000")

#### Export / Save plots as PDF
pdf(file="./plots/ilfs-participation-by-date.pdf", width=14, height=7)
# Save plots ---------------------------
filename <- paste0("../plots/timeplot_",
gsub(x = Sys.time(), pattern = "-.*", replacement = ""),
".pdf")
pdf(file = filename, width = 14, height = 7)
grid.arrange(twitter_plot, mastodon_plot, nrow = 2)
dev.off()
# }}}

pdf(file = "test_platform.pdf", height = 7, width = 9)
part1_plot
# Wordcloud --------------------------------------------------------------------

# Gathering mentions -------------------
mentions <- c(
# Toots: Hashtags
unlist(strsplit(x = as.character(toots$htag), split = ",")),
# Tweets: Hashtags
unlist(strsplit(x = as.character(tweets$htag), split = ",")),
# Toots: Mentions
unlist(strsplit(x = as.character(toots$ment), split = ",")),
# Tweets: Mentions
unlist(strsplit(x = as.character(tweets$ment), split = ","))
) %>%
gsub(pattern = "NA", replacement = NA) %>%
na.omit() %>%
gsub(pattern = "@|#|\\!", replacement = "") %>%
tolower()

# valid mentions -----------------------
ment_acc <- c("freesoftware", "ilovefs", "foscon", "fsfe", "debian", "linux",
"android", "gnu", "fdroid", "fsf", "fosdem", "publiccode",
"framasoft", "videolan", "opensrcdesign", "kdecommunity", "kde",
"libreoffice", "official_php", "mediawiki", "opensuse",
"centosproject", "gauteh", "free", "datalabx")
mentions <- mentions[mentions %in% ment_acc]

# color assignment ---------------------
wcol <- rainbow(length(unique(mentions))) %>%
sub(pattern = "FF$", replacement = "99")

# Plot results -------------------------
set.seed(1337) # reproducability!
filename <- paste0("../plots/wordcloud_",
gsub(x = Sys.time(), pattern = "-.*", replacement = ""),
".pdf")
pdf(file = filename, height = 4, width = 4)
wordcloud(mentions, colors = wcol, min.freq = 1, rot.per = 0.25,
random.color = TRUE, random.order = TRUE)
dev.off()

pdf(file = "test_instance.pdf", height = 7, width = 12)
part2_plot
dev.off()
# EOF plotte.R

+ 0
- 24
scripts/word_cloud.py View File

@@ -1,24 +0,0 @@
################################################################################
# Copyright (c) 2018 Free Software Foundation Europe e.V. <contact@fsfe.org>
# Author 2018 Vincent Lequertier <vincent@fsfe.org>
# SPDX-License-Identifier: GPL-3.0
################################################################################

import re
import random

def get_words_from_string(s):
return re.findall(re.compile('\w+'), s)

def scrambled(orig):
dest = orig[:]
random.shuffle(dest)
return dest

# This is mentions and hashtag from fediverse and twitter
text = "BudgieDesktop FreeDos AFWall AFWall WestNordOst WestNordOst OpenStreetMap OpenStreetMap OpenStreetMap Docker Ubuntu Ubuntu Ubuntu Ubuntu Ubuntu Ubuntu Ubuntu Ubuntu Virtualbox PHP PHP PHP PHP PHP PHP Perlanet Perlanet Shaarli Shaarli Framasoft Framasoft Framasoft Framasoft Framasoft Framasoft Framasoft Framasoft Framasoft Framasoft Framasoft Framasoft BSD KDE KDE KDE KDE KDE KDE KDE KDE KDE KDE KDE KDE KDE KDE KDE KDE KDE Fdroid Fdroid Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Linux Kalzium GNU GNU GNU GNU GNU GNU Python Python Plone Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Firefox Gentoo LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice LibreOffice OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE OpenSUSE Mutt VLC VLC VLC VLC VLC VLC VLC VLC VLC VLC VLC Anki Unity Thunderbird Thunderbird Thunderbird Thunderbird Xtext RedHat RedHat RedHat Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Debian Rproject OpenOffice OpenOffice Audacity Audacity Audacity Blender Blender Blender Blender AstroidMail AstroidMail AstroidMail AstroidMail OJS Ring Replicant Nextcloud Nextcloud Nextcloud Nextcloud Nextcloud Nextcloud Mastodon RPM Diaspora Archlinux Archlinux Archlinux AntennaPod AntennaPod AntennaPod CentOS CentOS Scribus Scribus Scribus Inkscape GIMP GIMP GIMP GIMP GIMP SUSE Canonical Fedora Fedora Fedora Fedora Fedora ElementaryOS GNUmax Xubuntu Kubuntu Kubuntu WordPress WordPress WordPress UBPorts UBPorts jspwiki Wallabag Antergos LineageOS LineageOS Volumio Kodi MediathekView Signal Signal Signal Gvsig Gvsig Gvsig Gvsig Gvsig Emacs UbuntuMate FreeBSD SnapCraft TOR TOR Go Rust Rust GNOME GNOME GNOME GNOME GNOME Typo3 MariaDB 0AD StandardNotes GNUcash OpenBuildService Mozilla Wire KeePassX NodeJS Shutter Eclipse OSMand Syncthing Transportr OpenMensa Twidere Weechat RocketChat ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS ILoveFS"

words_scramnbled = scrambled(get_words_from_string(text))
words_scramnbled = ' '.join(words_scramnbled)

print(words_scramnbled)

Loading…
Cancel
Save