###################################################################################################### ######################################################################################################

0. select dataset

library(reticulate) library(here) library(dplyr)

library(foreach) library(doParallel) library(tidytext) library(dplyr)

library(dbscan) library(ggplot2)

library(gridExtra) library(cuRe) library(stm) library(mgm)

save.image()

1. Natural language processing (emotions)

please note that it is necessary to have Python somewhere on your computer for this part

library(reticulate) library(here) library(dplyr)

reticulate::conda_remove(envname = "r-reticulate") #if you installed packages incorrectly, delete your whole environment and start over

reticulate::py_list_packages(envname = "r-reticulate") #check which packages you have

reticulate::py_config() #installing your r-reticulate environment reticulate::py_available() #TRUE

import tensorflow first, then transformers

reticulate::py_install("transformers", pip = TRUE) reticulate::py_install("torch", pip = TRUE) reticulate::py_install("tensorflow==2.10", pip = TRUE, user = TRUE) #has to be version 2.10

transformers <- reticulate::import("transformers") tensorflow <- reticulate::import("tensorflow") Torch <- reticulate::import("torch")

tweetnlp <- reticulate::import("tweetnlp")

pytorch

reticulate::conda_remove('r-reticulate', packages = c('tensorflow')) reticulate::conda_remove('r-reticulate', packages = c('transformers')) reticulate::conda_remove(envname = "r-reticulate", packages = "pytorch") reticulate::conda_remove(envname = "r-reticulate", packages = "tweetnlp")

reticulate::conda_remove(envname = "r-reticulate", packages = "pytorch")

reticulate::conda_install(envname = "r-reticulate", packages = "tensorflow==2.10", user = TRUE) reticulate::conda_install(envname = "r-reticulate", packages = "transformers", user = TRUE) reticulate::conda_install(envname = "r-reticulate", packages = "torch", user = TRUE) reticulate::conda_install(envname = "r-reticulate", packages = "tweetnlp", user = TRUE)

reticulate::py_install("tweetnlp", pip = TRUE, user = TRUE, user = TRUE)

reticulate::py_install("charset-normalizer==3.1.0", pip = TRUE, user = TRUE)

tweetnlp <- reticulate::import("tweetnlp")

Load the tokenizer and model

pipe = transformers$pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=TRUE)

classifier <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest") # you can ignore the following warning: UserWarning: `return_all_scores` is now deprecated, use `top_k=1` if you want similar functionnality

model <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")

pipe("I bet everything will work out in the end :)") # it works

data <- read.csv("C:/Users/Brent/Documents/GitHub/bdss-project/Assignment3/example.csv") # this dataset is simplified to only the text and creation time data <- read.csv("C:/Users/Brent/Documents/Psychology/KUL/Psychologie/June/resultsmarch2junecorona.csv")

data <- data[c("Tweet", "created_at", "text")]

df2 = data remove(df_clustered)

test code for first row

Get the labels (emotions) from the classifier's output list

for the first row of the dataframe

result <- pipe(df2[1, "Tweet"]) labels <- unlist(lapply(result[[1]], function(x) x[[1]])) score <- lapply(result[[1]], function(x) x[[2]]) df2[, labels] <- NA df2[1, labels] <- score #df2 should have scores for the first row

now see if we can iterate over rows

subset so you dont have to wait more then 5 minutes to see if your trials worked

df3 <- df2[49900:50000,]

remove(remaining_data) df3 <- df2

run over rows

library(foreach) library(doParallel)

install.packages("tidytext")

library(tidytext) library(dplyr) cl <- makeCluster(detectCores() - 1, type = "SOCK") registerDoParallel(cl)

system.time({ df3 <- foreach(i = 1:nrow(df3), .packages = "tidytext", .inorder = FALSE) %do% { tweet <- df3$Tweet[i] result <- pipe(tweet) labels <- unlist(lapply(result[[1]], function(x) x[[1]])) score <- unlist(lapply(result[[1]], function(x) x[[2]])) df3[i,labels] <- score df3[i,] } df3 <- do.call(rbind, df3) }) write.csv(df3, "df3_june.csv", row.names = FALSE)

2. Cluster analysis

install.packages("dbscan")

library(dbscan) library(ggplot2)

Assuming your dataset is stored in 'df1'

Convert your dataset to a matrix (if it's not already)

df1<- data[c("surprise", "trust", "sadness", "joy","love","fear","anger","pessimism","anticipation","disgust","optimism")]

df1 <- scale(df1) df1 <- as.data.frame(df1)

df1 <- df1[1:10000,]

df1 <- as.matrix(df1)

Set the parameters for DBSCAN

eps <- 0.15 # Epsilon - maximum distance between points to be considered neighbors minPts <- 25 # Minimum number of points in a cluster

0.1 and 10: Cluster1: anger and disgust 36000 cluster2: sadness and pessimism 1900 cluster9: positivity 5000

0.15 and 15: cluster1: anger cluster2: sadness and pessi cluster 3: positivity

0.2 and 30 is fine

0.2 and 40 also ok but prefer previous -> c1: 32000 positive c2: 73500 anger disgust c3: 3400 fear and pessi c4: 7700 the shitty all low one

0.15 and 25 is nice -> c1: anger and disgust 50 000 c2: sad pessi 4000 c5: joy, trust, love, opti 5000 c6: joy, opti, trust, love 8300 difference c5 and c6 -> c6 is extreemer on joy, love, trust (higher)

0.4 and 0.3 is not it like NOT

0.3 and 0.25 is not it like NOT

0.2 25 is okay but only c1 42000 and c2 89000

0.2 and 20 is also very nice but just 2 large clusters c1: 61300 and c2: 95000

0.15 and 20 is c1 58000 c2 2300 c3 18800

Run DBSCAN

dbscan_result <- dbscan(df1, eps = eps, minPts = minPts)

Print the clusters found

print(dbscan_result$cluster)

Print the noise points (not assigned to any cluster)

print(dbscan_result$cluster[dbscan_result$cluster == 0])

Print the core points (points that have at least 'minPts' neighbors)

print(dbscan_result$cluster[dbscan_result$cluster != 0])

Print the total number of clusters found

num_clusters <- length(unique(dbscan_result$cluster)) - 1 # Exclude noise points print(paste("Number of clusters found:", num_clusters))

Create a dataframe combining the original data and the cluster labels

df_clustered <- data.frame(df1, Cluster = dbscan_result$cluster)

Filter out the first cluster (cluster 0)

df_clustered <- df_clustered %>% filter(Cluster != 0)

Visualize the most common cluster using a histogram

print(ggplot(df_clustered, aes(x = Cluster)) + geom_histogram(fill = "steelblue", color = "black") + labs(x = "Cluster", y = "Frequency") + ggtitle("Histogram of Cluster Frequencies"))

table(df_clustered$Cluster)

Add a column with the cluster labels to the original dataset

data$Cluster <- dbscan_result$cluster

Print 10 random tweets from cluster 1

cluster1_tweets <- data[data$Cluster == 1, "text"] print(sample(cluster1_tweets, 10))

Print 10 random tweets from cluster 2

cluster2_tweets <- data[data$Cluster == 2, "text"] print(sample(cluster2_tweets, 10))

Print 10 random tweets from cluster 5

cluster5_tweets <- data[data$Cluster == 5, "text"] print(sample(cluster5_tweets, 10))

Print 10 random tweets from cluster 6

cluster6_tweets <- data[data$Cluster == 6, "text"] print(sample(cluster6_tweets, 10))

visualize

Install and load necessary packages

install.packages("ggplot2") library(ggplot2) install.packages("gridExtra") library(gridExtra)

Create a dataframe combining the original data and the cluster labels

df_clustered <- data.frame(df1, Cluster = dbscan_result$cluster)

Create density plots per cluster for the 11 variables

clusters_to_plot <- unique(df_clustered$Cluster)

best 6

library(gridExtra)

Create a dataframe combining the original data and the cluster labels

df_clustered <- data.frame(df1, Cluster = dbscan_result$cluster)

Create density plots per cluster for the 11 variables

clusters_to_plot <- unique(df_clustered$Cluster)[0:5] # Only look at first 6 clusters

for (i in clusters_to_plot) { cluster_data <- df_clustered[df_clustered$Cluster == i, ] cluster_data <- cluster_data[, !colnames(cluster_data) %in% c("Cluster")]

Create a density plot for each variable in the cluster

density_plots <- lapply(colnames(cluster_data), function(variable) { ggplot(cluster_data, aes(x = .data[[variable]], fill = variable)) + geom_density(color = "black") + labs(x = variable, y = "Density", fill = variable) + ggtitle(paste("Cluster", i, "-", variable)) })

Combine density plots for all variables into one figure

combined_plot <- grid.arrange(grobs = density_plots, ncol = 4)

Print the combined plot for the cluster

print(combined_plot) }

more fancy plots:

color_palette <- c("surprise" = "blue", "trust" = "green", "sadness" = "red",

"joy" = "orange", "love" = "purple", "fear" = "yellow",

"anger" = "pink", "pessimism" = "cyan", "anticipation" = "magenta",

"disgust" = "brown", "optimism" = "gray")

Define a function to calculate the color intensity based on scale values

calculate_color_intensity <- function(values) {

normalized_values <- (values - min(values)) / (max(values) - min(values))

intensity <- 0.5 + normalized_values * 0.5 # Adjust the scale of intensity as needed

return(intensity)

}

Loop over the clusters and variables to create density plots

for (i in clusters_to_plot) {

cluster_data <- df_clustered[df_clustered$Cluster == i, ]

cluster_data <- cluster_data[, !colnames(cluster_data) %in% c("Cluster")]

Create a density plot for each variable in the cluster

density_plots <- lapply(colnames(cluster_data), function(variable) {

intensity <- calculate_color_intensity(cluster_data[[variable]])

color <- color_palette[variable]

cluster_data$fill_variable <- as.factor(intensity)

ggplot(cluster_data, aes(x = .data[[variable]], fill = fill_variable)) +

geom_density(color = "black") +

scale_fill_manual(values = color, guide = "none") +

labs(x = variable, y = "Density") +

ggtitle(paste("Cluster", i, "-", variable))

})

Combine density plots for all variables into one figure

combined_plot <- grid.arrange(grobs = density_plots, ncol = 4)

Print the combined plot for the cluster

print(combined_plot)

}

######################################################################################################

2.1 another clustering method

######################################################################################################

hierarchical

note: code does not work

cure(df1)

Perform CURE clustering

cure_result <- cure(df1)

Get the cluster assignments

cluster_assignments <- cure_result$cluster

Print the number of clusters found

num_clusters <- length(unique(cluster_assignments)) print(paste("Number of clusters:", num_clusters))

not hierarchical but another clustering method

library(mclust)

Perform model-based clustering

mclust_result <- Mclust(df1)

Get the cluster assignments

cluster_labels <- mclust_result$classification

Print the cluster labels

print(cluster_labels)

save(mclust_result, file = "mclust_result.RData")

summary(cluster_labels)

Access cluster-specific information

cluster_means <- mclust_result$parameters$mean cluster_covs <- mclust_result$parameters$cov

Visualize variable distributions for each cluster

for (i in unique(cluster_labels)) { cluster_data <- df1[cluster_labels == i, ]

Create density plots for each variable in the cluster

density_plots <- lapply(colnames(cluster_data), function(variable) { ggplot(cluster_data, aes(x = .data[[variable]])) + geom_density(color = "black", fill = "steelblue") + labs(x = variable, y = "Density") + ggtitle(paste("Cluster", i, "-", variable)) })

Combine density plots for all variables into one figure

combined_plot <- gridExtra::grid.arrange(grobs = density_plots, ncol = 3)

Print the combined plot for the cluster

print(combined_plot) }

Assess the goodness of the clustering solution

BIC <- mclust_result$BIC ICL <- mclust_result$ICL

Print BIC and ICL values

print(paste("BIC:", BIC)) print(paste("ICL:", ICL))

3. Topic modeling

Topic modeling ----------------------------------------------------------

library(stm)

data <- sampleddata10 remove(sampleddata10)

remove hastags and links

remove1 <- function(text) {

Remove links

text <- gsub("http\S+", "", text) text <- gsub("www.\S+", "", text)

Remove mentions

text <- gsub("@[A-Za-z0-9_]+", "", text)

Remove hashtags

text <- gsub("#[A-Za-z0-9_]+", "", text)

Remove other patterns if needed

text <- gsub("like", "", text)

text <- gsub("good", "", text)

return(text) }

data$Tweet <- sapply(data$Tweet, remove1)

###################################################

week intervals

###################################################

now we need intervals per week for stm

library(lubridate)

Convert 'created_at' column to POSIXct format

data$created_at1 <- ymd_hms(data$created_at, tz = "UTC")

Define your starting date

starting_date <- ymd_hms("2020-05-09 00:00:00", tz = "UTC")

Calculate the number of weeks from the starting date

data$weeks <- floor(as.numeric(data$created_at1 - starting_date) / (7 24 60 * 60)) + 1

Calculate the number of 3-day intervals from the starting date

data$fourdayint <- floor(as.numeric(data$created_at - starting_date) / (3 24 60 * 60)) data$fourdayint <- data$fourdayint + 1

Calculate the number of 1-day intervals from the starting date

data$onedayint <- floor(as.numeric(data$created_at1 - starting_date) / (1 24 60 * 60)) + 1

new method

Define your starting date

starting_date <- ymd_hms("2020-05-09 00:00:00", tz = "UTC")

Create intervals of 3 days

interval_length <- 3 24 60 * 60 # 3 days in seconds data$fourdayint <- cut(data$created_at1, breaks = seq(starting_date, max(data$created_at1) + interval_length, by = interval_length), labels = FALSE)

####################################################

#######################################################################################

STM

#######################################################################################

install.packages("textclean","tm","SnowballC")

load("C:/Users/Erik/Documents/Brent/USB-station/stats thesis/batch12notdoneyet.RData")

load("C:/Users/Brent/Documents/Psychology/KUL/thesisstatisticsdata/BLM(original-08_07).RData")

Load the required libraries

library(textclean) library(stm) library(foreach) library(doParallel) library(tm) library(SnowballC)

Set the number of cores to use for parallel processing

num_cores <- 10

Register parallel backend with the desired number of cores

registerDoParallel(cores = num_cores)

Preprocess the text data

cleaned_data <- tolower(data$Tweet)# Convert text to lowercase cleaned_data <- replace_contraction(cleaned_data) # Replace contractions for example (I'm -> I am)

cleaned_data <- replace_number(cleaned_data) # Replace numbers

cleaned_data <- replace_symbol(cleaned_data) # Replace symbols

cleaned_data <- strip_whitespace(cleaned_data) # Remove leading/trailing white spaces

Create a data frame with the 'dayfour' column as metadata

metadata <- data.frame(days = data$fourdayint, index = 1:nrow(data))

processed <- textProcessor(documents = cleaned_data, metadata = metadata, lowercase = TRUE, removestopwords = TRUE, language = "en", striphtml = T, wordLengths = c(3, Inf))

out <- prepDocuments(processed$documents, processed$vocab, processed$meta) docs <- out$documents vocab <- out$vocab meta <- out$meta

out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 100) # removes infrequent terms depending on the user-set parameter lower.thresh

system.time({poliblogPrevFit <- stm(documents = out$documents, vocab = out$vocab, K = 15, prevalence = ~dayfour, max.em.its = 75, data = out$meta, init.type = "Spectral")})

install.packages("Rtsne" ,"rsvd", "geometry") library(rsvd) library(geometry) library(Rtsne)

system.time({storage <- searchK(out$documents, out$vocab, K = 0, prevalence =~ days, data = out$meta)}) ## it took 21 hours -> the result is 58 topics

system.time({Fit <- stm(documents = out$documents, vocab = out$vocab, K = 57, prevalence =~ days, max.em.its = 10, data = out$meta, init.type = "Spectral")})

save.image(file="BLM.RData")

install.packages("wordcloud") library(wordcloud)

plot(Fit, type = "summary", xlim = c(0, 0.2)) plot(Fit, type = "summary", xlim = c(0, 0.3),xlim = c(0, 0.3) ) par(mar = c(5.1,4.1,5.1,2.1)) plot(Fit, type = "summary", xlim = c(0, 0.15))

############## top 20 topics ######################

par(mfrow=c(1,1)) plot(Fit, type = "summary", xlim = c(0, 0.3))

############## top 24 topics ###################### par(mfrow=c(3,2), oma=c(0,0,0,0), mar=c(0,0,0,0)) cloud(Fit, topic = 1, scale = c(4, 1)) # mtext("Topic 12", side = 4,line =0 ) cloud(Fit, topic = 12, scale = c(4, 1)) # mtext("Topic 1", side = 2,line =2 ) cloud(Fit, topic = 36, scale = c(4, 1)) #
mtext("Topic 41", side = 4,line =0 ) cloud(Fit, topic = 41, scale = c(4, 1)) # mtext("Topic 36", side = 2,line =2 )

cloud(Fit, topic = 31, scale = c(2, 1)) # positivity about BLM (proud, love, encourage, hope, support, continue) mtext("Topic 16", side = 4,line =0 ) cloud(Fit, topic = 16, scale = c(4, 1)) ## mtext("Topic 31", side = 2,line =2 ) cloud(Fit, topic = 39, scale = c(4, 1)) # mtext("Topic 10", side = 4,line =0 ) cloud(Fit, topic = 10, scale = c(4, 1)) # mtext("Topic 39", side = 2,line =2 ) cloud(Fit, topic = 51, scale = c(4, 1)) # mtext("Topic 40", side = 4,line =0 )

cloud(Fit, topic = 40,scale=c(4 ,1)) # mtext("Topic 51", side = 2,line =2 ) cloud(Fit ,topic=27,scale=c(4 ,1)) # mtext("Topic 3", side = 4,line =0 ) cloud(Fit ,topic=3,scale=c(4 ,1)) # mtext("Topic 27", side = 2,line =2 ) cloud(Fit ,topic=21,scale=c(4 ,1)) # mtext("Topic 26", side = 4,line =0 ) cloud(Fit ,topic=26,scale=c(4 ,1)) # mtext("Topic 21", side = 2,line =2 )

cloud(Fit ,topic=56,scale=c(4 ,1)) # mtext("Topic 20", side = 4,line =0 ) cloud(Fit ,topic=20,scale=c(4 ,1)) # mtext("Topic 56", side = 2,line =2 ) cloud(Fit ,topic=14,scale=c(4 ,1)) # mtext("Topic 55", side = 4,line =0 ) cloud(Fit ,topic=55,scale=c(4 ,1)) mtext("Topic 14", side = 2,line =2 )

first topic and 20th topic

cloud(Fit ,topic=30,scale=c(4 ,1)) # [stay home] -> government's campaign(stay,hom e,safe,p eople away,everyone,w ork,p rotect) mtext("Topic 32", side = 4,line =0 ) cloud(Fit ,topic=32,scale=c(4 ,1)) mtext("Topic 30", side = 2,line =2 )

cloud(Fit ,topic=29,scale=c(4 ,1)) # mtext("Topic 44", side = 4,line =0 ) cloud(Fit ,topic=44,scale=c(4 ,1)) # mtext("Topic 29", side = 2,line =2 ) cloud(Fit ,topic=23,scale=c(4 ,1)) # mtext("Topic 43", side = 4,line =0 ) cloud(Fit ,topic=43,scale=c(4 ,1)) mtext("Topic 23", side = 2,line =2 )

par(mfrow=c(1,1))

###############################################◘ 11-08-2023

topics with most frequent words

Examine Top Words for Each Topic

library(stm) library(ggplot2)

Assuming 'Fit' is your STM model

topic_number <- 12 n_top_words <- 10

Get log probabilities from the specific topic

log_word_probs <- Fit$beta$logbeta[[1]][, topic_number]

Convert log probabilities to actual probabilities

word_probs <- exp(log_word_probs)

Get the top N words

top_words_indices <- order(word_probs, decreasing = TRUE)[1:n_top_words] top_words <- Fit$vocab[top_words_indices] top_probs <- word_probs[top_words_indices]

Create a data frame and plot

top_words_for_topic <- data.frame(word = top_words, probability = top_probs) ggplot(data = top_words_for_topic, aes(x = reorder(word, probability), y = probability)) + geom_bar(stat = "identity") + coord_flip() + ggtitle(paste("Top Words for Topic", topic_number)) + xlab("Words") + ylab("Probability")

Install and load required packages

install.packages("tmtoolkit") library(tmtoolkit)

Get the topic-word probabilities from your 'Fit' model

topic_word_probs <- as.matrix(Fit$beta)

Calculate topic coherence using 'tmtoolkit' package

coherence_scores <- calculate_coherence(topic_word_probs, method = "c_v", documents = out$documents)

Plot topic coherence

plot(1:Fit$K, coherence_scores, type = "b", xlab = "Number of Topics", ylab = "Coherence Score", main = "Topic Coherence")

exclusivity_scores <- stm::exclusivity(Fit)

Plot topic exclusivity

plot(exclusivity_scores, main = "Topic Exclusivity")

semco <- semanticCoherence(Fit, out$documents, M = 20)

semco5 <- semanticCoherence(Fit, out$documents, M = 5)

Create a bar plot of semantic coherence scores

barplot(semco, xlab = "Topic Number", ylab = "Semantic Coherence", main = "Semantic Coherence for Topics")

barplot(semco5, names.arg = 1:57, xlab = "Topic Number", ylab = "Semantic Coherence", main = "Semantic Coherence for Topics", cex.names = 0.6)

topic_indices <- c(1:57) coherence_values <- semco5[topic_indices]

Print the extracted coherence values

print(coherence_values)

mean(coherence_values)

###########################################

♠ now apply to whole dataset in batches

batch_1 <- batch_11 remove(batch_11)

Preprocess the text data

cleaned_data <- tolower(batch_1$Tweet)# Convert text to lowercase

Define your starting date

batch_1$created_at1 <- ymd_hms(batch_1$created_at, tz = "UTC") starting_date <- ymd_hms("2020-05-09 00:00:00", tz = "UTC")

Create intervals of 3 days

interval_length <- 3 24 60 * 60 # 3 days in seconds batch_1$fourdayint <- cut(batch_1$created_at1, breaks = seq(starting_date, max(batch_1$created_at1) + interval_length, by = interval_length), labels = FALSE)

Create a data frame with the 'dayfour' column as metadata

metadata2 <- data.frame(days = batch_1$fourdayint, index = 1:nrow(batch_1))

processed2 <- textProcessor(documents = cleaned_data, metadata = metadata2, lowercase = TRUE, removestopwords = TRUE, language = "en", striphtml = T)

out2 <- prepDocuments(processed2$documents, processed2$vocab, processed2$meta) docs2 <- out2$documents vocab2 <- out2$vocab meta2 <- out2$meta

processed2 <- processed

out2 <- out

docs2 <- out2$documents

vocab2 <- out2$vocab

meta2 <- out2$meta

system.time({Fit <- stm(documents = out$documents, vocab = out$vocab, K = 57, prevalence =~ days, max.em.its = 10, data = out$meta, init.type = "Spectral")})

new <- prepDocuments(out2$documents, out2$vocab) old.vocab <- out$vocab aligned <- alignCorpus(new, old.vocab) docs2 <- aligned$documents

new option

aligned2 <- alignCorpus(out2, old.vocab)

Apply the fitNewDocuments function

system.time({fit_new <- fitNewDocuments(model = Fit, documents = aligned2$documents, newData = aligned2$meta, origData = meta, # Set origData to NULL since it's not available for the new data prevalence = ~ days, betaIndex = NULL, prevalencePrior = "Covariate", contentPrior = "Average", returnPosterior = FALSE, returnPriors = FALSE, designMatrix = NULL, test = TRUE, verbose = TRUE)})

save.image(file="batch12.RData")

###################################################################### save the data 22/07

load("C:/Users/Erik/Documents/Brent/USB-station/stats thesis/BLM2.RData")

Get the topic proportions

topic_proportions <- Fit$theta

Convert the topic proportions to a data frame

topic_proportions_df <- as.data.frame(topic_proportions)

Rename the columns to reflect the topic numbers

colnames(topic_proportionsdf) <- paste0("topic", 1:ncol(topic_proportions_df))

Get the names of the elements in the 'documents' list of 'aligned'

names_in_aligned <- names(docs)

lastname <- tail(names_in_aligned, n = 5) print(lastname)

Convert the names of aligned to numeric

row_indices <- as.numeric(names_in_aligned)

Add a new 'id' column to topic_proportions_df that contains these indices

topic_proportions_df$id2 <- row_indices

Convert the 'id' column to numeric

batch_1$id2 <- as.numeric(row.names(batch_1))

Merge batch_1 and topic_proportions_df based on the 'id' column

data_with_topics <- merge(batch_1, topic_proportions_df, by = "id2", all.x = TRUE)

Save the data

write.csv(data_with_topics, "batch2p.csv")

check if it worked

library(dplyr) dta <- data_with_topics %>% select('text','topic_1','topic_12','topic_36','topic_41','topic_31','topic_16')

#########################################

COVID

######################################### par(mfrow=c(3,2), oma=c(0,0,0,0), mar=c(0,0,0,0)) cloud(Fit, topic = 8, scale = c(4, 1)) # mtext("Topic 30", side = 4,line =0 ) cloud(Fit, topic = 30, scale = c(4, 1)) # [lockdown, quarantine] mtext("Topic 8", side = 2,line =2 ) cloud(Fit, topic = 37, scale = c(4, 1)) # (shopping for) food, drinks, mtext("Topic 18", side = 4,line =0 ) cloud(Fit, topic = 18, scale = c(4, 1)) # [corona, deaths, chinese] mtext("Topic 37", side = 2,line =2 )

cloud(Fit, topic = 51, scale = c(2, 1)) # mtext("Topic 34", side = 4,line =0 ) cloud(Fit, topic = 34, scale = c(4, 1)) ## topic related to [love] mtext("Topic 51", side = 2,line =2 ) cloud(Fit, topic = 27, scale = c(4, 1)) # [leasure time] (multimedia such as: youtube, music, gym, album, radio, internet, netflix, disney,) mtext("Topic 20", side = 4,line =0 ) cloud(Fit, topic = 20, scale = c(4, 1)) # [coronavirus] mtext("Topic 27", side = 2,line =2 ) cloud(Fit, topic = 43, scale = c(4, 1)) # [helping behavior] -> share, support, help, community, charity, donate, respect,vulnerable, mtext("Topic 33", side = 4,line =0 )

cloud(Fit, topic = 33,scale=c(4 ,1)) # [pandemic] mtext("Topic 43", side = 2,line =2 ) cloud(Fit ,topic=13,scale=c(4 ,1)) # [uplifting and distracting] mtext("Topic 16", side = 4,line =0 ) cloud(Fit ,topic=16,scale=c(4 ,1)) # [Trump and the virus] mtext("Topic 13", side = 2,line =2 ) cloud(Fit ,topic=49,scale=c(4 ,1)) # [people not following COVID guidelines] mtext("Topic 46", side = 4,line =0 ) cloud(Fit ,topic=46,scale=c(4 ,1)) # [cognitive thinking] (think,believe ,reason ,thought ,opinion,mind ,seem) mtext("Topic 49", side = 2,line =2 )

cloud(Fit ,topic=21,scale=c(4 ,1)) # [cases] (case,new,number ,report ,confirm) mtext("Topic 38", side = 4,line =0 ) cloud(Fit ,topic=38,scale=c(4 ,1)) # [regulations] (rule,break,police,law,trave lignore,fine,government ,arrest,p eople ,drive) mtext("Topic 21", side = 2,line =2 ) cloud(Fit ,topic=9,scale=c(4 ,1)) # [deaths] (death ,coronavirus,die,rat e percent,increase,f igur e thousand,suffer,...) mtext("Topic 28", side = 4,line =0 ) cloud(Fit ,topic=28,scale=c(4 ,1)) mtext("Topic 9", side = 2,line =2 )

first topic and 20th topic

par(mfrow=c(1,2), oma=c(0,0,0,0), mar=c(0,0,0,0)) cloud(Fit ,topic=29,scale=c(4 ,1)) # [stay home] -> government's campaign(stay,hom e,safe,p eople away,everyone,w ork,p rotect) mtext("Topic 35", side = 4,line =0 ) cloud(Fit ,topic=35,scale=c(4 ,1)) mtext("Topic 29", side = 2,line =2 ) par(mfrow=c(1,1))

more detailed (multiple metrics)

labelTopics(Fit, c(29, 8, 30,37,18,51,34,27,20,43,33,13,16,49,46,21,38,9,28,35))

topic_probabilities <- findThoughts(Fit)

thoughts6 <- findThoughts(Fit, texts = shortdoc, n = 2, topics = 6)$docs[[1]] plotQuote(thoughts6, width = 30, main = "Topic 6") head(shortdoc)

Topic5 <- findThoughts(Fit, out$text, topics = 5, n = 5)

Topic5

cl_data <- as.data.frame(cleaned_data) thoughts <- findThoughts(Fit, data$Tweet, n = 1, topics = 1) thoughts

###################### back to dataframe

doc_topics <- labelTopics(Fit, out$documents) data_with_topic_probabilities <- cbind(cleaned_data, doc_topics)

Label topics for each document

topic_labels <- labelTopics(processed)

Merge the topic labels with the original 'cleaned_data' dataset

labeled_data <- cbind(cleaned_data, topic_labels)

################ covariate ############## out$meta$rating <- as.factor(out$meta$rating) prep <- estimateEffect(1:20 ~ dayfour, Fit, metadata = out$meta, uncertainty = "Global") plot(prep, "day", method = "continuous", topics = 13, model = z, printlegend = FALSE, xaxt = "n", xlab = "Time (2008)")

Create a document-term matrix

corpus <- textProcessor(documents = cleaned_data, metadata = data, lowercase = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, stopwords = TRUE, stemming = TRUE, wordLengths = c(3, Inf))

Convert the preprocessed data into a document-feature matrix

doc_term_matrix <- stm::convert(corpus, to = "stm")

Find the optimal number of topics using parallel processing

search_results <- foreach(k = seq(2, 10, 1), .combine = cbind) %dopar% { model <- stm(doc_term_matrix, K = k, verbose = FALSE) data.frame(k, searchK(model)) }

Select the best number of topics based on the chosen metric

best_topics <- apply(search_results[2:4, ], 2, which.max)

Perform topic modeling with the best number of topics

model <- stm(doc_term_matrix, K = best_topics)

Print the top terms for each topic

labelTopics(model)

preprocessing

select the clusters and perform topic modelling

plotRemoved(processed$documents, lower.thresh = seq(1, 200, by = 100))

out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 15)

saveRDS(out, file = "STMAnger.rds")

saveRDS(processed, file = "STM2Anger.rds")

poliblogPrevFit <- stm(documents = out$documents, vocab = out$vocab, K = 20, prevalence = ~series, max.em.its = 75, data = out$meta, init.type = "Spectral")

poliblogPrevFit10topics <- stm(documents = out$documents, vocab = out$vocab, K = 10, prevalence = ~series, max.em.its = 40, data = out$meta, init.type = "Spectral")

poliblogSelect <- selectModel(out$documents, out$vocab, K = 20, prevalence = ~emo_anger, max.em.its = 75, data = out$meta, runs = 20, seed = 8458159)

labelTopics(poliblogPrevFit, c(6, 13, 18))

plot(poliblogPrevFit, type = "summary", xlim = c(0, 0.3))

cloud(poliblogPrevFit, topic = 12, scale = c(4, 1)) # politics cloud(poliblogPrevFit, topic = 19, scale = c(5, 1)) cloud(poliblogPrevFit, topic = 9, scale = c(5, 1)) # things going on in the world & pandemic

cloud(poliblogPrevFit, topic = 1, scale = c(5, 1)) # random cloud(poliblogPrevFit, topic = 8, scale = c(5, 1)) # random cloud(poliblogPrevFit, topic = 1, scale = c(5, 1)) # random cloud(poliblogPrevFit, topic = 16, scale = c(5, 1)) # coronavirus, nhs, selfish, food, irrespons,distance, social, isol, staff, ignor, home cloud(poliblogPrevFit, topic = 11, scale = c(5, 1)) # random verbs cloud(poliblogPrevFit, topic = 3, scale = c(5, 1)) # social media & conversation cloud(poliblogPrevFit, topic = 5, scale = c(5, 1)) # leisure time & weather(possibly because walking is leisure) cloud(poliblogPrevFit, topic = 14, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 4, scale = c(5, 1)) #some great British mockery cloud(poliblogPrevFit, topic = 18, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 20, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 10, scale = c(5, 1)) #football cloud(poliblogPrevFit, topic = 7, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 15, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 13, scale = c(5, 1)) # school, children, student, teacher, lesson, educ, video, cloud(poliblogPrevFit, topic = 1, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 17, scale = c(5, 1)) #politics countries of the UK (Scotland, northen Ireland,...)

mod.out.corr <- topicCorr(poliblogPrevFit) plot(mod.out.corr)

now with 10 topics

library(parallel) ncores <- 10

cl <- makeCluster(ncores)

Fit the STM model in parallel

fit <- parLapply(cl, 1:ncores, function(i) { stm(documents = out$documents, vocab = out$vocab, K = 10, prevalence = ~series, max.em.its = 75, data = out$meta, init.type = "Spectral") })

Stop the parallel processing environment

stopCluster(cl) poliblogPrevFit2 <- searchK(documents = out$documents, vocab = out$vocab, K = c(5, 10,15), prevalence = ~series, max.em.its = 50, data = out$meta, init.type = "Spectral", cores=10)

poliblogPrevFit10topics <- stm(documents = out$documents, vocab = out$vocab, K = 10, prevalence = ~series, max.em.its = 40, data = out$meta, init.type = "Spectral") plot(poliblogPrevFit10topics, type = "summary", xlim = c(0, 0.3)) cloud(poliblogPrevFit10topics, topic = 3, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 7, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 9, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 8, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 1, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 6, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 4, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 5, scale = c(4, 1))

save.image(file="Anger.RData")

fear

plot(poliblogPrevFit, type = "summary", xlim = c(0, 0.3)) cloud(poliblogPrevFit, topic = 18, scale = c(4, 1)) # anxiety related terms cloud(poliblogPrevFit, topic = 12, scale = c(4, 1)) # anxiety related terms cloud(poliblogPrevFit, topic = 13, scale = c(4, 1)) # anxiety related terms cloud(poliblogPrevFit, topic = 15, scale = c(4, 1)) # coronavirus, covid, death, symptom, hospit, case, contract, pandem,test, cloud(poliblogPrevFit, topic = 11, scale = c(4, 1)) # cloud(poliblogPrevFit, topic = 19, scale = c(4, 1)) # also restrictions a bit cloud(poliblogPrevFit, topic = 2, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 10, scale = c(4, 1))

cloud(poliblogPrevFit, topic = 10, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 17, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 14, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 1, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 8, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 5, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 4, scale = c(4, 1))

mod.out.corr <- topicCorr(poliblogPrevFit) plot(mod.out.corr) View(mod.out.corr)

Assuming 'Fit' is your STM model

plot(Fit, type="labels", topics=c(1,12,41,31,16,39,10,51,40,27,3,21,26,56,20,14,55,30,32,29,44,23,43,54,42,15,22,13,2,38,52,34,11,8,18,35,37,28,5,6,9,47,7,33,45,53,25,4,57,50,46,48,49,19,17,24)) # Adjust topics to the ones you want to see

plot(Fit, type="labels", topics=c(16,39,10,51,40,27,3,21,26,56,20,14,55,30)) plot(Fit, type="labels", topics=c(32,29,44,23,43,54,42,15,22,13,2,38,52,34)) plot(Fit, type="labels", topics=c(11,8,18,35,37,28,5,6,9,47,7,33,45,53)) plot(Fit, type="labels", topics=c(25,4,57,50,46,48,49,19,17,24)) plot(Fit, type="labels", topics=c())

top_words <- labelTopics(Fit, n=10)$prob

############################################################

TV VAR

############################################################

######################## for model 1

n2020n2 <- alldataz_sum2[1191:nrow(alldataz_sum2),] rownames(n2020n2) <- seq(1263, 1262 + nrow(n2020n2))

whole_tseriesdata$cases[whole_tseriesdata$cases == 0] <- NA

mood <- n2020n2 %>% select(count_topic1, count_topic12, count_topic31,count_topic41, avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,

                       Count #, public_metrics.like_count, public_metrics.reply_count, public_metrics.retweet_count

)

mood_labels <- colnames(mood[1:9])

mood_labels <- c("positive", "anxiety")

mood2 <- data.matrix(mood)

01-08-2023

time2 <- n2020n2 %>% select(Hourno, Dayno)

time2 <- data.matrix(time2)

time2 <- time %>% select(hourno,dayno,hour,day,month) time2 <- data.matrix(time2)

sum(is.na(mood2)) sum(is.na(time2))

##########################################

which lags?

##########################################

ccf(mood$avg_d_topic1, mood$count_topic12, lag.max=200, plot=TRUE)

find where relationship is negative

ccf_result <- ccf(mood$avg_d_topic1, mood$count_topic1, lag.max=100, plot=FALSE) negative_lags <- ccf_result$lag[ccf_result$acf < 0] negative_acf <- ccf_result$acf[ccf_result$acf < 0]

Calculate 95% significance level

significance_level <- qnorm((1 + 0.95)/2)/sqrt(length(mood$avg_d_topic1))

Find negative and significant lags

significant_negative_lags <- negative_lags[abs(negative_acf) > significance_level]

ccf(mood$avg_d_topic1, mood$count_topic1, lag.max=60, plot=TRUE) abline(h = -significance_level, col = "blue", lty = 2) # Add line for -95% significance points(significant_negative_lags, negative_acf[abs(negative_acf) > significance_level], col = "red", pch = 19)

library(lmtest)

Test if avg_d_topic1 Granger-causes count_topic1

result <- grangertest(count_topic1 ~ avg_d_topic1, order=1, data=mood) # You can change the order parameter summary(result)

##########################################

bandwith selection

##########################################

set.seed(1) bwSeq <- seq(0.1,0.2,length=10)

bw_object <-bwSelect(data= mood2, type=rep("g",9), # number = number of variables level=rep(1,9), bwSeq=bwSeq, bwFolds=1, bwFoldsize=20, modeltype="mvar", lags=1, scale=TRUE,

timepoints= #when no time points are given, we expect them to be equidistant

                 beepvar=time2$Hourno,       ## model per day -> use: variable with 1:121       model per hour -> use: hourno
                 dayvar=time2$Dayno,       ## model per day -> use: variable with all 1's     model per hour -> use: dayno
                 pbar=TRUE)

bandwith <- bwSeq[which.min(bw_object$meanError)] bandwith

bandwithdata0.01_0.1hour <-bw_object$meanError

bandwithdata0.1_1hour <-bw_object$meanError bandwithdata0.1_1hour bandwithdata0.01_0.1hour <-bw_object$meanError bandwithdata0.01_0.1hour

bandwithdata0.1_0.2hour <-bw_object$meanError bandwithdata0.1_0.2hour

bandwithdata0.01_0.11hour

bandwithdata0.1_1hour <-bw_object$meanError

bandwithdata1_10 <-bw_object$meanError

bandwithdata0.1_1 <-bw_object$meanError

bandwithdata0.01_0.1 <-bw_object$meanError

bandwithdata0.1_1hour bandwithdata0.01_0.1hour

model per days

bandwithdata1_10

bandwithdata0.1_1

bandwithdata0.01_0.1

##################################

visualize Bandwith: Plot the mean error against the bandwidth sequence

##################################

new

Combining all bandwidth sequences

bwSeq_combined <- c(seq(0.01, 0.1, length=10), seq(0.1, 1, length=10), seq(0.1, 0.2, length=10))

Combining all bandwidth mean error data

all_bandwidth_data <- c(bandwithdata0.01_0.1hour, bandwithdata0.1_1hour, bandwithdata0.1_0.2hour)

Plotting the combined data

plot(bwSeq_combined, all_bandwidth_data, type="n", xlab="Bandwidths", ylab="Mean Error", main="Bandwidth Selection", xlim=c(0.01, 1), ylim=range(all_bandwidth_data), xaxt="n")

Custom x-axis ticks

breaks <- c(seq(0, 0.2, by=0.02), seq(0.3, 1, by=0.1)) axis(1, at=breaks, labels=TRUE, cex.axis=0.6)

Adding the individual data series

lines(seq(0.01, 0.1, length=10), bandwithdata0.01_0.1hour, type="b", pch=20, col="red") lines(seq(0.1, 1, length=10), bandwithdata0.1_1hour, type="b", pch=20, col="blue") lines(seq(0.1, 0.2, length=10), bandwithdata0.1_0.2hour, type="b", pch=20, col="green")

Adding the legend at the bottom

legend(x="topright", legend=c("Bandwidth 0.01-0.1", "Bandwidth 0.1-1", "Bandwidth 0.1-0.2"), col=c("red", "blue", "green"), lwd=1, pch=20, horiz=TRUE)

##############

visualize

##############

Gaussian kernel function

gaussian_kernel <- function(t, te, sigma) { Zt <- (1 / (sqrt(2 pi sigma))) exp(-((t - te)^2) / (2 sigma^2)) Zt_max <- max(sapply(1:100, function(t) (1 / (sqrt(2 pi sigma))) exp(-((t - te)^2) / (2 sigma^2)))) return(Zt / Zt_max) }

Parameters

te <- 6 # Center of the Gaussian b <- 1.3 # Bandwidth time_points <- seq(1, 12, by=0.1) # Using a finer granularity for smoother curve

Calculate weights for all time points

weights <- sapply(time_points, function(t) gaussian_kernel(t, te, b))

Plot

plot(time_points, weights, type="l", xlab="Time Points", ylab="Weights", main=paste("Weights for bandwidth b = 0.13"))

make more pretty

Libraries

library(ggplot2)

Gaussian Kernel Function

Parameters

te <- 6 # Center of the Gaussian b <- 1.3 # Bandwidth time_points <- seq(1, 12) # Using a finer granularity for smoother curve

Calculate weights for all time points

weights <- sapply(time_points, function(t) gaussian_kernel(t, te, b))

Create a data frame

data <- data.frame(Time = time_points, Weight = weights)

Create a ggplot graph

p <- ggplot(data, aes(x = Time, y = Weight)) + geom_line(color = "steelblue", size = 1) + labs(x = "Time Points", y = "Weights", title = paste("Weights for bandwidth b =", b)) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5))

Print the graph

print(p)

gaussian_kernel <- function(t, te, sigma) { Zt <- (1 / (sqrt(2 pi) sigma)) exp(-((t - te)^2) / (2 sigma^2)) Zt_max <- max(sapply(1:100, function(t) (1 / (sqrt(2 pi sigma))) exp(-((t - te)^2) / (2 sigma^2)))) return(Zt / Zt_max) }

Parameters

te <- 6 # Center of the Gaussian b <- 1.3 # Bandwidth sigma <- 0.13 time_points <- seq(1, 12) # Using a finer granularity for smoother curve

Calculate weights for all time points

weights <- sapply(time_points, function(t) gaussian_kernel(t, te, b))

Standard deviation values

std <- b

Plot

plot(time_points, weights, type = "n", xlab = "Time Points", ylab = "Weights", main = paste("Weights for bandwidth b = 0.13"), xaxt = "n")

Fill area under the curve

polygon(c(time_points, rev(time_points)), c(weights, rep(0, length(weights))), col = "lightblue")

lines(time_points, weights, col = "steelblue", lwd = 2)

Set x-axis tick marks

axis(1, at = seq(1, 12))

Add vertical lines at +2 std and -2 std

abline(v = c(te - 2 std, te + 2 std), col = c("red", "red"), lty = 2)

Add a legend

legend("topright", legend = paste("b = 0.13"), col = "steelblue", lwd = 2)

Add grid

grid()

Gaussian Kernel Function

gaussian_kernel <- function(j, te, b) { weight <- (1 / (sqrt(2 pi b^2))) exp(-((j - te)^2) / (2 b^2)) return(weight) }

Parameters

te <- 3 # Center of the Gaussian b <- 0.2 # Bandwidth time_points <- seq(1, 12) # Using a finer granularity for smoother curve

Calculate weights for all time points

weights <- sapply(1:length(time_points), function(j) gaussian_kernel(j, te, b))

Plot

plot(time_points, weights, type = "l", xlab = "Time Points", ylab = "Weights", main = paste("Weights for bandwidth b =", b))

Add grid

grid()

##########################################

model

##########################################

tvvar_obj <- tvmvar(data=mood2, type=rep("g",9), # 9 = number of variables level=rep(1,9), lambdaSel="CV",

timepoints= mood$rank, #when no time points are given, we expect them to be equidistant

                estpoints = seq(0,1,length=100),
                bandwidth=0.11,
                lags=1,
                beepvar=time2$Hourno,
                dayvar=time2$Dayno,             #dayno
                scale=TRUE)

tvvar_obj str(tvvar_obj) summary(tvvar_obj) str(tvvar_obj$tvmodels[[1]])

lambda_min_index_1 <- which.min(tvvar_obj$tvmodels[[1]]$fitobj$glmnet.fit$lambda) coefficients_for_lambda_min_1 <- tvvar_obj$tvmodels[[1]]$fitobj$glmnet.fit$beta[, lambda_min_index_1]

print(possible_coefs) ########################################

reliability of parameter estimates

########################################

res_obj <- resample(object=tvvar_obj, data=mood, nB=50, blocks=10, seeds=1:50, quantiles=c(0.05,0.95)) res_obj

########################################

time-varying prediction error

########################################

pred_obj <- predict(object=tvvar_obj, data=mood, errorCon=c("R2","RMSE"), tvMethod="weighted", consec=time2$Hourno) ## switch to hourno for data per hour

pred_obj pred_obj$tverrors

####################################################

time-varying Mixed Graphical Model (MGM)

####################################################

fit_mgm <- mgm(data = mood, type = rep("g",9), levels = rep(1,9), k = 3, lambdaSel = "CV", lambdaFolds = 10,ruleReg = "AND")

str(fit_mgm) tv_coeffs <- fit_mgm$coef summary(fit_mgm)

Plotting time-varying coefficients

plot(1:nrow(tv_coeffs), tv_coeffs[,1], type="l", col=1, ylim=range(tv_coeffs), xlab="Time", ylab="Coefficient Value", main="Time-varying Coefficients")

If you have more than one coefficient to plot

for (i in 2:ncol(tv_coeffs)) { lines(1:nrow(tv_coeffs), tv_coeffs[,i], col=i) }

Adding a legend to distinguish coefficients

legend("topright", legend=colnames(tv_coeffs), col=1:ncol(tv_coeffs), lty=1)

count_topic1, count_topic12, count_topic31,count_topic41,

avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,

Count

fit_mgm$interactions$indicator

FactorGraph(object = fit_mgm, labels = mood_labels,PairwiseAsEdge = FALSE)

########################################

visualizing tv VAR model

########################################

######################################################################################

NEW visualization

######################################################################################

f_timeline_new <- function(length = .15, gap = .005, mar = c(0,0,0,0), ylim = c(-.1,.1), ytext = -.1, cex = 1) {

par(mar=mar) plot.new() plot.window(xlim=c(0,1), ylim=ylim)

box()

arrows

p_weeks <- c(394,2160,1367) bor_end <- c(0,cumsum(p_weeks)/sum(p_weeks)) for(i in 1:3) { arrows(bor_end[i]+gap, 0, bor_end[i+1]-gap, code=3, length=length, lwd=1.5) }

text

t_lengths <- p_weeks / sum(p_weeks) midpoints <- bor_end[-1] - t_lengths/2

text(midpoints, rep(ytext, 3), c("May 2020", "June 2020", "First 20 days of July 2020"), cex = 0.8)

lockdown

points(0, rep(0,1), pch=20, cex=1.5, col = "red") ## not sure about this!! (sum(p_weeks)*7-10) = 53, what is it?

points(394, rep(0,1), pch=20, cex=1.5, col = "orange")

points(2554, rep(0,3), pch=4, cex=1.5, col = "black") ## cross for networkmodel 1

points((42) / (sum(p_weeks)*7-5), pch=4, cex=1.5, col = "black") ## cross for networkmodel 2

points(c(42,98) / (sum(p_weeks)*7-5), rep(0,2), pch=4, cex=1.5, col = "black") ## cross for networkmodel 3

}

----- Preprocessing ------

Compute mean movel over time to create decent layout

mean_wadj <- apply(tvvar_obj$wadj[, , 1, ], 1:2, mean)

par_ests <- tvvar_obj$wadj ind_negative <- which(tvvar_obj$signs == -1, arr.ind = T) par_ests[ind_negative] <- par_ests[ind_negative] * -1

Find parameters with highest SD

wadj_ws <- tvvar_obj$wadj wadj_ws[tvvar_obj$edgecolor=="red"] <- wadj_ws[tvvar_obj$edgecolor=="red"] * -1 parm_sds <- apply(wadj_ws, 1:2, sd) parm_sds_mat <- matrix(NA, 12^2, 3) counter <- 1 for(i in 1:9) { ##### change this to the amount of variables for(j in 1:9) { ##### change this to the amount of variables parm_sds_mat[counter, ] <- c(i, j, parm_sds[i, j]) counter <- counter + 1 } }

parm_sds_mat_ord <- parm_sds_mat[order(parm_sds_mat[, 3], decreasing = TRUE), ]

count_topic1, count_topic12, count_topic31,count_topic41,

avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,

Count

head(parm_sds_mat_ord) # six most time-varying parameters parm_sds_mat

----- Plotting ------

library(qgraph) figDir <- "C:/Users/Brent/Documents/Psychology/KUL/thesisstatisticsdata/mgm"

pdf(paste0(figDir, "Fig_Application_mgm2.pdf"), width = 8, height = 7)

1) Define Layout

lmat <- matrix(c(1, 2, 3, 4, 4, 4, 5, 5, 5), ncol=3, byrow = T) lo <- layout(lmat, heights = c(.7,.1, .6), widths = c(1, 1, 1))

2) Two Network Plots

Get layout of mean graph

Q <- qgraph(t(mean_wadj), DoNotPlot=TRUE) saveRDS(Q$layout, "C:/Users/Brent/Documents/Psychology/KUL/thesisstatisticsdata/layout_mgm.RDS")

Plot graph at selected fixed time points

tpSelect <- c(2, 50, 100)

Switch to colorblind scheme

tvvar_obj$edgecolor[, , , ][tvvar_obj$edgecolor[, , , ] == "darkgreen"] <- c("darkblue") lty_array <- array(1, dim=c(9, 9, 1, 100)) ##### change this to the amount of variables lty_array[tvvar_obj$edgecolor[, , , ] != "darkblue"] <- 2

for(tp in tpSelect) { qgraph(t(tvvar_obj$wadj[, , 1, tp]), layout = Q$layout, edge.color = t(tvvar_obj$edgecolor[, , 1, tp]), labels = mood_labels, vsize = 13, esize = 10, asize = 10, mar = rep(5, 4), minimum = 0, maximum = .5, lty = t(lty_array[, , 1, tp]), pie = pred_obj$tverrors[[tp]][, 3]) }

4) Timeline

f_timeline_new(length = .1, mar=c(0, 4, 0, 1), ylim = c(-1.2, .2), ytext = -.9, cex = 1)

5) Line-plots + CIs

plot.new() par(mar = c(4,4,0,1)) plot.window(xlim=c(1, 100), ylim=c(-.25, 1)) axis(1, c(1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100), labels=T) axis(2, c(-0.25,0, .25,0.5,0.75,1), las=2)

title(xlab = "Estimation points", cex.lab = 1.1) title(ylab = "Parameter estimate", cex.lab = 1.1) abline(h = 0, col = "grey", lty=2)

head(parm_sds_mat_ord) # pick three highest

count_topic1, count_topic12, count_topic31,count_topic41,

avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,

Count

m_par_display <- matrix(c(1, 5, 2, 6, 4, 8), ncol = 2, byrow = T)

Select colors

cols <- brewer.pal(12, "Spectral")[c(5,1,3)] # avoid red/green because used for edges in upper panel

set 1 ### set 2

for(i in 1:nrow(m_par_display)) { par_row <- m_par_display[i, ]

Plot point estimates

P1_pointest <- par_ests[par_row[1], par_row[2], 1, ] lines(1:100, P1_pointest, col = cols[i], lwd = 2, lty=i)

Plot uncertainty estimates [new shading]

Compute CIs

CIs <- apply(res_obj$bootParameters[par_row[1], par_row[2], 1, , ], 1, function(x) { quantile(x, probs = c(.05, .95)) } )

Plot shading

polygon(x = c(1:100, 100:1), y = c(CIs[1,], rev(CIs[2,])), col=alpha(colour = cols[i], alpha = .3), border=FALSE) } # end for: i

count_topic1, count_topic12, count_topic31,count_topic41,

avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,

Count

legend_labels <- c(expression("count_topic1"["t-1"] %->% "avg_d_topic1"["t"]), expression("count_topic12"["t-1"] %->% "avg_d_topic12"["t"]), expression("count_topic41"["t-1"] %->% "avg_d_topic41"["t"])) legend("bottomright", legend_labels, col = cols, lwd = 2, bty = "n", cex = 1, horiz=T, lty=1:3)

dev.off()

HendrickxBrent / EmotionContagionDuringBlackLIvesMatter

R code #5

0. select dataset

1. Natural language processing (emotions)

please note that it is necessary to have Python somewhere on your computer for this part

reticulate::conda_remove(envname = "r-reticulate") #if you installed packages incorrectly, delete your whole environment and start over

reticulate::py_list_packages(envname = "r-reticulate") #check which packages you have

import tensorflow first, then transformers

pytorch

tweetnlp <- reticulate::import("tweetnlp")

Load the tokenizer and model

classifier <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest") # you can ignore the following warning: UserWarning: return_all_scores is now deprecated, use top_k=1 if you want similar functionnality

model <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")

data <- data[c("Tweet", "created_at", "text")]

test code for first row

Get the labels (emotions) from the classifier's output list

for the first row of the dataframe

now see if we can iterate over rows

subset so you dont have to wait more then 5 minutes to see if your trials worked

run over rows

install.packages("tidytext")

2. Cluster analysis

install.packages("dbscan")

Assuming your dataset is stored in 'df1'

Convert your dataset to a matrix (if it's not already)

df1 <- df1[1:10000,]

Set the parameters for DBSCAN

0.1 and 10: Cluster1: anger and disgust 36000 cluster2: sadness and pessimism 1900 cluster9: positivity 5000

0.15 and 15: cluster1: anger cluster2: sadness and pessi cluster 3: positivity

0.2 and 30 is fine

0.2 and 40 also ok but prefer previous -> c1: 32000 positive c2: 73500 anger disgust c3: 3400 fear and pessi c4: 7700 the shitty all low one

0.15 and 25 is nice -> c1: anger and disgust 50 000 c2: sad pessi 4000 c5: joy, trust, love, opti 5000 c6: joy, opti, trust, love 8300 difference c5 and c6 -> c6 is extreemer on joy, love, trust (higher)

0.4 and 0.3 is not it like NOT

0.3 and 0.25 is not it like NOT

0.2 25 is okay but only c1 42000 and c2 89000

0.2 and 20 is also very nice but just 2 large clusters c1: 61300 and c2: 95000

0.15 and 20 is c1 58000 c2 2300 c3 18800

Run DBSCAN

Print the clusters found

Print the noise points (not assigned to any cluster)

Print the core points (points that have at least 'minPts' neighbors)

Print the total number of clusters found

Create a dataframe combining the original data and the cluster labels

Filter out the first cluster (cluster 0)

df_clustered <- df_clustered %>% filter(Cluster != 0)

Visualize the most common cluster using a histogram

Add a column with the cluster labels to the original dataset

Print 10 random tweets from cluster 1

Print 10 random tweets from cluster 2

Print 10 random tweets from cluster 5

Print 10 random tweets from cluster 6

visualize

Install and load necessary packages

Create a dataframe combining the original data and the cluster labels

Create density plots per cluster for the 11 variables

best 6

Create a dataframe combining the original data and the cluster labels

Create density plots per cluster for the 11 variables

Create a density plot for each variable in the cluster

Combine density plots for all variables into one figure

Print the combined plot for the cluster

more fancy plots:

color_palette <- c("surprise" = "blue", "trust" = "green", "sadness" = "red",

"joy" = "orange", "love" = "purple", "fear" = "yellow",

"anger" = "pink", "pessimism" = "cyan", "anticipation" = "magenta",

"disgust" = "brown", "optimism" = "gray")

Define a function to calculate the color intensity based on scale values

calculate_color_intensity <- function(values) {

normalized_values <- (values - min(values)) / (max(values) - min(values))

intensity <- 0.5 + normalized_values * 0.5 # Adjust the scale of intensity as needed

return(intensity)

}

Loop over the clusters and variables to create density plots

for (i in clusters_to_plot) {

cluster_data <- df_clustered[df_clustered$Cluster == i, ]

cluster_data <- cluster_data[, !colnames(cluster_data) %in% c("Cluster")]

Create a density plot for each variable in the cluster

density_plots <- lapply(colnames(cluster_data), function(variable) {

intensity <- calculate_color_intensity(cluster_data[[variable]])

color <- color_palette[variable]

classifier <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest") # you can ignore the following warning: UserWarning: `return_all_scores` is now deprecated, use `top_k=1` if you want similar functionnality