please note that it is necessary to have Python somewhere on your computer for this part
library(reticulate)
library(here)
library(dplyr)
reticulate::conda_remove(envname = "r-reticulate") #if you installed packages incorrectly, delete your whole environment and start over
reticulate::py_list_packages(envname = "r-reticulate") #check which packages you have
reticulate::py_config() #installing your r-reticulate environment
reticulate::py_available() #TRUE
import tensorflow first, then transformers
reticulate::py_install("transformers", pip = TRUE)
reticulate::py_install("torch", pip = TRUE)
reticulate::py_install("tensorflow==2.10", pip = TRUE, user = TRUE) #has to be version 2.10
classifier <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest") # you can ignore the following warning: UserWarning: return_all_scores is now deprecated, use top_k=1 if you want similar functionnality
model <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")
pipe("I bet everything will work out in the end :)") # it works
data <- read.csv("C:/Users/Brent/Documents/GitHub/bdss-project/Assignment3/example.csv") # this dataset is simplified to only the text and creation time
data <- read.csv("C:/Users/Brent/Documents/Psychology/KUL/Psychologie/June/resultsmarch2junecorona.csv")
data <- data[c("Tweet", "created_at", "text")]
df2 = data
remove(df_clustered)
test code for first row
Get the labels (emotions) from the classifier's output list
for the first row of the dataframe
result <- pipe(df2[1, "Tweet"])
labels <- unlist(lapply(result[[1]], function(x) x[[1]]))
score <- lapply(result[[1]], function(x) x[[2]])
df2[, labels] <- NA
df2[1, labels] <- score #df2 should have scores for the first row
now see if we can iterate over rows
subset so you dont have to wait more then 5 minutes to see if your trials worked
Create density plots per cluster for the 11 variables
clusters_to_plot <- unique(df_clustered$Cluster)[0:5] # Only look at first 6 clusters
for (i in clusters_to_plot) {
cluster_data <- df_clustered[df_clustered$Cluster == i, ]
cluster_data <- cluster_data[, !colnames(cluster_data) %in% c("Cluster")]
Create a density plot for each variable in the cluster
density_plots <- lapply(colnames(cluster_data), function(variable) {
ggplot(cluster_data, aes(x = .data[[variable]], fill = variable)) +
geom_density(color = "black") +
labs(x = variable, y = "Density", fill = variable) +
ggtitle(paste("Cluster", i, "-", variable))
})
Combine density plots for all variables into one figure
Set the number of cores to use for parallel processing
num_cores <- 10
Register parallel backend with the desired number of cores
registerDoParallel(cores = num_cores)
Preprocess the text data
cleaned_data <- tolower(data$Tweet)# Convert text to lowercase
cleaned_data <- replace_contraction(cleaned_data) # Replace contractions for example (I'm -> I am)
plot(Fit, type="labels", topics=c(1,12,41,31,16,39,10,51,40,27,3,21,26,56,20,14,55,30,32,29,44,23,43,54,42,15,22,13,2,38,52,34,11,8,18,35,37,28,5,6,9,47,7,33,45,53,25,4,57,50,46,48,49,19,17,24)) # Adjust topics to the ones you want to see
ccf(mood$avg_d_topic1, mood$count_topic1, lag.max=60, plot=TRUE)
abline(h = -significance_level, col = "blue", lty = 2) # Add line for -95% significance
points(significant_negative_lags, negative_acf[abs(negative_acf) > significance_level], col = "red", pch = 19)
library(lmtest)
Test if avg_d_topic1 Granger-causes count_topic1
result <- grangertest(count_topic1 ~ avg_d_topic1, order=1, data=mood) # You can change the order parameter
summary(result)
##########################################
bandwith selection
##########################################
set.seed(1)
bwSeq <- seq(0.1,0.2,length=10)
bw_object <-bwSelect(data= mood2,
type=rep("g",9), # number = number of variables
level=rep(1,9),
bwSeq=bwSeq,
bwFolds=1,
bwFoldsize=20,
modeltype="mvar",
lags=1,
scale=TRUE,
timepoints= #when no time points are given, we expect them to be equidistant
beepvar=time2$Hourno, ## model per day -> use: variable with 1:121 model per hour -> use: hourno
dayvar=time2$Dayno, ## model per day -> use: variable with all 1's model per hour -> use: dayno
pbar=TRUE)
pred_obj <- predict(object=tvvar_obj,
data=mood,
errorCon=c("R2","RMSE"),
tvMethod="weighted",
consec=time2$Hourno) ## switch to hourno for data per hour
###################################################################################################### ######################################################################################################
0. select dataset
###################################################################################################### ######################################################################################################
library(reticulate) library(here) library(dplyr)
library(foreach) library(doParallel) library(tidytext) library(dplyr)
library(dbscan) library(ggplot2)
library(gridExtra) library(cuRe) library(stm) library(mgm)
save.image()
###################################################################################################### ######################################################################################################
1. Natural language processing (emotions)
###################################################################################################### ######################################################################################################
please note that it is necessary to have Python somewhere on your computer for this part
library(reticulate) library(here) library(dplyr)
reticulate::conda_remove(envname = "r-reticulate") #if you installed packages incorrectly, delete your whole environment and start over
reticulate::py_list_packages(envname = "r-reticulate") #check which packages you have
reticulate::py_config() #installing your r-reticulate environment reticulate::py_available() #TRUE
import tensorflow first, then transformers
reticulate::py_install("transformers", pip = TRUE) reticulate::py_install("torch", pip = TRUE) reticulate::py_install("tensorflow==2.10", pip = TRUE, user = TRUE) #has to be version 2.10
transformers <- reticulate::import("transformers") tensorflow <- reticulate::import("tensorflow") Torch <- reticulate::import("torch")
tweetnlp <- reticulate::import("tweetnlp")
pytorch
reticulate::conda_remove('r-reticulate', packages = c('tensorflow')) reticulate::conda_remove('r-reticulate', packages = c('transformers')) reticulate::conda_remove(envname = "r-reticulate", packages = "pytorch") reticulate::conda_remove(envname = "r-reticulate", packages = "tweetnlp")
reticulate::conda_remove(envname = "r-reticulate", packages = "pytorch")
reticulate::conda_install(envname = "r-reticulate", packages = "tensorflow==2.10", user = TRUE) reticulate::conda_install(envname = "r-reticulate", packages = "transformers", user = TRUE) reticulate::conda_install(envname = "r-reticulate", packages = "torch", user = TRUE) reticulate::conda_install(envname = "r-reticulate", packages = "tweetnlp", user = TRUE)
reticulate::py_install("tweetnlp", pip = TRUE, user = TRUE, user = TRUE)
reticulate::py_install("charset-normalizer==3.1.0", pip = TRUE, user = TRUE)
tweetnlp <- reticulate::import("tweetnlp")
Load the tokenizer and model
pipe = transformers$pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=TRUE)
classifier <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest") # you can ignore the following warning: UserWarning:
return_all_scores
is now deprecated, usetop_k=1
if you want similar functionnalitymodel <- tweetnlp$load_model("topic_classification",model_name="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")
pipe("I bet everything will work out in the end :)") # it works
data <- read.csv("C:/Users/Brent/Documents/GitHub/bdss-project/Assignment3/example.csv") # this dataset is simplified to only the text and creation time data <- read.csv("C:/Users/Brent/Documents/Psychology/KUL/Psychologie/June/resultsmarch2junecorona.csv")
data <- data[c("Tweet", "created_at", "text")]
df2 = data remove(df_clustered)
test code for first row
Get the labels (emotions) from the classifier's output list
for the first row of the dataframe
result <- pipe(df2[1, "Tweet"]) labels <- unlist(lapply(result[[1]], function(x) x[[1]])) score <- lapply(result[[1]], function(x) x[[2]]) df2[, labels] <- NA df2[1, labels] <- score #df2 should have scores for the first row
now see if we can iterate over rows
subset so you dont have to wait more then 5 minutes to see if your trials worked
df3 <- df2[49900:50000,]
remove(remaining_data) df3 <- df2
run over rows
library(foreach) library(doParallel)
install.packages("tidytext")
library(tidytext) library(dplyr) cl <- makeCluster(detectCores() - 1, type = "SOCK") registerDoParallel(cl)
system.time({ df3 <- foreach(i = 1:nrow(df3), .packages = "tidytext", .inorder = FALSE) %do% { tweet <- df3$Tweet[i] result <- pipe(tweet) labels <- unlist(lapply(result[[1]], function(x) x[[1]])) score <- unlist(lapply(result[[1]], function(x) x[[2]])) df3[i,labels] <- score df3[i,] } df3 <- do.call(rbind, df3) }) write.csv(df3, "df3_june.csv", row.names = FALSE)
###################################################################################################### ######################################################################################################
2. Cluster analysis
###################################################################################################### ######################################################################################################
install.packages("dbscan")
library(dbscan) library(ggplot2)
Assuming your dataset is stored in 'df1'
Convert your dataset to a matrix (if it's not already)
df1<- data[c("surprise", "trust", "sadness", "joy","love","fear","anger","pessimism","anticipation","disgust","optimism")]
df1 <- scale(df1) df1 <- as.data.frame(df1)
df1 <- df1[1:10000,]
df1 <- as.matrix(df1)
Set the parameters for DBSCAN
eps <- 0.15 # Epsilon - maximum distance between points to be considered neighbors minPts <- 25 # Minimum number of points in a cluster
0.1 and 10: Cluster1: anger and disgust 36000 cluster2: sadness and pessimism 1900 cluster9: positivity 5000
0.15 and 15: cluster1: anger cluster2: sadness and pessi cluster 3: positivity
0.2 and 30 is fine
0.2 and 40 also ok but prefer previous -> c1: 32000 positive c2: 73500 anger disgust c3: 3400 fear and pessi c4: 7700 the shitty all low one
0.15 and 25 is nice -> c1: anger and disgust 50 000 c2: sad pessi 4000 c5: joy, trust, love, opti 5000 c6: joy, opti, trust, love 8300 difference c5 and c6 -> c6 is extreemer on joy, love, trust (higher)
0.4 and 0.3 is not it like NOT
0.3 and 0.25 is not it like NOT
0.2 25 is okay but only c1 42000 and c2 89000
0.2 and 20 is also very nice but just 2 large clusters c1: 61300 and c2: 95000
0.15 and 20 is c1 58000 c2 2300 c3 18800
Run DBSCAN
dbscan_result <- dbscan(df1, eps = eps, minPts = minPts)
Print the clusters found
print(dbscan_result$cluster)
Print the noise points (not assigned to any cluster)
print(dbscan_result$cluster[dbscan_result$cluster == 0])
Print the core points (points that have at least 'minPts' neighbors)
print(dbscan_result$cluster[dbscan_result$cluster != 0])
Print the total number of clusters found
num_clusters <- length(unique(dbscan_result$cluster)) - 1 # Exclude noise points print(paste("Number of clusters found:", num_clusters))
Create a dataframe combining the original data and the cluster labels
df_clustered <- data.frame(df1, Cluster = dbscan_result$cluster)
Filter out the first cluster (cluster 0)
df_clustered <- df_clustered %>% filter(Cluster != 0)
Visualize the most common cluster using a histogram
print(ggplot(df_clustered, aes(x = Cluster)) + geom_histogram(fill = "steelblue", color = "black") + labs(x = "Cluster", y = "Frequency") + ggtitle("Histogram of Cluster Frequencies"))
table(df_clustered$Cluster)
Add a column with the cluster labels to the original dataset
data$Cluster <- dbscan_result$cluster
Print 10 random tweets from cluster 1
cluster1_tweets <- data[data$Cluster == 1, "text"] print(sample(cluster1_tweets, 10))
Print 10 random tweets from cluster 2
cluster2_tweets <- data[data$Cluster == 2, "text"] print(sample(cluster2_tweets, 10))
Print 10 random tweets from cluster 5
cluster5_tweets <- data[data$Cluster == 5, "text"] print(sample(cluster5_tweets, 10))
Print 10 random tweets from cluster 6
cluster6_tweets <- data[data$Cluster == 6, "text"] print(sample(cluster6_tweets, 10))
visualize
Install and load necessary packages
install.packages("ggplot2") library(ggplot2) install.packages("gridExtra") library(gridExtra)
Create a dataframe combining the original data and the cluster labels
df_clustered <- data.frame(df1, Cluster = dbscan_result$cluster)
Create density plots per cluster for the 11 variables
clusters_to_plot <- unique(df_clustered$Cluster)
best 6
library(gridExtra)
Create a dataframe combining the original data and the cluster labels
df_clustered <- data.frame(df1, Cluster = dbscan_result$cluster)
Create density plots per cluster for the 11 variables
clusters_to_plot <- unique(df_clustered$Cluster)[0:5] # Only look at first 6 clusters
for (i in clusters_to_plot) { cluster_data <- df_clustered[df_clustered$Cluster == i, ] cluster_data <- cluster_data[, !colnames(cluster_data) %in% c("Cluster")]
Create a density plot for each variable in the cluster
density_plots <- lapply(colnames(cluster_data), function(variable) { ggplot(cluster_data, aes(x = .data[[variable]], fill = variable)) + geom_density(color = "black") + labs(x = variable, y = "Density", fill = variable) + ggtitle(paste("Cluster", i, "-", variable)) })
Combine density plots for all variables into one figure
combined_plot <- grid.arrange(grobs = density_plots, ncol = 4)
Print the combined plot for the cluster
print(combined_plot) }
more fancy plots:
color_palette <- c("surprise" = "blue", "trust" = "green", "sadness" = "red",
"joy" = "orange", "love" = "purple", "fear" = "yellow",
"anger" = "pink", "pessimism" = "cyan", "anticipation" = "magenta",
"disgust" = "brown", "optimism" = "gray")
Define a function to calculate the color intensity based on scale values
calculate_color_intensity <- function(values) {
normalized_values <- (values - min(values)) / (max(values) - min(values))
intensity <- 0.5 + normalized_values * 0.5 # Adjust the scale of intensity as needed
return(intensity)
}
Loop over the clusters and variables to create density plots
for (i in clusters_to_plot) {
cluster_data <- df_clustered[df_clustered$Cluster == i, ]
cluster_data <- cluster_data[, !colnames(cluster_data) %in% c("Cluster")]
Create a density plot for each variable in the cluster
density_plots <- lapply(colnames(cluster_data), function(variable) {
intensity <- calculate_color_intensity(cluster_data[[variable]])
color <- color_palette[variable]
cluster_data$fill_variable <- as.factor(intensity)
ggplot(cluster_data, aes(x = .data[[variable]], fill = fill_variable)) +
geom_density(color = "black") +
scale_fill_manual(values = color, guide = "none") +
labs(x = variable, y = "Density") +
ggtitle(paste("Cluster", i, "-", variable))
})
Combine density plots for all variables into one figure
combined_plot <- grid.arrange(grobs = density_plots, ncol = 4)
Print the combined plot for the cluster
print(combined_plot)
}
######################################################################################################
2.1 another clustering method
######################################################################################################
hierarchical
note: code does not work
cure(df1)
Perform CURE clustering
cure_result <- cure(df1)
Get the cluster assignments
cluster_assignments <- cure_result$cluster
Print the number of clusters found
num_clusters <- length(unique(cluster_assignments)) print(paste("Number of clusters:", num_clusters))
not hierarchical but another clustering method
library(mclust)
Perform model-based clustering
mclust_result <- Mclust(df1)
Get the cluster assignments
cluster_labels <- mclust_result$classification
Print the cluster labels
print(cluster_labels)
save(mclust_result, file = "mclust_result.RData")
summary(cluster_labels)
Access cluster-specific information
cluster_means <- mclust_result$parameters$mean cluster_covs <- mclust_result$parameters$cov
Visualize variable distributions for each cluster
for (i in unique(cluster_labels)) { cluster_data <- df1[cluster_labels == i, ]
Create density plots for each variable in the cluster
density_plots <- lapply(colnames(cluster_data), function(variable) { ggplot(cluster_data, aes(x = .data[[variable]])) + geom_density(color = "black", fill = "steelblue") + labs(x = variable, y = "Density") + ggtitle(paste("Cluster", i, "-", variable)) })
Combine density plots for all variables into one figure
combined_plot <- gridExtra::grid.arrange(grobs = density_plots, ncol = 3)
Print the combined plot for the cluster
print(combined_plot) }
Assess the goodness of the clustering solution
BIC <- mclust_result$BIC ICL <- mclust_result$ICL
Print BIC and ICL values
print(paste("BIC:", BIC)) print(paste("ICL:", ICL))
###################################################################################################### ######################################################################################################
3. Topic modeling
###################################################################################################### ######################################################################################################
Topic modeling ----------------------------------------------------------
library(stm)
data <- sampleddata10 remove(sampleddata10)
remove hastags and links
remove1 <- function(text) {
Remove links
text <- gsub("http\S+", "", text) text <- gsub("www.\S+", "", text)
Remove mentions
text <- gsub("@[A-Za-z0-9_]+", "", text)
Remove hashtags
text <- gsub("#[A-Za-z0-9_]+", "", text)
Remove other patterns if needed
text <- gsub("like", "", text)
text <- gsub("good", "", text)
return(text) }
data$Tweet <- sapply(data$Tweet, remove1)
###################################################
week intervals
###################################################
now we need intervals per week for stm
library(lubridate)
Convert 'created_at' column to POSIXct format
data$created_at1 <- ymd_hms(data$created_at, tz = "UTC")
Define your starting date
starting_date <- ymd_hms("2020-05-09 00:00:00", tz = "UTC")
Calculate the number of weeks from the starting date
data$weeks <- floor(as.numeric(data$created_at1 - starting_date) / (7 24 60 * 60)) + 1
Calculate the number of 3-day intervals from the starting date
data$fourdayint <- floor(as.numeric(data$created_at - starting_date) / (3 24 60 * 60)) data$fourdayint <- data$fourdayint + 1
Calculate the number of 1-day intervals from the starting date
data$onedayint <- floor(as.numeric(data$created_at1 - starting_date) / (1 24 60 * 60)) + 1
new method
Define your starting date
starting_date <- ymd_hms("2020-05-09 00:00:00", tz = "UTC")
Create intervals of 3 days
interval_length <- 3 24 60 * 60 # 3 days in seconds data$fourdayint <- cut(data$created_at1, breaks = seq(starting_date, max(data$created_at1) + interval_length, by = interval_length), labels = FALSE)
####################################################
#######################################################################################
STM
#######################################################################################
install.packages("textclean","tm","SnowballC")
load("C:/Users/Erik/Documents/Brent/USB-station/stats thesis/batch12notdoneyet.RData")
load("C:/Users/Brent/Documents/Psychology/KUL/thesisstatisticsdata/BLM(original-08_07).RData")
Load the required libraries
library(textclean) library(stm) library(foreach) library(doParallel) library(tm) library(SnowballC)
Set the number of cores to use for parallel processing
num_cores <- 10
Register parallel backend with the desired number of cores
registerDoParallel(cores = num_cores)
Preprocess the text data
cleaned_data <- tolower(data$Tweet)# Convert text to lowercase cleaned_data <- replace_contraction(cleaned_data) # Replace contractions for example (I'm -> I am)
cleaned_data <- replace_number(cleaned_data) # Replace numbers
cleaned_data <- replace_symbol(cleaned_data) # Replace symbols
cleaned_data <- strip_whitespace(cleaned_data) # Remove leading/trailing white spaces
Create a data frame with the 'dayfour' column as metadata
metadata <- data.frame(days = data$fourdayint, index = 1:nrow(data))
processed <- textProcessor(documents = cleaned_data, metadata = metadata, lowercase = TRUE, removestopwords = TRUE, language = "en", striphtml = T, wordLengths = c(3, Inf))
out <- prepDocuments(processed$documents, processed$vocab, processed$meta) docs <- out$documents vocab <- out$vocab meta <- out$meta
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 100) # removes infrequent terms depending on the user-set parameter lower.thresh
system.time({poliblogPrevFit <- stm(documents = out$documents, vocab = out$vocab, K = 15, prevalence = ~dayfour, max.em.its = 75, data = out$meta, init.type = "Spectral")})
install.packages("Rtsne" ,"rsvd", "geometry") library(rsvd) library(geometry) library(Rtsne)
system.time({storage <- searchK(out$documents, out$vocab, K = 0, prevalence =~ days, data = out$meta)}) ## it took 21 hours -> the result is 58 topics
system.time({Fit <- stm(documents = out$documents, vocab = out$vocab, K = 57, prevalence =~ days, max.em.its = 10, data = out$meta, init.type = "Spectral")})
save.image(file="BLM.RData")
install.packages("wordcloud") library(wordcloud)
plot(Fit, type = "summary", xlim = c(0, 0.2)) plot(Fit, type = "summary", xlim = c(0, 0.3),xlim = c(0, 0.3) ) par(mar = c(5.1,4.1,5.1,2.1)) plot(Fit, type = "summary", xlim = c(0, 0.15))
############## top 20 topics ######################
par(mfrow=c(1,1)) plot(Fit, type = "summary", xlim = c(0, 0.3))
############## top 24 topics ###################### par(mfrow=c(3,2), oma=c(0,0,0,0), mar=c(0,0,0,0)) cloud(Fit, topic = 1, scale = c(4, 1)) # mtext("Topic 12", side = 4,line =0 ) cloud(Fit, topic = 12, scale = c(4, 1)) # mtext("Topic 1", side = 2,line =2 ) cloud(Fit, topic = 36, scale = c(4, 1)) #
mtext("Topic 41", side = 4,line =0 ) cloud(Fit, topic = 41, scale = c(4, 1)) # mtext("Topic 36", side = 2,line =2 )
cloud(Fit, topic = 31, scale = c(2, 1)) # positivity about BLM (proud, love, encourage, hope, support, continue) mtext("Topic 16", side = 4,line =0 ) cloud(Fit, topic = 16, scale = c(4, 1)) ## mtext("Topic 31", side = 2,line =2 ) cloud(Fit, topic = 39, scale = c(4, 1)) # mtext("Topic 10", side = 4,line =0 ) cloud(Fit, topic = 10, scale = c(4, 1)) # mtext("Topic 39", side = 2,line =2 ) cloud(Fit, topic = 51, scale = c(4, 1)) # mtext("Topic 40", side = 4,line =0 )
cloud(Fit, topic = 40,scale=c(4 ,1)) # mtext("Topic 51", side = 2,line =2 ) cloud(Fit ,topic=27,scale=c(4 ,1)) # mtext("Topic 3", side = 4,line =0 ) cloud(Fit ,topic=3,scale=c(4 ,1)) # mtext("Topic 27", side = 2,line =2 ) cloud(Fit ,topic=21,scale=c(4 ,1)) # mtext("Topic 26", side = 4,line =0 ) cloud(Fit ,topic=26,scale=c(4 ,1)) # mtext("Topic 21", side = 2,line =2 )
cloud(Fit ,topic=56,scale=c(4 ,1)) # mtext("Topic 20", side = 4,line =0 ) cloud(Fit ,topic=20,scale=c(4 ,1)) # mtext("Topic 56", side = 2,line =2 ) cloud(Fit ,topic=14,scale=c(4 ,1)) # mtext("Topic 55", side = 4,line =0 ) cloud(Fit ,topic=55,scale=c(4 ,1)) mtext("Topic 14", side = 2,line =2 )
first topic and 20th topic
cloud(Fit ,topic=30,scale=c(4 ,1)) # [stay home] -> government's campaign(stay,hom e,safe,p eople away,everyone,w ork,p rotect) mtext("Topic 32", side = 4,line =0 ) cloud(Fit ,topic=32,scale=c(4 ,1)) mtext("Topic 30", side = 2,line =2 )
cloud(Fit ,topic=29,scale=c(4 ,1)) # mtext("Topic 44", side = 4,line =0 ) cloud(Fit ,topic=44,scale=c(4 ,1)) # mtext("Topic 29", side = 2,line =2 ) cloud(Fit ,topic=23,scale=c(4 ,1)) # mtext("Topic 43", side = 4,line =0 ) cloud(Fit ,topic=43,scale=c(4 ,1)) mtext("Topic 23", side = 2,line =2 )
par(mfrow=c(1,1))
###############################################◘ 11-08-2023
topics with most frequent words
Examine Top Words for Each Topic
library(stm) library(ggplot2)
Assuming 'Fit' is your STM model
topic_number <- 12 n_top_words <- 10
Get log probabilities from the specific topic
log_word_probs <- Fit$beta$logbeta[[1]][, topic_number]
Convert log probabilities to actual probabilities
word_probs <- exp(log_word_probs)
Get the top N words
top_words_indices <- order(word_probs, decreasing = TRUE)[1:n_top_words] top_words <- Fit$vocab[top_words_indices] top_probs <- word_probs[top_words_indices]
Create a data frame and plot
top_words_for_topic <- data.frame(word = top_words, probability = top_probs) ggplot(data = top_words_for_topic, aes(x = reorder(word, probability), y = probability)) + geom_bar(stat = "identity") + coord_flip() + ggtitle(paste("Top Words for Topic", topic_number)) + xlab("Words") + ylab("Probability")
Install and load required packages
install.packages("tmtoolkit") library(tmtoolkit)
Get the topic-word probabilities from your 'Fit' model
topic_word_probs <- as.matrix(Fit$beta)
Calculate topic coherence using 'tmtoolkit' package
coherence_scores <- calculate_coherence(topic_word_probs, method = "c_v", documents = out$documents)
Plot topic coherence
plot(1:Fit$K, coherence_scores, type = "b", xlab = "Number of Topics", ylab = "Coherence Score", main = "Topic Coherence")
exclusivity_scores <- stm::exclusivity(Fit)
Plot topic exclusivity
plot(exclusivity_scores, main = "Topic Exclusivity")
semco <- semanticCoherence(Fit, out$documents, M = 20)
semco5 <- semanticCoherence(Fit, out$documents, M = 5)
Create a bar plot of semantic coherence scores
barplot(semco, xlab = "Topic Number", ylab = "Semantic Coherence", main = "Semantic Coherence for Topics")
barplot(semco5, names.arg = 1:57, xlab = "Topic Number", ylab = "Semantic Coherence", main = "Semantic Coherence for Topics", cex.names = 0.6)
topic_indices <- c(1:57) coherence_values <- semco5[topic_indices]
Print the extracted coherence values
print(coherence_values)
mean(coherence_values)
###########################################
♠ now apply to whole dataset in batches
batch_1 <- batch_11 remove(batch_11)
Preprocess the text data
cleaned_data <- tolower(batch_1$Tweet)# Convert text to lowercase
Define your starting date
batch_1$created_at1 <- ymd_hms(batch_1$created_at, tz = "UTC") starting_date <- ymd_hms("2020-05-09 00:00:00", tz = "UTC")
Create intervals of 3 days
interval_length <- 3 24 60 * 60 # 3 days in seconds batch_1$fourdayint <- cut(batch_1$created_at1, breaks = seq(starting_date, max(batch_1$created_at1) + interval_length, by = interval_length), labels = FALSE)
Create a data frame with the 'dayfour' column as metadata
metadata2 <- data.frame(days = batch_1$fourdayint, index = 1:nrow(batch_1))
processed2 <- textProcessor(documents = cleaned_data, metadata = metadata2, lowercase = TRUE, removestopwords = TRUE, language = "en", striphtml = T)
out2 <- prepDocuments(processed2$documents, processed2$vocab, processed2$meta) docs2 <- out2$documents vocab2 <- out2$vocab meta2 <- out2$meta
processed2 <- processed
out2 <- out
docs2 <- out2$documents
vocab2 <- out2$vocab
meta2 <- out2$meta
system.time({Fit <- stm(documents = out$documents, vocab = out$vocab, K = 57, prevalence =~ days, max.em.its = 10, data = out$meta, init.type = "Spectral")})
new <- prepDocuments(out2$documents, out2$vocab) old.vocab <- out$vocab aligned <- alignCorpus(new, old.vocab) docs2 <- aligned$documents
new option
aligned2 <- alignCorpus(out2, old.vocab)
Apply the fitNewDocuments function
system.time({fit_new <- fitNewDocuments(model = Fit, documents = aligned2$documents, newData = aligned2$meta, origData = meta, # Set origData to NULL since it's not available for the new data prevalence = ~ days, betaIndex = NULL, prevalencePrior = "Covariate", contentPrior = "Average", returnPosterior = FALSE, returnPriors = FALSE, designMatrix = NULL, test = TRUE, verbose = TRUE)})
save.image(file="batch12.RData")
###################################################################### save the data 22/07
load("C:/Users/Erik/Documents/Brent/USB-station/stats thesis/BLM2.RData")
Get the topic proportions
topic_proportions <- Fit$theta
Convert the topic proportions to a data frame
topic_proportions_df <- as.data.frame(topic_proportions)
Rename the columns to reflect the topic numbers
colnames(topic_proportionsdf) <- paste0("topic", 1:ncol(topic_proportions_df))
Get the names of the elements in the 'documents' list of 'aligned'
names_in_aligned <- names(docs)
lastname <- tail(names_in_aligned, n = 5) print(lastname)
Convert the names of aligned to numeric
row_indices <- as.numeric(names_in_aligned)
Add a new 'id' column to topic_proportions_df that contains these indices
topic_proportions_df$id2 <- row_indices
Convert the 'id' column to numeric
batch_1$id2 <- as.numeric(row.names(batch_1))
Merge batch_1 and topic_proportions_df based on the 'id' column
data_with_topics <- merge(batch_1, topic_proportions_df, by = "id2", all.x = TRUE)
Save the data
write.csv(data_with_topics, "batch2p.csv")
check if it worked
library(dplyr) dta <- data_with_topics %>% select('text','topic_1','topic_12','topic_36','topic_41','topic_31','topic_16')
#########################################
COVID
######################################### par(mfrow=c(3,2), oma=c(0,0,0,0), mar=c(0,0,0,0)) cloud(Fit, topic = 8, scale = c(4, 1)) # mtext("Topic 30", side = 4,line =0 ) cloud(Fit, topic = 30, scale = c(4, 1)) # [lockdown, quarantine] mtext("Topic 8", side = 2,line =2 ) cloud(Fit, topic = 37, scale = c(4, 1)) # (shopping for) food, drinks, mtext("Topic 18", side = 4,line =0 ) cloud(Fit, topic = 18, scale = c(4, 1)) # [corona, deaths, chinese] mtext("Topic 37", side = 2,line =2 )
cloud(Fit, topic = 51, scale = c(2, 1)) # mtext("Topic 34", side = 4,line =0 ) cloud(Fit, topic = 34, scale = c(4, 1)) ## topic related to [love] mtext("Topic 51", side = 2,line =2 ) cloud(Fit, topic = 27, scale = c(4, 1)) # [leasure time] (multimedia such as: youtube, music, gym, album, radio, internet, netflix, disney,) mtext("Topic 20", side = 4,line =0 ) cloud(Fit, topic = 20, scale = c(4, 1)) # [coronavirus] mtext("Topic 27", side = 2,line =2 ) cloud(Fit, topic = 43, scale = c(4, 1)) # [helping behavior] -> share, support, help, community, charity, donate, respect,vulnerable, mtext("Topic 33", side = 4,line =0 )
cloud(Fit, topic = 33,scale=c(4 ,1)) # [pandemic] mtext("Topic 43", side = 2,line =2 ) cloud(Fit ,topic=13,scale=c(4 ,1)) # [uplifting and distracting] mtext("Topic 16", side = 4,line =0 ) cloud(Fit ,topic=16,scale=c(4 ,1)) # [Trump and the virus] mtext("Topic 13", side = 2,line =2 ) cloud(Fit ,topic=49,scale=c(4 ,1)) # [people not following COVID guidelines] mtext("Topic 46", side = 4,line =0 ) cloud(Fit ,topic=46,scale=c(4 ,1)) # [cognitive thinking] (think,believe ,reason ,thought ,opinion,mind ,seem) mtext("Topic 49", side = 2,line =2 )
cloud(Fit ,topic=21,scale=c(4 ,1)) # [cases] (case,new,number ,report ,confirm) mtext("Topic 38", side = 4,line =0 ) cloud(Fit ,topic=38,scale=c(4 ,1)) # [regulations] (rule,break,police,law,trave lignore,fine,government ,arrest,p eople ,drive) mtext("Topic 21", side = 2,line =2 ) cloud(Fit ,topic=9,scale=c(4 ,1)) # [deaths] (death ,coronavirus,die,rat e percent,increase,f igur e thousand,suffer,...) mtext("Topic 28", side = 4,line =0 ) cloud(Fit ,topic=28,scale=c(4 ,1)) mtext("Topic 9", side = 2,line =2 )
first topic and 20th topic
par(mfrow=c(1,2), oma=c(0,0,0,0), mar=c(0,0,0,0)) cloud(Fit ,topic=29,scale=c(4 ,1)) # [stay home] -> government's campaign(stay,hom e,safe,p eople away,everyone,w ork,p rotect) mtext("Topic 35", side = 4,line =0 ) cloud(Fit ,topic=35,scale=c(4 ,1)) mtext("Topic 29", side = 2,line =2 ) par(mfrow=c(1,1))
more detailed (multiple metrics)
labelTopics(Fit, c(29, 8, 30,37,18,51,34,27,20,43,33,13,16,49,46,21,38,9,28,35))
topic_probabilities <- findThoughts(Fit)
thoughts6 <- findThoughts(Fit, texts = shortdoc, n = 2, topics = 6)$docs[[1]] plotQuote(thoughts6, width = 30, main = "Topic 6") head(shortdoc)
Topic5 <- findThoughts(Fit, out$text, topics = 5, n = 5)
Topic5
cl_data <- as.data.frame(cleaned_data) thoughts <- findThoughts(Fit, data$Tweet, n = 1, topics = 1) thoughts
###################### back to dataframe
doc_topics <- labelTopics(Fit, out$documents) data_with_topic_probabilities <- cbind(cleaned_data, doc_topics)
Label topics for each document
topic_labels <- labelTopics(processed)
Merge the topic labels with the original 'cleaned_data' dataset
labeled_data <- cbind(cleaned_data, topic_labels)
################ covariate ############## out$meta$rating <- as.factor(out$meta$rating) prep <- estimateEffect(1:20 ~ dayfour, Fit, metadata = out$meta, uncertainty = "Global") plot(prep, "day", method = "continuous", topics = 13, model = z, printlegend = FALSE, xaxt = "n", xlab = "Time (2008)")
Create a document-term matrix
corpus <- textProcessor(documents = cleaned_data, metadata = data, lowercase = TRUE, removePunctuation = TRUE, removeNumbers = TRUE, stopwords = TRUE, stemming = TRUE, wordLengths = c(3, Inf))
Convert the preprocessed data into a document-feature matrix
doc_term_matrix <- stm::convert(corpus, to = "stm")
Find the optimal number of topics using parallel processing
search_results <- foreach(k = seq(2, 10, 1), .combine = cbind) %dopar% { model <- stm(doc_term_matrix, K = k, verbose = FALSE) data.frame(k, searchK(model)) }
Select the best number of topics based on the chosen metric
best_topics <- apply(search_results[2:4, ], 2, which.max)
Perform topic modeling with the best number of topics
model <- stm(doc_term_matrix, K = best_topics)
Print the top terms for each topic
labelTopics(model)
preprocessing
select the clusters and perform topic modelling
plotRemoved(processed$documents, lower.thresh = seq(1, 200, by = 100))
out <- prepDocuments(processed$documents, processed$vocab, processed$meta, lower.thresh = 15)
saveRDS(out, file = "STMAnger.rds")
saveRDS(processed, file = "STM2Anger.rds")
poliblogPrevFit <- stm(documents = out$documents, vocab = out$vocab, K = 20, prevalence = ~series, max.em.its = 75, data = out$meta, init.type = "Spectral")
poliblogPrevFit10topics <- stm(documents = out$documents, vocab = out$vocab, K = 10, prevalence = ~series, max.em.its = 40, data = out$meta, init.type = "Spectral")
poliblogSelect <- selectModel(out$documents, out$vocab, K = 20, prevalence = ~emo_anger, max.em.its = 75, data = out$meta, runs = 20, seed = 8458159)
labelTopics(poliblogPrevFit, c(6, 13, 18))
plot(poliblogPrevFit, type = "summary", xlim = c(0, 0.3))
cloud(poliblogPrevFit, topic = 12, scale = c(4, 1)) # politics cloud(poliblogPrevFit, topic = 19, scale = c(5, 1)) cloud(poliblogPrevFit, topic = 9, scale = c(5, 1)) # things going on in the world & pandemic
cloud(poliblogPrevFit, topic = 1, scale = c(5, 1)) # random cloud(poliblogPrevFit, topic = 8, scale = c(5, 1)) # random cloud(poliblogPrevFit, topic = 1, scale = c(5, 1)) # random cloud(poliblogPrevFit, topic = 16, scale = c(5, 1)) # coronavirus, nhs, selfish, food, irrespons,distance, social, isol, staff, ignor, home cloud(poliblogPrevFit, topic = 11, scale = c(5, 1)) # random verbs cloud(poliblogPrevFit, topic = 3, scale = c(5, 1)) # social media & conversation cloud(poliblogPrevFit, topic = 5, scale = c(5, 1)) # leisure time & weather(possibly because walking is leisure) cloud(poliblogPrevFit, topic = 14, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 4, scale = c(5, 1)) #some great British mockery cloud(poliblogPrevFit, topic = 18, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 20, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 10, scale = c(5, 1)) #football cloud(poliblogPrevFit, topic = 7, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 15, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 13, scale = c(5, 1)) # school, children, student, teacher, lesson, educ, video, cloud(poliblogPrevFit, topic = 1, scale = c(5, 1)) #random cloud(poliblogPrevFit, topic = 17, scale = c(5, 1)) #politics countries of the UK (Scotland, northen Ireland,...)
mod.out.corr <- topicCorr(poliblogPrevFit) plot(mod.out.corr)
now with 10 topics
library(parallel) ncores <- 10
cl <- makeCluster(ncores)
Fit the STM model in parallel
fit <- parLapply(cl, 1:ncores, function(i) { stm(documents = out$documents, vocab = out$vocab, K = 10, prevalence = ~series, max.em.its = 75, data = out$meta, init.type = "Spectral") })
Stop the parallel processing environment
stopCluster(cl) poliblogPrevFit2 <- searchK(documents = out$documents, vocab = out$vocab, K = c(5, 10,15), prevalence = ~series, max.em.its = 50, data = out$meta, init.type = "Spectral", cores=10)
poliblogPrevFit10topics <- stm(documents = out$documents, vocab = out$vocab, K = 10, prevalence = ~series, max.em.its = 40, data = out$meta, init.type = "Spectral") plot(poliblogPrevFit10topics, type = "summary", xlim = c(0, 0.3)) cloud(poliblogPrevFit10topics, topic = 3, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 7, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 9, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 8, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 1, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 6, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 4, scale = c(4, 1)) cloud(poliblogPrevFit10topics, topic = 5, scale = c(4, 1))
save.image(file="Anger.RData")
fear
plot(poliblogPrevFit, type = "summary", xlim = c(0, 0.3)) cloud(poliblogPrevFit, topic = 18, scale = c(4, 1)) # anxiety related terms cloud(poliblogPrevFit, topic = 12, scale = c(4, 1)) # anxiety related terms cloud(poliblogPrevFit, topic = 13, scale = c(4, 1)) # anxiety related terms cloud(poliblogPrevFit, topic = 15, scale = c(4, 1)) # coronavirus, covid, death, symptom, hospit, case, contract, pandem,test, cloud(poliblogPrevFit, topic = 11, scale = c(4, 1)) # cloud(poliblogPrevFit, topic = 19, scale = c(4, 1)) # also restrictions a bit cloud(poliblogPrevFit, topic = 2, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 10, scale = c(4, 1))
cloud(poliblogPrevFit, topic = 10, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 17, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 14, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 1, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 8, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 5, scale = c(4, 1)) cloud(poliblogPrevFit, topic = 4, scale = c(4, 1))
mod.out.corr <- topicCorr(poliblogPrevFit) plot(mod.out.corr) View(mod.out.corr)
Assuming 'Fit' is your STM model
plot(Fit, type="labels", topics=c(1,12,41,31,16,39,10,51,40,27,3,21,26,56,20,14,55,30,32,29,44,23,43,54,42,15,22,13,2,38,52,34,11,8,18,35,37,28,5,6,9,47,7,33,45,53,25,4,57,50,46,48,49,19,17,24)) # Adjust topics to the ones you want to see
plot(Fit, type="labels", topics=c(16,39,10,51,40,27,3,21,26,56,20,14,55,30)) plot(Fit, type="labels", topics=c(32,29,44,23,43,54,42,15,22,13,2,38,52,34)) plot(Fit, type="labels", topics=c(11,8,18,35,37,28,5,6,9,47,7,33,45,53)) plot(Fit, type="labels", topics=c(25,4,57,50,46,48,49,19,17,24)) plot(Fit, type="labels", topics=c())
top_words <- labelTopics(Fit, n=10)$prob
############################################################
TV VAR
############################################################
######################## for model 1
n2020n2 <- alldataz_sum2[1191:nrow(alldataz_sum2),] rownames(n2020n2) <- seq(1263, 1262 + nrow(n2020n2))
whole_tseriesdata$cases[whole_tseriesdata$cases == 0] <- NA
mood <- n2020n2 %>% select(count_topic1, count_topic12, count_topic31,count_topic41, avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,
)
mood_labels <- colnames(mood[1:9])
mood_labels <- c("positive", "anxiety")
mood2 <- data.matrix(mood)
01-08-2023
time2 <- n2020n2 %>% select(Hourno, Dayno)
time2 <- data.matrix(time2)
time2 <- time %>% select(hourno,dayno,hour,day,month) time2 <- data.matrix(time2)
sum(is.na(mood2)) sum(is.na(time2))
##########################################
which lags?
##########################################
ccf(mood$avg_d_topic1, mood$count_topic12, lag.max=200, plot=TRUE)
find where relationship is negative
ccf_result <- ccf(mood$avg_d_topic1, mood$count_topic1, lag.max=100, plot=FALSE) negative_lags <- ccf_result$lag[ccf_result$acf < 0] negative_acf <- ccf_result$acf[ccf_result$acf < 0]
Calculate 95% significance level
significance_level <- qnorm((1 + 0.95)/2)/sqrt(length(mood$avg_d_topic1))
Find negative and significant lags
significant_negative_lags <- negative_lags[abs(negative_acf) > significance_level]
ccf(mood$avg_d_topic1, mood$count_topic1, lag.max=60, plot=TRUE) abline(h = -significance_level, col = "blue", lty = 2) # Add line for -95% significance points(significant_negative_lags, negative_acf[abs(negative_acf) > significance_level], col = "red", pch = 19)
library(lmtest)
Test if avg_d_topic1 Granger-causes count_topic1
result <- grangertest(count_topic1 ~ avg_d_topic1, order=1, data=mood) # You can change the order parameter summary(result)
##########################################
bandwith selection
##########################################
set.seed(1) bwSeq <- seq(0.1,0.2,length=10)
bw_object <-bwSelect(data= mood2, type=rep("g",9), # number = number of variables level=rep(1,9), bwSeq=bwSeq, bwFolds=1, bwFoldsize=20, modeltype="mvar", lags=1, scale=TRUE,
timepoints= #when no time points are given, we expect them to be equidistant
bandwith <- bwSeq[which.min(bw_object$meanError)] bandwith
bandwithdata0.01_0.1hour <-bw_object$meanError
bandwithdata0.1_1hour <-bw_object$meanError bandwithdata0.1_1hour bandwithdata0.01_0.1hour <-bw_object$meanError bandwithdata0.01_0.1hour
bandwithdata0.1_0.2hour <-bw_object$meanError bandwithdata0.1_0.2hour
bandwithdata0.01_0.11hour
bandwithdata0.1_1hour <-bw_object$meanError
bandwithdata1_10 <-bw_object$meanError
bandwithdata0.1_1 <-bw_object$meanError
bandwithdata0.01_0.1 <-bw_object$meanError
bandwithdata0.1_1hour bandwithdata0.01_0.1hour
model per days
bandwithdata1_10
bandwithdata0.1_1
bandwithdata0.01_0.1
##################################
visualize Bandwith: Plot the mean error against the bandwidth sequence
##################################
new
Combining all bandwidth sequences
bwSeq_combined <- c(seq(0.01, 0.1, length=10), seq(0.1, 1, length=10), seq(0.1, 0.2, length=10))
Combining all bandwidth mean error data
all_bandwidth_data <- c(bandwithdata0.01_0.1hour, bandwithdata0.1_1hour, bandwithdata0.1_0.2hour)
Plotting the combined data
plot(bwSeq_combined, all_bandwidth_data, type="n", xlab="Bandwidths", ylab="Mean Error", main="Bandwidth Selection", xlim=c(0.01, 1), ylim=range(all_bandwidth_data), xaxt="n")
Custom x-axis ticks
breaks <- c(seq(0, 0.2, by=0.02), seq(0.3, 1, by=0.1)) axis(1, at=breaks, labels=TRUE, cex.axis=0.6)
Adding the individual data series
lines(seq(0.01, 0.1, length=10), bandwithdata0.01_0.1hour, type="b", pch=20, col="red") lines(seq(0.1, 1, length=10), bandwithdata0.1_1hour, type="b", pch=20, col="blue") lines(seq(0.1, 0.2, length=10), bandwithdata0.1_0.2hour, type="b", pch=20, col="green")
Adding the legend at the bottom
legend(x="topright", legend=c("Bandwidth 0.01-0.1", "Bandwidth 0.1-1", "Bandwidth 0.1-0.2"), col=c("red", "blue", "green"), lwd=1, pch=20, horiz=TRUE)
##############
visualize
##############
Gaussian kernel function
gaussian_kernel <- function(t, te, sigma) { Zt <- (1 / (sqrt(2 pi sigma))) exp(-((t - te)^2) / (2 sigma^2)) Zt_max <- max(sapply(1:100, function(t) (1 / (sqrt(2 pi sigma))) exp(-((t - te)^2) / (2 sigma^2)))) return(Zt / Zt_max) }
Parameters
te <- 6 # Center of the Gaussian b <- 1.3 # Bandwidth time_points <- seq(1, 12, by=0.1) # Using a finer granularity for smoother curve
Calculate weights for all time points
weights <- sapply(time_points, function(t) gaussian_kernel(t, te, b))
Plot
plot(time_points, weights, type="l", xlab="Time Points", ylab="Weights", main=paste("Weights for bandwidth b = 0.13"))
make more pretty
Libraries
library(ggplot2)
Gaussian Kernel Function
gaussian_kernel <- function(t, te, sigma) { Zt <- (1 / (sqrt(2 pi sigma))) exp(-((t - te)^2) / (2 sigma^2)) Zt_max <- max(sapply(1:100, function(t) (1 / (sqrt(2 pi sigma))) exp(-((t - te)^2) / (2 sigma^2)))) return(Zt / Zt_max) }
Parameters
te <- 6 # Center of the Gaussian b <- 1.3 # Bandwidth time_points <- seq(1, 12) # Using a finer granularity for smoother curve
Calculate weights for all time points
weights <- sapply(time_points, function(t) gaussian_kernel(t, te, b))
Create a data frame
data <- data.frame(Time = time_points, Weight = weights)
Create a ggplot graph
p <- ggplot(data, aes(x = Time, y = Weight)) + geom_line(color = "steelblue", size = 1) + labs(x = "Time Points", y = "Weights", title = paste("Weights for bandwidth b =", b)) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
Print the graph
print(p)
gaussian_kernel <- function(t, te, sigma) { Zt <- (1 / (sqrt(2 pi) sigma)) exp(-((t - te)^2) / (2 sigma^2)) Zt_max <- max(sapply(1:100, function(t) (1 / (sqrt(2 pi sigma))) exp(-((t - te)^2) / (2 sigma^2)))) return(Zt / Zt_max) }
Parameters
te <- 6 # Center of the Gaussian b <- 1.3 # Bandwidth sigma <- 0.13 time_points <- seq(1, 12) # Using a finer granularity for smoother curve
Calculate weights for all time points
weights <- sapply(time_points, function(t) gaussian_kernel(t, te, b))
Standard deviation values
std <- b
Plot
plot(time_points, weights, type = "n", xlab = "Time Points", ylab = "Weights", main = paste("Weights for bandwidth b = 0.13"), xaxt = "n")
Fill area under the curve
polygon(c(time_points, rev(time_points)), c(weights, rep(0, length(weights))), col = "lightblue")
lines(time_points, weights, col = "steelblue", lwd = 2)
Set x-axis tick marks
axis(1, at = seq(1, 12))
Add vertical lines at +2 std and -2 std
abline(v = c(te - 2 std, te + 2 std), col = c("red", "red"), lty = 2)
Add a legend
legend("topright", legend = paste("b = 0.13"), col = "steelblue", lwd = 2)
Add grid
grid()
Gaussian Kernel Function
gaussian_kernel <- function(j, te, b) { weight <- (1 / (sqrt(2 pi b^2))) exp(-((j - te)^2) / (2 b^2)) return(weight) }
Parameters
te <- 3 # Center of the Gaussian b <- 0.2 # Bandwidth time_points <- seq(1, 12) # Using a finer granularity for smoother curve
Calculate weights for all time points
weights <- sapply(1:length(time_points), function(j) gaussian_kernel(j, te, b))
Plot
plot(time_points, weights, type = "l", xlab = "Time Points", ylab = "Weights", main = paste("Weights for bandwidth b =", b))
Add grid
grid()
##########################################
model
##########################################
tvvar_obj <- tvmvar(data=mood2, type=rep("g",9), # 9 = number of variables level=rep(1,9), lambdaSel="CV",
timepoints= mood$rank, #when no time points are given, we expect them to be equidistant
tvvar_obj str(tvvar_obj) summary(tvvar_obj) str(tvvar_obj$tvmodels[[1]])
lambda_min_index_1 <- which.min(tvvar_obj$tvmodels[[1]]$fitobj$glmnet.fit$lambda) coefficients_for_lambda_min_1 <- tvvar_obj$tvmodels[[1]]$fitobj$glmnet.fit$beta[, lambda_min_index_1]
print(possible_coefs) ########################################
reliability of parameter estimates
########################################
res_obj <- resample(object=tvvar_obj, data=mood, nB=50, blocks=10, seeds=1:50, quantiles=c(0.05,0.95)) res_obj
########################################
time-varying prediction error
########################################
pred_obj <- predict(object=tvvar_obj, data=mood, errorCon=c("R2","RMSE"), tvMethod="weighted", consec=time2$Hourno) ## switch to hourno for data per hour
pred_obj pred_obj$tverrors
####################################################
time-varying Mixed Graphical Model (MGM)
####################################################
fit_mgm <- mgm(data = mood, type = rep("g",9), levels = rep(1,9), k = 3, lambdaSel = "CV", lambdaFolds = 10,ruleReg = "AND")
str(fit_mgm) tv_coeffs <- fit_mgm$coef summary(fit_mgm)
Plotting time-varying coefficients
plot(1:nrow(tv_coeffs), tv_coeffs[,1], type="l", col=1, ylim=range(tv_coeffs), xlab="Time", ylab="Coefficient Value", main="Time-varying Coefficients")
If you have more than one coefficient to plot
for (i in 2:ncol(tv_coeffs)) { lines(1:nrow(tv_coeffs), tv_coeffs[,i], col=i) }
Adding a legend to distinguish coefficients
legend("topright", legend=colnames(tv_coeffs), col=1:ncol(tv_coeffs), lty=1)
count_topic1, count_topic12, count_topic31,count_topic41,
avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,
Count
fit_mgm$interactions$indicator
FactorGraph(object = fit_mgm, labels = mood_labels,PairwiseAsEdge = FALSE)
########################################
visualizing tv VAR model
########################################
######################################################################################
NEW visualization
######################################################################################
f_timeline_new <- function(length = .15, gap = .005, mar = c(0,0,0,0), ylim = c(-.1,.1), ytext = -.1, cex = 1) {
par(mar=mar) plot.new() plot.window(xlim=c(0,1), ylim=ylim)
box()
arrows
p_weeks <- c(394,2160,1367) bor_end <- c(0,cumsum(p_weeks)/sum(p_weeks)) for(i in 1:3) { arrows(bor_end[i]+gap, 0, bor_end[i+1]-gap, code=3, length=length, lwd=1.5) }
text
t_lengths <- p_weeks / sum(p_weeks) midpoints <- bor_end[-1] - t_lengths/2
text(midpoints, rep(ytext, 3), c("May 2020", "June 2020", "First 20 days of July 2020"), cex = 0.8)
lockdown
points(0, rep(0,1), pch=20, cex=1.5, col = "red") ## not sure about this!! (sum(p_weeks)*7-10) = 53, what is it?
points(394, rep(0,1), pch=20, cex=1.5, col = "orange")
points(2554, rep(0,3), pch=4, cex=1.5, col = "black") ## cross for networkmodel 1
points((42) / (sum(p_weeks)*7-5), pch=4, cex=1.5, col = "black") ## cross for networkmodel 2
points(c(42,98) / (sum(p_weeks)*7-5), rep(0,2), pch=4, cex=1.5, col = "black") ## cross for networkmodel 3
}
----- Preprocessing ------
Compute mean movel over time to create decent layout
mean_wadj <- apply(tvvar_obj$wadj[, , 1, ], 1:2, mean)
par_ests <- tvvar_obj$wadj ind_negative <- which(tvvar_obj$signs == -1, arr.ind = T) par_ests[ind_negative] <- par_ests[ind_negative] * -1
Find parameters with highest SD
wadj_ws <- tvvar_obj$wadj wadj_ws[tvvar_obj$edgecolor=="red"] <- wadj_ws[tvvar_obj$edgecolor=="red"] * -1 parm_sds <- apply(wadj_ws, 1:2, sd) parm_sds_mat <- matrix(NA, 12^2, 3) counter <- 1 for(i in 1:9) { ##### change this to the amount of variables for(j in 1:9) { ##### change this to the amount of variables parm_sds_mat[counter, ] <- c(i, j, parm_sds[i, j]) counter <- counter + 1 } }
parm_sds_mat_ord <- parm_sds_mat[order(parm_sds_mat[, 3], decreasing = TRUE), ]
count_topic1, count_topic12, count_topic31,count_topic41,
avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,
Count
head(parm_sds_mat_ord) # six most time-varying parameters parm_sds_mat
----- Plotting ------
library(qgraph) figDir <- "C:/Users/Brent/Documents/Psychology/KUL/thesisstatisticsdata/mgm"
pdf(paste0(figDir, "Fig_Application_mgm2.pdf"), width = 8, height = 7)
1) Define Layout
lmat <- matrix(c(1, 2, 3, 4, 4, 4, 5, 5, 5), ncol=3, byrow = T) lo <- layout(lmat, heights = c(.7,.1, .6), widths = c(1, 1, 1))
2) Two Network Plots
Get layout of mean graph
Q <- qgraph(t(mean_wadj), DoNotPlot=TRUE) saveRDS(Q$layout, "C:/Users/Brent/Documents/Psychology/KUL/thesisstatisticsdata/layout_mgm.RDS")
Plot graph at selected fixed time points
tpSelect <- c(2, 50, 100)
Switch to colorblind scheme
tvvar_obj$edgecolor[, , , ][tvvar_obj$edgecolor[, , , ] == "darkgreen"] <- c("darkblue") lty_array <- array(1, dim=c(9, 9, 1, 100)) ##### change this to the amount of variables lty_array[tvvar_obj$edgecolor[, , , ] != "darkblue"] <- 2
for(tp in tpSelect) { qgraph(t(tvvar_obj$wadj[, , 1, tp]), layout = Q$layout, edge.color = t(tvvar_obj$edgecolor[, , 1, tp]), labels = mood_labels, vsize = 13, esize = 10, asize = 10, mar = rep(5, 4), minimum = 0, maximum = .5, lty = t(lty_array[, , 1, tp]), pie = pred_obj$tverrors[[tp]][, 3]) }
4) Timeline
f_timeline_new(length = .1, mar=c(0, 4, 0, 1), ylim = c(-1.2, .2), ytext = -.9, cex = 1)
5) Line-plots + CIs
plot.new() par(mar = c(4,4,0,1)) plot.window(xlim=c(1, 100), ylim=c(-.25, 1)) axis(1, c(1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100), labels=T) axis(2, c(-0.25,0, .25,0.5,0.75,1), las=2)
title(xlab = "Estimation points", cex.lab = 1.1) title(ylab = "Parameter estimate", cex.lab = 1.1) abline(h = 0, col = "grey", lty=2)
head(parm_sds_mat_ord) # pick three highest
count_topic1, count_topic12, count_topic31,count_topic41,
avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,
Count
m_par_display <- matrix(c(1, 5, 2, 6, 4, 8), ncol = 2, byrow = T)
Select colors
cols <- brewer.pal(12, "Spectral")[c(5,1,3)] # avoid red/green because used for edges in upper panel
set 1 ### set 2
for(i in 1:nrow(m_par_display)) { par_row <- m_par_display[i, ]
Plot point estimates
P1_pointest <- par_ests[par_row[1], par_row[2], 1, ] lines(1:100, P1_pointest, col = cols[i], lwd = 2, lty=i)
Plot uncertainty estimates [new shading]
Compute CIs
CIs <- apply(res_obj$bootParameters[par_row[1], par_row[2], 1, , ], 1, function(x) { quantile(x, probs = c(.05, .95)) } )
Plot shading
polygon(x = c(1:100, 100:1), y = c(CIs[1,], rev(CIs[2,])), col=alpha(colour = cols[i], alpha = .3), border=FALSE) } # end for: i
count_topic1, count_topic12, count_topic31,count_topic41,
avg_d_topic1, avg_d_topic12,avg_d_topic31, avg_d_topic41,
Count
legend_labels <- c(expression("count_topic1"["t-1"] %->% "avg_d_topic1"["t"]), expression("count_topic12"["t-1"] %->% "avg_d_topic12"["t"]), expression("count_topic41"["t-1"] %->% "avg_d_topic41"["t"])) legend("bottomright", legend_labels, col = cols, lwd = 2, bty = "n", cex = 1, horiz=T, lty=1:3)
dev.off()