-- coding: utf-8 --

""" Created on Sun Jul 23 23:36:39 2023

@author: Brent """

########################################################################################################

Clustering

######################################################################################################## import pandas as pd import os

os.chdir("C:\Users\Erik\Documents\Brent\USB-station\statsthesis")

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata")

◘df = pd.read_csv(r'dataNLPTopic.csv')

dfall = pd.read_csv(r'alldataz_sum2.csv') aa = df.head(1000)

###########################################

standerdize

###########################################

from sklearn.preprocessing import StandardScaler

Create a StandardScaler instance

scaler = StandardScaler()

Specify the columns to standardize

cols_to_standardize = list(aa.columns[19:30]) + list(aa.columns[-57:]) cols_to_standardize = list(df.columns[17:28]) + list(df.columns[-57:])

Standardize the columns

aa[cols_to_standardize] = scaler.fit_transform(aa[cols_to_standardize])

df[cols_to_standardize] = scaler.fit_transform(df[cols_to_standardize])

topicsd = df['topic_1'].std topicsd = df['topic_1'].std()

###########################################

subset for clustering

###########################################

topics top 18:

1,12,36,41,

31,16,39,10,51

40,27,3,21,26

56,20,14,55

threshold = 0

subsettopic1 = df[df['topic_1'] > threshold]

subsettopic12 = df[df['topic_12'] > threshold] subsettopic36 = df[df['topic_36'] > threshold] subsettopic41 = df[df['topic_41'] > threshold] subsettopic31 = df[df['topic_31'] > threshold] subsettopic16 = df[df['topic_16'] > threshold]

subsettopic39 = df[df['topic_39'] > threshold] subsettopic10 = df[df['topic_10'] > threshold] subsettopic51 = df[df['topic_51'] > threshold] subsettopic40 = df[df['topic_40'] > threshold] subsettopic27 = df[df['topic_27'] > threshold] subsettopic3 = df[df['topic_3'] > threshold] subsettopic21 = df[df['topic_21'] > threshold] subsettopic26 = df[df['topic_26'] > threshold] subsettopic56 = df[df['topic_56'] > threshold] subsettopic20 = df[df['topic_20'] > threshold] subsettopic14 = df[df['topic_14'] > threshold] subsettopic55 = df[df['topic_55'] > threshold]

subsettopic1.to_csv('subsettopic1.csv') subsettopic12.to_csv('subsettopic12.csv') subsettopic36.to_csv('subsettopic36.csv') subsettopic41.to_csv('subsettopic41.csv') subsettopic31.to_csv('subsettopic31.csv') subsettopic16.to_csv('subsettopic16.csv')

subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv')

a2a = subsettopic1.head(30000) print(a2a)

df2 = subsettopic1 df2 = a2a

a2a.to_csv('df2.csv')

###########################################

clustering

########################################### import numpy as np import hdbscan import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler

################ old dat = subsettopic1.iloc[:, 21:32] aab = dat.head(10) dat = dat.head(30000) clust = hdbscan.HDBSCAN(min_cluster_size = 5, gen_min_span_tree=True, ) clust_lab = clust.fit_predict(dat)

################ old

average_d = [] prev_steps = 1 # 24hours * 20 minutes -> 72 timesteps per hour

for current_step in sorted(df2['Rank'].unique()):

# Select tweets in this step and the previous 71 steps about the given topic
mask = ((df2['Rank'] >= current_step - prev_steps) & (df2['Rank'] <= current_step))
relevant_tweets = df2.loc[mask].copy()

# Select tweets in this step about the given topic
mask_current_step = (df2['Rank'] == current_step)
current_step_tweets = df2.loc[mask_current_step].copy()

# Perform HDBSCAN clustering on the emotion variables of relevant_tweets
clusterer = hdbscan.HDBSCAN(min_cluster_size=int(5))

cluster_data = relevant_tweets.iloc[:, 21:32]

# Drop rows with missing data
cluster_data.dropna(inplace=True)
cluster_data.replace([np.inf, -np.inf], np.nan, inplace=True)
print(cluster_data.isnull().sum())

# Print shape and type of the data
print(f"Shape of the data: {cluster_data.shape}, Type of the data: {type(cluster_data)}")
print(cluster_data.describe())
print(cluster_data.info())
cluster_data = cluster_data.astype(np.float64)

cluster_labels = clusterer.fit_predict(cluster_data)

# Get the probabilities of belonging to a cluster.
probabilities = clusterer.probabilities_

# Add the cluster labels and probabilities to the relevant_tweets DataFrame
relevant_tweets['Cluster'] = cluster_labels
relevant_tweets['Score'] = probabilities

# Find the largest cluster
largest_cluster = relevant_tweets['Cluster'].value_counts().idxmax()

# Get the typical emotional pattern (average emotion scores) of this cluster
typical_pattern = relevant_tweets[relevant_tweets['Cluster'] == largest_cluster].iloc[:,21:32].mean()

# Select the data of the current step from the largest cluster
current_step_cluster_data = current_step_tweets[current_step_tweets['Cluster'] == largest_cluster].iloc[:,21:32]

# Compute the Euclidean distances of each point in the current_step_cluster_data to the typical_pattern
distances = np.sqrt(((current_step_cluster_data - typical_pattern)**2).sum(axis=1))

# Compute the average distance
average_distance = distances.mean()

# Store the average distance
average_d.append(average_distance)

# store average distances in df
df_avg_d = pd.DataFrame({'Rank': sorted(df2['Rank'].unique())[prev_steps:],
                        'avg_d': average_d

})

DBSCAN

import pandas as pd from sklearn.cluster import DBSCAN from scipy.spatial import distance import numpy as np import time

Load the data

df2 = pd.read_csv('df2.csv')

subsettopic1 = subsettopic1.sort_values('created_at') a2a = subsettopic1.head(300000)

print(a2a) df2 = subsettopic1 df2 = a2a

df2 = subsettopic31

average_d = [] num_clusters = []

prev_steps = 71 # 24hours 20 minutes -> 72 timesteps per hour or past 6 hours = 17 (36 -1) start_time = time.time() for current_step in sorted(df2['Rank'].unique()):

# Select tweets in this step and the previous 71 steps about the given topic
mask = ((df2['Rank'] >= current_step - prev_steps) & (df2['Rank'] <= current_step))
relevant_tweets = df2.loc[mask].copy()

# Perform DBSCAN clustering on the emotion variables of relevant_tweets
clusterer = DBSCAN(eps=0.5, min_samples=5) # You may need to adjust these parameters

cluster_data = relevant_tweets.iloc[:, 21:32]

# Drop rows with missing data
cluster_data.dropna(inplace=True)
cluster_data.replace([np.inf, -np.inf], np.nan, inplace=True)

cluster_data = cluster_data.astype(np.float64)

cluster_labels = clusterer.fit_predict(cluster_data)

# Add the cluster labels to the relevant_tweets DataFrame
relevant_tweets['Cluster'] = cluster_labels

# Select tweets in this step about the given topic
mask_current_step = (df2['Rank'] == current_step)
current_step_tweets = df2.loc[mask_current_step].copy()

# Add the 'Cluster' column to the current_step_tweets DataFrame
current_step_tweets['Cluster'] = relevant_tweets.loc[mask_current_step, 'Cluster']

# Find the largest cluster
largest_cluster = relevant_tweets['Cluster'].value_counts().idxmax()

# Get the typical emotional pattern (average emotion scores) of this cluster
typical_pattern = relevant_tweets[relevant_tweets['Cluster'] == largest_cluster].iloc[:,21:32].mean()

# Select the data of the current step from the largest cluster
current_step_cluster_data = current_step_tweets[current_step_tweets['Cluster'] == largest_cluster].iloc[:,21:32]

# Compute the Euclidean distances of each point in the current_step_cluster_data to the typical_pattern
distances = distance.cdist(current_step_cluster_data, typical_pattern.values.reshape(1,-1), 'euclidean')

# Compute the number of unique clusters (excluding noise, which is labeled as -1 by DBSCAN)
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
# Store the number of clusters
num_clusters.append(n_clusters)

# Compute the average distance
if len(distances) > 0:
    average_distance = distances.mean()
else:
    average_distance = np.nan

# Store the average distance
average_d.append(average_distance)

Store average distances in df

df_avg_d = pd.DataFrame({'Rank': sorted(df2['Rank'].unique()), 'avg_d': average_d}) df_avg_d['num_clusters'] = num_clusters

end_time = time.time() execution_time = end_time - start_time

execution_time

df_avg_d_topic31 = df_avg_d df_avg_d.to_csv('df_avg_d_topic31.csv') cluster_data.to_csv('cluster_data_topic31.csv') cluster_labels_topic31 = cluster_labels average_d_topic31 = average_d distances_topic31 = distances

cluster_labelsDF = cluster_labels.df

############################################################

reset varibales

List all variables you want to keep

variables_to_keep = ['a2a', 'df','df2','subsettopic1']

Get a dictionary of your current global variables

globals_ = globals().copy()

Delete all variables except the ones you want to keep

for var in globals_: if var not in variables_tokeep and not var.startswith(''): del globals()[var]

##########################################################

###############################################################################•

analysis

###############################################################################•

cluster_data_topic1 = pd.read_csv(r'cluster_data_topic1.csv') cluster_data_topic12 = pd.read_csv(r'cluster_data_topic12.csv') cluster_data_topic31 = pd.read_csv(r'cluster_data_topic31.csv') cluster_data_topic36 = pd.read_csv(r'cluster_data_topic36.csv') cluster_data_topic41 = pd.read_csv(r'cluster_data_topic41.csv')

df_avg_d_topic1 = pd.read_csv(r'df_avg_d_topic1.csv') df_avg_d_topic12 = pd.read_csv(r'df_avg_d_topic12.csv') df_avg_d_topic31 = pd.read_csv(r'df_avg_d_topic31.csv') df_avg_d_topic36 = pd.read_csv(r'df_avg_d_topic36.csv') df_avg_d_topic41 = pd.read_csv(r'df_avg_d_topic41.csv')

plot

import matplotlib.pyplot as plt

plt.plot(df_avg_d['Rank'], df_avg_d['avg_d'], marker = 'o', linestyle= '-' )

###########################

this is full data

plt.figure(figsize=(10,6)) plt.plot(df_avg_d_topic31['Rank'], df_avg_d_topic31['avg_d'], marker = 'o', linestyle= '-' ) plt.ylabel("Distance to dominant emotional pattern") plt.xlabel("Time steps") plt.title('Topic 31', fontdict=None, loc='center', pad=None) plt.show()

###########################

this is smooth data

Calculate the moving average

window_size = 12 df_avg_d_topic31['avg_d_smooth'] = df_avg_d_topic31['avg_d'].rolling(window=window_size).mean()

Plot the original data and the smoothed data

plt.figure(figsize=(10,6))

plt.plot(df_avg_d_topic31['Rank'], df_avg_d['avg_d'], marker='o', linestyle='-', alpha=0.5, label='Original')

plt.plot(df_avg_d_topic31['Rank'], df_avg_d_topic31['avg_d_smooth'], marker='', linestyle='-', color='r', label='Smoothed') plt.ylabel("Distance to dominant emotional pattern") plt.xlabel("Time steps") plt.title('Topic 31', fontdict=None, loc='center', pad=None) plt.show()

Plot the original data and the smoothed data

plt.figure(figsize=(10,6)) plt.plot(df_avg_d_topic31['Rank'], df_avg_d_topic31['avg_d'], marker='o', linestyle='-', alpha=0.5, label='Original') plt.plot(df_avg_d_topic31['Rank'], df_avg_d_topic31['avg_d_smooth'], marker='', linestyle='-', color='r', label='Smoothed') plt.ylabel("Distance to dominant emotional pattern") plt.xlabel("Time steps") plt.title('Topic 31', fontdict=None, loc='center', pad=None) plt.show()

function to compute the data above for all datasets

def process_and_plot(df, topic_name, window_size=12):

plot full data

plt.figure(figsize=(10,6))
plt.plot(df['Rank'], df['avg_d'], marker = 'o', linestyle= '-' )
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.show()

# smooth data
df['avg_d_smooth'] = df['avg_d'].rolling(window=window_size).mean()

# plot smooth data
plt.figure(figsize=(8,6))
plt.plot(df['Rank'], df['avg_d_smooth'], marker='', linestyle='-', color='r', label='Smoothed')
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.legend()
plt.show()

# plot both original and smooth data
plt.figure(figsize=(10,6))
plt.plot(df['Rank'], df['avg_d'], marker='o', linestyle='-', alpha=0.5, label='Original')
plt.plot(df['Rank'], df['avg_d_smooth'], marker='', linestyle='-', color='r', label='Smoothed')
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.legend()
plt.show()

you can call this function on any dataframe like this:

process_and_plot(df_avg_d, 'Topic 0')

process_and_plot(df_avg_d_topic1, 'Topic 1') process_and_plot(df_avg_d_topic12, 'Topic 12') process_and_plot(df_avg_d_topic31, 'Topic 31') process_and_plot(df_avg_d_topic41, 'Topic 41')

new

def process_and_plot(df, topic_name, window_size=6):

Function to add vertical lines and annotations

def add_vertical_lines_and_annotations():
    plt.axvline(x=1213, color='k', linestyle='--')
    plt.text(1213, df['avg_d'].min(), 'George Floyd', rotation=90, verticalalignment='bottom')

    plt.axvline(x=1657, color='k', linestyle='-')
    plt.text(1657, df['avg_d'].min(), 'June', rotation=90, verticalalignment='bottom', horizontalalignment='right')

    plt.axvline(x=3817, color='k', linestyle='-')
    plt.text(3817, df['avg_d'].min(), 'July', rotation=90, verticalalignment='bottom', horizontalalignment='right')

# plot full data
plt.figure(figsize=(10,6))
plt.plot(df['Rank'], df['avg_d'], marker = 'o', linestyle= '-' )
add_vertical_lines_and_annotations()
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.show()

# smooth data
df['avg_d_smooth'] = df['avg_d'].rolling(window=window_size).mean()

# plot smooth data
plt.figure(figsize=(8,6))
plt.plot(df['Rank'], df['avg_d_smooth'], marker='', linestyle='-', color='r', label='Smoothed')
add_vertical_lines_and_annotations()
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.legend()
plt.show()

# plot both original and smooth data
plt.figure(figsize=(10,6))
plt.plot(df['Rank'], df['avg_d'], marker='o', linestyle='-', alpha=0.5, label='Original')
plt.plot(df['Rank'], df['avg_d_smooth'], marker='', linestyle='-', color='r', label='Smoothed')
add_vertical_lines_and_annotations()
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.legend()
plt.show()

you can call this function on any dataframe like this:

process_and_plot(df_avg_d, 'Topic 0')

process_and_plot(df_avg_d_topic1, 'Topic 1') process_and_plot(df_avg_d_topic12, 'Topic 12') process_and_plot(df_avg_d_topic31, 'Topic 31') process_and_plot(df_avg_d_topic41, 'Topic 41')

###############################

def merge_and_rename(df, topic_name):

Merge 'avg_d' column into 'awhole' based on the 'Rank' column

awhole[f'avg_d_{topic_name}'] = df.set_index('Rank')['avg_d']
# Reset the index to avoid duplicate index entries
awhole.reset_index(inplace=True)

Merge and rename the 'avg_d' columns for each dataset

merge_and_rename(df_avg_d, 'Topic0') merge_and_rename(df_avg_d_topic1, 'Topic1') merge_and_rename(df_avg_d_topic12, 'Topic12') merge_and_rename(df_avg_d_topic31, 'Topic31') merge_and_rename(df_avg_d_topic41, 'Topic41')

Assuming 'awhole' is the dataset we have been working with, and it has the 'Rank' column as the index.

Update 'awhole' with 'avg_d' columns from each dataset

awhole['avg_d_topic0'] = pd.to_numeric(df_avg_d_topic1.set_index('Rank')['avg_d'], errors='coerce') awhole['avg_d_topic1'] = pd.to_numeric(df_avg_d_topic1.set_index('Rank')['avg_d'], errors='coerce') awhole['avg_d_topic12'] = pd.to_numeric(df_avg_d_topic12.set_index('Rank')['avg_d'], errors='coerce') awhole['avg_d_topic31'] = pd.to_numeric(df_avg_d_topic31.set_index('Rank')['avg_d'], errors='coerce') awhole['avg_d_topic41'] = pd.to_numeric(df_avg_d_topic41.set_index('Rank')['avg_d'], errors='coerce')

a2awhole = awhole.head(10)

###################### now use the complete dataset:

import pandas as pd import matplotlib.pyplot as plt

def process_and_plot(awhole, topic_name, window_size=71):

plot full data

plt.figure(figsize=(10, 6))
plt.plot(awhole.index, awhole[f'avg_d_{topic_name}'], marker='o', linestyle='-')
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.show()

# smooth data
awhole[f'avg_d_{topic_name}_smooth'] = awhole[f'avg_d_{topic_name}'].rolling(window=window_size).mean()
awhole['Count_smooth'] = awhole['Count'].rolling(window=window_size).mean()

# plot smooth data
plt.figure(figsize=(10, 6))
plt.plot(awhole.index, awhole[f'avg_d_{topic_name}_smooth'], marker='', linestyle='-', color='r', label='Smoothed avg_d')
plt.plot(awhole.index, awhole['Count_smooth'], marker='', linestyle='-', color='g', label='Smoothed Count')
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.legend()
plt.show()

# plot both original and smooth data
plt.figure(figsize=(10, 6))
plt.plot(awhole.index, awhole[f'avg_d_{topic_name}'], marker='o', linestyle='-', alpha=0.5, label='Original avg_d')
plt.plot(awhole.index, awhole[f'avg_d_{topic_name}_smooth'], marker='', linestyle='-', color='r', label='Smoothed avg_d')
plt.plot(awhole.index, awhole['Count_smooth'], marker='', linestyle='-', color='g', label='Smoothed Count')
plt.ylabel("Distance to dominant emotional pattern")
plt.xlabel("Time steps")
plt.title(topic_name, fontdict=None, loc='center', pad=None)
plt.legend()
plt.show()

process_and_plot(awhole, 'topic0') # Replace 'Topic 1' with the desired topic name

######################### correlation

Select the columns of interest

columns_of_interest = ['avg_d_topic0','avg_d_topic1','avg_d_topic12','avg_d_topic41', 'avg_d_topic31', 'Count', 'public_metrics.like_count', 'public_metrics.reply_count', 'public_metrics.retweet_count']

Compute the correlation matrix

correlation_matrix = awhole[columns_of_interest].corr() import seaborn as sns import matplotlib.pyplot as plt

Plot the correlation matrix as a heatmap

plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1) plt.show()

########################## autocorrelation

import pandas as pd

Select the columns of interest

columns_of_interest = ['avg_d_topic1','avg_d_topic12','avg_d_topic41', 'avg_d_topic31', 'Count', 'count_topic1','public_metrics.like_count', 'public_metrics.reply_count', 'public_metrics.retweet_count']

columns_of_interest = ['avg_d_topic1','avg_d_topic12','avg_d_topic41', 'avg_d_topic31', 'Count', 'count_topic1','count_topic12','count_topic41','count_topic31']

Calculate the autocorrelation with a lag of 1 for each column

lag_1_autocorrelation = {} for column in columns_of_interest: lagged_column = awhole[column].shift(1) autocorr = awhole[column].corr(lagged_column) lag_1_autocorrelation[column] = autocorr

for column, autocorr in lag_1_autocorrelation.items(): print(f"Lag-1 Autocorrelation for {column}: {autocorr}")

####################### cross-lagged effects

Calculate the autocorrelation with different lags for each pair of columns

autocorrelations = pd.DataFrame() for column1 in columns_of_interest: for column2 in columns_of_interest: for lag in range(1, 6): # Calculate autocorrelation for lags 1 to 5 (can adjust the range as needed) lagged_column1 = awhole[column1].shift(lag) autocorr = awhole[column2].corr(lagged_column1) autocorrelations.loc[f"Lag {lag}", f"{column1} -> {column2}"] = autocorr

The 'autocorrelations' DataFrame will contain the autocorrelation values for each pair of columns at different lags.

print(autocorrelations)

plt.figure(figsize=(10, 8)) sns.heatmap(autocorrelations, annot=True, cmap='coolwarm', vmin=-1, vmax=1) plt.title("Autocorrelation between Variables at Different Lags") plt.show()

############ cross lagged effects in batches:

columns_of_interest = ['avg_d_topic0','avg_d_topic1', 'avg_d_topic31', 'Count', 'public_metrics.like_count', 'public_metrics.reply_count', 'public_metrics.retweet_count']

Define the 'Rank' ranges for each batch

index_ranges = [(1300, 1800), (1800, 2300), (2300,2800),(2800,3300),(3300, 3800),(3800, 4300)]

Calculate the autocorrelation for each batch

autocorrelations_by_batch = {} for i, (start, end) in enumerate(index_ranges, 1):

Select data for the current batch

batch_data = awhole.loc[start:end]

# Calculate the autocorrelation for each pair of columns at different lags
autocorrelations = pd.DataFrame()
for column1 in columns_of_interest:
    for column2 in columns_of_interest:
        for lag in range(1, 6):  # Calculate autocorrelation for lags 1 to 5 (you can adjust the range as needed)
            lagged_column1 = batch_data[column1].shift(lag)
            autocorr = batch_data[column2].corr(lagged_column1)
            autocorrelations.loc[f"Lag {lag}", f"{column1} -> {column2}"] = autocorr

# Store the autocorrelations for the current batch
autocorrelations_by_batch[f"Batch {i} (Index {start} - {end})"] = autocorrelations

The 'autocorrelations_by_batch' dictionary will contain the autocorrelations for each batch.

find the optimal negative one

autocorrelations_by_batch = {} for i, (start, end) in enumerate(index_ranges, 1):

Select data for the current batch

batch_data = awhole.loc[start:end]

# Calculate the autocorrelation for each pair of columns at different lags
autocorrelations = pd.DataFrame()
for column1 in columns_of_interest:
    for column2 in columns_of_interest:
        lag_at_negative_autocorr = None
        for lag in range(1, 101):  # Calculate autocorrelation for lags 1 to 100 (you can adjust the range as needed)
            lagged_column1 = batch_data[column1].shift(lag)
            autocorr = batch_data[column2].corr(lagged_column1)
            if autocorr < 0:
                lag_at_negative_autocorr = lag
                break
        autocorrelations.loc[f"Lag at negative autocorrelation", f"{column1} -> {column2}"] = lag_at_negative_autocorr

# Store the autocorrelations for the current batch
autocorrelations_by_batch[f"Batch {i} (Index {start} - {end})"] = autocorrelations

from txt to jsonl

import json

Replace 'input.txt' with the path to your TXT file containing tweets

input_file = 'input.txt'

Replace 'output.jsonl' with the path where you want to save the JSONL file

output_file = 'output.jsonl'

Read the contents of the TXT file

with open(input_file, 'r', encoding='utf-8') as txt_file: tweets_text = txt_file.readlines()

Parse each tweet and write it as a separate line in the JSONL file

with open(output_file, 'w', encoding='utf-8') as jsonl_file: for tweet_text in tweets_text: try: tweet = json.loads(tweet_text) jsonl_file.write(json.dumps(tweet) + '\n') except json.JSONDecodeError:

Skip lines that are not valid JSON objects (e.g., headers or empty lines)

        pass

print("Conversion from TXT to JSONL completed.")

awhole.to_csv('alldataz_sum2.csv')

visualize

import matplotlib.pyplot as plt import seaborn as sns

import numpy as np

import matplotlib.pyplot as plt

def plot_violin(dataframes, titles): plt.figure(figsize=(12, 6))

for idx, df in enumerate(dataframes):
    ax = plt.subplot(1, len(dataframes), idx+1)

    # Data for 'First 1200' and 'Last 1200' rows
    data = [df['avg_d'].iloc[:1200], df['avg_d'].iloc[-1200:]]

    # Plotting the violin plots
    sns.violinplot(data=data, palette=["#add8e6", "#1e90ff"], inner=None, ax=ax)

    # Calculating the 95% confidence interval for 'First 1200' rows and adjusting the error bar size
    ci_low_first, ci_high_first = np.percentile(data[0], [2.5, 97.5])
    error_bar_size_first = (ci_high_first - ci_low_first) * 2
    ax.errorbar(0, np.mean(data[0]), yerr=error_bar_size_first/2, fmt='o', color='red', capsize=5)

    # Calculating the 95% confidence interval for 'Last 1200' rows and adjusting the error bar size
    ci_low_last, ci_high_last = np.percentile(data[1], [2.5, 97.5])
    error_bar_size_last = (ci_high_last - ci_low_last) * 2
    ax.errorbar(1, np.mean(data[1]), yerr=error_bar_size_last/2, fmt='o', color='red', capsize=5)

    plt.title(titles[idx])
    plt.ylabel('avg_d')
    plt.xticks([0, 1], ['First 1200', 'Last 1200'])

plt.tight_layout()
plt.show()

Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41

You can plot them using the function as:

plot_violin([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12

Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic

Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41

You can plot them using the function as:

plot_violin([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12']) plot_violin([df_avg_d_topic31, df_avg_d_topic41], ['Topic 31', 'Topic 41'])

first 1200

def plot_violin_first_1200(dataframes, titles): plt.figure(figsize=(12, 6))

for idx, df in enumerate(dataframes):
    ax = plt.subplot(1, len(dataframes), idx+1)

    # Data for 'First 1200' rows
    data_first = df['avg_d'].iloc[:1200].values

    # Plotting the violin plot for 'First 1200' rows
    sns.violinplot(y=data_first, color="#add8e6", ax=ax)

    # Calculating and plotting the 95% confidence interval for 'First 1200' rows
    ci_low_first, ci_high_first = np.percentile(data_first, [2.5, 97.5])
    ax.axhline(ci_low_first, color="red", linestyle="--")
    ax.axhline(ci_high_first, color="red", linestyle="--")

    plt.title(titles[idx] + " - First 1200")
    plt.ylabel('avg_d')

plt.tight_layout()
plt.show()

plot_violin_first_1200([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12'])

looks great

def plot_violin(dataframes, titles): plt.figure(figsize=(12, 6))

for idx, df in enumerate(dataframes):
    ax = plt.subplot(1, len(dataframes), idx+1)

    # Data for 'First 1200' and 'Last 1200' rows
    data_first = df['avg_d'].iloc[:1200].values
    data_last = df['avg_d'].iloc[-1200:].values

    # Plotting the violin plots
    sns.violinplot(data=[data_first, data_last], palette=["#add8e6", "#1e90ff"], ax=ax)

    # Calculating and plotting the 95% confidence interval for 'First 1200' rows
    ci_low_first, ci_high_first = np.percentile(data_first, [2.5, 97.5])
    ax.axhline(ci_low_first, color="red", linestyle="--", xmin=0.05, xmax=0.45)
    ax.axhline(ci_high_first, color="red", linestyle="--", xmin=0.05, xmax=0.45)

    # Calculating and plotting the 95% confidence interval for 'Last 1200' rows
    ci_low_last, ci_high_last = np.percentile(data_last, [2.5, 97.5])
    #ax.axhline(ci_low_last, color="pink", linestyle="--", xmin=0.55, xmax=0.95)
    #ax.axhline(ci_high_last, color="blue", linestyle="--", xmin=0.55, xmax=0.95)

    plt.title(titles[idx])
    plt.ylabel('average distance to Dominant Emotional Pattern')
    plt.xticks([0, 1], ['Before George Floyd', '1-2 months after George Floyd'])

plt.tight_layout()
plt.show()

Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41

You can plot them using the function as:

plot_violin([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1: cognitive thinking', 'Topic 12: posting about BLM']) plot_violin([df_avg_d_topic31, df_avg_d_topic41], ['Topic 31: positivity', 'Topic 41: BLM protest'])

now significance tests

from scipy.stats import ttest_ind, levene, t import numpy as np

def welch_dof(x,y): """ Calculate the effective degrees of freedom for two samples. """ dof = (x.var()/x.size + y.var()/y.size)2 / ((x.var()/x.size)2 / (x.size-1) + (y.var()/y.size)**2 / (y.size-1)) return dof

def cohen_d(x, y): """ Calculate Cohen's d for effect size. """ pooled_var = (len(x) np.var(x, ddof=1) + len(y) np.var(y, ddof=1)) / (len(x) + len(y)) d = (np.mean(x) - np.mean(y)) / np.sqrt(pooled_var) return d

dataframes = [df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, df_avg_d_topic41] titles = ['Topic 1', 'Topic 12', 'Topic 31', 'Topic 41']

def variance_ratio(first, last): """Compute the F ratio (variance ratio) for comparing variances.""" return first.var() / last.var()

from scipy.stats import kstest, ttest_ind, levene for idx, df in enumerate(dataframes): first = df['avg_d'].iloc[:1200] last = df['avg_d'].iloc[-1200:]

# Check for NaN or infinite values
if np.any(np.isnan(first)) or np.any(np.isnan(last)):
    print(f"{titles[idx]} has NaN values.")
if np.any(np.isinf(first)) or np.any(np.isinf(last)):
    print(f"{titles[idx]} has infinite values.")

# Handle NaN values by removing them (you can also impute them if you prefer)
first = first.dropna()
last = last.dropna()

# Handle infinite values by removing them (you can also replace them if you prefer)
first = first[np.isfinite(first)]
last = last[np.isfinite(last)]

# Kolmogorov-Smirnov test for normality
_, p_value_ks_first = kstest(first, 'norm')
_, p_value_ks_last = kstest(last, 'norm')

# Test for equality of variances
_, p_value_var = levene(first, last)

# Welch's t-test
t_stat, p_value = ttest_ind(first, last, equal_var=False)

# Degrees of freedom for Welch's t-test
dof = welch_dof(first, last)

# Effect size (Cohen's d)
d = cohen_d(first, last)
delta = variance_ratio(first, last)

print(f"{titles[idx]} Glass's Δ: {delta:.4f}")
print(f"K-S test p-value (First 1200): {p_value_ks_first:.4f}")
print(f"K-S test p-value (Last 1200): {p_value_ks_last:.4f}")

print(f"{titles[idx]}:")
print(f"Levene's test p-value: {p_value_var:.4f}")
print(f"Welch's t-test: t({dof:.2f}) = {t_stat:.4f}, p = {p_value:.4f}")
print(f"Cohen's d: {d:.4f}")
print("-" * 50)

assumption of normality violated

from scipy.stats import mannwhitneyu, fligner

def rank_biserial(u, n1, n2): """Compute rank-biserial correlation as effect size for Mann-Whitney U.""" return 1 - (2u) / (n1 n2)

def variance_ratio(first, last): """Compute the F ratio (variance ratio) for comparing variances.""" return first.var() / last.var()

for idx, df in enumerate(dataframes): first = df['avg_d'].iloc[:1200] last = df['avg_d'].iloc[-1200:]

# Handle NaN and infinite values
first = first.dropna()
last = last.dropna()
first = first[np.isfinite(first)]
last = last[np.isfinite(last)]

# Mann-Whitney U test for comparing medians
u_stat, p_value_median = mannwhitneyu(first, last, alternative='two-sided')

# Rank-biserial correlation as effect size for Mann-Whitney U
rbc = rank_biserial(u_stat, len(first), len(last))

# Fligner-Killeen test for equality of variances
_, p_value_var = fligner(first, last)

# Glass's Δ for variance ratio
delta = variance_ratio(first, last)

print(f"{titles[idx]}:")
print(f"Mann-Whitney U test p-value: {p_value_median:.4f}")
print(f"Rank-biserial correlation: {rbc:.4f}")
print(f"Fligner-Killeen test p-value: {p_value_var:.4f}")
print(f"Glass's Δ: {delta:.4f}")
print("-" * 50)

nsign tests for time series

from scipy.stats import shapiro, ttest_ind

def check_normality_and_ttest(dataframes, titles): for idx, df in enumerate(dataframes):

Extract data for 'First 1200' and 'Last 1200' rows

    data_first = df['avg_d'].iloc[:1200]
    data_last = df['avg_d'].iloc[-1200:]

    # Shapiro-Wilk Test for normality
    stat_first, p_first = shapiro(data_first)
    stat_last, p_last = shapiro(data_last)

    print(f"\nFor {titles[idx]}:")
    print(f"Shapiro-Wilk Test for 'First 1200': Statistic={stat_first}, p-value={p_first}")
    print(f"Shapiro-Wilk Test for 'Last 1200': Statistic={stat_last}, p-value={p_last}")

    # If p-value is less than 0.05, the data is not normally distributed
    if p_first < 0.05:
        print("'First 1200' data is not normally distributed.")
    else:
        print("'First 1200' data is normally distributed.")

    if p_last < 0.05:
        print("'Last 1200' data is not normally distributed.")
    else:
        print("'Last 1200' data is normally distributed.")

    # If both series are approximately normal, proceed with the T-test
    if p_first >= 0.05 and p_last >= 0.05:
        t_stat, p_val = ttest_ind(data_first, data_last)
        print(f"T-test results: Statistic={t_stat}, p-value={p_val}")
        if p_val < 0.05:
            print("There is a significant difference between the means of 'First 1200' and 'Last 1200'.")
        else:
            print("There is no significant difference between the means of 'First 1200' and 'Last 1200'.")

check_normality_and_ttest([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12'])

bootstrapping

import numpy as np import matplotlib.pyplot as plt

def block_bootstrap(series, block_size, numsamples): """Generate bootstrapped samples using block bootstrapping.""" samples = [] n = len(series) for in range(num_samples): sample = [] while len(sample) < n: start_idx = np.random.randint(0, n - block_size + 1) sample.extend(series[start_idx:start_idx+block_size]) samples.append(sample[:n]) return samples

Check for NaN values in the original data

if df_avg_d_topic1['avg_d'].isna().any() or df_avg_d_topic12['avg_d'].isna().any(): print("Warning: NaN values detected in the data. Please handle them before proceeding.")

You can choose to drop NaN values or fill them

# df_avg_d_topic1 = df_avg_d_topic1.dropna()
# df_avg_d_topic12 = df_avg_d_topic12.dropna()

Parameters

block_size = 50 # Size of blocks to resample num_samples = 2000 # Number of bootstrapped samples

df_avg_d_topic1 = df_avg_d_topic1.dropna(subset=['avg_d']) df_avg_d_topic12 = df_avg_d_topic12.dropna(subset=['avg_d'])

Generate bootstrapped samples

bootstrap_samples_topic1 = block_bootstrap(df_avg_d_topic1['avg_d'].values, block_size, num_samples) bootstrap_samples_topic12 = block_bootstrap(df_avg_d_topic12['avg_d'].values, block_size, num_samples)

Compute differences in means for each pair of bootstrapped samples

differences = [np.mean(sample1) - np.mean(sample2) for sample1, sample2 in zip(bootstrap_samples_topic1, bootstrap_samples_topic12)]

Remove any NaN values from differences

differences = [diff for diff in differences if not np.isnan(diff)]

Plot histogram of differences

plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped Mean Differences') plt.xlabel('Difference in Means') plt.ylabel('Frequency') plt.show()

Compute 95% confidence interval

ci_low, ci_high = np.percentile(differences, [2.5, 97.5]) print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})")

new bootstrapping

def block_bootstrap(data, block_size, num_samples): num_blocks = len(data) - block_size + 1 bootstrap_samples = []

for _ in range(num_samples):
    sample = []
    while len(sample) < len(data):
        start_idx = np.random.randint(0, num_blocks)
        block = data[start_idx:start_idx+block_size]
        sample.extend(block)
    bootstrap_samples.append(sample[:len(data)])

return bootstrap_samples

def bootstrap_difference(data1, data2, block_size=10, num_iterations=10000): differences = []

for _ in range(num_iterations):
    sample1 = block_bootstrap(data1, block_size, 1)[0]
    sample2 = block_bootstrap(data2, block_size, 1)[0]
    mean_diff = np.mean(sample1) - np.mean(sample2)
    differences.append(mean_diff)

return differences

def bootstrap_difference(data1, data2, block_size=10, num_iterations=10000): #+ this is for variance variance_differences = []

for _ in range(num_iterations):
    sample1 = block_bootstrap(data1, block_size, 1)[0]
    sample2 = block_bootstrap(data2, block_size, 1)[0]
    variance_diff = np.var(sample1) - np.var(sample2)
    variance_differences.append(variance_diff)

return variance_differences

def cohens_d_for_means(differences): mean_diff = np.mean(differences) std_diff = np.std(differences) return mean_diff / std_diff

################ Data for topic 1 first_1200_topic1 = df_avg_d_topic1['avg_d'].iloc[:1200].dropna().values last_1200_topic1 = df_avg_d_topic1['avg_d'].iloc[-1200:].dropna().values

differences = bootstrap_difference(first_1200_topic1, last_1200_topic1) plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped variance Differences for topic 1') plt.xlabel('Difference in variances') plt.ylabel('Frequency') plt.show()

Compute 95% confidence interval

ci_low, ci_high = np.percentile(differences, [2.5, 97.5]) print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})") d_means = cohens_d_for_means(differences) print(f"Cohen's d for Mean Differences: {d_means:.4f}")

################ Data for topic 12 first_1200_topic12 = df_avg_d_topic12['avg_d'].iloc[:1200].dropna().values last_1200_topic12 = df_avg_d_topic12['avg_d'].iloc[-1200:].dropna().values

differences = bootstrap_difference(first_1200_topic12, last_1200_topic12) plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped variances Differences for topic 12') plt.xlabel('Difference in variances') plt.ylabel('Frequency') plt.show()

Compute 95% confidence interval

################ Data for topic 41 first_1200_topic41 = df_avg_d_topic41['avg_d'].iloc[:1200].dropna().values last_1200_topic41 = df_avg_d_topic41['avg_d'].iloc[-1200:].dropna().values

differences = bootstrap_difference(first_1200_topic41, last_1200_topic41) plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped variances Differences for topic 41') plt.xlabel('Difference in variances') plt.ylabel('Frequency') plt.show()

Compute 95% confidence interval

ci_low, ci_high = np.percentile(differences, [2.5, 97.5]) print(f"95% Confidence Interval for Difference in variances: ({ci_low:.4f}, {ci_high:.4f})") d_means = cohens_d_for_means(differences) print(f"Cohen's d for Mean Differences: {d_means:.4f}") d_means = cohens_d_for_means(differences) print(f"Cohen's d for Mean Differences: {d_means:.4f}")

################ Data for topic 1 first_1200_topic31 = df_avg_d_topic31['avg_d'].iloc[:1200].dropna().values last_1200_topic31 = df_avg_d_topic31['avg_d'].iloc[-1200:].dropna().values

differences = bootstrap_difference(first_1200_topic31, last_1200_topic31) plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped variances Differences for topic 31') plt.xlabel('Difference in variances') plt.ylabel('Frequency') plt.show()

Compute 95% confidence interval

#########################

import pandas as pd import matplotlib.pyplot as plt

Assuming dfall is already loaded

plt.figure(figsize=(12, 6)) plt.plot(dfall['Count']) plt.title('Time Series of total amount of tweets per time step') plt.xlabel('Time steps') plt.ylabel('Amount of tweets posted about BLM') plt.grid(True) plt.show()

plt.figure(figsize=(12, 6)) plt.plot(dfall['count_topic1']) plt.title('Time Series of total amount of tweets per time step') plt.xlabel('Time steps') plt.ylabel('Amount of tweets posted about BLM') plt.grid(True) plt.show()

plt.figure(figsize=(12, 6)) plt.plot(dfall['count_topic41']) plt.title('Time Series of Count') plt.xlabel('Time steps') plt.ylabel('Count') plt.grid(True) plt.show()

HendrickxBrent / EmotionContagionDuringBlackLIvesMatter

Clustering #1

-- coding: utf-8 --

Clustering

os.chdir("C:\Users\Erik\Documents\Brent\USB-station\statsthesis")

◘df = pd.read_csv(r'dataNLPTopic.csv')

standerdize

Create a StandardScaler instance

Specify the columns to standardize

Standardize the columns

subset for clustering

topics top 18:

1,12,36,41,

31,16,39,10,51

40,27,3,21,26

56,20,14,55

clustering

DBSCAN

Load the data

Store average distances in df

reset varibales

List all variables you want to keep

Get a dictionary of your current global variables

Delete all variables except the ones you want to keep

analysis

plot

this is full data

this is smooth data

Calculate the moving average

Plot the original data and the smoothed data

plt.plot(df_avg_d_topic31['Rank'], df_avg_d['avg_d'], marker='o', linestyle='-', alpha=0.5, label='Original')

Plot the original data and the smoothed data

function to compute the data above for all datasets

plot full data

you can call this function on any dataframe like this:

process_and_plot(df_avg_d, 'Topic 0')

new

Function to add vertical lines and annotations

you can call this function on any dataframe like this:

process_and_plot(df_avg_d, 'Topic 0')

Merge 'avg_d' column into 'awhole' based on the 'Rank' column

Merge and rename the 'avg_d' columns for each dataset

Assuming 'awhole' is the dataset we have been working with, and it has the 'Rank' column as the index.

Update 'awhole' with 'avg_d' columns from each dataset

plot full data

Select the columns of interest

Compute the correlation matrix

Plot the correlation matrix as a heatmap

Select the columns of interest

Calculate the autocorrelation with a lag of 1 for each column

Calculate the autocorrelation with different lags for each pair of columns

The 'autocorrelations' DataFrame will contain the autocorrelation values for each pair of columns at different lags.

Define the 'Rank' ranges for each batch

Calculate the autocorrelation for each batch

Select data for the current batch

The 'autocorrelations_by_batch' dictionary will contain the autocorrelations for each batch.

find the optimal negative one

Select data for the current batch

from txt to jsonl

Replace 'input.txt' with the path to your TXT file containing tweets

Replace 'output.jsonl' with the path where you want to save the JSONL file

Read the contents of the TXT file

Parse each tweet and write it as a separate line in the JSONL file

Skip lines that are not valid JSON objects (e.g., headers or empty lines)

visualize

Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41

You can plot them using the function as:

plot_violin([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12

Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic

Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41

You can plot them using the function as:

first 1200

looks great

Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41

You can plot them using the function as:

now significance tests

assumption of normality violated

nsign tests for time series

Extract data for 'First 1200' and 'Last 1200' rows

bootstrapping