###########################################
import numpy as np
import hdbscan
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
################ old
dat = subsettopic1.iloc[:, 21:32]
aab = dat.head(10)
dat = dat.head(30000)
clust = hdbscan.HDBSCAN(min_cluster_size = 5, gen_min_span_tree=True, )
clust_lab = clust.fit_predict(dat)
# Select tweets in this step and the previous 71 steps about the given topic
mask = ((df2['Rank'] >= current_step - prev_steps) & (df2['Rank'] <= current_step))
relevant_tweets = df2.loc[mask].copy()
# Select tweets in this step about the given topic
mask_current_step = (df2['Rank'] == current_step)
current_step_tweets = df2.loc[mask_current_step].copy()
# Perform HDBSCAN clustering on the emotion variables of relevant_tweets
clusterer = hdbscan.HDBSCAN(min_cluster_size=int(5))
cluster_data = relevant_tweets.iloc[:, 21:32]
# Drop rows with missing data
cluster_data.dropna(inplace=True)
cluster_data.replace([np.inf, -np.inf], np.nan, inplace=True)
print(cluster_data.isnull().sum())
# Print shape and type of the data
print(f"Shape of the data: {cluster_data.shape}, Type of the data: {type(cluster_data)}")
print(cluster_data.describe())
print(cluster_data.info())
cluster_data = cluster_data.astype(np.float64)
cluster_labels = clusterer.fit_predict(cluster_data)
# Get the probabilities of belonging to a cluster.
probabilities = clusterer.probabilities_
# Add the cluster labels and probabilities to the relevant_tweets DataFrame
relevant_tweets['Cluster'] = cluster_labels
relevant_tweets['Score'] = probabilities
# Find the largest cluster
largest_cluster = relevant_tweets['Cluster'].value_counts().idxmax()
# Get the typical emotional pattern (average emotion scores) of this cluster
typical_pattern = relevant_tweets[relevant_tweets['Cluster'] == largest_cluster].iloc[:,21:32].mean()
# Select the data of the current step from the largest cluster
current_step_cluster_data = current_step_tweets[current_step_tweets['Cluster'] == largest_cluster].iloc[:,21:32]
# Compute the Euclidean distances of each point in the current_step_cluster_data to the typical_pattern
distances = np.sqrt(((current_step_cluster_data - typical_pattern)**2).sum(axis=1))
# Compute the average distance
average_distance = distances.mean()
# Store the average distance
average_d.append(average_distance)
# store average distances in df
df_avg_d = pd.DataFrame({'Rank': sorted(df2['Rank'].unique())[prev_steps:],
'avg_d': average_d
})
DBSCAN
import pandas as pd
from sklearn.cluster import DBSCAN
from scipy.spatial import distance
import numpy as np
import time
prev_steps = 71 # 24hours 20 minutes -> 72 timesteps per hour or past 6 hours = 17 (36 -1)
start_time = time.time()
for current_step in sorted(df2['Rank'].unique()):
# Select tweets in this step and the previous 71 steps about the given topic
mask = ((df2['Rank'] >= current_step - prev_steps) & (df2['Rank'] <= current_step))
relevant_tweets = df2.loc[mask].copy()
# Perform DBSCAN clustering on the emotion variables of relevant_tweets
clusterer = DBSCAN(eps=0.5, min_samples=5) # You may need to adjust these parameters
cluster_data = relevant_tweets.iloc[:, 21:32]
# Drop rows with missing data
cluster_data.dropna(inplace=True)
cluster_data.replace([np.inf, -np.inf], np.nan, inplace=True)
cluster_data = cluster_data.astype(np.float64)
cluster_labels = clusterer.fit_predict(cluster_data)
# Add the cluster labels to the relevant_tweets DataFrame
relevant_tweets['Cluster'] = cluster_labels
# Select tweets in this step about the given topic
mask_current_step = (df2['Rank'] == current_step)
current_step_tweets = df2.loc[mask_current_step].copy()
# Add the 'Cluster' column to the current_step_tweets DataFrame
current_step_tweets['Cluster'] = relevant_tweets.loc[mask_current_step, 'Cluster']
# Find the largest cluster
largest_cluster = relevant_tweets['Cluster'].value_counts().idxmax()
# Get the typical emotional pattern (average emotion scores) of this cluster
typical_pattern = relevant_tweets[relevant_tweets['Cluster'] == largest_cluster].iloc[:,21:32].mean()
# Select the data of the current step from the largest cluster
current_step_cluster_data = current_step_tweets[current_step_tweets['Cluster'] == largest_cluster].iloc[:,21:32]
# Compute the Euclidean distances of each point in the current_step_cluster_data to the typical_pattern
distances = distance.cdist(current_step_cluster_data, typical_pattern.values.reshape(1,-1), 'euclidean')
# Compute the number of unique clusters (excluding noise, which is labeled as -1 by DBSCAN)
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
# Store the number of clusters
num_clusters.append(n_clusters)
# Compute the average distance
if len(distances) > 0:
average_distance = distances.mean()
else:
average_distance = np.nan
# Store the average distance
average_d.append(average_distance)
Calculate the autocorrelation with a lag of 1 for each column
lag_1_autocorrelation = {}
for column in columns_of_interest:
lagged_column = awhole[column].shift(1)
autocorr = awhole[column].corr(lagged_column)
lag_1_autocorrelation[column] = autocorr
for column, autocorr in lag_1_autocorrelation.items():
print(f"Lag-1 Autocorrelation for {column}: {autocorr}")
####################### cross-lagged effects
Calculate the autocorrelation with different lags for each pair of columns
autocorrelations = pd.DataFrame()
for column1 in columns_of_interest:
for column2 in columns_of_interest:
for lag in range(1, 6): # Calculate autocorrelation for lags 1 to 5 (can adjust the range as needed)
lagged_column1 = awhole[column1].shift(lag)
autocorr = awhole[column2].corr(lagged_column1)
autocorrelations.loc[f"Lag {lag}", f"{column1} -> {column2}"] = autocorr
The 'autocorrelations' DataFrame will contain the autocorrelation values for each pair of columns at different lags.
print(autocorrelations)
plt.figure(figsize=(10, 8))
sns.heatmap(autocorrelations, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Autocorrelation between Variables at Different Lags")
plt.show()
autocorrelations_by_batch = {}
for i, (start, end) in enumerate(index_ranges, 1):
Select data for the current batch
batch_data = awhole.loc[start:end]
# Calculate the autocorrelation for each pair of columns at different lags
autocorrelations = pd.DataFrame()
for column1 in columns_of_interest:
for column2 in columns_of_interest:
for lag in range(1, 6): # Calculate autocorrelation for lags 1 to 5 (you can adjust the range as needed)
lagged_column1 = batch_data[column1].shift(lag)
autocorr = batch_data[column2].corr(lagged_column1)
autocorrelations.loc[f"Lag {lag}", f"{column1} -> {column2}"] = autocorr
# Store the autocorrelations for the current batch
autocorrelations_by_batch[f"Batch {i} (Index {start} - {end})"] = autocorrelations
The 'autocorrelations_by_batch' dictionary will contain the autocorrelations for each batch.
find the optimal negative one
autocorrelations_by_batch = {}
for i, (start, end) in enumerate(index_ranges, 1):
Select data for the current batch
batch_data = awhole.loc[start:end]
# Calculate the autocorrelation for each pair of columns at different lags
autocorrelations = pd.DataFrame()
for column1 in columns_of_interest:
for column2 in columns_of_interest:
lag_at_negative_autocorr = None
for lag in range(1, 101): # Calculate autocorrelation for lags 1 to 100 (you can adjust the range as needed)
lagged_column1 = batch_data[column1].shift(lag)
autocorr = batch_data[column2].corr(lagged_column1)
if autocorr < 0:
lag_at_negative_autocorr = lag
break
autocorrelations.loc[f"Lag at negative autocorrelation", f"{column1} -> {column2}"] = lag_at_negative_autocorr
# Store the autocorrelations for the current batch
autocorrelations_by_batch[f"Batch {i} (Index {start} - {end})"] = autocorrelations
from txt to jsonl
import json
Replace 'input.txt' with the path to your TXT file containing tweets
input_file = 'input.txt'
Replace 'output.jsonl' with the path where you want to save the JSONL file
output_file = 'output.jsonl'
Read the contents of the TXT file
with open(input_file, 'r', encoding='utf-8') as txt_file:
tweets_text = txt_file.readlines()
Parse each tweet and write it as a separate line in the JSONL file
with open(output_file, 'w', encoding='utf-8') as jsonl_file:
for tweet_text in tweets_text:
try:
tweet = json.loads(tweet_text)
jsonl_file.write(json.dumps(tweet) + '\n')
except json.JSONDecodeError:
Skip lines that are not valid JSON objects (e.g., headers or empty lines)
pass
print("Conversion from TXT to JSONL completed.")
awhole.to_csv('alldataz_sum2.csv')
visualize
import matplotlib.pyplot as plt
import seaborn as sns
def variance_ratio(first, last):
"""Compute the F ratio (variance ratio) for comparing variances."""
return first.var() / last.var()
from scipy.stats import kstest, ttest_ind, levene
for idx, df in enumerate(dataframes):
first = df['avg_d'].iloc[:1200]
last = df['avg_d'].iloc[-1200:]
# Check for NaN or infinite values
if np.any(np.isnan(first)) or np.any(np.isnan(last)):
print(f"{titles[idx]} has NaN values.")
if np.any(np.isinf(first)) or np.any(np.isinf(last)):
print(f"{titles[idx]} has infinite values.")
# Handle NaN values by removing them (you can also impute them if you prefer)
first = first.dropna()
last = last.dropna()
# Handle infinite values by removing them (you can also replace them if you prefer)
first = first[np.isfinite(first)]
last = last[np.isfinite(last)]
# Kolmogorov-Smirnov test for normality
_, p_value_ks_first = kstest(first, 'norm')
_, p_value_ks_last = kstest(last, 'norm')
# Test for equality of variances
_, p_value_var = levene(first, last)
# Welch's t-test
t_stat, p_value = ttest_ind(first, last, equal_var=False)
# Degrees of freedom for Welch's t-test
dof = welch_dof(first, last)
# Effect size (Cohen's d)
d = cohen_d(first, last)
delta = variance_ratio(first, last)
print(f"{titles[idx]} Glass's Δ: {delta:.4f}")
print(f"K-S test p-value (First 1200): {p_value_ks_first:.4f}")
print(f"K-S test p-value (Last 1200): {p_value_ks_last:.4f}")
print(f"{titles[idx]}:")
print(f"Levene's test p-value: {p_value_var:.4f}")
print(f"Welch's t-test: t({dof:.2f}) = {t_stat:.4f}, p = {p_value:.4f}")
print(f"Cohen's d: {d:.4f}")
print("-" * 50)
assumption of normality violated
from scipy.stats import mannwhitneyu, fligner
def rank_biserial(u, n1, n2):
"""Compute rank-biserial correlation as effect size for Mann-Whitney U."""
return 1 - (2u) / (n1 n2)
def variance_ratio(first, last):
"""Compute the F ratio (variance ratio) for comparing variances."""
return first.var() / last.var()
for idx, df in enumerate(dataframes):
first = df['avg_d'].iloc[:1200]
last = df['avg_d'].iloc[-1200:]
# Handle NaN and infinite values
first = first.dropna()
last = last.dropna()
first = first[np.isfinite(first)]
last = last[np.isfinite(last)]
# Mann-Whitney U test for comparing medians
u_stat, p_value_median = mannwhitneyu(first, last, alternative='two-sided')
# Rank-biserial correlation as effect size for Mann-Whitney U
rbc = rank_biserial(u_stat, len(first), len(last))
# Fligner-Killeen test for equality of variances
_, p_value_var = fligner(first, last)
# Glass's Δ for variance ratio
delta = variance_ratio(first, last)
print(f"{titles[idx]}:")
print(f"Mann-Whitney U test p-value: {p_value_median:.4f}")
print(f"Rank-biserial correlation: {rbc:.4f}")
print(f"Fligner-Killeen test p-value: {p_value_var:.4f}")
print(f"Glass's Δ: {delta:.4f}")
print("-" * 50)
nsign tests for time series
from scipy.stats import shapiro, ttest_ind
def check_normality_and_ttest(dataframes, titles):
for idx, df in enumerate(dataframes):
Extract data for 'First 1200' and 'Last 1200' rows
data_first = df['avg_d'].iloc[:1200]
data_last = df['avg_d'].iloc[-1200:]
# Shapiro-Wilk Test for normality
stat_first, p_first = shapiro(data_first)
stat_last, p_last = shapiro(data_last)
print(f"\nFor {titles[idx]}:")
print(f"Shapiro-Wilk Test for 'First 1200': Statistic={stat_first}, p-value={p_first}")
print(f"Shapiro-Wilk Test for 'Last 1200': Statistic={stat_last}, p-value={p_last}")
# If p-value is less than 0.05, the data is not normally distributed
if p_first < 0.05:
print("'First 1200' data is not normally distributed.")
else:
print("'First 1200' data is normally distributed.")
if p_last < 0.05:
print("'Last 1200' data is not normally distributed.")
else:
print("'Last 1200' data is normally distributed.")
# If both series are approximately normal, proceed with the T-test
if p_first >= 0.05 and p_last >= 0.05:
t_stat, p_val = ttest_ind(data_first, data_last)
print(f"T-test results: Statistic={t_stat}, p-value={p_val}")
if p_val < 0.05:
print("There is a significant difference between the means of 'First 1200' and 'Last 1200'.")
else:
print("There is no significant difference between the means of 'First 1200' and 'Last 1200'.")
import numpy as np
import matplotlib.pyplot as plt
def block_bootstrap(series, block_size, numsamples):
"""Generate bootstrapped samples using block bootstrapping."""
samples = []
n = len(series)
for in range(num_samples):
sample = []
while len(sample) < n:
start_idx = np.random.randint(0, n - block_size + 1)
sample.extend(series[start_idx:start_idx+block_size])
samples.append(sample[:n])
return samples
Check for NaN values in the original data
if df_avg_d_topic1['avg_d'].isna().any() or df_avg_d_topic12['avg_d'].isna().any():
print("Warning: NaN values detected in the data. Please handle them before proceeding.")
Compute differences in means for each pair of bootstrapped samples
differences = [np.mean(sample1) - np.mean(sample2) for sample1, sample2 in zip(bootstrap_samples_topic1, bootstrap_samples_topic12)]
Remove any NaN values from differences
differences = [diff for diff in differences if not np.isnan(diff)]
Plot histogram of differences
plt.hist(differences, bins=50, edgecolor='k', alpha=0.7)
plt.title('Distribution of Bootstrapped Mean Differences')
plt.xlabel('Difference in Means')
plt.ylabel('Frequency')
plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5])
print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})")
################ Data for topic 1
first_1200_topic1 = df_avg_d_topic1['avg_d'].iloc[:1200].dropna().values
last_1200_topic1 = df_avg_d_topic1['avg_d'].iloc[-1200:].dropna().values
differences = bootstrap_difference(first_1200_topic1, last_1200_topic1)
plt.hist(differences, bins=50, edgecolor='k', alpha=0.7)
plt.title('Distribution of Bootstrapped variance Differences for topic 1')
plt.xlabel('Difference in variances')
plt.ylabel('Frequency')
plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5])
print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})")
d_means = cohens_d_for_means(differences)
print(f"Cohen's d for Mean Differences: {d_means:.4f}")
################ Data for topic 12
first_1200_topic12 = df_avg_d_topic12['avg_d'].iloc[:1200].dropna().values
last_1200_topic12 = df_avg_d_topic12['avg_d'].iloc[-1200:].dropna().values
differences = bootstrap_difference(first_1200_topic12, last_1200_topic12)
plt.hist(differences, bins=50, edgecolor='k', alpha=0.7)
plt.title('Distribution of Bootstrapped variances Differences for topic 12')
plt.xlabel('Difference in variances')
plt.ylabel('Frequency')
plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5])
print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})")
d_means = cohens_d_for_means(differences)
print(f"Cohen's d for Mean Differences: {d_means:.4f}")
################ Data for topic 41
first_1200_topic41 = df_avg_d_topic41['avg_d'].iloc[:1200].dropna().values
last_1200_topic41 = df_avg_d_topic41['avg_d'].iloc[-1200:].dropna().values
differences = bootstrap_difference(first_1200_topic41, last_1200_topic41)
plt.hist(differences, bins=50, edgecolor='k', alpha=0.7)
plt.title('Distribution of Bootstrapped variances Differences for topic 41')
plt.xlabel('Difference in variances')
plt.ylabel('Frequency')
plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5])
print(f"95% Confidence Interval for Difference in variances: ({ci_low:.4f}, {ci_high:.4f})")
d_means = cohens_d_for_means(differences)
print(f"Cohen's d for Mean Differences: {d_means:.4f}")
d_means = cohens_d_for_means(differences)
print(f"Cohen's d for Mean Differences: {d_means:.4f}")
################ Data for topic 1
first_1200_topic31 = df_avg_d_topic31['avg_d'].iloc[:1200].dropna().values
last_1200_topic31 = df_avg_d_topic31['avg_d'].iloc[-1200:].dropna().values
differences = bootstrap_difference(first_1200_topic31, last_1200_topic31)
plt.hist(differences, bins=50, edgecolor='k', alpha=0.7)
plt.title('Distribution of Bootstrapped variances Differences for topic 31')
plt.xlabel('Difference in variances')
plt.ylabel('Frequency')
plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5])
print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})")
d_means = cohens_d_for_means(differences)
print(f"Cohen's d for Mean Differences: {d_means:.4f}")
#########################
import pandas as pd
import matplotlib.pyplot as plt
Assuming dfall is already loaded
plt.figure(figsize=(12, 6))
plt.plot(dfall['Count'])
plt.title('Time Series of total amount of tweets per time step')
plt.xlabel('Time steps')
plt.ylabel('Amount of tweets posted about BLM')
plt.grid(True)
plt.show()
plt.figure(figsize=(12, 6))
plt.plot(dfall['count_topic1'])
plt.title('Time Series of total amount of tweets per time step')
plt.xlabel('Time steps')
plt.ylabel('Amount of tweets posted about BLM')
plt.grid(True)
plt.show()
plt.figure(figsize=(12, 6))
plt.plot(dfall['count_topic41'])
plt.title('Time Series of Count')
plt.xlabel('Time steps')
plt.ylabel('Count')
plt.grid(True)
plt.show()
-- coding: utf-8 --
""" Created on Sun Jul 23 23:36:39 2023
@author: Brent """
########################################################################################################
Clustering
######################################################################################################## import pandas as pd import os
os.chdir("C:\Users\Erik\Documents\Brent\USB-station\statsthesis")
os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata")
◘df = pd.read_csv(r'dataNLPTopic.csv')
dfall = pd.read_csv(r'alldataz_sum2.csv') aa = df.head(1000)
###########################################
standerdize
###########################################
from sklearn.preprocessing import StandardScaler
Create a StandardScaler instance
scaler = StandardScaler()
Specify the columns to standardize
cols_to_standardize = list(aa.columns[19:30]) + list(aa.columns[-57:]) cols_to_standardize = list(df.columns[17:28]) + list(df.columns[-57:])
Standardize the columns
aa[cols_to_standardize] = scaler.fit_transform(aa[cols_to_standardize])
df[cols_to_standardize] = scaler.fit_transform(df[cols_to_standardize])
topicsd = df['topic_1'].std topicsd = df['topic_1'].std()
###########################################
subset for clustering
###########################################
topics top 18:
1,12,36,41,
31,16,39,10,51
40,27,3,21,26
56,20,14,55
threshold = 0
subsettopic1 = df[df['topic_1'] > threshold]
subsettopic12 = df[df['topic_12'] > threshold] subsettopic36 = df[df['topic_36'] > threshold] subsettopic41 = df[df['topic_41'] > threshold] subsettopic31 = df[df['topic_31'] > threshold] subsettopic16 = df[df['topic_16'] > threshold]
subsettopic39 = df[df['topic_39'] > threshold] subsettopic10 = df[df['topic_10'] > threshold] subsettopic51 = df[df['topic_51'] > threshold] subsettopic40 = df[df['topic_40'] > threshold] subsettopic27 = df[df['topic_27'] > threshold] subsettopic3 = df[df['topic_3'] > threshold] subsettopic21 = df[df['topic_21'] > threshold] subsettopic26 = df[df['topic_26'] > threshold] subsettopic56 = df[df['topic_56'] > threshold] subsettopic20 = df[df['topic_20'] > threshold] subsettopic14 = df[df['topic_14'] > threshold] subsettopic55 = df[df['topic_55'] > threshold]
subsettopic1.to_csv('subsettopic1.csv') subsettopic12.to_csv('subsettopic12.csv') subsettopic36.to_csv('subsettopic36.csv') subsettopic41.to_csv('subsettopic41.csv') subsettopic31.to_csv('subsettopic31.csv') subsettopic16.to_csv('subsettopic16.csv')
subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv') subsettopic.to_csv('subsettopic.csv')
a2a = subsettopic1.head(30000) print(a2a)
df2 = subsettopic1 df2 = a2a
a2a.to_csv('df2.csv')
###########################################
clustering
########################################### import numpy as np import hdbscan import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler
################ old dat = subsettopic1.iloc[:, 21:32] aab = dat.head(10) dat = dat.head(30000) clust = hdbscan.HDBSCAN(min_cluster_size = 5, gen_min_span_tree=True, ) clust_lab = clust.fit_predict(dat)
################ old
average_d = [] prev_steps = 1 # 24hours * 20 minutes -> 72 timesteps per hour
for current_step in sorted(df2['Rank'].unique()):
})
DBSCAN
import pandas as pd from sklearn.cluster import DBSCAN from scipy.spatial import distance import numpy as np import time
Load the data
df2 = pd.read_csv('df2.csv')
subsettopic1 = subsettopic1.sort_values('created_at') a2a = subsettopic1.head(300000)
print(a2a) df2 = subsettopic1 df2 = a2a
df2 = subsettopic31
average_d = [] num_clusters = []
prev_steps = 71 # 24hours 20 minutes -> 72 timesteps per hour or past 6 hours = 17 (36 -1) start_time = time.time() for current_step in sorted(df2['Rank'].unique()):
Store average distances in df
df_avg_d = pd.DataFrame({'Rank': sorted(df2['Rank'].unique()), 'avg_d': average_d}) df_avg_d['num_clusters'] = num_clusters
end_time = time.time() execution_time = end_time - start_time
execution_time
df_avg_d_topic31 = df_avg_d df_avg_d.to_csv('df_avg_d_topic31.csv') cluster_data.to_csv('cluster_data_topic31.csv') cluster_labels_topic31 = cluster_labels average_d_topic31 = average_d distances_topic31 = distances
cluster_labelsDF = cluster_labels.df
############################################################
reset varibales
List all variables you want to keep
variables_to_keep = ['a2a', 'df','df2','subsettopic1']
Get a dictionary of your current global variables
globals_ = globals().copy()
Delete all variables except the ones you want to keep
for var in globals_: if var not in variables_tokeep and not var.startswith(''): del globals()[var]
##########################################################
###############################################################################•
analysis
###############################################################################•
cluster_data_topic1 = pd.read_csv(r'cluster_data_topic1.csv') cluster_data_topic12 = pd.read_csv(r'cluster_data_topic12.csv') cluster_data_topic31 = pd.read_csv(r'cluster_data_topic31.csv') cluster_data_topic36 = pd.read_csv(r'cluster_data_topic36.csv') cluster_data_topic41 = pd.read_csv(r'cluster_data_topic41.csv')
df_avg_d_topic1 = pd.read_csv(r'df_avg_d_topic1.csv') df_avg_d_topic12 = pd.read_csv(r'df_avg_d_topic12.csv') df_avg_d_topic31 = pd.read_csv(r'df_avg_d_topic31.csv') df_avg_d_topic36 = pd.read_csv(r'df_avg_d_topic36.csv') df_avg_d_topic41 = pd.read_csv(r'df_avg_d_topic41.csv')
plot
import matplotlib.pyplot as plt
plt.plot(df_avg_d['Rank'], df_avg_d['avg_d'], marker = 'o', linestyle= '-' )
###########################
this is full data
plt.figure(figsize=(10,6)) plt.plot(df_avg_d_topic31['Rank'], df_avg_d_topic31['avg_d'], marker = 'o', linestyle= '-' ) plt.ylabel("Distance to dominant emotional pattern") plt.xlabel("Time steps") plt.title('Topic 31', fontdict=None, loc='center', pad=None) plt.show()
###########################
this is smooth data
Calculate the moving average
window_size = 12 df_avg_d_topic31['avg_d_smooth'] = df_avg_d_topic31['avg_d'].rolling(window=window_size).mean()
Plot the original data and the smoothed data
plt.figure(figsize=(10,6))
plt.plot(df_avg_d_topic31['Rank'], df_avg_d['avg_d'], marker='o', linestyle='-', alpha=0.5, label='Original')
plt.plot(df_avg_d_topic31['Rank'], df_avg_d_topic31['avg_d_smooth'], marker='', linestyle='-', color='r', label='Smoothed') plt.ylabel("Distance to dominant emotional pattern") plt.xlabel("Time steps") plt.title('Topic 31', fontdict=None, loc='center', pad=None) plt.show()
Plot the original data and the smoothed data
plt.figure(figsize=(10,6)) plt.plot(df_avg_d_topic31['Rank'], df_avg_d_topic31['avg_d'], marker='o', linestyle='-', alpha=0.5, label='Original') plt.plot(df_avg_d_topic31['Rank'], df_avg_d_topic31['avg_d_smooth'], marker='', linestyle='-', color='r', label='Smoothed') plt.ylabel("Distance to dominant emotional pattern") plt.xlabel("Time steps") plt.title('Topic 31', fontdict=None, loc='center', pad=None) plt.show()
function to compute the data above for all datasets
def process_and_plot(df, topic_name, window_size=12):
plot full data
you can call this function on any dataframe like this:
process_and_plot(df_avg_d, 'Topic 0')
process_and_plot(df_avg_d_topic1, 'Topic 1') process_and_plot(df_avg_d_topic12, 'Topic 12') process_and_plot(df_avg_d_topic31, 'Topic 31') process_and_plot(df_avg_d_topic41, 'Topic 41')
new
def process_and_plot(df, topic_name, window_size=6):
Function to add vertical lines and annotations
you can call this function on any dataframe like this:
process_and_plot(df_avg_d, 'Topic 0')
process_and_plot(df_avg_d_topic1, 'Topic 1') process_and_plot(df_avg_d_topic12, 'Topic 12') process_and_plot(df_avg_d_topic31, 'Topic 31') process_and_plot(df_avg_d_topic41, 'Topic 41')
###############################
def merge_and_rename(df, topic_name):
Merge 'avg_d' column into 'awhole' based on the 'Rank' column
Merge and rename the 'avg_d' columns for each dataset
merge_and_rename(df_avg_d, 'Topic0') merge_and_rename(df_avg_d_topic1, 'Topic1') merge_and_rename(df_avg_d_topic12, 'Topic12') merge_and_rename(df_avg_d_topic31, 'Topic31') merge_and_rename(df_avg_d_topic41, 'Topic41')
Assuming 'awhole' is the dataset we have been working with, and it has the 'Rank' column as the index.
Update 'awhole' with 'avg_d' columns from each dataset
awhole['avg_d_topic0'] = pd.to_numeric(df_avg_d_topic1.set_index('Rank')['avg_d'], errors='coerce') awhole['avg_d_topic1'] = pd.to_numeric(df_avg_d_topic1.set_index('Rank')['avg_d'], errors='coerce') awhole['avg_d_topic12'] = pd.to_numeric(df_avg_d_topic12.set_index('Rank')['avg_d'], errors='coerce') awhole['avg_d_topic31'] = pd.to_numeric(df_avg_d_topic31.set_index('Rank')['avg_d'], errors='coerce') awhole['avg_d_topic41'] = pd.to_numeric(df_avg_d_topic41.set_index('Rank')['avg_d'], errors='coerce')
a2awhole = awhole.head(10)
###################### now use the complete dataset:
import pandas as pd import matplotlib.pyplot as plt
def process_and_plot(awhole, topic_name, window_size=71):
plot full data
process_and_plot(awhole, 'topic0') # Replace 'Topic 1' with the desired topic name
######################### correlation
Select the columns of interest
columns_of_interest = ['avg_d_topic0','avg_d_topic1','avg_d_topic12','avg_d_topic41', 'avg_d_topic31', 'Count', 'public_metrics.like_count', 'public_metrics.reply_count', 'public_metrics.retweet_count']
Compute the correlation matrix
correlation_matrix = awhole[columns_of_interest].corr() import seaborn as sns import matplotlib.pyplot as plt
Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1) plt.show()
########################## autocorrelation
import pandas as pd
Select the columns of interest
columns_of_interest = ['avg_d_topic1','avg_d_topic12','avg_d_topic41', 'avg_d_topic31', 'Count', 'count_topic1','public_metrics.like_count', 'public_metrics.reply_count', 'public_metrics.retweet_count']
columns_of_interest = ['avg_d_topic1','avg_d_topic12','avg_d_topic41', 'avg_d_topic31', 'Count', 'count_topic1','count_topic12','count_topic41','count_topic31']
Calculate the autocorrelation with a lag of 1 for each column
lag_1_autocorrelation = {} for column in columns_of_interest: lagged_column = awhole[column].shift(1) autocorr = awhole[column].corr(lagged_column) lag_1_autocorrelation[column] = autocorr
for column, autocorr in lag_1_autocorrelation.items(): print(f"Lag-1 Autocorrelation for {column}: {autocorr}")
####################### cross-lagged effects
Calculate the autocorrelation with different lags for each pair of columns
autocorrelations = pd.DataFrame() for column1 in columns_of_interest: for column2 in columns_of_interest: for lag in range(1, 6): # Calculate autocorrelation for lags 1 to 5 (can adjust the range as needed) lagged_column1 = awhole[column1].shift(lag) autocorr = awhole[column2].corr(lagged_column1) autocorrelations.loc[f"Lag {lag}", f"{column1} -> {column2}"] = autocorr
The 'autocorrelations' DataFrame will contain the autocorrelation values for each pair of columns at different lags.
print(autocorrelations)
plt.figure(figsize=(10, 8)) sns.heatmap(autocorrelations, annot=True, cmap='coolwarm', vmin=-1, vmax=1) plt.title("Autocorrelation between Variables at Different Lags") plt.show()
############ cross lagged effects in batches:
columns_of_interest = ['avg_d_topic0','avg_d_topic1', 'avg_d_topic31', 'Count', 'public_metrics.like_count', 'public_metrics.reply_count', 'public_metrics.retweet_count']
Define the 'Rank' ranges for each batch
index_ranges = [(1300, 1800), (1800, 2300), (2300,2800),(2800,3300),(3300, 3800),(3800, 4300)]
Calculate the autocorrelation for each batch
autocorrelations_by_batch = {} for i, (start, end) in enumerate(index_ranges, 1):
Select data for the current batch
The 'autocorrelations_by_batch' dictionary will contain the autocorrelations for each batch.
find the optimal negative one
autocorrelations_by_batch = {} for i, (start, end) in enumerate(index_ranges, 1):
Select data for the current batch
from txt to jsonl
import json
Replace 'input.txt' with the path to your TXT file containing tweets
input_file = 'input.txt'
Replace 'output.jsonl' with the path where you want to save the JSONL file
output_file = 'output.jsonl'
Read the contents of the TXT file
with open(input_file, 'r', encoding='utf-8') as txt_file: tweets_text = txt_file.readlines()
Parse each tweet and write it as a separate line in the JSONL file
with open(output_file, 'w', encoding='utf-8') as jsonl_file: for tweet_text in tweets_text: try: tweet = json.loads(tweet_text) jsonl_file.write(json.dumps(tweet) + '\n') except json.JSONDecodeError:
Skip lines that are not valid JSON objects (e.g., headers or empty lines)
print("Conversion from TXT to JSONL completed.")
awhole.to_csv('alldataz_sum2.csv')
visualize
import matplotlib.pyplot as plt import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
def plot_violin(dataframes, titles): plt.figure(figsize=(12, 6))
Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41
You can plot them using the function as:
plot_violin([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12
Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic
Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41
You can plot them using the function as:
plot_violin([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12']) plot_violin([df_avg_d_topic31, df_avg_d_topic41], ['Topic 31', 'Topic 41'])
first 1200
def plot_violin_first_1200(dataframes, titles): plt.figure(figsize=(12, 6))
plot_violin_first_1200([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12'])
looks great
def plot_violin(dataframes, titles): plt.figure(figsize=(12, 6))
Assuming you have the dataframes loaded as df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, and df_avg_d_topic41
You can plot them using the function as:
plot_violin([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1: cognitive thinking', 'Topic 12: posting about BLM']) plot_violin([df_avg_d_topic31, df_avg_d_topic41], ['Topic 31: positivity', 'Topic 41: BLM protest'])
now significance tests
from scipy.stats import ttest_ind, levene, t import numpy as np
def welch_dof(x,y): """ Calculate the effective degrees of freedom for two samples. """ dof = (x.var()/x.size + y.var()/y.size)2 / ((x.var()/x.size)2 / (x.size-1) + (y.var()/y.size)**2 / (y.size-1)) return dof
def cohen_d(x, y): """ Calculate Cohen's d for effect size. """ pooled_var = (len(x) np.var(x, ddof=1) + len(y) np.var(y, ddof=1)) / (len(x) + len(y)) d = (np.mean(x) - np.mean(y)) / np.sqrt(pooled_var) return d
dataframes = [df_avg_d_topic1, df_avg_d_topic12, df_avg_d_topic31, df_avg_d_topic41] titles = ['Topic 1', 'Topic 12', 'Topic 31', 'Topic 41']
def variance_ratio(first, last): """Compute the F ratio (variance ratio) for comparing variances.""" return first.var() / last.var()
from scipy.stats import kstest, ttest_ind, levene for idx, df in enumerate(dataframes): first = df['avg_d'].iloc[:1200] last = df['avg_d'].iloc[-1200:]
assumption of normality violated
from scipy.stats import mannwhitneyu, fligner
def rank_biserial(u, n1, n2): """Compute rank-biserial correlation as effect size for Mann-Whitney U.""" return 1 - (2u) / (n1 n2)
def variance_ratio(first, last): """Compute the F ratio (variance ratio) for comparing variances.""" return first.var() / last.var()
for idx, df in enumerate(dataframes): first = df['avg_d'].iloc[:1200] last = df['avg_d'].iloc[-1200:]
nsign tests for time series
from scipy.stats import shapiro, ttest_ind
def check_normality_and_ttest(dataframes, titles): for idx, df in enumerate(dataframes):
Extract data for 'First 1200' and 'Last 1200' rows
check_normality_and_ttest([df_avg_d_topic1, df_avg_d_topic12], ['Topic 1', 'Topic 12'])
bootstrapping
import numpy as np import matplotlib.pyplot as plt
def block_bootstrap(series, block_size, numsamples): """Generate bootstrapped samples using block bootstrapping.""" samples = [] n = len(series) for in range(num_samples): sample = [] while len(sample) < n: start_idx = np.random.randint(0, n - block_size + 1) sample.extend(series[start_idx:start_idx+block_size]) samples.append(sample[:n]) return samples
Check for NaN values in the original data
if df_avg_d_topic1['avg_d'].isna().any() or df_avg_d_topic12['avg_d'].isna().any(): print("Warning: NaN values detected in the data. Please handle them before proceeding.")
You can choose to drop NaN values or fill them
Parameters
block_size = 50 # Size of blocks to resample num_samples = 2000 # Number of bootstrapped samples
df_avg_d_topic1 = df_avg_d_topic1.dropna(subset=['avg_d']) df_avg_d_topic12 = df_avg_d_topic12.dropna(subset=['avg_d'])
Generate bootstrapped samples
bootstrap_samples_topic1 = block_bootstrap(df_avg_d_topic1['avg_d'].values, block_size, num_samples) bootstrap_samples_topic12 = block_bootstrap(df_avg_d_topic12['avg_d'].values, block_size, num_samples)
Compute differences in means for each pair of bootstrapped samples
differences = [np.mean(sample1) - np.mean(sample2) for sample1, sample2 in zip(bootstrap_samples_topic1, bootstrap_samples_topic12)]
Remove any NaN values from differences
differences = [diff for diff in differences if not np.isnan(diff)]
Plot histogram of differences
plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped Mean Differences') plt.xlabel('Difference in Means') plt.ylabel('Frequency') plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5]) print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})")
new bootstrapping
def block_bootstrap(data, block_size, num_samples): num_blocks = len(data) - block_size + 1 bootstrap_samples = []
def bootstrap_difference(data1, data2, block_size=10, num_iterations=10000): differences = []
def bootstrap_difference(data1, data2, block_size=10, num_iterations=10000): #+ this is for variance variance_differences = []
def cohens_d_for_means(differences): mean_diff = np.mean(differences) std_diff = np.std(differences) return mean_diff / std_diff
################ Data for topic 1 first_1200_topic1 = df_avg_d_topic1['avg_d'].iloc[:1200].dropna().values last_1200_topic1 = df_avg_d_topic1['avg_d'].iloc[-1200:].dropna().values
differences = bootstrap_difference(first_1200_topic1, last_1200_topic1) plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped variance Differences for topic 1') plt.xlabel('Difference in variances') plt.ylabel('Frequency') plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5]) print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})") d_means = cohens_d_for_means(differences) print(f"Cohen's d for Mean Differences: {d_means:.4f}")
################ Data for topic 12 first_1200_topic12 = df_avg_d_topic12['avg_d'].iloc[:1200].dropna().values last_1200_topic12 = df_avg_d_topic12['avg_d'].iloc[-1200:].dropna().values
differences = bootstrap_difference(first_1200_topic12, last_1200_topic12) plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped variances Differences for topic 12') plt.xlabel('Difference in variances') plt.ylabel('Frequency') plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5]) print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})") d_means = cohens_d_for_means(differences) print(f"Cohen's d for Mean Differences: {d_means:.4f}")
################ Data for topic 41 first_1200_topic41 = df_avg_d_topic41['avg_d'].iloc[:1200].dropna().values last_1200_topic41 = df_avg_d_topic41['avg_d'].iloc[-1200:].dropna().values
differences = bootstrap_difference(first_1200_topic41, last_1200_topic41) plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped variances Differences for topic 41') plt.xlabel('Difference in variances') plt.ylabel('Frequency') plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5]) print(f"95% Confidence Interval for Difference in variances: ({ci_low:.4f}, {ci_high:.4f})") d_means = cohens_d_for_means(differences) print(f"Cohen's d for Mean Differences: {d_means:.4f}") d_means = cohens_d_for_means(differences) print(f"Cohen's d for Mean Differences: {d_means:.4f}")
################ Data for topic 1 first_1200_topic31 = df_avg_d_topic31['avg_d'].iloc[:1200].dropna().values last_1200_topic31 = df_avg_d_topic31['avg_d'].iloc[-1200:].dropna().values
differences = bootstrap_difference(first_1200_topic31, last_1200_topic31) plt.hist(differences, bins=50, edgecolor='k', alpha=0.7) plt.title('Distribution of Bootstrapped variances Differences for topic 31') plt.xlabel('Difference in variances') plt.ylabel('Frequency') plt.show()
Compute 95% confidence interval
ci_low, ci_high = np.percentile(differences, [2.5, 97.5]) print(f"95% Confidence Interval for Difference in Means: ({ci_low:.4f}, {ci_high:.4f})") d_means = cohens_d_for_means(differences) print(f"Cohen's d for Mean Differences: {d_means:.4f}")
#########################
import pandas as pd import matplotlib.pyplot as plt
Assuming dfall is already loaded
plt.figure(figsize=(12, 6)) plt.plot(dfall['Count']) plt.title('Time Series of total amount of tweets per time step') plt.xlabel('Time steps') plt.ylabel('Amount of tweets posted about BLM') plt.grid(True) plt.show()
plt.figure(figsize=(12, 6)) plt.plot(dfall['count_topic1']) plt.title('Time Series of total amount of tweets per time step') plt.xlabel('Time steps') plt.ylabel('Amount of tweets posted about BLM') plt.grid(True) plt.show()
plt.figure(figsize=(12, 6)) plt.plot(dfall['count_topic41']) plt.title('Time Series of Count') plt.xlabel('Time steps') plt.ylabel('Count') plt.grid(True) plt.show()