make sure you are in a new environment

conda activate thesisstats

!conda info --envs

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu112

pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/cu112/torch_stable.html pip install transformers pip install tweetnlp

import tweetnlp

RoBERTa

import concurrent.futures import time import pandas as pd import torch import transformers from transformers import pipeline import tensorflow as tf import timeit import dask.dataframe as dd import os import numpy as np import re import time import concurrent.futures import dask.dataframe as dd from multiprocessing import cpu_count

cores = 10

classifier = tweetnlp.load_model('topic_classification', model_name='cardiffnlp/twitter-roberta-base-emotion-multilabel-latest') classifier.predict("I bet everything will work out in the end :)", return_probability=True)

import os import signal

Get the current process ID

pid = os.getpid()

Send a signal to restart the process

os.kill(pid, signal.SIGINT)

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata") df = pd.read_csv('day31.csv')

df=df2 df = df.reset_index() df = df.replace(np.nan,'')

def remove(text):

text = re.sub('[0-9]+', '', text) # numbers

text = re.sub(r"http\S+", "", text)         #links
text = re.sub(r"www.\S+", "", text)         #links
text = re.sub("@[A-Za-z0-9_]+","", text)    #remove @mentions
return text

df['Tweet'] = df['text'].apply(lambda x: remove(x))

#################################################################

after BLM

List of words to search for

keywords = ['blm', 'blacklives', 'georgefl', 'floyd', 'racism', 'looting', 'supremacists']

Create a regular expression pattern that matches any of the words

pattern = '|'.join(keywords)

Use the `str.contains()` method to create a boolean mask that is True if the "Tweet" column contains any of the words

mask = df['Tweet'].str.contains(pattern, case=False, na=False)

Convert the boolean mask to integers, so that True becomes 1 and False becomes 0

df['blm'] = mask.astype(int)

subset_df = df[(df['blm'] == 0) & (df['lang'] == 'en')]

#################################################################

split dataset to train in chunks

one_df1 = 750000 two_df1 = 750001

A = 0

df1=df oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel1June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel2June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel3June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel4June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel5June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel6June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel7June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel8June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel9June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel10June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel11June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel12June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel13June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel14June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel15June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel16June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel17June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel18June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel19June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel20June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel21June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel22June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel23June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel24June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel25June30.csv') ## tot twoApril.to_csv('June30rest.csv')

df1 = pd.read_csv(r'June30rest.csv') A = 1 oneApril = df1.iloc[:one_df1,A:] twoApril = df1.iloc[two_df1:,A:] oneApril.to_csv('deel26June30.csv') ## tot twoApril.to_csv('June30rest.csv')

new split approach

df = pd.read_csv('your_data.csv') # Load the original data

batch_size = 600000 num_batches = len(df) // batch_size # Calculate the number of batches

for i in range(num_batches): start_idx = i batch_size end_idx = (i + 1) batch_size batch_df = df.iloc[start_idx:end_idx, :] # Select the rows for the current batch batch_df.tocsv(f'batch{i+1}.csv', index=False) # Save the batch to a separate file

####################################################################################################

withhout gpu and without multiprocessing

speed benchmark: dataset 2000 rows -> 73 seconds

4000 rows -> 127.21

#################################################################################################### classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', top_k=None)

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

result = classifier(df2.at[0, 'Tweet']) labels = [entry['label'] for entry in result[0]]

emotion_df = pd.DataFrame(index=df2.index, columns=labels)

start = timeit.default_timer() for i, row in df2.iterrows():

Apply the classifier function to the Tweet in the 'Tweet' column

result = classifier(df2.at[i, 'Tweet'])
# Get the labels (emotions) from the result dictionary
labels = [entry['label'] for entry in result[0]]
# Loop through each emotion in the dictionary
for emotion in result[0]:
    # If the emotion label exists in the new dataframe,
    # set the value for that emotion in the current row
    # to the corresponding score from the dictionary
    if emotion['label'] in emotion_df.columns:
        emotion_df.at[i, emotion['label']] = emotion['score']

result2 = pd.concat([df2, emotion_df], axis=1) end = timeit.default_timer() print("Time taken:", end - start)

####################################################################################################

threading

speed benchmark: dataset 2000 rows -> 37.27 seconds (I7, 10th edition, 11 cores)

4000 rows -> 73.08

10000 rows-> 220.31

####################################################################################################

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', top_k=None)

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

result = classifier(df2.at[0, 'Tweet']) labels = [entry['label'] for entry in result[0]]

emotion_df = pd.DataFrame(index=df2.index, columns=labels)

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

result = classifier(tweet)
# Get the labels (emotions) from the result dictionary
labels = [entry['label'] for entry in result[0]]
return labels, result[0]

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=11) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df2[['Tweet']].itertuples(index=True)

# Map the classify_emotion function to the iterator
# using the executor's map method

results = executor.map(classify_emotion, (x[1] for x in tweet_iter))

# Initialize the emotion dataframe with the same shape as the original dataframe
emotion_df = pd.DataFrame(index=df2.index, columns=labels)

# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']

# Concatenate the original dataframe with the emotion dataframe
result2 = pd.concat([df2, emotion_df], axis=1)

end = time.time() print(f"Time taken: {end - start} seconds")

####################################################################################################

Parallel processing (multiple CPU) DASK

speed benchmark: dataset 2000 rows -> 39.63 (I7, 10th edition, 11 cores)

4000 rows -> 74.22

10000

DASK this is 28.9 seconds (2000) for 4000 = 57.63

start = time.time() def classify_emotion(tweet):

Apply the classifier function to the given tweet text

result = classifier(tweet)
# Get the labels (emotions) from the result dictionary
labels = [entry['label'] for entry in result[0]]
return labels, result[0]

Convert the Pandas dataframe to a Dask dataframe

df = dd.from_pandas(df2, npartitions=11)

Use the map method of the Dask dataframe to apply the classify_emotion function to each tweet

results = df['Tweet'].map(classify_emotion)

Compute the result as a Pandas dataframe

results = results.compute()

Initialize the emotion dataframe with the same shape as the original dataframe

emotion_df = pd.DataFrame(index=df2.index, columns=labels)

Loop through the results and populate the emotion dataframe

for i, (labels, emotions) in enumerate(results): for emotion in emotions: if emotion['label'] in emotion_df.columns: emotion_df.at[i, emotion['label']] = emotion['score']

Concatenate the original dataframe with the emotion dataframe

result2 = pd.concat([df2, emotion_df], axis=1) end = time.time() print(f"Time taken: {end - start} seconds")

####################################################################################################

DASK + threading

speed benchmark: dataset 2000 rows -> 37.60 (I7, 10th edition, 2*11 cores)

2000 rows -> 38.78 (I7, 10th edition, 5+6 cores)

4000 rows -> 79.45 (I7, 10th edition, 11 cores)

DASK + threathing this is 28.9 seconds+ multiple threathing = 28.5 (2000) for 4000 = 55.57 (using cpu 11 cores and intel 10th gen i7)

import concurrent.futures

start = time.time() def classify_emotion(tweet):

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Apply the classifier function to the given tweet text
    result = classifier(tweet)
    # Get the labels (emotions) from the result dictionary
    labels = [entry['label'] for entry in result[0]]
    return labels, result[0]

Convert the Pandas dataframe to a Dask dataframe

df = dd.from_pandas(df2, npartitions=6)

Use the map method of the Dask dataframe to apply the classify_emotion function to each tweet

results = df['Tweet'].map(classify_emotion)

Compute the result as a Pandas dataframe

results = results.compute()

Initialize the emotion dataframe with the same shape as the original dataframe

emotion_df = pd.DataFrame(index=df2.index, columns=labels)

Loop through the results and populate the emotion dataframe

for i, (labels, emotions) in enumerate(results): for emotion in emotions: if emotion['label'] in emotion_df.columns: emotion_df.at[i, emotion['label']] = emotion['score']

Concatenate the original dataframe with the emotion dataframe

result2 = pd.concat([df2, emotion_df], axis=1) end = time.time() print(f"Time taken: {end - start} seconds")

CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.file)"))

############################### code############################### code GPU acceleration test (didn't work) ############################### code############################### code

print(torch.backends.cudnn.enabled) print(torch.cuda.is_available())

Check if the GPU is available

if tf.test.is_gpu_available():

Set the device to GPU

device = '/device:GPU:0'
print("Running on the GPU")

else:

Set the device to CPU

device = '/device:CPU:0'
print("Running on the CPU")

tf.config.list_physical_devices('GPU')

classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', top_k=None)

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

result = classifier(df2.at[0, 'Tweet']) labels = [entry['label'] for entry in result[0]]

emotion_df = pd.DataFrame(index=df2.index, columns=labels)

with gpu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") start = timeit.default_timer() with tf.device('/GPU:0' if tf.test.is_gpu_available() else '/CPU:0'): for i, row in df2.iterrows():

Apply the classifier function to the text in the 'Tweet' column

    result = classifier(df2.at[i, 'Tweet'])
    # Get the labels (emotions) from the result dictionary
    labels = [entry['label'] for entry in result[0]]
    # Loop through each emotion in the dictionary
    for emotion in result[0]:
        # If the emotion label exists in the new dataframe,
        # set the value for that emotion in the current row
        # to the corresponding score from the dictionary
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']

result2 = pd.concat([df2, emotion_df], axis=1) end = timeit.default_timer() print("Time taken:", end - start)

######################################################################################################################################### ######################################## start new here #######################################

import numpy as np import pandas as pd

option 1

import transformers import tensorflow from transformers import pipeline

device = torch.device('cpu') if torch.cuda.is_available(): device = torch.cuda.get_device_name(0) # Get the name of the current GPU print(f"Using GPU: {device}") else: print("Using CPU")

☺option2

pip install tokenizers

pip install tweetnlp

import torch import tweetnlp

df2 = subset_df df2 = df2.reset_index() df2 = df2.replace(np.nan,'')

df1 = df1.reset_index()

n_cores = cpu_count() cores = n_cores - 1 # don't use all cores if you want the basic programs of your computer to function properly cores = 10 # when i'm working during the day on my computer

cores = 11 # during the night when im not working

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

result = classifier.predict(tweet, return_probability=True)
# Get the labels (emotions) from the result dictionary
labels = list(result['probability'].keys())
result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
return labels, result

CUDA_LAUNCH_BLOCKING=1

#############################

deel1

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\earlier") df2 = pd.readcsv('tweetsmai25.csv')

df2 = df2.head(1000)

df2 = df df1=df

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

result = classifier.predict(df2.at[0, 'Tweet'], return_probability=True) extract = [{'label': label, 'score': score} for label, score in result['probability'].items()] result = classifier.predict(df2.at[0, 'Tweet'], return_probability=True) labels = list(result['probability'].keys())

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('resultsday31.csv')

#############################

deel2

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_10") df2 = pd.read_csv('deel2June10.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results2.csv')

#############################

deel3

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_10") df2 = pd.read_csv('deel3June10.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results3.csv')

#############################

deel4

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_02") df2 = pd.read_csv('deel4June2.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results4.csv')

#############################

deel5

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_02") df2 = pd.read_csv('deel5June2.csv')

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

df1 = df2.sample(n=1000)

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results5.csv')

#############################

deel6

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_02") df2 = pd.read_csv('deel6June2.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results6.csv')

#############################

deel7

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_02") df2 = pd.read_csv('deel7June2.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results7.csv')

#############################

deel8

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_02") df2 = pd.read_csv('deel8June2.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results8.csv')

#############################

deel9

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_02") df2 = pd.read_csv('deel9June2.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results9.csv')

#############################

deel10

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_02") df2 = pd.read_csv('deel10June2.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results10.csv')

#############################

deel11

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_02") df2 = pd.read_csv('deel11June2.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results11.csv')

############################################################################################################ ############################################################################################################ ############################################################################################################

day 11

############################################################################################################ ############################################################################################################ ############################################################################################################

#############################

deel1

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_11") df2 = pd.read_csv('deel1June11.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results1.csv')

#############################

deel2

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_11") df2 = pd.read_csv('deel2June11.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

start = time.time()

Initialize a ThreadPoolExecutor with 4 worker threads

with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:

Get an iterator of (index, tweet) tuples from the dataframe

tweet_iter = df1[['Tweet']].itertuples(index=True)
results = executor.map(classify_emotion, (x[1] for x in tweet_iter))
emotion_df = pd.DataFrame(index=df2.index, columns=labels)
# Loop through the results and populate the emotion dataframe
for i, (labels, emotions) in enumerate(results):
    for emotion in emotions:
        if emotion['label'] in emotion_df.columns:
            emotion_df.at[i, emotion['label']] = emotion['score']
results1 = pd.concat([df1, emotion_df], axis=1)

torch.cuda.empty_cache() end = time.time() print(f"Time taken: {end - start} seconds")

results1.to_csv('results2.csv')

#############################

deel3

#############################

os.chdir("C:\Users\Brent\Documents\Psychology\KUL\thesisstatisticsdata\tweetsMA2_06_11") df2 = pd.read_csv('deel3June11.csv')

df2 = df2.head(1000)

df1 = df2

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

emotion_words = list(result['probability'].keys()) for word in emotion_words: df2[word] = ''

def classify_emotion(tweet):

Apply the classifier function to the given tweet text

    result = classifier.predict(tweet, return_probability=True)
    # Get the labels (emotions) from the result dictionary
    labels = list(result['probability'].keys())
    result = [{'label': label, 'score': score} for label, score in result['probability'].items()]
    return labels, result

HendrickxBrent / EmotionContagionDuringBlackLIvesMatter

NLP #2

make sure you are in a new environment

conda activate thesisstats

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu112

RoBERTa

Get the current process ID

Send a signal to restart the process

text = re.sub('[0-9]+', '', text) # numbers

after BLM

List of words to search for

Create a regular expression pattern that matches any of the words

Use the str.contains() method to create a boolean mask that is True if the "Tweet" column contains any of the words

Convert the boolean mask to integers, so that True becomes 1 and False becomes 0

split dataset to train in chunks

new split approach

withhout gpu and without multiprocessing

speed benchmark: dataset 2000 rows -> 73 seconds

4000 rows -> 127.21

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

Apply the classifier function to the Tweet in the 'Tweet' column

threading

speed benchmark: dataset 2000 rows -> 37.27 seconds (I7, 10th edition, 11 cores)

4000 rows -> 73.08

10000 rows-> 220.31

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

Apply the classifier function to the given tweet text

Initialize a ThreadPoolExecutor with 4 worker threads

Get an iterator of (index, tweet) tuples from the dataframe

Parallel processing (multiple CPU) DASK

speed benchmark: dataset 2000 rows -> 39.63 (I7, 10th edition, 11 cores)

4000 rows -> 74.22

10000

DASK this is 28.9 seconds (2000) for 4000 = 57.63

Apply the classifier function to the given tweet text

Convert the Pandas dataframe to a Dask dataframe

Use the map method of the Dask dataframe to apply the classify_emotion function to each tweet

Compute the result as a Pandas dataframe

Initialize the emotion dataframe with the same shape as the original dataframe

Loop through the results and populate the emotion dataframe

Concatenate the original dataframe with the emotion dataframe

DASK + threading

speed benchmark: dataset 2000 rows -> 37.60 (I7, 10th edition, 2*11 cores)

2000 rows -> 38.78 (I7, 10th edition, 5+6 cores)

4000 rows -> 79.45 (I7, 10th edition, 11 cores)

DASK + threathing this is 28.9 seconds+ multiple threathing = 28.5 (2000) for 4000 = 55.57 (using cpu 11 cores and intel 10th gen i7)

Initialize a ThreadPoolExecutor with 4 worker threads

Convert the Pandas dataframe to a Dask dataframe

Use the map method of the Dask dataframe to apply the classify_emotion function to each tweet

Compute the result as a Pandas dataframe

Initialize the emotion dataframe with the same shape as the original dataframe

Loop through the results and populate the emotion dataframe

Concatenate the original dataframe with the emotion dataframe

Check if the GPU is available

Set the device to GPU

Set the device to CPU

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

with gpu

Apply the classifier function to the text in the 'Tweet' column

option 1

☺option2

pip install tokenizers

pip install tweetnlp

cores = 11 # during the night when im not working

Apply the classifier function to the given tweet text

CUDA_LAUNCH_BLOCKING=1

deel1

df2 = df2.head(1000)

Get the labels (emotions) from the classifier's output dictionary

for the first row of the dataframe

labels = [entry['label'] for entry in result[0]]

Apply the classifier function to the given tweet text

Initialize a ThreadPoolExecutor with 4 worker threads

Get an iterator of (index, tweet) tuples from the dataframe

deel2

df2 = df2.head(1000)

Get the labels (emotions) from the classifier's output dictionary

Use the `str.contains()` method to create a boolean mask that is True if the "Tweet" column contains any of the words