Closed leopensaa closed 2 years ago
@alexrods could you please paste here the code solution of the graphic it has to be included in your comments? We need just one. This will be useful for @martin-crdev to add it to their work this week. Also to have and idea and fill up the main Goal and the Accepance criteria of this issue. Thank you!
Ok, i made a graph of the sentiment analysis of all external data reviews, and other with the sentiment analysis by category.
import os
import numpy as np
import pandas as pd
import seaborn as sns
from io import BytesIO
import matplotlib.pyplot as plt
from custom_transformers import import_data, DropNullData, DropDuplicates
from text_utils import re_breakline, re_dates, re_hiperlinks, re_money, re_negation, re_numbers, re_special_chars, re_whitespaces, ApplyRegex, StemmingProcess, StopWordsRemoval
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from joblib import load
from ml_utils import ColumnMapping
from text_utils import stopwords_removal, stemming_process, ngrams_count
from viz_utils import single_countplot, donut_plot, format_spines
from scipy.sparse import csr_matrix
DATA_PATH = '../Data_analysis/datasets'
PIPELINES_PATH = '/NLP/pipelines' # Take a look at your project structure
MODELS_PATH = '/NLP/models' # Take a look at your project structure
# Variables for reading the data
COLS_READ = ['review_comment_message', 'review_score']
CORPUS_COL = 'review_comment_message'
TARGET_COL = 'target'
# Defining stopwords
PT_STOPWORDS = stopwords.words('portuguese')
# READING DATA
df = import_data('../Data_analysis/datasets/external_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.rename(columns= {'comment':'review_comment_message', 'score':'review_score'}, inplace=True)
print('Dataset Shape: ', df.shape)
df.head()
df_ml = import_data('../Data_analysis/datasets/external_data.csv')
df_ml.drop('Unnamed: 0', axis=1, inplace=True)
df_ml.rename(columns= {'comment':'review_comment_message', 'score':'review_score'}, inplace=True)
print('Dataset Shape: ', df_ml.shape)
score_map = {
1: 0,
2: 0,
3: 0,
4: 1,
5: 1
}
initial_prep_pipeline = Pipeline([
('mapper', ColumnMapping(old_col_name='review_score', mapping_dict=score_map, new_col_name=TARGET_COL)),
('null_dropper', DropNullData()),
('dup_dropper', DropDuplicates())
])
# Applying initial prep pipeline
df_prep = initial_prep_pipeline.fit_transform(df_ml)
df_prep.reset_index(inplace=True)
df_prep.drop('index', axis=1, inplace=True)
# Defining regex transformers to be applied
regex_transformers = {
'break_line': re_breakline,
'hiperlinks': re_hiperlinks,
'dates': re_dates,
'money': re_money,
'numbers': re_numbers,
'negation': re_negation,
'special_chars': re_special_chars,
'whitespaces': re_whitespaces
}
# Building a text prep pipeline
text_prep_pipeline = Pipeline([
('regex', ApplyRegex(regex_transformers)),
('stopwords', StopWordsRemoval(PT_STOPWORDS)),
('stemming', StemmingProcess(RSLPStemmer())),
('vectorizer', TfidfVectorizer(max_features=300, min_df=0, max_df=1, stop_words=PT_STOPWORDS))
])
model_path = os.environ.get('MODEL_PATH', 'models/sentiment_clf_model.pkl')
with open(model_path, 'rb') as pipe_file:
model = load(BytesIO(pipe_file.read()))
sent_anls = []
for i in range(len(df_prep)):
try:
comment_prep = text_prep_pipeline.fit_transform([df_prep['review_comment_message'][i]])
m_array = csr_matrix(comment_prep, shape=(1,500)).toarray()
sent_anls. append(model.predict(m_array)[0])
except Exception as e:
print(f"key {i}: error {e}")
sent_dict = {
'sentiment': sent_anls,
'sentiment_label': [('positive' if x ==1 else "negative") for x in sent_anls]
}
sent_df = pd.DataFrame(sent_dict)
fig, ax = plt.subplots(figsize=(7, 7))
donut_plot(sent_df.query('sentiment_label in ("positive", "negative")'), 'sentiment_label',
label_names=sent_df.query('sentiment_label in ("positive", "negative")')['sentiment_label'].value_counts().index,
ax=ax, colors=['darkslateblue', 'crimson'])
ax.set_title("General Sentiment Analysis")
plt.show()
import json
with open('../Scraping/categories/brazil.json', 'r') as f:
category = json.load(f)
f.close()
files = [key for key in category]
# Functions to modularize the process
def read_prep_data(path):
df = import_data(path, verbose=False)
df.drop('Unnamed: 0', axis=1, inplace=True)
df.rename(columns= {'comment':'review_comment_message', 'score':'review_score'}, inplace=True)
df_prep = initial_prep_pipeline.fit_transform(df)
df_prep.reset_index(inplace=True)
df_prep.drop('index', axis=1, inplace=True)
return df_prep
def nlp_predic(data):
sent_anls = []
for i in range(len(data)):
try:
comment_prep = text_prep_pipeline.fit_transform([data['review_comment_message'][i]])
m_array = csr_matrix(comment_prep, shape=(1,500)).toarray()
sent_anls. append(model.predict(m_array)[0])
except Exception as e:
pass
sent_dict = {
'sentiment': sent_anls,
'sentiment_label': [('positive' if x ==1 else "negative") for x in sent_anls]
}
return pd.DataFrame(sent_dict)
# Helps to define the graphs position
slots = []
i = 0
while i < 4:
j = 0
while j < 5:
slots.append((i,j))
j+=1
i+=1
fig, axs = plt.subplots(4, 5, figsize=(30, 24))
for i, file in enumerate(files):
path = '../Scraping/External_data/{}.csv'.format(file)
data = read_prep_data(path)
prediction = nlp_predic(data)
donut_plot(prediction.query('sentiment_label in ("positive", "negative")'), 'sentiment_label',
label_names=prediction.query('sentiment_label in ("positive", "negative")')['sentiment_label'].value_counts().index,
ax=axs[slots[i]], colors=['darkslateblue', 'crimson'])
axs[slots[i]].set_title('Categoty: {}'.format(file))
plt.show()
Reopened this Issue. Not really clear how to proceed with this chart for the dashboard and it has to be discussed with the squad in today's Master Session.
💡 Goal
Use trained NLP model in external data reviews, create new dataset with sentiment analysis classification, and show percentages of positive reviews by category in graphs.
🤝 Acceptance Criteria
See the Notion reference with pictures as example.