C10-Brazilian-e-commerce-modeling-team / brazilian-e-commerce

0 stars 6 forks source link

feat: Graph 10. Web scraping insight #70

Closed leopensaa closed 2 years ago

leopensaa commented 2 years ago

💡 Goal

Use trained NLP model in external data reviews, create new dataset with sentiment analysis classification, and show percentages of positive reviews by category in graphs.

🤝 Acceptance Criteria

See the Notion reference with pictures as example.

alexrods commented 2 years ago

Notebokk

leopensaa commented 2 years ago

@alexrods could you please paste here the code solution of the graphic it has to be included in your comments? We need just one. This will be useful for @martin-crdev to add it to their work this week. Also to have and idea and fill up the main Goal and the Accepance criteria of this issue. Thank you!

alexrods commented 2 years ago

Ok, i made a graph of the sentiment analysis of all external data reviews, and other with the sentiment analysis by category.

Import libraries

import os
import numpy as np
import pandas as pd
import seaborn as sns
from io import BytesIO
import matplotlib.pyplot as plt
from custom_transformers import import_data, DropNullData, DropDuplicates
from text_utils import re_breakline, re_dates, re_hiperlinks, re_money, re_negation, re_numbers, re_special_chars, re_whitespaces, ApplyRegex, StemmingProcess, StopWordsRemoval
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from joblib import load
from ml_utils import ColumnMapping
from text_utils import stopwords_removal, stemming_process, ngrams_count
from viz_utils import single_countplot, donut_plot, format_spines
from scipy.sparse import csr_matrix

Initial Settings

DATA_PATH = '../Data_analysis/datasets'
PIPELINES_PATH = '/NLP/pipelines' # Take a look at your project structure
MODELS_PATH = '/NLP/models' # Take a look at your project structure

# Variables for reading the data
COLS_READ = ['review_comment_message', 'review_score']
CORPUS_COL = 'review_comment_message'
TARGET_COL = 'target'

# Defining stopwords
PT_STOPWORDS = stopwords.words('portuguese')
# READING DATA

df = import_data('../Data_analysis/datasets/external_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.rename(columns= {'comment':'review_comment_message', 'score':'review_score'}, inplace=True)
print('Dataset Shape: ', df.shape)
df.head()

NLP

df_ml = import_data('../Data_analysis/datasets/external_data.csv')
df_ml.drop('Unnamed: 0', axis=1, inplace=True)
df_ml.rename(columns= {'comment':'review_comment_message', 'score':'review_score'}, inplace=True)
print('Dataset Shape: ', df_ml.shape)

Creating a pipeline for the initial prep on the data

score_map = {
    1: 0,
    2: 0,
    3: 0,
    4: 1,
    5: 1
}

initial_prep_pipeline = Pipeline([
    ('mapper', ColumnMapping(old_col_name='review_score', mapping_dict=score_map, new_col_name=TARGET_COL)),
    ('null_dropper', DropNullData()),
    ('dup_dropper', DropDuplicates())
])
# Applying initial prep pipeline
df_prep = initial_prep_pipeline.fit_transform(df_ml)
df_prep.reset_index(inplace=True)
df_prep.drop('index', axis=1, inplace=True)
# Defining regex transformers to be applied
regex_transformers = {
    'break_line': re_breakline,
    'hiperlinks': re_hiperlinks,
    'dates': re_dates,
    'money': re_money,
    'numbers': re_numbers,
    'negation': re_negation,
    'special_chars': re_special_chars,
    'whitespaces': re_whitespaces
}
# Building a text prep pipeline
text_prep_pipeline = Pipeline([
    ('regex', ApplyRegex(regex_transformers)),
    ('stopwords', StopWordsRemoval(PT_STOPWORDS)),
    ('stemming', StemmingProcess(RSLPStemmer())),
    ('vectorizer', TfidfVectorizer(max_features=300, min_df=0, max_df=1, stop_words=PT_STOPWORDS))
])

General Sentiment Analysis

model_path = os.environ.get('MODEL_PATH', 'models/sentiment_clf_model.pkl')
with open(model_path, 'rb') as pipe_file:
    model = load(BytesIO(pipe_file.read()))
sent_anls = []
for i in range(len(df_prep)):
    try:
        comment_prep = text_prep_pipeline.fit_transform([df_prep['review_comment_message'][i]])
        m_array = csr_matrix(comment_prep, shape=(1,500)).toarray()
        sent_anls. append(model.predict(m_array)[0])
    except Exception as e:
        print(f"key {i}: error {e}")
sent_dict = {
    'sentiment': sent_anls,
    'sentiment_label': [('positive' if x ==1 else "negative") for x in sent_anls]
}

sent_df = pd.DataFrame(sent_dict)

fig, ax = plt.subplots(figsize=(7, 7))
donut_plot(sent_df.query('sentiment_label in ("positive", "negative")'), 'sentiment_label', 
           label_names=sent_df.query('sentiment_label in ("positive", "negative")')['sentiment_label'].value_counts().index,
           ax=ax, colors=['darkslateblue', 'crimson'])
ax.set_title("General Sentiment Analysis")
plt.show()

output_nlp_gral

Categories Sentiment Analysis

import json

with open('../Scraping/categories/brazil.json', 'r') as f:
    category = json.load(f)
    f.close()

files = [key for key in category]
# Functions to modularize the process

def read_prep_data(path):

    df = import_data(path, verbose=False)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df.rename(columns= {'comment':'review_comment_message', 'score':'review_score'}, inplace=True)    
    df_prep = initial_prep_pipeline.fit_transform(df)
    df_prep.reset_index(inplace=True)
    df_prep.drop('index', axis=1, inplace=True)

    return df_prep

def nlp_predic(data):
    sent_anls = []
    for i in range(len(data)):
        try:
            comment_prep = text_prep_pipeline.fit_transform([data['review_comment_message'][i]])
            m_array = csr_matrix(comment_prep, shape=(1,500)).toarray()
            sent_anls. append(model.predict(m_array)[0])
        except Exception as e:
            pass

    sent_dict = {
    'sentiment': sent_anls,
    'sentiment_label': [('positive' if x ==1 else "negative") for x in sent_anls]
}

    return pd.DataFrame(sent_dict)
# Helps to define the graphs position
slots = []
i = 0     
while i < 4:
    j = 0
    while j < 5:
        slots.append((i,j))
        j+=1
    i+=1
fig, axs = plt.subplots(4, 5, figsize=(30, 24))
for i, file in enumerate(files):
    path = '../Scraping/External_data/{}.csv'.format(file)
    data = read_prep_data(path)
    prediction = nlp_predic(data)

    donut_plot(prediction.query('sentiment_label in ("positive", "negative")'), 'sentiment_label', 
        label_names=prediction.query('sentiment_label in ("positive", "negative")')['sentiment_label'].value_counts().index,
        ax=axs[slots[i]], colors=['darkslateblue', 'crimson'])
    axs[slots[i]].set_title('Categoty: {}'.format(file))

plt.show()

output

leopensaa commented 2 years ago

Reopened this Issue. Not really clear how to proceed with this chart for the dashboard and it has to be discussed with the squad in today's Master Session.