PrasenjeetSaha / Literature

Materials for AI/ML
0 stars 0 forks source link

Sentiment Analysis #1

Open PrasenjeetSaha opened 1 year ago

PrasenjeetSaha commented 1 year ago

import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score

Load the dataset

data = pd.read_csv('sentiment_data.csv') # Replace 'sentiment_data.csv' with your dataset file

Split the data into training and testing sets

X = data['text'] y = data['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Convert text data into numerical features

vectorizer = CountVectorizer() X_train_vec = vectorizer.fit_transform(X_train) X_test_vec = vectorizer.transform(X_test)

Train the sentiment classifier

classifier = MultinomialNB() classifier.fit(X_train_vec, y_train)

Predict sentiment on the test set

y_pred = classifier.predict(X_test_vec)

Evaluate the accuracy

accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy)

PrasenjeetSaha commented 1 year ago

import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB

Load the trained model and vectorizer

model = MultinomialNB() # Replace with the appropriate model class you used during training vectorizer = CountVectorizer() # Replace with the appropriate vectorizer you used during training

Load the dataset or prepare your own data

new_data = pd.DataFrame({'text': ['This is a positive sentence', 'I am not happy with this product']})

Convert the new text data into numerical features

new_data_vec = vectorizer.transform(new_data['text'])

Predict the sentiment using the trained model

sentiment_pred = model.predict(new_data_vec)

Print the predicted sentiment

for text, sentiment in zip(new_data['text'], sentiment_pred): print(f"Text: {text}") print(f"Sentiment: {sentiment}") print()

PrasenjeetSaha commented 1 year ago

import gpt_2_simple as gpt2 import tensorflow as tf

Download the GPT-2 model

gpt2.download_gpt2(model_name='124M')

Start a TensorFlow session

sess = gpt2.start_tf_sess()

Load the GPT-2 model checkpoint

gpt2.load_gpt2(sess, model_name='124M')

Chatbot loop

while True: user_input = input("User: ")

# Generate a response using the GPT-2 model
response = gpt2.generate(sess, model_name='124M', prefix=user_input, length=50, temperature=0.7, return_as_list=True)[0]

print("Chatbot: " + response)
PrasenjeetSaha commented 1 year ago

import os import pandas as pd from extract_msg import Message

folder_path = "C:/Data" df_data = []

for file in os.listdir(folder_path): if file.endswith(".msg"): file_path = os.path.join(folder_path, file) try: with Message(file_path) as msg: attachment = msg.attachments['content.html'] text = attachment.data.decode('utf-8') df_data.append({'filename': file, 'text': text}) except Exception as e: print(f"Error processing file '{file}': {e}")

df = pd.DataFrame(df_data)

PrasenjeetSaha commented 1 year ago

import os import pandas as pd from extract_msg import Message

folder_path = "C:/Data" df_data = []

for file in os.listdir(folder_path): if file.endswith(".msg"): file_path = os.path.join(folder_path, file) try: with Message(file_path) as msg: for att in msg.attachments: if att.long_filename == "content.html": text = att.data.decode('utf-8') df_data.append({'filename': file, 'text': text}) break except Exception as e: print(f"Error processing file '{file}': {e}")

df = pd.DataFrame(df_data)

PrasenjeetSaha commented 1 year ago

import os import pandas as pd from extract_msg import Message from bs4 import BeautifulSoup from datetime import datetime

folder_path = "C:/Data" df_data = []

for file in os.listdir(folder_path): if file.endswith(".msg"): file_path = os.path.join(folder_path, file) try: with Message(file_path) as msg: for att in msg.attachments: if att.filename == "content.html": html_content = att.data.decode('utf-8') soup = BeautifulSoup(html_content, 'html.parser') email_list = soup.findall('div', class='email')

                    if email_list:
                        latest_email = email_list[-1]
                        sender = latest_email.find('span', class_='sender-name').text
                        date_str = latest_email.find('span', class_='datetime').text
                        latest_date = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')

                        previous_date = None
                        for email in reversed(email_list[:-1]):
                            prev_sender = email.find('span', class_='sender-name').text
                            prev_date_str = email.find('span', class_='datetime').text
                            prev_date = datetime.strptime(prev_date_str, '%Y-%m-%d %H:%M:%S')

                            if prev_sender == sender:
                                previous_date = prev_date
                                break

                        df_data.append({
                            'filename': file,
                            'latest_sender': sender,
                            'latest_email_date': latest_date,
                            'previous_email_date': previous_date
                        })
                    break
    except Exception as e:
        print(f"Error processing file '{file}': {e}")

df = pd.DataFrame(df_data)

PrasenjeetSaha commented 1 year ago

Analytical scheduling, guided by past data, enhances efficiency for Tier 1 and Tier 2 teams:

  1. Task Timing: Allocate documentation reviews during low-demand periods, ensuring focus on KYC verification during peak times.
  2. Delay Prevention: Predictive analysis anticipates downstream team delays, allowing for proactive adjustments.
  3. Priority Automation: Automate high-priority tasks like urgent verifications, ensuring timely completion.
  4. Real-time Optimization: Monitor and adjust resource allocation for smoother product activations.
  5. Resource Efficiency: Utilize data to address recurring delays, optimizing KYC verification processes.

Data-driven scheduling streamlines operations, reduces delays, and improves banking product implementations for commercial clients.

PrasenjeetSaha commented 1 year ago

import spacy import re from dateutil.parser import parse

def extract_info_with_spacy(input_text): nlp = spacy.load("en_core_web_sm") doc = nlp(input_text)

# Regular expressions for validating client ID and transaction ID
client_id_pattern = r'^\d{1,11}$'
transaction_id_pattern = r'^\d{4}-\d{5}-\d{5}$'

# Initialize variables to store extracted information
entities = {
    'client_id': None,
    'transaction_id': None,
    'date1': None,
    'date2': None,
    'term': None
}

# Flags to keep track of whether an entity is already recognized
client_id_found = False
transaction_id_found = False

# Extract client ID, transaction ID, and dates from the parsed doc
for ent in doc.ents:
    if not client_id_found and ent.label_ == 'CARDINAL' and len(ent.text) <= 11:
        context = input_text[max(0, ent.start_char - 15):min(len(input_text), ent.end_char + 15)].lower()
        if any(word in context for word in ['client', 'client id', 'client with id']):
            if re.match(client_id_pattern, ent.text):
                entities['client_id'] = ent.text
                client_id_found = True
                continue  # Move to the next entity

    elif not transaction_id_found and ent.label_ == 'CARDINAL' and '-' in ent.text and ent.text.count('-') == 2:
        context = input_text[max(0, ent.start_char - 15):min(len(input_text), ent.end_char + 15)].lower()
        if 'transaction' in context or 'transaction id' in context:
            if re.match(transaction_id_pattern, ent.text):
                entities['transaction_id'] = ent.text
                transaction_id_found = True
                continue  # Move to the next entity

    if ent.label_ == 'DATE':
        if not entities['date1']:
            entities['date1'] = ent.text
            continue  # Move to the next entity
        elif not entities['date2']:
            entities['date2'] = ent.text
            continue  # Move to the next entity

# Extract term if there is a numerical quantity followed by a time period
for token in doc:
    if token.like_num and token.idx + len(token.text) < len(input_text):
        next_token = doc[token.i + 1]
        if next_token.text.lower() in {'day', 'days', 'week', 'weeks', 'month', 'months', 'year', 'years'}:
            entities['term'] = token.text + ' ' + next_token.text
            break

# Check for month-year format dates (e.g., "March 2023") and convert them to "dd/mm/yyyy" format
if entities['date1'] and not entities['date1'].count('/') == 2:
    try:
        parsed_date = parse(entities['date1'], fuzzy=True)
        entities['date1'] = parsed_date.strftime("%m/%d/%Y")
    except ValueError:
        pass

if entities['date2'] and not entities['date2'].count('/') == 2:
    try:
        parsed_date = parse(entities['date2'], fuzzy=True)
        entities['date2'] = parsed_date.strftime("%m/%d/%Y")
    except ValueError:
        pass

return entities

Test the function with example inputs

inputs = [ "i need to know the payment status for client 12345678910", "show me the payment history for client 93216956291", "what are all the transactions for client with id 10386392619 since March this year", "i want the transactions done by client 94017382949 from 2/11/2022 to 3/6/2023", "what is the transaction status for transaction number 16339", "i need transaction status for March 2023", ]

for input_text in inputs: entities = extract_info_with_spacy(input_text) print("Input Text:", input_text) print("Entities:", entities) print("=" * 50)