python-telegram-bot / python-telegram-bot

We have made you a wrapper you can't refuse
https://python-telegram-bot.org
GNU General Public License v3.0
26.22k stars 5.32k forks source link

Improving keyword recognition and extraction in a Telegram bot using Python[QUESTION] #4421

Closed EdPyMy closed 2 months ago

EdPyMy commented 2 months ago

Issue I am facing

Hello,

I am developing a Telegram bot using Python and facing challenges with keyword recognition and extraction. Despite trying various methods like basic string matching, regular expressions, and NLP libraries (spaCy, NLTK), I am unable to achieve consistent keyword recognition and efficient performance with larger datasets.

What I've tried so far:

Problems:

Traceback to the issue

import re # Importing the module for working with regular expressions

import difflib # Importing the module for sequence comparison

import spacy # Importing the spaCy library for natural language processing

import nltk # Importing the NLTK library for natural language processing

from fuzzywuzzy import fuzz # Importing the module for fuzzy string matching

from nltk.corpus import stopwords # Importing the list of stop words from NLTK

from nltk.stem import WordNetLemmatizer # Importing the lemmatizer from NLTK

from telebot import types # Importing the types module from the telebot library for creating buttons

import config # Importing the config file containing settings

import time # Importing the module for working with time

from button_functions import send_buttons # Importing the function for sending buttons

from buttons import ( # Importing functions for sending various buttons

send_calculators_buttons,

send_loan_buttons,

send_sub_buttons_hipotek,

send_sub_buttons_grav,

send_sub_buttons_acra,

)

import logging # Importing the module for logging

from config import button_phrases # Importing button phrases from config

# Loading NLTK resources

nltk.download('punkt') # Downloading the tokenizer from NLTK

nltk.download('stopwords') # Downloading the stop words from NLTK

nltk.download('wordnet') # Downloading the WordNet lemmatizer from NLTK

# Loading spaCy models

nlp_en = spacy.load("en_core_web_sm") # Loading the model for English

nlp_ru = spacy.load("ru_core_news_sm") # Loading the model for Russian

# nlp_hy = spacy.load("hy_core_news_sm") # Loading the model for Armenian

# Initializing the lemmatizer and the list of stop words

lemmatizer = WordNetLemmatizer() # Initializing the lemmatizer

stop_words = set(stopwords.words('english')).union(set(stopwords.words('russian'))).union(

set(config.conjunctions)) # Combining the lists of stop words for English, Russian, and conjunctions

def normalize_text(text, lang='en'): # Defining the function for normalizing text

text = re.sub(r'[^\w\s]', '', text) # Removing all characters except letters and spaces using a regular expression

logging.info(f"Text after regex cleaning: {text}") # Logging the text after removing characters

# Removing conjunctions from the text

words = text.split() # Splitting the text into individual words

words = [word for word in words if word not in config.conjunctions] # Removing all conjunctions from the text using the list of conjunctions from the config file

text = ' '.join(words) # Joining the remaining words back into a text string

logging.info(f"Text after removing conjunctions: {text}") # Logging the text after removing conjunctions

if lang == 'ru': # If the language of the text is Russian

doc = nlp_ru(text) # Processing the text with the Russian language model

elif lang == 'hy': # If the language of the text is Armenian

doc = text.split() # Simple example of text processing for Armenian

else: # If the language of the text is English

doc = nlp_en(text) # Processing the text with the English language model

if lang == 'hy': # If the language of the text is Armenian

normalized_tokens = [token.lower() for token in doc if not re.match(r'\W', token)] # Normalizing the text for Armenian by removing all non-alphabetic characters

else: # If the language of the text is English or Russian

normalized_tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space] # Normalizing the text for English and Russian using lemmatization

normalized_text = " ".join(normalized_tokens) # Joining the normalized tokens into a string

logging.info(f"Normalized text: {normalized_text}") # Logging the normalized text

return normalized_text # Returning the normalized text

def fuzzy_match(input_text, phrases): # Defining the function for fuzzy matching

best_match = None # Initializing the variable for the best match

highest_ratio = 0 # Initializing the variable for the highest ratio

normalized_input = normalize_text(input_text) # Normalizing the input text

for phrase in phrases: # For each phrase in the list of phrases

normalized_phrase = normalize_text(phrase) # Normalizing the phrase

ratio = fuzz.ratio(normalized_input, normalized_phrase) # Comparing the text and the phrase

if ratio > highest_ratio: # If the current ratio is higher than the highest ratio

highest_ratio = ratio # Updating the highest ratio

best_match = phrase # Updating the best phrase

logging.info(f"Fuzzy match result: {best_match} with ratio: {highest_ratio}") # Logging the result of the fuzzy match

return best_match, highest_ratio # Returning the best phrase and the highest ratio

def get_best_match(input_text, phrases): # Defining the function for finding the best match

input_text = normalize_text(input_text) # Normalizing the input text

normalized_phrases = [normalize_text(phrase) for phrase in phrases] # Normalizing the phrases

matches = difflib.get_close_matches(input_text, normalized_phrases, n=1, cutoff=0.8) # Finding the closest matches

if matches: # If matches are found

best_match_index = normalized_phrases.index(matches[0]) # Getting the index of the best match

best_match = phrases[best_match_index] # Getting the best phrase

ratio = difflib.SequenceMatcher(None, input_text, matches[0]).ratio() * 100 # Calculating the match percentage

logging.info(f"Best match found: {best_match} with ratio: {ratio}") # Logging the result of the match

return best_match, ratio # Returning the best phrase and the match percentage

logging.info("No match found") # Logging the absence of matches

return None, 0 # Returning None and 0 if no matches are found

def find_best_match(input_text, config_section): # Defining the function for finding the best match with the configuration

best_match, highest_ratio = fuzzy_match(input_text, config_section) # Performing fuzzy matching

if highest_ratio < 80: # If the highest ratio is less than 80

best_match, highest_ratio = get_best_match(input_text, config_section) # Performing closest match search

return best_match, highest_ratio # Returning the best phrase and the highest ratio

def get_matching_keywords(message, threshold=81): # Defining the function for getting matching keywords

text = message.text.lower() # Converting the message text to lowercase

logging.info(f"get_matching_keywords called with text: {text}") # Logging the function call with text

best_match = None # Initializing the variable for the best match

highest_ratio = 0 # Initializing the variable for the highest ratio

# Removing conjunctions

text = ' '.join([word for word in text.split() if word not in config.conjunctions]) # Removing conjunctions from the text

logging.info(f"Text after removing conjunctions: {text}") # Logging the text after removing conjunctions

# Checking for matches with button phrases

for button, phrases in config.button_phrases.items(): # For each button and phrases in the dictionary of button phrases

for phrase in phrases: # For each phrase in the list of phrases

best_match_phrase, highest_ratio_phrase = fuzzy_match(text, [

phrase.lower()]) # Performing fuzzy matching with button phrases

if highest_ratio_phrase > highest_ratio: # If the current ratio is higher than the highest ratio

best_match = button # Updating the best button

highest_ratio = highest_ratio_phrase # Updating the highest ratio

# Checking for matches with keywords

for key, words in config.keywords.items(): # For each key and words in the dictionary of keywords

for word in words: # For each word in the list of words

best_match_word, highest_ratio_word = fuzzy_match(text, [

word.lower()]) # Performing fuzzy matching with keywords

if highest_ratio_word > highest_ratio: # If the current ratio is higher than the highest ratio

best_match = key # Updating the best key

highest_ratio = highest_ratio_word # Updating the highest ratio

if highest_ratio >= threshold: # If the highest ratio is greater than or equal to the threshold

logging.info(

f"Best matched keyword: {best_match} with ratio: {highest_ratio}") # Logging the best matching keyword and the ratio

return [best_match] # Returning the list with the best keyword

else: # If the highest ratio is less than the threshold

logging.info(

f"No exact match found, highest ratio: {highest_ratio}") # Logging the absence of an exact match and the highest ratio

return [] # Returning an empty list

def remove_fuzzy_matches(text, phrases_dict, threshold=74): # Defining the function for removing fuzzy matches

words = text.split() # Splitting the text into words

removed_phrases = [] # Initializing the list of removed phrases

for key, phrases in phrases_dict.items(): # For each key and phrases in the dictionary of phrases

for phrase in phrases: # For each phrase in the list of phrases

phrase_words = phrase.split() # Splitting the phrase into words

phrase_len = len(phrase_words) # Determining the length of the phrase

indices_to_remove = set() # Initializing the set of indices to remove

for i in range(len(words) - phrase_len + 1): # Iterating through the words in the text

window = words[i:i + phrase_len] # Defining the window of words

window_text = ' '.join(window) # Joining the window words into a string

if fuzz.ratio(window_text, phrase) >= threshold: # If the match ratio is greater than or equal to the threshold

indices_to_remove.update(range(i, i + phrase_len)) # Updating the indices to remove

removed_phrases.append(key) # Adding the key to the list of removed phrases

logging.info(f'Removed phrase: {phrase} (matched key: {key})') # Logging the removed phrase and key

break # Breaking the loop after finding a match

if indices_to_remove: # If there are indices to remove

words = [word for i, word in enumerate(words) if i not in indices_to_remove] # Removing the matched phrases from the text

break # Breaking the outer loop after finding a match

cleaned_text = ' '.join(words) # Joining the remaining words into a string

logging.info(f"Cleaned text after removal: {cleaned_text}") # Logging the cleaned text

return removed_phrases, cleaned_text # Returning the list of removed phrases and the cleaned text

def handle_message(bot, message): # Defining the function for handling messages

try: # Starting the try block for exception handling

if not hasattr(message, 'text') or not message.text: # Checking if the message has text

bot.send_message(message.chat.id, "Message does not contain text", parse_mode='Markdown') # Sending a message about the absence of text

return # Breaking the function execution

text = message.text.lower() # Converting the message text to lowercase

logging.info(f"Received message: {text}") # Logging the received message

# Removing conjunctions before processing the text

text = ' '.join([word for word in text.split() if word not in config.conjunctions]) # Removing conjunctions from the text

logging.info(f"Text after removing conjunctions: {text}") # Logging the text after removing conjunctions

btn_phr_removed = remove_fuzzy_matches(text, button_phrases) # Removing matched phrases for buttons from the text

keywords = btn_phr_removed[0] # Getting the list of keywords

logging.info(f"Keywords after button phrase removal: {keywords}") # Logging the keywords after removing button phrases

new_text = btn_phr_removed[1] # Getting the cleaned text

keyword_matches = remove_fuzzy_matches(new_text, config.keywords) # Removing matched keywords from the text

keywords.extend(keyword_matches[0]) # Adding the matched keywords to the list of keywords

logging.info(f'Final keywords: {keywords}') # Logging the final keywords

if keywords: # If keywords are found

send_buttons(bot, message, keywords) # Sending buttons

else: # If no keywords are found

bot.send_message(message.chat.id, "No relevant materials found with the entered content", parse_mode='Markdown') # Sending a message about the absence of found materials

except Exception as e: # Catching any exception that may occur

logging.error(f"Error handling message: {e}") # Logging the error

bot.send_message(message.chat.id, "An error has occurred", parse_mode='Markdown') # Sending a message about the error

def open_site(bot, message): # Defining the function for sending the site link

bot.send_message(message.chat.id, 'Visit the ABC Finance website: <a href="https://www.abcfinance.am/">ABC Finance</a>', parse_mode='HTML') # Sending a message with the site link

def open_loan_calculator(bot, message): # Defining the function for sending the loan calculator link

bot.send_message(message.chat.id, 'Use the Loan Calculator: <a href="https://www.abcfinance.am/calculators/loancalc.html">Loan Calculator</a>', parse_mode='HTML') # Sending a message with the loan calculator link

def open_deposit_calculator(bot, message): # Defining the function for sending the deposit calculator link

bot.send_message(message.chat.id, 'Use the Deposit Calculator: <a href="https://www.abcfinance.am/calculators/depositcalc.html">Deposit Calculator</a>', parse_mode='HTML') # Sending a message with the deposit calculator link

def open_pension_calculator(bot, message): # Defining the function for sending the pension calculator link

bot.send_message(message.chat.id, 'Use the Pension Calculator: <a href="https://www.abcfinance.am/calculators/pensioncalc.html">Pension Calculator</a>', parse_mode='HTML') # Sending a message with the pension calculator link

def open_salary_calculator(bot, message): # Defining the function for sending the salary calculator link

bot.send_message(message.chat.id, 'Use the Salary Calculator: <a href="https://www.abcfinance.am/calculators/salarycalc.html">Salary Calculator</a>', parse_mode='HTML') # Sending a message with the salary calculator link

def send_start_message(bot, message): # Defining the function for sending the welcome message

bot.send_message(message.chat.id, f'Hello, {message.from_user.first_name}!') # Sending a welcome message with the user's name

time.sleep(2) # Pause for 2 seconds

bot.send_message(message.chat.id, 'Choose the <b>“Financial Education”</b> section, then the desired topic, and you will receive the necessary information', parse_mode='HTML') # Sending a message with instructions for choosing a topic

time.sleep(2) # Pause for 2 seconds

bot.send_message(message.chat.id, 'By taking the <b>“Financial Test”</b>, you will check your financial knowledge and get ways to improve it', parse_mode='HTML') # Sending a message with instructions for taking the financial test

time.sleep(2) # Pause for 2 seconds

bot.send_message(message.chat.id, 'To use the calculators, click on the blue <b>“Menu”</b> button on the left', parse_mode='HTML') # Sending a message with instructions for using the calculators

time.sleep(2) # Pause for 2 seconds

markup = types.ReplyKeyboardMarkup(row_width=2, resize_keyboard=True) # Creating an object of the keyboard with buttons

itembtn1 = types.KeyboardButton('Financial Education') # Creating the "Financial Education" button

itembtn2 = types.KeyboardButton('Financial Test') # Creating the "Financial Test" button

markup.add(itembtn1, itembtn2) # Adding buttons to the keyboard

bot.send_message(message.chat.id, 'You can enter the words <b>“menu”</b> or press the <b>“Main Menu”</b> button to return to the main menu', parse_mode='HTML', reply_markup=markup) # Sending a message with instructions and the keyboard

def send_main_menu_buttons(bot, message): # Defining the function for sending the main menu

markup = types.ReplyKeyboardMarkup(row_width=2, resize_keyboard=True) # Creating an object of the keyboard with buttons

itembtn1 = types.KeyboardButton('Financial Education') # Creating the "Financial Education" button

itembtn2 = types.KeyboardButton('Financial Test') # Creating the "Financial Test" button

markup.add(itembtn1, itembtn2) # Adding buttons to the keyboard

bot.send_message(message.chat.id, "Choose the appropriate section", reply_markup=markup) # Sending a message with the keyboard

def send_finlearn_buttons(bot, message): # Defining the function for sending the financial education buttons

markup = types.ReplyKeyboardMarkup(row_width=4, resize_keyboard=True) # Creating an object of the keyboard with buttons

loan_btn = types.KeyboardButton('Loan') # Creating the "Loan" button

main_menu_btn = types.KeyboardButton('Main Menu') # Creating the "Main Menu" button

markup.row(loan_btn) # Adding the button to the keyboard

markup.row(main_menu_btn) # Adding the button to the keyboard

bot.send_message(message.chat.id, "Let's learn together", reply_markup=markup) # Sending a message with the keyboard

Related part of your code

No response

Operating System

Win 11

Version of Python, python-telegram-bot & dependencies

Python 3.12.2
Bibo-Joshi commented 2 months ago

Hi. Keyword recognition is not a task that PTB covers. How you process the data that you receive from Telegram is beyond the scope of PTB as a bot API wrapper. If you have an issue with a specific string processing library, please ask your there.