Open rbyh opened 1 week ago
Suggest revising get_question_names()
:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download necessary NLTK resources if not already available
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
def sanitize_string(input_string, used_names):
"""
Generates a unique sanitized string from an input question, aiming to use minimal important words.
:param input_string: str - The input question string.
:param used_names: set - A set of previously used sanitized strings.
:return: str - A unique sanitized string.
"""
# Define and enhance the stopwords set with custom irrelevant words
stop_words = set(stopwords.words('english')) | {'much', 'your', 'any', 'all', 'did', 'out', 'is', 'what', 'how', 'which', 'total', 'combined', 'members', 'household', 'last', 'year'}
# Filter out stopwords from the tokenized input string, focusing on alphabetical words only
important_words = [word for word in word_tokenize(input_string.lower()) if word not in stop_words and word.isalpha()]
# Attempt to find a unique name starting from the most significant word
for i in range(len(important_words)):
# Generate candidates by increasingly combining important words from the end
candidate = '_'.join(important_words[-(i+1):])
if candidate not in used_names:
used_names.add(candidate)
return candidate.lower()
# Raise an error if no unique key could be generated
raise ValueError("Unable to generate a unique question name from provided string.")
Let me know what you think of new version!
Current logic is cutting off many words at 25 characters, making the question names too hard to remember, and it's not clear whether they are just cut off from print or actually cut off: