cdqa-suite / cdQA

⛔ [NOT MAINTAINED] An End-To-End Closed Domain Question Answering System.
https://cdqa-suite.github.io/cdQA-website/
Apache License 2.0
616 stars 191 forks source link

ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer #365

Open DSKaarthick opened 4 years ago

DSKaarthick commented 4 years ago

when i am executing the below code ,i am getting ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer error:

vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word') tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])

Python Code: """

import pandas as pd import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import gensim from gensim.models import FastText from sklearn.feature_extraction.text import TfidfVectorizer from difflib import SequenceMatcher

Loading Query Text for Corpus Building

qt=pd.read_csv('C:/Demo/Query_Text.csv') qt.shape

Loading QueryText for comparing

convid =pd.read_csv('C:/Demo/ConvId_May06.csv')

convid.shape

sentences = qt

convid = convid.sort_values(['User_PUID','EventInfo_Time'], ascending=[True,True]) convid = convid.reset_index() convid['FastTextResult'] =float() convid['Tfidf'] = float() convid['TfidfWc'] = float()

model_ted = FastText(qt, window=1, min_count=1, sg=0)

for i in range(len(convid['Query_Text'])): print('i',i) if(i == len(convid)-1): break

print("The FastText Output")

ft=model_ted.wv.similarity(str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower())
ft=round(ft,4)
print(ft)
#convid.loc[convid['FastTextResult'][i+1]]=ft
convid['FastTextResult'][i+1]=ft

print("vector output ")

vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word')
tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])
product=(tfidf *tfidf.T).A
pro = product[0,1]
pro = round(pro,4)
print(pro)
convid['Tfidf'][i+1]=pro

print("widlchar")
wildchar_value = SequenceMatcher(str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()).ratio()

wildchar_value = round(wildchar_value,4)

convid['TfidfWc'][i+1]=wildchar_value
print(wildchar_value)

convid.to_csv('C:/Demo/ConvIdOutput_May061.csv')