import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import SequenceMatcher
when i am executing the below code ,i am getting ValueError: empty vocabulary; perhaps the documents only contain stop words in TfidfVectorizer error:
vectorizer = TfidfVectorizer(decode_error='ignore',strip_accents='unicode',stop_words='english',min_df=1,analyzer='word') tfidf= vectorizer.fit_transform([str(convid['Query_Text'][i]).lower(),str(convid['Query_Text'][i+1]).lower()])
Python Code: """
import pandas as pd import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import gensim from gensim.models import FastText from sklearn.feature_extraction.text import TfidfVectorizer from difflib import SequenceMatcher
Loading Query Text for Corpus Building
qt=pd.read_csv('C:/Demo/Query_Text.csv') qt.shape
Loading QueryText for comparing
convid =pd.read_csv('C:/Demo/ConvId_May06.csv')
convid.shape
sentences = qt
convid = convid.sort_values(['User_PUID','EventInfo_Time'], ascending=[True,True]) convid = convid.reset_index() convid['FastTextResult'] =float() convid['Tfidf'] = float() convid['TfidfWc'] = float()
model_ted = FastText(qt, window=1, min_count=1, sg=0)
for i in range(len(convid['Query_Text'])): print('i',i) if(i == len(convid)-1): break
convid.to_csv('C:/Demo/ConvIdOutput_May061.csv')