Open testpppppp opened 2 years ago
import re
def generate_ngrams(s, n):
# Convert to lowercases
s = s.lower()
# Replace all none alphanumeric characters with spaces
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
# Break sentence in the token, remove empty tokens
tokens = [token for token in s.split(" ") if token != ""]
# Use the zip function to help us generate n-grams
# Concatentate the tokens into ngrams and return
ngrams = zip(*[token[i:] for i in range(n)])
return [" ".join(ngram) for ngram in ngrams]
https://blog.csdn.net/qq_43391414/article/details/112912107 # tfidfvector practical