pigbreeder / CodeMemo

1 stars 0 forks source link

new word discover #11

Open testpppppp opened 1 year ago

testpppppp commented 1 year ago

https://blog.csdn.net/qq_43391414/article/details/112912107 # tfidfvector practical

import math
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
# https://spaces.ac.cn/archives/3913
#【中文分词系列】 2. 基于切分的新词发现
# 1. split get new words span
# 2. tfidf get important word
# 3. search this from specific items from bad cate
# 4. add these word to augment data

def find_ngrams(input_list, n):
    return zip(*[input_list[i:] for i in range(n)])

def token(x):
    x = x.translate({ord(c): " " for c in "!@$%^&*()[]{};:,./<>?\|`~-=+"})
    return filter(lambda xx:len(xx)>3,x.split(' '))

def get_tfidf_top_features(documents,index=None,n_top=10):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=5,  tokenizer=token)
    tfidf = tfidf_vectorizer.fit_transform(documents)
    if index:
        importance = np.argsort(np.asarray(tfidf[index].sum(axis=0)).ravel())[::-1]
    else:
        importance = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]
    tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names())
    return tfidf_feature_names[importance[:n_top]]

class Find_Words:
    def __init__(self, min_count=10, min_pmi=0, tokened=True):
        self.min_count = min_count
        self.min_pmi = min_pmi
        self.tokened = tokened
        self.chars, self.pairs = defaultdict(int), defaultdict(int) #如果键不存在,那么就用int函数
                                                                  #初始化一个值,int()的默认结果为0
        self.total = 0.
        self.corpus = []
        self.catchr = '_#_'
    def text_filter(self, texts): #预切断句子,以免得到太多无意义(不是中文、英文、数字)的字符串
        for a in tqdm(texts):
            if self.tokened:
                yield a.split()
            for t in re.split(u'[^\u4e00-\u9fa50-9a-zA-Z]+', a): #这个正则表达式匹配的是任意非中文、
                                                              #非英文、非数字,因此它的意思就是用任
                                                              #意非中文、非英文、非数字的字符断开句子
                if t:
                    yield t
    def count(self, texts): #计数函数,计算单字出现频数、相邻两字出现的频数
        for text in self.text_filter(texts):
            self.chars[text[0]] += 1
            for i in range(len(text)-1):
                self.chars[text[i+1]] += 1
                self.pairs[' '.join(text[i:i+2])] += 1
                self.total += 1
        self.chars = {i:j for i,j in self.chars.items() if j >= self.min_count} #最少频数过滤
        self.pairs = {i:j for i,j in self.pairs.items() if j >= self.min_count} #最少频数过滤
        self.strong_segments = set()
        for ii,j in self.pairs.items(): #根据互信息找出比较“密切”的邻字
            i = ii.split(' ')
            _ = math.log(self.total*j/(self.chars[i[0]]*self.chars[i[1]]))
            if _ >= self.min_pmi:
                self.strong_segments.add(ii)
    def find_words(self, texts): #根据前述结果来找词语
        self.words = defaultdict(int)
        for text in self.text_filter(texts):
            s = text[0]
            newt = []
            for i in range(len(text)-1):
                if ' '.join(text[i:i+2]) in self.strong_segments: #如果比较“密切”则不断开
                    s += self.catchr + text[i+1]
                else:
                    newt.append(s)
                    self.words[s] += 1 #否则断开,前述片段作为一个词来统计
                    s = text[i+1]
            self.words[s] += 1 #最后一个“词”
            newt.append(s)
            self.corpus.append(' '.join(newt))
        self.words = {i:j for i,j in self.words.items() if j >= self.min_count} #最后再次根据频数过滤

fw = Find_Words(10, 1)
fw.count(df.product_name)
fw.find_words(df.product_name)
len(fw.words)
ind = bdf[bdf.annotated_path=='Kitchen & Dining | Cookware | Griddle & Grills'].index.tolist()
get_tfidf_top_features(fw.corpus,ind,20)
testpppppp commented 1 year ago

https://investigate.ai/text-analysis/how-to-make-scikit-learn-natural-language-processing-work-with-japanese-chinese/

import re

def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()

    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)

    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]

    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]
testpppppp commented 1 year ago

u2 新词发现

https://spaces.ac.cn/archives/3491 https://spaces.ac.cn/archives/3913 https://spaces.ac.cn/archives/6540 https://github.com/bojone/nlp-zero/blob/master/nlp_zero.py