Open zengbin93 opened 6 years ago
gensim - 源码阅读笔记
代码: https://github.com/RaRe-Technologies/gensim
这个模块实现了多个语料库对象。
from .indexedcorpus import IndexedCorpus # noqa:F401
# must appear before the other classes
from .mmcorpus import MmCorpus # noqa:F401
from .bleicorpus import BleiCorpus # noqa:F401
from .svmlightcorpus import SvmLightCorpus # noqa:F401
from .lowcorpus import LowCorpus # noqa:F401
from .dictionary import Dictionary # noqa:F401
from .hashdictionary import HashDictionary # noqa:F401
from .wikicorpus import WikiCorpus # noqa:F401
from .textcorpus import TextCorpus, TextDirectoryCorpus # noqa:F401
from .ucicorpus import UciCorpus # noqa:F401
from .malletcorpus import MalletCorpus # noqa:F401
models模块实现了多个文本向量化方法。
from .coherencemodel import CoherenceModel # noqa:F401
from .hdpmodel import HdpModel # noqa:F401
from .ldamodel import LdaModel # noqa:F401
from .lsimodel import LsiModel # noqa:F401
from .tfidfmodel import TfidfModel # noqa:F401
from .rpmodel import RpModel # noqa:F401
from .logentropy_model import LogEntropyModel
# noqa:F401
from .word2vec import Word2Vec # noqa:F401
from .doc2vec import Doc2Vec # noqa:F401
from .keyedvectors import KeyedVectors # noqa:F401
from .ldamulticore import LdaMulticore # noqa:F401
from .phrases import Phrases # noqa:F401
from .normmodel import NormModel # noqa:F401
from .atmodel import AuthorTopicModel # noqa:F401
from .ldaseqmodel import LdaSeqModel # noqa:F401
from .fasttext import FastText # noqa:F401
from .translation_matrix import TranslationMatrix, \
BackMappingTranslationMatrix # noqa:F401
这里实现的主要是一些英文raw text的预处理函数,比如:删除后缀(PorterStemmer)、删除停用词(remove_stopwords)等。
from .porter import PorterStemmer # noqa:F401
from .preprocessing import (remove_stopwords, strip_punctuation, strip_punctuation2, # noqa:F401
strip_tags, strip_short, strip_numeric,
strip_non_alphanum, strip_multiple_whitespaces,
split_alphanum, stem_text, preprocess_string,
preprocess_documents, read_file, read_files)
similarities模块提供了几个文本的相似性查询类。
from .docsim import Similarity, \
MatrixSimilarity, \
SparseMatrixSimilarity, \
SoftCosineSimilarity, \
WmdSimilarity # noqa:F401
在这个模块中,作者将models中实现的模型全部封装成与sklearn类似的API格式,方便熟悉sklearn的用户使用。
from .ldamodel import LdaTransformer # noqa: F401
from .lsimodel import LsiTransformer # noqa: F401
from .rpmodel import RpTransformer # noqa: F401
from .ldaseqmodel import LdaSeqTransformer # noqa: F401
from .w2vmodel import W2VTransformer # noqa: F401
from .atmodel import AuthorTopicTransformer # noqa: F401
from .d2vmodel import D2VTransformer # noqa: F401
from .text2bow import Text2BowTransformer # noqa: F401
from .tfidf import TfIdfTransformer # noqa: F401
from .hdp import HdpTransformer # noqa: F401
from .phrases import PhrasesTransformer # noqa: F401
这个模块实现了一个自动文本摘要算法,这个算法是TextRank的变种。此外,还实现了BM25文本检索算法和两个关键词提取算法。
from .summarizer import summarize, summarize_corpus # noqa:F401
from .keywords import keywords # noqa:F401
from .mz_entropy import mz_keywords # noqa:F401
from .bm25 import BM25, get_bm25_weights
文献: 1、Federico Barrios, Federico L´opez, Luis Argerich, Rosita Wachenchauzer (2016). Variations of the Similarity Function of TextRank for Automated Summarization, https://arxiv.org/abs/1602.03606
2、Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153, DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558
gensim是NLP领域的一个工具包,实现了多个常用模型。