Open hxsnow10 opened 7 years ago
我的cppjieba.py
#encoding=utf-8 from ctypes import * import os cur_dir = os.path.dirname( os.path.abspath(__file__)) or os.getcwd() lib = cdll.LoadLibrary(cur_dir+'/libJieba.so') lib.Jieba_cut.restype = py_object lib.Jieba_tag.restype = py_object lib.Jieba_extract.restype = py_object class Tokenizer(object): def __init__(self, dict_path=cur_dir+'/dict/jieba.dict.utf8', model_path=cur_dir+'/dict/hmm_model.utf8', user_dict_path=cur_dir+'/dict/user.dict.utf8', idfPath=cur_dir+'/dict/idf.utf8', stopWordPath=cur_dir+'/dict/stop_words.utf8'): self.obj = lib.Jieba_new(dict_path, model_path, user_dict_path, idfPath, stopWordPath) def add_word(self, word, tag='n', num=50): return lib.Jieba_add_word(self.obj, word, tag, num) def load_user_dicts(self, paths): return lib.Jieba_load_dict(self.obj, paths) def dload_user_dicts(self, paths): return lib.Jieba_dload_dict(self.obj, paths) def cut(self, sentence, hmm=True): return lib.Jieba_cut(self.obj, sentence, hmm) def pos_cut(self, sentence): rval=[] r=lib.Jieba_tag(self.obj, sentence) for s in r: a=s.split('/') rval.append(('/'.join(a[:-1]),a[-1])) return rval def extract(self, sentence, topN): return lib.Jieba_extract(self.obj, sentence, topN) if __name__=='__main__': model=Tokenizer() text=''' 全国两会是数千名中外记者关注中国发展、聚焦中国命运的“新闻发布厅”。 ''' models=[] for i in range(50): model=Tokenizer() s=model.cut(text) models.append(model)
我的Jieba.cpp
#ifndef CPPJIEAB_JIEBA_H #define CPPJIEAB_JIEBA_H #include "QuerySegment.hpp" #include "KeywordExtractor.hpp" using namespace std; namespace cppjieba { class Jieba { public: Jieba(const string& dict_path, const string& model_path, const string& user_dict_path, const string& idfPath, const string& stopWordPath) : dict_trie_(dict_path, user_dict_path), model_(model_path), mp_seg_(&dict_trie_), hmm_seg_(&model_), mix_seg_(&dict_trie_, &model_), full_seg_(&dict_trie_), query_seg_(&dict_trie_, &model_), extractor(&dict_trie_, &model_, idfPath, stopWordPath) { } ~Jieba() { } struct LocWord { string word; size_t begin; size_t end; }; // struct LocWord void Cut(const string& sentence, vector<string>& words, bool hmm = true) const { mix_seg_.Cut(sentence, words, hmm); } void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const { mix_seg_.Cut(sentence, words, hmm); } void CutAll(const string& sentence, vector<string>& words) const { full_seg_.Cut(sentence, words); } void CutAll(const string& sentence, vector<Word>& words) const { full_seg_.Cut(sentence, words); } void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const { query_seg_.Cut(sentence, words, hmm); } void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const { query_seg_.Cut(sentence, words, hmm); } void CutHMM(const string& sentence, vector<string>& words) const { hmm_seg_.Cut(sentence, words); } void CutHMM(const string& sentence, vector<Word>& words) const { hmm_seg_.Cut(sentence, words); } void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const { mp_seg_.Cut(sentence, words, max_word_len); } void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const { mp_seg_.Cut(sentence, words, max_word_len); } void Tag(const string& sentence, vector<pair<string, string> >& words) const { mix_seg_.Tag(sentence, words); } string LookupTag(const string &str) const { return mix_seg_.LookupTag(str); } bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG, const int& freq=10) { return dict_trie_.InsertUserWordWeight(word, tag, freq); } void LoadUserDict(const string& filePaths){ return dict_trie_.LoadUserDict(filePaths); } void dLoadUserDict(const string& filePaths){ return dict_trie_.dLoadUserDict(filePaths); } void ResetSeparators(const string& s) { //TODO mp_seg_.ResetSeparators(s); hmm_seg_.ResetSeparators(s); mix_seg_.ResetSeparators(s); full_seg_.ResetSeparators(s); query_seg_.ResetSeparators(s); } const DictTrie* GetDictTrie() const { return &dict_trie_; } const HMMModel* GetHMMModel() const { return &model_; } private: DictTrie dict_trie_; HMMModel model_; // They share the same dict trie and model MPSegment mp_seg_; HMMSegment hmm_seg_; MixSegment mix_seg_; FullSegment full_seg_; QuerySegment query_seg_; public: KeywordExtractor extractor; }; // class Jieba } // namespace cppjieba #endif // CPPJIEAB_JIEBA_H using namespace cppjieba; char* DICT_PATH = "dict/jieba.dict.utf8"; char* HMM_PATH = "dict/hmm_model.utf8"; char* USER_DICT_PATH = "dict/user.dict.utf8"; char* IDF_PATH = "dict/idf.utf8"; char* STOP_WORD_PATH = "dict/stop_words.utf8"; #include <python2.7/Python.h> extern "C" { /*需要以下接口: seg.__init__() .add_word(word,num,tag) .del_word(word) .all_cut(sentence) .cut(sentence) .tag(sentence) key_words() */ Jieba* Jieba_new(char* dict_path, char* model_path, char* user_dict_path, char* idfPath, char* stopWordPath){ return new Jieba(dict_path, model_path, user_dict_path, idfPath, stopWordPath); } PyObject* Jieba_cut(Jieba* segmentor, char* sentence, bool hmm = true){ //cout<< segmentor->dict_trie_ << endl; PyObject* result = PyList_New(0); vector<string> words; segmentor->Cut(sentence, words, hmm); for (vector<string>::const_iterator iter =words.begin(); iter != words.end(); iter++){ PyObject* a=PyString_FromString((*iter).c_str()); PyList_Append(result,a); Py_XDECREF(a); } //free(words); return result; } bool Jieba_add_word(Jieba* segmentor, char* word, char* tag, int weight){ return segmentor->InsertUserWord(word, tag, weight); } void Jieba_load_dict(Jieba* segmentor, char* path){ return segmentor->LoadUserDict(path); } void Jieba_dload_dict(Jieba* segmentor, char* path){ return segmentor->dLoadUserDict(path); } PyObject* Jieba_tag(Jieba *segmentor, char* sentence){ PyObject* result = PyList_New(0); vector<pair<string, string> > tagers; segmentor->Tag(sentence, tagers); for (vector<pair<string, string> >::const_iterator iter =tagers.begin(); iter != tagers.end(); iter++){ /* PyObject* a=PyString_FromString((iter->first).c_str()); PyObject* b=PyString_FromString((iter->second).c_str()); PyObject* p=PyTuple_Pack(2,a,b);//很奇怪,有时候core dumped... */ string s=iter->first+'/'+iter->second; PyObject* p=PyString_FromString(s.c_str()); PyList_Append(result,p); Py_XDECREF(p); } return result; } PyObject* Jieba_extract(Jieba *segmentor, char* sentence, int topN){ PyObject* result = PyList_New(0); vector<pair<string, double> > keywords; //vector<Word> keywords; segmentor->extractor.Extract(sentence, keywords, topN); for (vector< pair<string, double> >::const_iterator iter =keywords.begin(); iter != keywords.end(); iter++){ //cout << *iter <<endl; //string s=(*iter).word+'/'+(*iter).weight.c_str(); PyObject* weight=PyFloat_FromDouble(iter->second); PyObject* word=PyString_FromString(iter->first.c_str()); PyObject* p=PyTuple_Pack(2,word,weight); PyList_Append(result,p); Py_XDECREF(weight); Py_XDECREF(word); Py_XDECREF(p); } return result; } } int main(int argc, char** argv) { vector<Jieba*> models(100); Jieba* segmentor=Jieba_new(DICT_PATH,HMM_PATH,"user_dicts/地方.txt",IDF_PATH,STOP_WORD_PATH); Jieba* segmentor1=Jieba_new(DICT_PATH,HMM_PATH,USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH); PyObject* result; char* s = "他来到了网易杭研大厦"; for (int i=0;i<100;i++){ cout<< i<<endl; models[i]=Jieba_new(DICT_PATH,HMM_PATH,USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH); result= Jieba_cut(models[i],s); } }
Jieba.cpp 编译后跑的没问题; 但是cppjieba.py编译成*.so文件,由cppjieba.py去调,
for i in range(50): model=Tokenizer() s=model.cut(text) models.append(model)
会在正确运行十几次后报segment fault, 发现每次的错误都出现在 MixSegment.hpp 的
while (pre_filter.HasNext()) { range = pre_filter.Next(); Cut(range.begin, range.end, wrs, hmm); }
但是看core dump, 每次报错不在里面的同一行,错误类型都是 signal 11, Segmentation fault
signal 11, Segmentation fault
我在想是python代码的一些行为 导致了C++变量一部分变成了空指针吗,但是有点搞不来了
您能从您的角度给我点建议,可能的问题所在吗?
还有个问题,就是我把你的DictTrie的LoadUserDict拿到public里, 如果我在初始化分词器里指定 自定义字典, 可以跑; 但是通过LoadUserDict 修改模型, 分词就会报错。 这个是目前不支持 还是 bug?
如果通过add_word 修改词典是可以分词的。
Hello,我最近写了个 Python 的封装:https://github.com/messense/cjieba-py
我的cppjieba.py
我的Jieba.cpp
Jieba.cpp 编译后跑的没问题; 但是cppjieba.py编译成*.so文件,由cppjieba.py去调,
会在正确运行十几次后报segment fault, 发现每次的错误都出现在 MixSegment.hpp 的
但是看core dump, 每次报错不在里面的同一行,错误类型都是
signal 11, Segmentation fault
我在想是python代码的一些行为 导致了C++变量一部分变成了空指针吗,但是有点搞不来了
您能从您的角度给我点建议,可能的问题所在吗?