yanyiwu / cppjieba

"结巴"中文分词的C++版本
MIT License
2.54k stars 686 forks source link

请教个问题:python接口调的core dump, 而C++本身可以 #93

Open hxsnow10 opened 7 years ago

hxsnow10 commented 7 years ago

我的cppjieba.py

#encoding=utf-8
from ctypes import *
import os
cur_dir = os.path.dirname( os.path.abspath(__file__)) or os.getcwd()

lib = cdll.LoadLibrary(cur_dir+'/libJieba.so')
lib.Jieba_cut.restype = py_object
lib.Jieba_tag.restype = py_object
lib.Jieba_extract.restype = py_object

class Tokenizer(object):
    def __init__(self, 
        dict_path=cur_dir+'/dict/jieba.dict.utf8', 
        model_path=cur_dir+'/dict/hmm_model.utf8', 
        user_dict_path=cur_dir+'/dict/user.dict.utf8', 
        idfPath=cur_dir+'/dict/idf.utf8', 
        stopWordPath=cur_dir+'/dict/stop_words.utf8'):
        self.obj = lib.Jieba_new(dict_path, model_path, user_dict_path, idfPath, stopWordPath)

    def add_word(self, word, tag='n', num=50):
        return lib.Jieba_add_word(self.obj, word, tag, num)

    def load_user_dicts(self, paths):
        return lib.Jieba_load_dict(self.obj, paths)

    def dload_user_dicts(self, paths):
        return lib.Jieba_dload_dict(self.obj, paths)

    def cut(self, sentence, hmm=True):
        return lib.Jieba_cut(self.obj, sentence, hmm)

    def pos_cut(self, sentence):
        rval=[]
        r=lib.Jieba_tag(self.obj, sentence)
        for s in r:
            a=s.split('/')
            rval.append(('/'.join(a[:-1]),a[-1]))
        return rval

    def extract(self, sentence, topN):
        return lib.Jieba_extract(self.obj, sentence, topN)

if __name__=='__main__':
    model=Tokenizer()
    text='''
    全国两会是数千名中外记者关注中国发展、聚焦中国命运的“新闻发布厅”。
    '''

    models=[]
    for i in range(50):
        model=Tokenizer()
        s=model.cut(text)
        models.append(model)

我的Jieba.cpp

#ifndef CPPJIEAB_JIEBA_H
#define CPPJIEAB_JIEBA_H

#include "QuerySegment.hpp"
#include "KeywordExtractor.hpp"
using namespace std;

namespace cppjieba {

class Jieba {
 public:
  Jieba(const string& dict_path, 
        const string& model_path,
        const string& user_dict_path, 
        const string& idfPath, 
        const string& stopWordPath) 
    : dict_trie_(dict_path, user_dict_path),
      model_(model_path),
      mp_seg_(&dict_trie_),
      hmm_seg_(&model_),
      mix_seg_(&dict_trie_, &model_),
      full_seg_(&dict_trie_),
      query_seg_(&dict_trie_, &model_),
      extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
  }
  ~Jieba() {
  }

  struct LocWord {
    string word;
    size_t begin;
    size_t end;
  }; // struct LocWord

  void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
    mix_seg_.Cut(sentence, words, hmm);
  }
  void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
    mix_seg_.Cut(sentence, words, hmm);
  }
  void CutAll(const string& sentence, vector<string>& words) const {
    full_seg_.Cut(sentence, words);
  }
  void CutAll(const string& sentence, vector<Word>& words) const {
    full_seg_.Cut(sentence, words);
  }
  void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
    query_seg_.Cut(sentence, words, hmm);
  }
  void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
    query_seg_.Cut(sentence, words, hmm);
  }
  void CutHMM(const string& sentence, vector<string>& words) const {
    hmm_seg_.Cut(sentence, words);
  }
  void CutHMM(const string& sentence, vector<Word>& words) const {
    hmm_seg_.Cut(sentence, words);
  }
  void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
    mp_seg_.Cut(sentence, words, max_word_len);
  }
  void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
    mp_seg_.Cut(sentence, words, max_word_len);
  }

  void Tag(const string& sentence, vector<pair<string, string> >& words) const {
    mix_seg_.Tag(sentence, words);
  }
  string LookupTag(const string &str) const {
    return mix_seg_.LookupTag(str);
  }

  bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG, const int& freq=10) {
    return dict_trie_.InsertUserWordWeight(word, tag, freq);
  }
  void LoadUserDict(const string& filePaths){
    return dict_trie_.LoadUserDict(filePaths);
  }
  void dLoadUserDict(const string& filePaths){
    return dict_trie_.dLoadUserDict(filePaths);
  }
  void ResetSeparators(const string& s) {
    //TODO
    mp_seg_.ResetSeparators(s);
    hmm_seg_.ResetSeparators(s);
    mix_seg_.ResetSeparators(s);
    full_seg_.ResetSeparators(s);
    query_seg_.ResetSeparators(s);
  }

  const DictTrie* GetDictTrie() const {
    return &dict_trie_;
  } 
  const HMMModel* GetHMMModel() const {
    return &model_;
  }

 private:
  DictTrie dict_trie_;
  HMMModel model_;

  // They share the same dict trie and model
  MPSegment mp_seg_;
  HMMSegment hmm_seg_;
  MixSegment mix_seg_;
  FullSegment full_seg_;
  QuerySegment query_seg_;

 public:
  KeywordExtractor extractor;
}; // class Jieba

} // namespace cppjieba

#endif // CPPJIEAB_JIEBA_H

using namespace cppjieba;
char* DICT_PATH = "dict/jieba.dict.utf8";
char* HMM_PATH = "dict/hmm_model.utf8";
char* USER_DICT_PATH = "dict/user.dict.utf8";
char* IDF_PATH = "dict/idf.utf8";
char* STOP_WORD_PATH = "dict/stop_words.utf8";

#include <python2.7/Python.h>

extern "C" {
    /*需要以下接口:
    seg.__init__()
    .add_word(word,num,tag)
    .del_word(word)
    .all_cut(sentence)
    .cut(sentence)
    .tag(sentence)
    key_words()
    */
    Jieba* Jieba_new(char* dict_path, 
        char* model_path,
        char* user_dict_path, 
        char* idfPath, 
        char* stopWordPath){ 
            return new Jieba(dict_path, model_path, user_dict_path, idfPath, stopWordPath);
    }

    PyObject* Jieba_cut(Jieba* segmentor, char* sentence, bool hmm = true){
        //cout<< segmentor->dict_trie_ << endl;
        PyObject* result = PyList_New(0);
        vector<string> words;
        segmentor->Cut(sentence, words, hmm);
        for (vector<string>::const_iterator iter =words.begin(); iter != words.end(); iter++){
            PyObject* a=PyString_FromString((*iter).c_str());
            PyList_Append(result,a);
            Py_XDECREF(a);
        }
        //free(words);
        return result;
    }

    bool Jieba_add_word(Jieba* segmentor, char* word, char* tag, int weight){
        return segmentor->InsertUserWord(word, tag, weight);
    }
    void Jieba_load_dict(Jieba* segmentor, char* path){
        return segmentor->LoadUserDict(path);
    }
    void Jieba_dload_dict(Jieba* segmentor, char* path){
        return segmentor->dLoadUserDict(path);
    }
    PyObject* Jieba_tag(Jieba *segmentor, char* sentence){
        PyObject* result = PyList_New(0);
        vector<pair<string, string> > tagers;
        segmentor->Tag(sentence, tagers);
        for (vector<pair<string, string> >::const_iterator iter =tagers.begin(); iter != tagers.end(); iter++){
            /*
            PyObject* a=PyString_FromString((iter->first).c_str());
            PyObject* b=PyString_FromString((iter->second).c_str());
            PyObject* p=PyTuple_Pack(2,a,b);//很奇怪,有时候core dumped...
            */
            string s=iter->first+'/'+iter->second;
            PyObject* p=PyString_FromString(s.c_str());
            PyList_Append(result,p);
            Py_XDECREF(p);
        }
        return result;
    }

    PyObject* Jieba_extract(Jieba *segmentor, char* sentence, int topN){
        PyObject* result = PyList_New(0);
        vector<pair<string, double> > keywords;
        //vector<Word> keywords;
        segmentor->extractor.Extract(sentence, keywords, topN);
        for (vector< pair<string, double> >::const_iterator iter =keywords.begin(); iter != keywords.end(); iter++){
            //cout << *iter <<endl;
            //string s=(*iter).word+'/'+(*iter).weight.c_str();
            PyObject* weight=PyFloat_FromDouble(iter->second);
            PyObject* word=PyString_FromString(iter->first.c_str());
            PyObject* p=PyTuple_Pack(2,word,weight);
            PyList_Append(result,p);
            Py_XDECREF(weight);
            Py_XDECREF(word);
            Py_XDECREF(p);

        }
        return result;
    }

}

int main(int argc, char** argv) {
    vector<Jieba*> models(100);
    Jieba* segmentor=Jieba_new(DICT_PATH,HMM_PATH,"user_dicts/地方.txt",IDF_PATH,STOP_WORD_PATH); 
    Jieba* segmentor1=Jieba_new(DICT_PATH,HMM_PATH,USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH); 
    PyObject* result;
    char* s = "他来到了网易杭研大厦";
    for (int i=0;i<100;i++){
        cout<< i<<endl;
        models[i]=Jieba_new(DICT_PATH,HMM_PATH,USER_DICT_PATH,IDF_PATH,STOP_WORD_PATH);
        result= Jieba_cut(models[i],s);
        }
    } 

Jieba.cpp 编译后跑的没问题; 但是cppjieba.py编译成*.so文件,由cppjieba.py去调,

    for i in range(50):
        model=Tokenizer()
        s=model.cut(text)
        models.append(model)

会在正确运行十几次后报segment fault, 发现每次的错误都出现在 MixSegment.hpp 的

    while (pre_filter.HasNext()) {
      range = pre_filter.Next();
      Cut(range.begin, range.end, wrs, hmm);
    }

但是看core dump, 每次报错不在里面的同一行,错误类型都是 signal 11, Segmentation fault

我在想是python代码的一些行为 导致了C++变量一部分变成了空指针吗,但是有点搞不来了

您能从您的角度给我点建议,可能的问题所在吗?

hxsnow10 commented 7 years ago

还有个问题,就是我把你的DictTrie的LoadUserDict拿到public里, 如果我在初始化分词器里指定 自定义字典, 可以跑; 但是通过LoadUserDict 修改模型, 分词就会报错。 这个是目前不支持 还是 bug?

如果通过add_word 修改词典是可以分词的。

messense commented 6 years ago

Hello,我最近写了个 Python 的封装:https://github.com/messense/cjieba-py