Open Alistair-zhong opened 3 years ago
最好把plugins下完整目录结构贴出来让我看一下,以及创建索引语句
plugins
目录:
plugins/
`-- analysis-hanlp
|-- LICENSE.txt
|-- NOTICE.txt
|-- README.url
|-- commons-codec-1.15.jar
|-- commons-logging-1.2.jar
|-- data
| |-- README.url
| |-- dictionary
| | |-- CoreNatureDictionary.mini.txt
| | |-- CoreNatureDictionary.mini.txt.bin
| | |-- CoreNatureDictionary.ngram.mini.txt
| | |-- CoreNatureDictionary.ngram.mini.txt.table.bin
| | |-- CoreNatureDictionary.ngram.txt
| | |-- CoreNatureDictionary.ngram.txt.table.bin
| | |-- CoreNatureDictionary.tr.txt
| | |-- CoreNatureDictionary.txt
| | |-- CoreNatureDictionary.txt.bin
| | |-- custom
| | | |-- #U4e0a#U6d77#U5730#U540d.txt
| | | |-- #U4eba#U540d#U8bcd#U5178.txt
| | | |-- #U5168#U56fd#U5730#U540d#U5927#U5168.txt
| | | |-- #U673a#U6784#U540d#U8bcd#U5178.txt
| | | |-- #U73b0#U4ee3#U6c49#U8bed#U8865#U5145#U8bcd#U5e93.txt
| | | |-- CustomDictionary.txt
| | | `-- CustomDictionary.txt.bin
| | |-- organization
| | | |-- nt.tr.txt
| | | |-- nt.txt
| | | `-- nt.txt.bin
| | |-- other
| | | |-- CharTable.txt
| | | |-- CharTable.txt.bin
| | | |-- CharType.bin
| | | `-- TagPKU98.csv
| | |-- person
| | | |-- nr.tr.txt
| | | |-- nr.txt
| | | |-- nr.txt.bin
| | | |-- nrf.txt
| | | |-- nrf.txt.trie.dat
| | | |-- nrj.txt
| | | |-- nrj.txt.trie.dat
| | | `-- nrj.txt.value.dat
| | |-- pinyin
| | | |-- pinyin.txt
| | | `-- pinyin.txt.bin
| | |-- place
| | | |-- ns.tr.txt
| | | |-- ns.txt
| | | `-- ns.txt.bin
| | |-- stopwords.txt
| | |-- stopwords.txt.bin
| | |-- synonym
| | | `-- CoreSynonym.txt
| | `-- tc
| | |-- hk2s.bin
| | |-- hk2t.bin
| | |-- hk2tw.bin
| | |-- s2hk.bin
| | |-- s2t.txt
| | |-- s2t.txt.bin
| | |-- s2tw.bin
| | |-- t2hk.bin
| | |-- t2hk.txt
| | |-- t2s.txt
| | |-- t2s.txt.bin
| | |-- t2tw.bin
| | |-- t2tw.txt
| | |-- tw2hk.bin
| | |-- tw2s.bin
| | `-- tw2t.bin
| |-- model
| | |-- crf
| | | `-- pku199801
| | | |-- cws.txt.bin
| | | |-- ner.txt.bin
| | | `-- pos.txt.bin
| | |-- dependency
| | | |-- NNParserModel.licence.txt
| | | |-- NNParserModel.txt.bin
| | | |-- NNParserModel.txt.description.txt
| | | |-- WordNature.txt.bi.bin
| | | |-- WordNature.txt.bin
| | | `-- perceptron.bin
| | `-- perceptron
| | |-- ctb
| | | `-- pos.bin
| | |-- large
| | | `-- cws.bin
| | |-- pku1998
| | | |-- cws.bin
| | | |-- ner.bin
| | | `-- pos.bin
| | `-- pku199801
| | |-- cws.bin
| | |-- ner.bin
| | `-- pos.bin
| `-- version.txt
|-- elasticsearch-analysis-hanlp-7.5.2.jar
|-- hanlp-portable-1.7.8.jar
|-- httpclient-4.5.13.jar
|-- httpcore-4.4.14.jar
|-- plugin-descriptor.properties
`-- plugin-security.policy
20 directories, 85 files
创建索引
PUT /life
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"life_analyzer_index":{
"type": "custom",
"tokenizer": "life_hanlp"
},
"life_analyzer_search":{
"type": "custom",
"tokenizer": "life_hanlp"
},
"life_shingle_analyzer":{
"type": "custom",
"tokenizer": "life_hanlp",
"filter": ["life_shingle_filter"]
}
},
"filter": {
"life_shingle_filter":{
"type": "shingle",
"output_unigrams": false
}
},
"tokenizer": {
"life_hanlp": {
"type": "hanlp",
"enable_custom_dictionary": true,
"enable_custom_config": true,
"enable_offset": true,
"enable_normalization": false,
"enable_number_quantifier_recognize":false,
"enable_index_mode": false,
"enable_stop_dictionary": false,
"enable_part_of_speech_tagging": false,
"enable_traditional_chinese_mode": false
}
}
},
"refresh_interval": "30s",
"index":{
"similarity":{
"life_similarity":{
"type": "BM25",
"b": 0.2,
"k1": 0.3
}
}
}
},
"mappings": {
"properties": {
"title":{
"type": "keyword"
},
"content":{
"type": "text",
"analyzer": "life_analyzer_index",
"search_analyzer": "life_analyzer_search",
"term_vector": "with_positions_offsets",
"similarity": "life_similarity",
"fields": {
"raw":{
"type": "keyword"
},
"shingles":{
"type": "text",
"analyzer": "life_shingle_analyzer",
"similarity": "life_similarity"
}
}
},
"post_id":{
"type": "long"
},
"post_name": {
"type": "keyword"
},
"status":{
"type": "keyword"
},
"weight":{
"type": "byte"
},
"updated_at":{
"type": "integer"
},
"categories":{
"type": "keyword"
}
}
}
}
@KennFalcon 有空帮忙再看看吗,麻烦你了
我装7.10.2时碰到过类似报错,把ananlysis-hanlp目录属主(递归)改成elasticsearch:elasticsearch就好了,你试试看。
我也碰到了这个问题,但是我没有使用 data-for-1.7.5 这个数据包。最后我修改了源码,问题解决了。我用的是 7.10.2 这个版本,es 版本是 8.10.0。
es 启动后出现错误日志,如下:
通过对比不同版本的源码发现, 7.5.0 之后,HanLPNLPAnalyzer
中的 createComponents
方法实现逻辑变了。如下:
7.5.0
7.10.2
可以发现,在 7.5.0 中,如果抛出异常,其实会提供一个默认的分词器,而 7.10.2 中并没有这样做。我的做法就是手动捕获异常,然后提个一个默认分词器,如下:
环境 Elastic 版本 :
7.9.3
Kibana 版本:7.9.3
何时报错 在使用 hanlp 提供的 data-for-1.7.5 完整数据包,调用分词器创建索引时,抛出异常。使用此插件默认数据包不报错
报错异常
在
7.5.2
版本中使用此数据包都没有问题,但是升级了 ES 版本就报错了。不知是否和 Readme 里提到的7.6
版本后的模型配置有关。@KennFalcon