infinilabs / analysis-pinyin

🛵 This Pinyin Analysis plugin is used to do conversion between Chinese characters and Pinyin.
Apache License 2.0
2.96k stars 548 forks source link

开启ignore_pinyin_offset = false, 因为分词获得的数组排列顺序偶发不一致,导致数据无法index #223

Open wuzhenglin510 opened 4 years ago

wuzhenglin510 commented 4 years ago

报错提示: startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards startOffset=1,endOffset=2,lastStartOffset=4 for field 'nickname.pinyin'

创建索引的命令:

put /crm_fans_v1
{
  "settings": {
    "number_of_shards": 1,
    "analysis": {
      "analyzer": {
        "phone_analyzer": {
          "tokenizer": "phone_tokenizer"
        },
        "pinyin_analyzer": {
          "tokenizer": "pinyin_tokenizer"
        },
        "pinyin_search_analyzer": {
          "tokenizer": "pinyin_search_tokenizer"
        }
      },
      "tokenizer": {
        "phone_tokenizer": {
          "type": "ngram",
          "min_gram": 2,
          "max_gram": 3,
          "token_chars": [
            "digit"
          ]
        },
        "pinyin_tokenizer": {
          "type": "pinyin",
          "keep_first_letter": false,
          "keep_separate_first_letter": true,
          "keep_full_pinyin": false,
          "lowercase": false,
          "keep_joined_full_pinyin": false,
          "remove_duplicated_term": false,
          "keep_none_chinese": true,
          "keep_none_chinese_together": false,
          "keep_none_chinese_in_first_letter": false,
          "ignore_pinyin_offset": false,
          "trim_whitespace": true
        },
        "pinyin_search_tokenizer": {
          "type": "standard",
          "max_token_length": 1
        }
      }
    }
  },
  "mappings": {
    "_doc": {
      "properties": {
        "id": {
          "type": "long"
        },
        "busId": {
          "type": "long"
        },
        "deleted": {
          "type": "boolean"
        },
        "nickname": {
          "type": "text",
          "term_vector": "with_positions_offsets",
          "fields": {
            "keyword": {
              "type": "keyword"
            },
            "pinyin": {
              "type": "text",
              "analyzer": "pinyin_analyzer",
              "search_analyzer": "pinyin_search_analyzer",
              "term_vector": "with_positions_offsets"
            }
          }
        },
        "mobilePhone": {
          "type": "text",
          "analyzer": "phone_analyzer",
          "term_vector": "with_positions_offsets",
          "fields": {
            "keyword": {
              "type": "keyword"
            }
          }
        },
        "gender": {
          "type": "keyword"
        },
        "customerId": {
          "type": "long"
        },
        "customerType": {
          "type": "keyword"
        },
        "subscribeTime": {
          "type": "long"
        },
        "profile": {
          "type": "keyword"
        },
        "openId": {
          "type": "keyword"
        },
        "totalSendCouponTimes": {
          "type": "integer"
        },
        "currentMonthSendCouponTimes": {
          "type": "integer"
        }
      }
    }
  }
}

进行分词 get /crm_fans_v1/_analyze { "text": ["Yang小波🔜🐑"], "analyzer": "pinyin_analyzer" }

分词得到的错误数据

{
  "tokens" : [
    {
      "token" : "Y",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "x",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "a",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "b",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "n",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "g",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "word",
      "position" : 3
    }
  ]
}

多试几次就得到正确的分词数据:

{
  "tokens" : [
    {
      "token" : "Y",
      "start_offset" : 0,
      "end_offset" : 1,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "a",
      "start_offset" : 1,
      "end_offset" : 2,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "n",
      "start_offset" : 2,
      "end_offset" : 3,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "g",
      "start_offset" : 3,
      "end_offset" : 4,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "x",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "b",
      "start_offset" : 5,
      "end_offset" : 6,
      "type" : "word",
      "position" : 4
    }
  ]
}
medcl commented 4 years ago

在 master 分支的版本重新试一下看看。