jprante / elasticsearch-analysis-decompound

Decompounding Plugin for Elasticsearch
GNU General Public License v2.0
87 stars 38 forks source link

Failure to decompound "Kinderzahnheilkunde" #28

Open cycore opened 8 years ago

cycore commented 8 years ago

The plugin fails to decompound the German word "Kinderzahnheilkunde". The resulting tokens are ["kinderzahnheilkunde", "kinderzahnhe", "ilkunde"]. The expected tokes are ["kinderzahnheilkunde", "kinder", "zahn", "heil", "kunde"].

I'm using plugin Version 2.2.0.0 and elasticsearch 2.2.0.

Index settings are

{
        "analysis": {
            "filter": {
                "german_stop": {
                    "type": "stop",
                    "stopwords": "_german_"
                },
                "german_stemmer": {
                    "type": "stemmer",
                    "language": "light_german"
                },
                "german_decompound": {
                    "type": "decompound"
                }
            },
            "analyzer": {
                "german_with_decompounder": {
                    "tokenizer": "standard",
                    "filter": [
                            "lowercase",
                            "german_decompound",
                            "unique",
                            "german_stop",
                            "german_normalization",
                            "german_stemmer"
                    ]
                }
            }
        }
    }

I got the results from the _analyze API with the explain=true option.

{
    "detail": {
        "custom_analyzer": true,
        "charfilters": [
        ],
        "tokenizer": {
            "name": "standard",
            "tokens": [
                {
                    "token": "Kinderzahnheilkunde",
                    "start_offset": 0,
                    "end_offset": 19,
                    "type": "<ALPHANUM>",
                    "position": 0,
                    "bytes": "[4b 69 6e 64 65 72 7a 61 68 6e 68 65 69 6c 6b 75 6e 64 65]",
                    "positionLength": 1
                }
            ]
        },
        "tokenfilters": [
            {
                "name": "lowercase",
                "tokens": [
                    {
                        "token": "kinderzahnheilkunde",
                        "start_offset": 0,
                        "end_offset": 19,
                        "type": "<ALPHANUM>",
                        "position": 0,
                        "bytes": "[6b 69 6e 64 65 72 7a 61 68 6e 68 65 69 6c 6b 75 6e 64 65]",
                        "positionLength": 1
                    }
                ]
            },
            {
                "name": "german_decompound",
                "tokens": [
                    {
                        "token": "kinderzahnheilkunde",
                        "start_offset": 0,
                        "end_offset": 19,
                        "type": "<ALPHANUM>",
                        "position": 0,
                        "bytes": "[6b 69 6e 64 65 72 7a 61 68 6e 68 65 69 6c 6b 75 6e 64 65]",
                        "keyword": false,
                        "positionLength": 1
                    },
                    {
                        "token": "kinderzahnhe",
                        "start_offset": 0,
                        "end_offset": 19,
                        "type": "<ALPHANUM>",
                        "position": 0,
                        "bytes": "[6b 69 6e 64 65 72 7a 61 68 6e 68 65]",
                        "keyword": false,
                        "positionLength": 1
                    },
                    {
                        "token": "ilkunde",
                        "start_offset": 0,
                        "end_offset": 19,
                        "type": "<ALPHANUM>",
                        "position": 0,
                        "bytes": "[69 6c 6b 75 6e 64 65]",
                        "keyword": false,
                        "positionLength": 1
                    }
                ]
            },

Any suggestions to receive better results are well appreciated. Thanks