Closed yandooo closed 4 years ago
Can you share your mapping please?
@dadoonet going to do this. It's quite big enough.
Sorry. I meant you index settings. Where you defined your analyzers actually.
@dadoonet yes I got it. Do you need dynamic mapping? It's applied through generic template. I think problem is in minimal_polish
...but not sure
"settings": {
"number_of_shards": 3,
"number_of_replicas": 1,
"index.mapping.total_fields.limit": "200",
"index.mapping.depth.limit": "3",
"index.mapping.nested_fields.limit": "200",
"index.search.slowlog.threshold.query.warn": "200ms",
"index.search.slowlog.threshold.fetch.warn": "200ms",
"analysis": {
"filter": {
"trigram_filter": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3
},
"ar_stop_filter": {
"type": "stop",
"stopwords": [
"_arabic_"
]
},
"bg_stop_filter": {
"type": "stop",
"stopwords": [
"_bulgarian_"
]
},
"ca_stop_filter": {
"type": "stop",
"stopwords": [
"_catalan_"
]
},
"cs_stop_filter": {
"type": "stop",
"stopwords": [
"_czech_"
]
},
"da_stop_filter": {
"type": "stop",
"stopwords": [
"_danish_"
]
},
"de_stop_filter": {
"type": "stop",
"stopwords": [
"_german_"
]
},
"de_stem_filter": {
"type": "stemmer",
"name": "minimal_german"
},
"el_stop_filter": {
"type": "stop",
"stopwords": [
"_greek_"
]
},
"en_stop_filter": {
"type": "stop",
"stopwords": [
"_english_"
]
},
"en_stem_filter": {
"type": "stemmer",
"name": "minimal_english"
},
"es_stop_filter": {
"type": "stop",
"stopwords": [
"_spanish_"
]
},
"es_stem_filter": {
"type": "stemmer",
"name": "light_spanish"
},
"eu_stop_filter": {
"type": "stop",
"stopwords": [
"_basque_"
]
},
"fa_stop_filter": {
"type": "stop",
"stopwords": [
"_persian_"
]
},
"fi_stop_filter": {
"type": "stop",
"stopwords": [
"_finnish_"
]
},
"fi_stem_filter": {
"type": "stemmer",
"name": "light_finish"
},
"fr_stop_filter": {
"type": "stop",
"stopwords": [
"_french_"
]
},
"fr_stem_filter": {
"type": "stemmer",
"name": "minimal_french"
},
"he_stop_filter": {
"type": "stop",
"stopwords": [
"אני",
"את",
"אתה",
"אתך",
"אתכן",
"אתכם",
"אנחנו",
"אתן",
"אתם",
"הם",
"הן",
"היא",
"הוא",
"שלי",
"שלו",
"שלך",
"שלה",
"שלנו",
"שלכם",
"שלכן",
"שלהם",
"שלהן",
"לי",
"לו",
"לה",
"לנו",
"לכם",
"לכן",
"להם",
"להן",
"אותה",
"אותו",
"זה",
"זאת",
"אלה",
"אלו",
"תחת",
"מתחת",
"מעל",
"בין",
"עם",
"עד",
"נגר",
"על",
"אל",
"מול",
"של",
"אצל",
"כמו",
"אחר",
"אותו",
"בלי",
"לפני",
"אחרי",
"מאחורי",
"עלי",
"עליו",
"עליה",
"עליך",
"עלינו",
"עליכם",
"לעיכן",
"עליהם",
"עליהן",
"כל",
"כולם",
"כולן",
"כך",
"ככה",
"כזה",
"זה",
"זות",
"אותי",
"אותה",
"אותם",
"אותך",
"אותו",
"אותן",
"אותנו",
"ואת",
"את",
"אתכם",
"אתכן",
"איתי",
"איתו",
"איתך",
"איתה",
"איתם",
"איתן",
"איתנו",
"איתכם",
"איתכן",
"יהיה",
"תהיה",
"היתי",
"היתה",
"היה",
"להיות",
"עצמי",
"עצמו",
"עצמה",
"עצמם",
"עצמן",
"עצמנו",
"עצמהם",
"עצמהן",
"מי",
"מה",
"איפה",
"היכן",
"במקום שבו",
"אם",
"לאן",
"למקום שבו",
"מקום בו",
"איזה",
"מהיכן",
"איך",
"כיצד",
"באיזו מידה",
"מתי",
"בשעה ש",
"כאשר",
"כש",
"למרות",
"לפני",
"אחרי",
"מאיזו סיבה",
"הסיבה שבגללה",
"למה",
"מדוע",
"לאיזו תכלית",
"כי",
"יש",
"אין",
"אך",
"מנין",
"מאין",
"מאיפה",
"יכל",
"יכלה",
"יכלו",
"יכול",
"יכולה",
"יכולים",
"יכולות",
"יוכלו",
"יוכל",
"מסוגל",
"לא",
"רק",
"אולי",
"אין",
"לאו",
"אי",
"כלל",
"נגד",
"אם",
"עם",
"אל",
"אלה",
"אלו",
"אף",
"על",
"מעל",
"מתחת",
"מצד",
"בשביל",
"לבין",
"באמצע",
"בתוך",
"דרך",
"מבעד",
"באמצעות",
"למעלה",
"למטה",
"מחוץ",
"מן",
"לעבר",
"מכאן",
"כאן",
"הנה",
"הרי",
"פה",
"שם",
"אך",
"ברם",
"שוב",
"אבל",
"מבלי",
"בלי",
"מלבד",
"רק",
"בגלל",
"מכיוון",
"עד",
"אשר",
"ואילו",
"למרות",
"אס",
"כמו",
"כפי",
"אז",
"אחרי",
"כן",
"לכן",
"לפיכך",
"מאד",
"עז",
"מעט",
"מעטים",
"במידה",
"שוב",
"יותר",
"מדי",
"גם",
"כן",
"נו",
"אחר",
"אחרת",
"אחרים",
"אחרות",
"אשר",
"או"
]
},
"hi_stop_filter": {
"type": "stop",
"stopwords": [
"_hindi_"
]
},
"hu_stop_filter": {
"type": "stop",
"stopwords": [
"_hungarian_"
]
},
"hu_stem_filter": {
"type": "stemmer",
"name": "light_hungarian"
},
"hy_stop_filter": {
"type": "stop",
"stopwords": [
"_armenian_"
]
},
"id_stop_filter": {
"type": "stop",
"stopwords": [
"_indonesian_"
]
},
"it_stop_filter": {
"type": "stop",
"stopwords": [
"_italian_"
]
},
"it_stem_filter": {
"type": "stemmer",
"name": "light_italian"
},
"nl_stop_filter": {
"type": "stop",
"stopwords": [
"_dutch_"
]
},
"no_stop_filter": {
"type": "stop",
"stopwords": [
"_norwegian_"
]
},
"pl_stop_filter": {
"type": "stop",
"stopwords": [
"_polish_"
]
},
"pl_stem_filter": {
"type": "stemmer",
"name": "minimal_polish"
},
"pt_stop_filter": {
"type": "stop",
"stopwords": [
"_portuguese_"
]
},
"pt_stem_filter": {
"type": "stemmer",
"name": "minimal_portuguese"
},
"ro_stop_filter": {
"type": "stop",
"stopwords": [
"_romanian_"
]
},
"ru_stop_filter": {
"type": "stop",
"stopwords": [
"_russian_"
]
},
"ru_stem_filter": {
"type": "stemmer",
"name": "light_russian"
},
"sv_stop_filter": {
"type": "stop",
"stopwords": [
"_swedish_"
]
},
"sv_stem_filter": {
"type": "stemmer",
"name": "light_swedish"
},
"tr_stop_filter": {
"type": "stop",
"stopwords": [
"_turkish_"
]
}
},
"analyzer": {
"trigrams_analyzer": {
"filter": [
"icu_folding",
"trigram_filter"
],
"type": "custom",
"tokenizer": "icu_tokenizer"
},
"ar_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"ar_stop_filter"
]
},
"bg_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"bg_stop_filter"
]
},
"ca_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"ca_stop_filter"
]
},
"cs_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"cs_stop_filter"
]
},
"da_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"da_stop_filter"
]
},
"de_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"de_stop_filter",
"de_stem_filter"
]
},
"el_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"el_stop_filter"
]
},
"en_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"en_stop_filter",
"en_stem_filter"
]
},
"es_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"es_stop_filter",
"es_stem_filter"
]
},
"eu_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"eu_stop_filter"
]
},
"fa_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"fa_stop_filter"
]
},
"fi_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"fi_stop_filter",
"fi_stem_filter"
]
},
"fr_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"fr_stop_filter",
"fr_stem_filter"
]
},
"he_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"he_stop_filter"
]
},
"hi_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"hi_stop_filter"
]
},
"hu_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"hu_stop_filter",
"hu_stem_filter"
]
},
"hy_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"hy_stop_filter"
]
},
"id_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"id_stop_filter"
]
},
"it_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"it_stop_filter",
"it_stem_filter"
]
},
"ja_analyzer": {
"type": "custom",
"tokenizer": "kuromoji_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer"
]
},
"ko_analyzer": {
"type": "cjk"
},
"nl_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"nl_stop_filter"
]
},
"no_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"no_stop_filter"
]
},
"pl_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"pl_stop_filter",
"pl_stem_filter"
]
},
"pt_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"pt_stop_filter",
"pt_stem_filter"
]
},
"ro_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"ro_stop_filter"
]
},
"ru_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"ru_stop_filter",
"ru_stem_filter"
]
},
"sv_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"sv_stop_filter",
"sv_stem_filter"
]
},
"tr_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer",
"tr_stop_filter"
]
},
"zh_analyzer": {
"type": "custom",
"tokenizer": "smartcn_sentence",
"filter": [
"icu_folding",
"icu_normalizer",
"smartcn_word"
]
},
"default_lang_analyzer": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"filter": [
"icu_folding",
"icu_normalizer"
]
}
}
}
},
"mappings": {
"event": {
"_source": {
"enabled": true
},
"_all": {
"enabled": false
},
"_size": {
"enabled": true
},
"properties": {
"content": {
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "default_lang_analyzer"
},
"content_lang_ar": {
"type": "text",
"analyzer": "ar_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_bg": {
"type": "text",
"analyzer": "bg_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_ca": {
"type": "text",
"analyzer": "ca_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_cs": {
"type": "text",
"analyzer": "cs_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_da": {
"type": "text",
"analyzer": "da_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_de": {
"type": "text",
"analyzer": "de_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_el": {
"type": "text",
"analyzer": "el_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_en": {
"type": "text",
"analyzer": "en_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_es": {
"type": "text",
"analyzer": "es_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_fa": {
"type": "text",
"analyzer": "fa_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_fi": {
"type": "text",
"analyzer": "fi_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_fr": {
"type": "text",
"analyzer": "fr_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_he": {
"type": "text",
"analyzer": "he_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_hi": {
"type": "text",
"analyzer": "hi_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_hu": {
"type": "text",
"analyzer": "hu_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_hy": {
"type": "text",
"analyzer": "hy_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_id": {
"type": "text",
"analyzer": "id_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_it": {
"type": "text",
"analyzer": "it_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_ja": {
"type": "text",
"analyzer": "ja_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_ko": {
"type": "text",
"analyzer": "ko_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_nl": {
"type": "text",
"analyzer": "nl_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_no": {
"type": "text",
"analyzer": "no_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_pl": {
"type": "text",
"analyzer": "pl_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_pt": {
"type": "text",
"analyzer": "pt_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_ro": {
"type": "text",
"analyzer": "ro_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_ru": {
"type": "text",
"analyzer": "ru_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_sv": {
"type": "text",
"analyzer": "sv_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_tr": {
"type": "text",
"analyzer": "tr_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_zh": {
"type": "text",
"analyzer": "zh_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang_ua": {
"type": "text",
"analyzer": "ukrainian",
"term_vector": "with_positions_offsets"
},
"content_lang": {
"type": "keyword"
},
"content_type": {
"type": "keyword"
},
"feature": {
"type": "object",
"dynamic": "true",
"properties": {
"id": {
"type": "keyword"
},
"actor_name": {
"type": "keyword"
},
"title": {
"type": "text",
"analyzer": "trigrams_analyzer",
"term_vector": "with_positions_offsets"
},
"summary": {
"type": "text",
"analyzer": "default_lang_analyzer",
"term_vector": "with_positions_offsets"
},
"content": {
"type": "text",
"analyzer": "default_lang_analyzer",
"term_vector": "with_positions_offsets"
},
"content_lang": {
"type": "keyword"
},
"file": {
"type": "attachment",
"fields": {
"content": {
"type": "text",
"term_vector": "with_positions_offsets",
"store": true
},
"title": {
"store": true,
"type": "text",
"analyzer": "default_lang_analyzer",
"term_vector": "with_positions_offsets"
},
"date": {
"store": true
},
"author": {
"store": true,
"type": "text",
"term_vector": "with_positions_offsets",
"analyzer": "trigrams_analyzer"
},
"keywords": {
"store": true
},
"content_type": {
"store": true
},
"content_length": {
"store": true
},
"language": {
"store": true
}
}
},
"created_at": {
"type": "date",
"format": "epoch_millis"
},
}
}
}
}
},
Where did you find this?
"pl_stem_filter": {
"type": "stemmer",
"name": "minimal_polish"
}
I don't see it in our docs:
@dadoonet hm...don't remember really - surprized how it worked in general. Should index creation fail at creation time?
Thanks for the documentation. I'll update index definition and will try again..
@yandooo I believe it would be better to fail early. @jpountz WDYT?
@dadoonet also did some quick review of the docs and found another mistmatch in the index mapping:
"fi_stem_filter": {
"type": "stemmer",
"name": "light_finish"
},
but should be light_finnish
and ES still allows to create new index.
@dadoonet so I tried polish_stem
and I still get same error.
{
"error": {
"root_cause": [
{
"type": "remote_transport_exception",
"reason": "indices:data/write/index]"
}
],
"type": "illegal_argument_exception",
"reason": "Invalid stemmer class specified: Polish_stem",
"caused_by": {
"type": "class_not_found_exception",
"reason": "class_not_found_exception: org.tartarus.snowball.ext.Polish_stemStemmer"
}
},
"status": 400
}
plugin is definitely installed
ERROR: plugin directory /usr/share/elasticsearch/plugins/analysis-stempel already exists. To update the plugin, uninstall it first using 'remove analysis-stempel' command
I decided to change to plugin provided analyzer polish
and it works.
It should be ok for the time being.
AFAIK polish_stem
is a Token Filter. Not a stemmer.
So you can define it in your analyzer filters but not as a stemmer option. I believe it's included in the polish
analyzer.
I believe it would be better to fail early.
+1
cc @elastic/es-search-aggs
I believe it would be better to fail early.
I just checked this issue again. After this was raised we added an earlier check for the stemmer language with https://github.com/elastic/elasticsearch/pull/34601 that should also resolve this issue. When I checked now with installed ICU and polish stemmer plugin installed on 7.5.1. I got an exception at index creating time already which should be early enough to detect e.g. typing errors like the one mentioned here, so I'm closing this.
Elasticsearch version:
Plugins installed:
JVM version: OpenJDK v8
OS version: Linux ip-10-20-0-14 2.6.32-573.18.1.el6.x86_64 #1 SMP Tue Feb 9 22:46:17 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux
Description of the problem including expected versus actual behavior:
Running Bulk API ~200 events and getting from time to time:
Entire batch fails.
Index contains single root type with embedded list of
Object
types which contains field of typeattachment
. Rest of the fields are pretty standard. Also there is some dynamic mapping is configured. Seems like happens only for this index.Provide logs (if relevant):
There is nothing useful in ES cluster logs (3 client nodes, 6 data nodes and 3 master nodes).