Closed johtani closed 10 years ago
if analyze is Custom Analyzer, charfilter output strings and tokenizer output tokens, tokenfilters output tokens each filter.
{
"custom analyzer" : true,
"charfilters" : [
{"mapping" : {... // string replaced mapping charFilter} },
{"pattern_replace" : {... // string replaced pattern_replace charFilter } }
],
"tokenizer" : [ {
"token" : "aaa",
"start_offset" : 0,
...
"extended_attributes" : [ {
"...TermToBytesRefAttribute" : {
"bytes" : "value"
},
...
"...PartOfSpeechAttribute" : {
"partOfSpeech" : "名詞-一般",
"partOfSpeech (en)" : "noun-common"
},...
} ]
} , {
...
}
],
"tokenfilters" : [
{ "kuromoji_baseform" : [ { ...// tokens like tokenizer section } ] }
]
}
if analyzer is not Custom Analyzer, only one array of tokens.
{
"custom analyzer" : false,
"analyzer" : [ {
"token" : "aaa",
"start_offset" : 0,
...
"extended_attributes" : [ {
"...TermToBytesRefAttribute" : {
"bytes" : "value"
},
...
"...PartOfSpeechAttribute" : {
"partOfSpeech" : "名詞-一般",
"partOfSpeech (en)" : "noun-common"
},...
} ]
} , {
...
}
]
}
This format is not include tokneizer/analyzer name in "tokenizer"/"analyzer" object. Improve the following object.
"analyzer" : { "standard" : [ {
"token" : "aaa",
"start_offset" : 0,
...
"extended_attributes" : [ {
"...TermToBytesRefAttribute" : {
"bytes" : "value"
},
...
"...PartOfSpeechAttribute" : {
"partOfSpeech" : "名詞-一般",
"partOfSpeech (en)" : "noun-common"
},...
} ]
} , {
...
}
] }
841cc9c22d7f1f4384e6c73cc4c02c59e133b793
this commit implement to improve output json format. analyzer test case is fine, but tokenizer and tokenfilter combination test is fail...
Currently, output format is the following:
request command
curl -XPOST 'localhost:9200/_extended_analyze?tokenizer=kuromoji_tokenizer&filters=kuromoji_baseform&pretty' -d '寿司が美味しかった'
response format
{
"custom_analyzer" : true,
"tokenizer" : {
"kuromoji_tokenizer" : [ {
"token" : "寿司",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1,
"extended_attributes" : {
"org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute" : {
"baseForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute" : {
"inflectionType (en)" : null,
"inflectionType" : null,
"inflectionForm (en)" : null,
"inflectionForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute" : {
"partOfSpeech (en)" : "noun-common",
"partOfSpeech" : "名詞-一般"
},
"org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute" : {
"reading (en)" : "sushi",
"reading" : "スシ",
"pronunciation (en)" : "sushi",
"pronunciation" : "スシ"
},
"org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute" : {
"positionLength" : 1
},
"org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute" : {
"bytes" : "[e5 af bf e5 8f b8]"
}
}
}, {
"token" : "が",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2,
"extended_attributes" : {
"org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute" : {
"baseForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute" : {
"inflectionType (en)" : null,
"inflectionType" : null,
"inflectionForm (en)" : null,
"inflectionForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute" : {
"partOfSpeech (en)" : "particle-case-misc",
"partOfSpeech" : "助詞-格助詞-一般"
},
"org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute" : {
"reading (en)" : "ga",
"reading" : "ガ",
"pronunciation (en)" : "ga",
"pronunciation" : "ガ"
},
"org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute" : {
"positionLength" : 1
},
"org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute" : {
"bytes" : "[e3 81 8c]"
}
}
}, {
"token" : "美味しかっ",
"start_offset" : 3,
"end_offset" : 8,
"type" : "word",
"position" : 3,
"extended_attributes" : {
"org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute" : {
"baseForm" : "美味しい"
},
"org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute" : {
"inflectionType (en)" : "adj-group-i",
"inflectionType" : "形容詞・イ段",
"inflectionForm (en)" : "conjunctive-ta-connection",
"inflectionForm" : "連用タ接続"
},
"org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute" : {
"partOfSpeech (en)" : "adjective-main",
"partOfSpeech" : "形容詞-自立"
},
"org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute" : {
"reading (en)" : "oishika",
"reading" : "オイシカッ",
"pronunciation (en)" : "oishika",
"pronunciation" : "オイシカッ"
},
"org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute" : {
"positionLength" : 1
},
"org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute" : {
"bytes" : "[e7 be 8e e5 91 b3 e3 81 97 e3 81 8b e3 81 a3]"
}
}
}, {
"token" : "た",
"start_offset" : 8,
"end_offset" : 9,
"type" : "word",
"position" : 4,
"extended_attributes" : {
"org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute" : {
"baseForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute" : {
"inflectionType (en)" : "special-da",
"inflectionType" : "特殊・タ",
"inflectionForm (en)" : "base",
"inflectionForm" : "基本形"
},
"org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute" : {
"partOfSpeech (en)" : "auxiliary-verb",
"partOfSpeech" : "助動詞"
},
"org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute" : {
"reading (en)" : "ta",
"reading" : "タ",
"pronunciation (en)" : "ta",
"pronunciation" : "タ"
},
"org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute" : {
"positionLength" : 1
},
"org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute" : {
"bytes" : "[e3 81 9f]"
}
}
} ]
},
"tokenfilters" : [ {
"kuromoji_baseform" : [ {
"token" : "寿司",
"start_offset" : 0,
"end_offset" : 2,
"type" : "word",
"position" : 1,
"extended_attributes" : {
"org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute" : {
"baseForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute" : {
"inflectionType (en)" : null,
"inflectionType" : null,
"inflectionForm (en)" : null,
"inflectionForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute" : {
"partOfSpeech (en)" : "noun-common",
"partOfSpeech" : "名詞-一般"
},
"org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute" : {
"reading (en)" : "sushi",
"reading" : "スシ",
"pronunciation (en)" : "sushi",
"pronunciation" : "スシ"
},
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute" : {
"keyword" : false
},
"org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute" : {
"positionLength" : 1
},
"org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute" : {
"bytes" : "[e5 af bf e5 8f b8]"
}
}
}, {
"token" : "が",
"start_offset" : 2,
"end_offset" : 3,
"type" : "word",
"position" : 2,
"extended_attributes" : {
"org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute" : {
"baseForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute" : {
"inflectionType (en)" : null,
"inflectionType" : null,
"inflectionForm (en)" : null,
"inflectionForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute" : {
"partOfSpeech (en)" : "particle-case-misc",
"partOfSpeech" : "助詞-格助詞-一般"
},
"org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute" : {
"reading (en)" : "ga",
"reading" : "ガ",
"pronunciation (en)" : "ga",
"pronunciation" : "ガ"
},
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute" : {
"keyword" : false
},
"org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute" : {
"positionLength" : 1
},
"org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute" : {
"bytes" : "[e3 81 8c]"
}
}
}, {
"token" : "美味しい",
"start_offset" : 3,
"end_offset" : 8,
"type" : "word",
"position" : 3,
"extended_attributes" : {
"org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute" : {
"baseForm" : "美味しい"
},
"org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute" : {
"inflectionType (en)" : "adj-group-i",
"inflectionType" : "形容詞・イ段",
"inflectionForm (en)" : "conjunctive-ta-connection",
"inflectionForm" : "連用タ接続"
},
"org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute" : {
"partOfSpeech (en)" : "adjective-main",
"partOfSpeech" : "形容詞-自立"
},
"org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute" : {
"reading (en)" : "oishika",
"reading" : "オイシカッ",
"pronunciation (en)" : "oishika",
"pronunciation" : "オイシカッ"
},
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute" : {
"keyword" : false
},
"org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute" : {
"positionLength" : 1
},
"org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute" : {
"bytes" : "[e7 be 8e e5 91 b3 e3 81 97 e3 81 84]"
}
}
}, {
"token" : "た",
"start_offset" : 8,
"end_offset" : 9,
"type" : "word",
"position" : 4,
"extended_attributes" : {
"org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute" : {
"baseForm" : null
},
"org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute" : {
"inflectionType (en)" : "special-da",
"inflectionType" : "特殊・タ",
"inflectionForm (en)" : "base",
"inflectionForm" : "基本形"
},
"org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute" : {
"partOfSpeech (en)" : "auxiliary-verb",
"partOfSpeech" : "助動詞"
},
"org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute" : {
"reading (en)" : "ta",
"reading" : "タ",
"pronunciation (en)" : "ta",
"pronunciation" : "タ"
},
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute" : {
"keyword" : false
},
"org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute" : {
"positionLength" : 1
},
"org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute" : {
"bytes" : "[e3 81 9f]"
}
}
} ]
} ]
}
Maybe, CustomTokenizer only.
Remain task.