wikimedia / search-highlighter

Github mirror of "search/highlighter" - our actual code is hosted with Gerrit (please see https://www.mediawiki.org/wiki/Developer_access for contributing
100 stars 37 forks source link

regression: hunspell filter does not highlight correctly in 5.x #28

Closed rpedela closed 7 years ago

rpedela commented 7 years ago

The following example works with 2.4.1, but not 5.1.2 and 5.2.2. When hunspell is used for stemming, a term that is stemmed and expanded (e.g. contract) is not highlighted correctly. I verified that the output of the analyzer with Hunspell filter is the same between 2.4.1 and 5.2.2. Maybe it has something to do with the handling of term positions?

// complete Node.js example

var async = require('async');
var es = require('elasticsearch');

var INDEX_NAME = 'test_hunspell';
var SEARCH_TERMS = 'contract';
var TEST_ANALYZER = 'hunspell';
var TEST_TEXT = '\
    8-K\n1\nd67628d8k.htm\n8-K\n8-K\nUNITED STATES\nSECURITIES AND EXCHANGE \
    COMMISSION\nWashington, D.C. 20549\nFORM 8-K\nCURRENT REPORT Pursuant\nto \
    Section 13 or 15(d) of the Securities Exchange Act of 1934\nDate of Report \
    (Date of earliest event reported): August 6, 2015\nIndependence Contract \
    Drilling, Inc.\n(Exact name of registrant as specified in its charter)\n \
    Delaware\n001-36590\n37-1653648\n(State or other jurisdiction\nof \
    incorporation)\n(Commission\nFile Number)\n(I.R.S. Employer\nIdentification \
     No.)\n11601 North Galayda Street\nHouston, TX 77086\n(Address of principal \
    executive offices)\n(281) 598-1230\n(Registrant’s telephone number, including \
    area code)\nN/A (Former name or\nformer address, if changed since last \
    report)\nCheck the appropriate box below\nif the Form 8-K filing is intended \
    to simultaneously satisfy the filing obligation of the registrant under \
    any of the following provisions (see General Instruction A.2. below):\n¨\nWritten \
    communications pursuant to Rule 425 under the Securities Act (17 CFR 230.425)\
    \n¨\nSoliciting material pursuant to Rule 14a-12 under the Exchange Act (17 \
    CFR 240.14a-12)\n¨\nPre-commencement communications pursuant to Rule 14d-2(b) \
    under the Exchange Act (17 CFR 240.14d-2(b))\n¨\nPre-commencement communications \
    pursuant to Rule 13e-4(c) under the Exchange Act (17 CFR 240.13e-4(c))\nItem \
    2.02\nResults of Operations and Financial Condition On August 6, 2015, \
    Independence\nContract Drilling, Inc. (“ICD”) issued a press release reporting \
    financial results for the second quarter and the six months ended June 30, \
    2015. A copy of the press release is being furnished as Exhibit 99.1 hereto \
    and is\nincorporated herein by reference. The information furnished pursuant \
    to Item 2.02, including Exhibit 99.1, shall not be deemed\n“filed” for purposes \
    of Section 18 of the Securities Exchange Act of 1934, as amended (the “Exchange \
    Act”), is not subject to the liabilities of that section and is not deemed \
    incorporated by reference in any filing of\nICD’s under the Exchange Act or \
    the Securities Act of 1933, as amended, unless specifically identified \
    therein as being incorporated therein by reference.\nItem 9.01\nFinancial \
    Statements and Exhibits\n(d)\nExhibits\n99.1\nPress Release dated August 6, \
    2015\nSIGNATURES\nPursuant to the requirements of the Securities Exchange \
    Act of 1934, the registrant has duly caused this report to be signed on \
    its behalf by\nthe undersigned hereunto duly authorized.\nIndependence \
    Contract Drilling, Inc.\nDate: August 6, 2015\nBy:\n/s/ Philip A. \
    Choyce\nName:\nPhilip A. Choyce\nTitle:\nSenior Vice President and Chief \
    Financial Officer\nEXHIBIT INDEX\nExhibit\nNo.\nDescription\n99.1\nPress \
    Release dated August 6, 2015';

var esClient = new es.Client({
    apiVersion: '5.0',
    hosts: [ 'localhost:9200' ],
});

async.waterfall([
    function (callback) {

        var params = {
            index: INDEX_NAME,
        };

        esClient.indices.delete(params, function (err) {

            if (err && err.response) {
                var res = JSON.parse(err.response);
                if (res.error && res.error.type === 'index_not_found_exception') {
                    return callback(null);
                }
            }

            return callback(err);
        });
    },
    function (callback) {

        var params = {
            index: INDEX_NAME,
            body: {
                mappings: {
                    default: {
                        _all: { enabled: false },
                        properties: {
                            text: {
                                analyzer: TEST_ANALYZER,
                                type: 'string',
                            },
                        },
                    },
                },
                settings: {
                    analysis: {
                        char_filter: {
                            single_quotes: {
                                type: 'mapping',
                                mappings: [
                                    '\\u0091=>\\u0027',
                                    '\\u0092=>\\u0027',
                                    '\\u2018=>\\u0027',
                                    '\\u2019=>\\u0027',
                                    '\\u201B=>\\u0027'
                                ],
                            },
                        },
                        filter: {
                            en_US_porter: {
                                type: 'stemmer',
                                language: 'english',
                            },
                            en_US_hunspell: {
                                type: 'hunspell',
                                language: 'en_US',
                                dedup: true,
                            },
                            english_stopwords: {
                                type: 'stop',
                                stopwords: '_english_',
                            },
                            word_delimiter: {
                                type: 'word_delimiter',
                                catenate_all: true,
                                generate_number_parts: false,
                                generate_word_parts: false,
                                preserve_original: false,
                                split_on_case_change: false,
                                split_on_numerics: false,
                                stem_english_possessive: true,
                            },
                        },
                        analyzer: {
                            hunspell: {
                                char_filter: [ 'single_quotes' ],
                                filter: [
                                    'lowercase',
                                    'asciifolding',
                                    'word_delimiter',
                                    'english_stopwords',
                                    'en_US_hunspell',
                                ],
                                tokenizer: 'whitespace',
                            },
                            porter: {
                                char_filter: [ 'single_quotes' ],
                                filter: [
                                    'lowercase',
                                    'asciifolding',
                                    'word_delimiter',
                                    'english_stopwords',
                                    'en_US_porter',
                                ],
                                tokenizer: 'whitespace',
                            },
                        },
                    },
                },
            },
        };

        esClient.indices.create(params, function (err) {
            return callback(err);
        });
    },
    function (callback) {

        var params = {
            index: INDEX_NAME,
            analyzer: TEST_ANALYZER,
            text: TEST_TEXT,
        };

        esClient.indices.analyze(params, function (err, res) {

            if (err) {
                return callback(err);
            }

            var tokens = [];
            for (var i = 0; i < res.tokens.length; i++) {
                tokens.push(res.tokens[i].token);
            }

            console.log('----------------------------------------------------------');
            console.log('  Indexed text using ' + TEST_ANALYZER + ' analyzer.');
            console.log('----------------------------------------------------------');
            console.log(tokens.join(' '));

            return callback(null);
        });
    },
    function (callback) {

        var params = {
            index: INDEX_NAME,
            type: 'default',
            id: 1,
            body: {
                text: TEST_TEXT,
            },
            refresh: true,
        };

        esClient.index(params, function (err) {
            return callback(err);
        });
    },
    function (callback) {

        console.log('----------------------------------------------------------');
        console.log('  No highlight returned using experimental highlighter.');
        console.log('----------------------------------------------------------');

        var params = {
            index: INDEX_NAME,
            type: 'default',
            body: {
                query: {
                    match: {
                        text: {
                            query: SEARCH_TERMS,
                        },
                    },
                },
                highlight: {
                    fields: {
                        text: {
                            type: 'experimental',
                        },
                    },
                },
            },
        };

        esClient.search(params, function (err, res) {

            if (err) {
                return callback(err);
            }

            console.log(JSON.stringify(res,null,4));

            return callback(null);
        });
    },
    function (callback) {

        console.log('----------------------------------------------------------');
        console.log('  Correctly highlighted using plain highlighter.');
        console.log('----------------------------------------------------------');

        var params = {
            index: INDEX_NAME,
            type: 'default',
            body: {
                query: {
                    match: {
                        text: {
                            query: SEARCH_TERMS,
                        },
                    },
                },
                highlight: {
                    fields: {
                        text: {},
                    },
                },
            },
        };

        esClient.search(params, function (err, res) {

            if (err) {
                return callback(err);
            }

            console.log(JSON.stringify(res,null,4));

            return callback(null);
        });
    },
],
function (err) {
    esClient.close();
    if (err) {
        console.error(JSON.stringify(JSON.parse(err.response),null,4));
        console.error(err.stack);
    }
});
rpedela commented 7 years ago

Here is my hunspell config: hunspell.zip

nomoa commented 7 years ago

Thanks for the report. Should be fixed with https://gerrit.wikimedia.org/r/#/c/341317/