Github mirror of "search/highlighter" - our actual code is hosted with Gerrit (please see https://www.mediawiki.org/wiki/Developer_access for contributing
100
stars
37
forks
source link
regression: hunspell filter does not highlight correctly in 5.x #28
The following example works with 2.4.1, but not 5.1.2 and 5.2.2. When hunspell is used for stemming, a term that is stemmed and expanded (e.g. contract) is not highlighted correctly. I verified that the output of the analyzer with Hunspell filter is the same between 2.4.1 and 5.2.2. Maybe it has something to do with the handling of term positions?
// complete Node.js example
var async = require('async');
var es = require('elasticsearch');
var INDEX_NAME = 'test_hunspell';
var SEARCH_TERMS = 'contract';
var TEST_ANALYZER = 'hunspell';
var TEST_TEXT = '\
8-K\n1\nd67628d8k.htm\n8-K\n8-K\nUNITED STATES\nSECURITIES AND EXCHANGE \
COMMISSION\nWashington, D.C. 20549\nFORM 8-K\nCURRENT REPORT Pursuant\nto \
Section 13 or 15(d) of the Securities Exchange Act of 1934\nDate of Report \
(Date of earliest event reported): August 6, 2015\nIndependence Contract \
Drilling, Inc.\n(Exact name of registrant as specified in its charter)\n \
Delaware\n001-36590\n37-1653648\n(State or other jurisdiction\nof \
incorporation)\n(Commission\nFile Number)\n(I.R.S. Employer\nIdentification \
No.)\n11601 North Galayda Street\nHouston, TX 77086\n(Address of principal \
executive offices)\n(281) 598-1230\n(Registrant’s telephone number, including \
area code)\nN/A (Former name or\nformer address, if changed since last \
report)\nCheck the appropriate box below\nif the Form 8-K filing is intended \
to simultaneously satisfy the filing obligation of the registrant under \
any of the following provisions (see General Instruction A.2. below):\n¨\nWritten \
communications pursuant to Rule 425 under the Securities Act (17 CFR 230.425)\
\n¨\nSoliciting material pursuant to Rule 14a-12 under the Exchange Act (17 \
CFR 240.14a-12)\n¨\nPre-commencement communications pursuant to Rule 14d-2(b) \
under the Exchange Act (17 CFR 240.14d-2(b))\n¨\nPre-commencement communications \
pursuant to Rule 13e-4(c) under the Exchange Act (17 CFR 240.13e-4(c))\nItem \
2.02\nResults of Operations and Financial Condition On August 6, 2015, \
Independence\nContract Drilling, Inc. (“ICD”) issued a press release reporting \
financial results for the second quarter and the six months ended June 30, \
2015. A copy of the press release is being furnished as Exhibit 99.1 hereto \
and is\nincorporated herein by reference. The information furnished pursuant \
to Item 2.02, including Exhibit 99.1, shall not be deemed\n“filed” for purposes \
of Section 18 of the Securities Exchange Act of 1934, as amended (the “Exchange \
Act”), is not subject to the liabilities of that section and is not deemed \
incorporated by reference in any filing of\nICD’s under the Exchange Act or \
the Securities Act of 1933, as amended, unless specifically identified \
therein as being incorporated therein by reference.\nItem 9.01\nFinancial \
Statements and Exhibits\n(d)\nExhibits\n99.1\nPress Release dated August 6, \
2015\nSIGNATURES\nPursuant to the requirements of the Securities Exchange \
Act of 1934, the registrant has duly caused this report to be signed on \
its behalf by\nthe undersigned hereunto duly authorized.\nIndependence \
Contract Drilling, Inc.\nDate: August 6, 2015\nBy:\n/s/ Philip A. \
Choyce\nName:\nPhilip A. Choyce\nTitle:\nSenior Vice President and Chief \
Financial Officer\nEXHIBIT INDEX\nExhibit\nNo.\nDescription\n99.1\nPress \
Release dated August 6, 2015';
var esClient = new es.Client({
apiVersion: '5.0',
hosts: [ 'localhost:9200' ],
});
async.waterfall([
function (callback) {
var params = {
index: INDEX_NAME,
};
esClient.indices.delete(params, function (err) {
if (err && err.response) {
var res = JSON.parse(err.response);
if (res.error && res.error.type === 'index_not_found_exception') {
return callback(null);
}
}
return callback(err);
});
},
function (callback) {
var params = {
index: INDEX_NAME,
body: {
mappings: {
default: {
_all: { enabled: false },
properties: {
text: {
analyzer: TEST_ANALYZER,
type: 'string',
},
},
},
},
settings: {
analysis: {
char_filter: {
single_quotes: {
type: 'mapping',
mappings: [
'\\u0091=>\\u0027',
'\\u0092=>\\u0027',
'\\u2018=>\\u0027',
'\\u2019=>\\u0027',
'\\u201B=>\\u0027'
],
},
},
filter: {
en_US_porter: {
type: 'stemmer',
language: 'english',
},
en_US_hunspell: {
type: 'hunspell',
language: 'en_US',
dedup: true,
},
english_stopwords: {
type: 'stop',
stopwords: '_english_',
},
word_delimiter: {
type: 'word_delimiter',
catenate_all: true,
generate_number_parts: false,
generate_word_parts: false,
preserve_original: false,
split_on_case_change: false,
split_on_numerics: false,
stem_english_possessive: true,
},
},
analyzer: {
hunspell: {
char_filter: [ 'single_quotes' ],
filter: [
'lowercase',
'asciifolding',
'word_delimiter',
'english_stopwords',
'en_US_hunspell',
],
tokenizer: 'whitespace',
},
porter: {
char_filter: [ 'single_quotes' ],
filter: [
'lowercase',
'asciifolding',
'word_delimiter',
'english_stopwords',
'en_US_porter',
],
tokenizer: 'whitespace',
},
},
},
},
},
};
esClient.indices.create(params, function (err) {
return callback(err);
});
},
function (callback) {
var params = {
index: INDEX_NAME,
analyzer: TEST_ANALYZER,
text: TEST_TEXT,
};
esClient.indices.analyze(params, function (err, res) {
if (err) {
return callback(err);
}
var tokens = [];
for (var i = 0; i < res.tokens.length; i++) {
tokens.push(res.tokens[i].token);
}
console.log('----------------------------------------------------------');
console.log(' Indexed text using ' + TEST_ANALYZER + ' analyzer.');
console.log('----------------------------------------------------------');
console.log(tokens.join(' '));
return callback(null);
});
},
function (callback) {
var params = {
index: INDEX_NAME,
type: 'default',
id: 1,
body: {
text: TEST_TEXT,
},
refresh: true,
};
esClient.index(params, function (err) {
return callback(err);
});
},
function (callback) {
console.log('----------------------------------------------------------');
console.log(' No highlight returned using experimental highlighter.');
console.log('----------------------------------------------------------');
var params = {
index: INDEX_NAME,
type: 'default',
body: {
query: {
match: {
text: {
query: SEARCH_TERMS,
},
},
},
highlight: {
fields: {
text: {
type: 'experimental',
},
},
},
},
};
esClient.search(params, function (err, res) {
if (err) {
return callback(err);
}
console.log(JSON.stringify(res,null,4));
return callback(null);
});
},
function (callback) {
console.log('----------------------------------------------------------');
console.log(' Correctly highlighted using plain highlighter.');
console.log('----------------------------------------------------------');
var params = {
index: INDEX_NAME,
type: 'default',
body: {
query: {
match: {
text: {
query: SEARCH_TERMS,
},
},
},
highlight: {
fields: {
text: {},
},
},
},
};
esClient.search(params, function (err, res) {
if (err) {
return callback(err);
}
console.log(JSON.stringify(res,null,4));
return callback(null);
});
},
],
function (err) {
esClient.close();
if (err) {
console.error(JSON.stringify(JSON.parse(err.response),null,4));
console.error(err.stack);
}
});
The following example works with 2.4.1, but not 5.1.2 and 5.2.2. When hunspell is used for stemming, a term that is stemmed and expanded (e.g. contract) is not highlighted correctly. I verified that the output of the analyzer with Hunspell filter is the same between 2.4.1 and 5.2.2. Maybe it has something to do with the handling of term positions?