elastic / elasticsearch

Free and Open Source, Distributed, RESTful Search Engine
https://www.elastic.co/products/elasticsearch
Other
1.51k stars 24.89k forks source link

Highlighting of a field populated via copy_to produces garbled highlights #111221

Open roytmana opened 4 months ago

roytmana commented 4 months ago

Elasticsearch Version

8.14.3

Installed Plugins

No response

Java Version

bundled

OS Version

Windows 11

Problem Description

Highlighting of a field populated via copy_to produces garbled highlights

Steps to Reproduce

PUT test-copy_to
{
  "mappings": {
    "properties": {
      "text1": {
        "type":        "text",
        "index_options": "offsets",
        "copy_to": ["combined"]
      },
      "text2": {
        "type":        "text",
        "index_options": "offsets",
        "copy_to": ["combined"]
      },
      "combined": {
        "type":        "text",
        "index_options": "offsets"
      }
    }
  }
}
PUT test-copy_to/_doc/1
{
  "text1": ["quick brown fox", "jumped over high fence"],
  "text2": ["flying over", "boat trip"]
}
GET test-copy_to/_search
{
  "query": {
    "match": {
      "combined": "fox"
    }
  },
  "highlight": {
    "fields": {
      "combined": {} 
    }
  }
}
{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 1,
      "relation": "eq"
    },
    "max_score": 0.2876821,
    "hits": [
      {
        "_index": "test-copy_to",
        "_id": "1",
        "_score": 0.2876821,
        "_source": {
          "text1": [
            "quick brown fox",
            "jumped over high fence"
          ],
          "text2": [
            "flying over",
            "boat trip"
          ]
        },
        "highlight": {
          "combined": [
            "<em>boa</em>t trip"
          ]
        }
      }
    ]
  }
}

Extract from my original code

"subjectTerms.combinedPreferred.name": [
  "c<em>ounterna</em>rcotics",
  "can<em>nabis\u0000co</em>"
]

Result

      {
        "fields": {
          "subjectTerms.term.name": [
            "drugs",
            "law enforcement agencies",
            "cannabis",
            "compliance oversight",
            "marijuana",
            "federal spending",
            "controlled substances",
            "state law",
            "drug enforcement"
          ],
          "subjectTerms.combinedPreferred.name": [
            "marijuana",
            "cannabis",
            "state laws",
            "counternarcotics",
            "drugs",
            "law enforcement agencies",
            "cannabis",
            "compliance oversight",
            "marijuana",
            "federal spending",
            "controlled substances",
            "state law",
            "drug enforcement"
          ]
        },
        "highlight": {
          "subjectTerms.term.name": [
            "<em>cannabis</em>"
          ],
          "subjectTerms.combinedPreferred.name": [
            "c<em>ounterna</em>rcotics",
            "can<em>nabis\u0000co</em>"
          ]
        }
      }

Query Snippet

{
  "query": {
    "bool":{
      "should" : [
        {
          "multi_match": {
            "query": "marijuana",
            "type": "phrase",
            "fields":[
              "subjectTerms.term.name^2",
              "subjectTerms.combinedPreferred.name",
              "subjectTerms.combinedPreferred.name.prefix"
            ]
          }
        }
      ]
    }
  },
  "fields":[
    "subjectTerms.combinedPreferred.name*",
    "subjectTerms.term.name*"
  ],
  "highlight": {
    "fragment_size": 200,
    "require_field_match":true,
    "fields": {
      "subjectTerms.term.name*": {
        "number_of_fragments":0
      },
      "subjectTerms.combinedPreferred.name": {
        "number_of_fragments":0
      }
    }
  },
  "size":500
}

Mapping Snippet

      "subjectTerms": {
        "properties": {
          "frequency": {
            "type": "integer"
          },
          "score": {
            "type": "integer"
          },
          "term": {
            "properties": {
              "id": {
                "type": "long",
                "copy_to": [
                  "subjectTerms.combinedPreferred.id"
                ]
              },
              "name": {
                "type": "text",
                "index_options": "offsets",
                "fields": {
                  "keyword": {
                    "type": "keyword"
                  },
                  "lowercase": {
                    "type": "keyword",
                    "normalizer": "lowercase"
                  },
                  "prefix": {
                    "type": "text",
                    "analyzer": "prefix",
                    "search_analyzer": "prefix_search",
                    "index_options": "offsets"
                  }
                },
                "copy_to": [
                  "subjectTerms.combinedPreferred.name"
                ]
              },
              "preferredTerm": {
                "properties": {
                  "id": {
                    "type": "long",
                    "copy_to": [
                      "subjectTerms.combinedPreferred.id"
                    ]
                  },
                  "name": {
                    "type": "text",
                    "index_options": "offsets",
                    "fields": {
                      "keyword": {
                        "type": "keyword"
                      },
                      "lowercase": {
                        "type": "keyword",
                        "normalizer": "lowercase"
                      },
                      "prefix": {
                        "type": "text",
                        "analyzer": "prefix",
                        "search_analyzer": "prefix_search",
                        "index_options": "offsets"
                      }
                    },
                    "copy_to": [
                      "subjectTerms.combinedPreferred.name"
                    ]
                  }
                }
              },
              "nonPreferredTerms": {
                "properties": {
                  "id": {
                    "type": "long",
                    "copy_to": [
                      "subjectTerms.combinedPreferred.id"
                    ]
                  },
                  "name": {
                    "type": "text",
                    "index_options": "offsets",
                    "fields": {
                      "keyword": {
                        "type": "keyword"
                      },
                      "lowercase": {
                        "type": "keyword",
                        "normalizer": "lowercase"
                      },
                      "prefix": {
                        "type": "text",
                        "analyzer": "prefix",
                        "search_analyzer": "prefix_search",
                        "index_options": "offsets"
                      }
                    },
                    "copy_to": [
                      "subjectTerms.combinedPreferred.name"
                    ]
                  }
                }
              }
            }
          },
          "combinedPreferred": {
            "properties": {
              "id": {
                "type": "long"
              },
              "name": {
                "type": "text",
                "index_options": "offsets",
                "fields": {
                  "keyword": {
                    "type": "keyword"
                  },
                  "lowercase": {
                    "type": "keyword",
                    "normalizer": "lowercase"
                  },
                  "prefix": {
                    "type": "text",
                    "analyzer": "prefix",
                    "search_analyzer": "prefix_search",
                    "index_options": "offsets"
                  }
                }
              }
            }
          }
        }
      }

No response

elasticsearchmachine commented 4 months ago

Pinging @elastic/es-search-relevance (Team:Search Relevance)