wellcomecollection / catalogue-api

:crystal_ball: The API for searching the Wellcome Collection catalogue.
https://developers.wellcomecollection.org
MIT License
4 stars 0 forks source link

API returns a 500 error when regex-like characters sneak into the search templates #696

Closed alexwlchan closed 1 year ago

alexwlchan commented 1 year ago

Observe the following error: https://api.wellcomecollection.org/catalogue/v2/works?workType=a%3C&aggregations=workType

This is coming from an unhandled error in the Elasticsearch response:

{
  "error": {
    "root_cause": [
      {
        "type": "x_content_parse_exception",
        "reason": "[1:349] [terms] failed to parse field [include]"
      }
    ],
    "type": "x_content_parse_exception",
    "reason": "[1:349] [terms] failed to parse field [include]",
    "caused_by": {
      "type": "illegal_argument_exception",
      "reason": "expected '>' at position 14"
    }
  },
  "status": 400
}

which comes from this query:

POST /works-indexed-2023-06-09/_search/template
{
  "source": " { {{#query}}   \"query\": {   \"bool\": {     \"should\": [       {         \"span_first\": {           \"match\": {             \"span_term\": {               \"query.title.shingles\": \"{{query}}\"             }           },           \"end\": 1,           \"boost\": 1000.0,           \"_name\": \"start of title\"         }       },       {         \"multi_match\": {           \"query\": \"{{query}}\",           \"fields\": [             \"query.id^1000.0\",             \"query.identifiers.value^1000.0\",             \"query.items.id^1000.0\",             \"query.items.identifiers.value^1000.0\",             \"query.images.id^1000.0\",             \"query.images.identifiers.value^1000.0\",             \"query.referenceNumber^1000.0\",             \"query.allIdentifiers^1000.0\"           ],           \"type\": \"best_fields\",           \"analyzer\": \"whitespace_analyzer\",           \"operator\": \"Or\",           \"_name\": \"identifiers\"         }       },       {         \"dis_max\": {           \"queries\": [             {               \"multi_match\": {                 \"query\": \"{{query}}\",                 \"fields\": [                   \"query.titlesAndContributors^100.0\",                   \"query.titlesAndContributors.english^100.0\",                   \"query.titlesAndContributors.shingles^100.0\"                 ],                 \"type\": \"best_fields\",                 \"minimum_should_match\": \"-30%\",                 \"operator\": \"Or\",                 \"_name\": \"title and contributor exact spellings\"               }             },             {               \"multi_match\": {                 \"query\": \"{{query}}\",                 \"fields\": [                   \"query.titlesAndContributors.arabic\",                   \"query.titlesAndContributors.bengali\",                   \"query.titlesAndContributors.french\",                   \"query.titlesAndContributors.german\",                   \"query.titlesAndContributors.hindi\",                   \"query.titlesAndContributors.italian\"                 ],                 \"type\": \"best_fields\",                 \"minimum_should_match\": \"-30%\",                 \"operator\": \"Or\",                 \"_name\": \"non-english titles and contributors\"               }             }           ]         }       },       {         \"bool\": {           \"must\": [             {               \"multi_match\": {                 \"query\": \"{{query}}\",                 \"fields\": [                   \"query.collectionPath.path.clean\",                   \"query.collectionPath.label.cleanPath\",                   \"query.collectionPath.label\",                   \"query.collectionPath.path.keyword\"                 ],                 \"operator\": \"Or\",                 \"_name\": \"relations paths\"               }             }           ],           \"should\": [             {               \"multi_match\": {                 \"query\": \"{{query}}\",                 \"fields\": [\"query.title^100.0\", \"query.description^10.0\"],                 \"type\": \"cross_fields\",                 \"operator\": \"Or\",                 \"_name\": \"relations text\"               }             }           ]         }       },       {         \"multi_match\": {           \"query\": \"{{query}}\",           \"fields\": [             \"query.contributors.agent.label^1000.0\",             \"query.subjects.concepts.label^10.0\",             \"query.genres.concepts.label^10.0\",             \"query.production.label^10.0\",             \"query.description\",             \"query.physicalDescription\",             \"query.languages.label\",             \"query.edition\",             \"query.notes.contents\",             \"query.lettering\"           ],           \"type\": \"cross_fields\",           \"minimum_should_match\": \"-30%\",           \"operator\": \"Or\",           \"_name\": \"data\"         }       },       {         \"multi_match\": {           \"query\": \"{{query}}\",           \"fields\": [             \"query.title.shingles_cased^1000.0\",             \"query.alternativeTitles.shingles_cased^100.0\",             \"query.partOf.title.shingles_cased^10.0\"           ],           \"type\": \"most_fields\",           \"minimum_should_match\": \"-30%\",           \"operator\": \"Or\",           \"_name\": \"shingles cased\"         }       }     ],     \"filter\": [       {         \"term\": {           \"type\": {             \"value\": \"Visible\"           }         }       }     ],     \"minimum_should_match\": \"1\"   } } ,   {{/query}}    \"from\": \"{{from}}\",   \"size\": \"{{size}}\",   \"_source\": {     \"includes\": {{#toJson}}includes{{/toJson}}   },    {{#aggs}}   \"aggs\": {{#toJson}}aggs{{/toJson}},   {{/aggs}}    {{#postFilter}}   \"post_filter\": {{#toJson}}postFilter{{/toJson}},   {{/postFilter}}    \"sort\": [     {{#sortByDate}}     {       \"query.production.dates.range.from\": {         \"order\": \"{{sortByDate}}\"       }     },     {{/sortByDate}}     {{#sortByScore}}     {       \"_score\": {         \"order\": \"desc\"       }     },     {{/sortByScore}}     {       \"query.id\": {         \"order\": \"asc\"       }     }   ] } ",
  "params": {
    "query": null,
    "from": 0,
    "size": 10,
    "sortByDate": null,
    "sortByScore": false,
    "includes": [
      "display",
      "type"
    ],
    "aggs": {
      "format": {
        "filter": {
          "bool": {}
        },
        "aggs": {
          "format": {
            "terms": {
              "field": "aggregatableValues.workType",
              "size": 30,
              "order": [
                {
                  "_count": "desc"
                },
                {
                  "_key": "asc"
                }
              ]
            }
          },
          "self": {
            "filter": {
              "terms": {
                "query.format.id": [
                  "a<"
                ]
              }
            },
            "aggs": {
              "format": {
                "terms": {
                  "field": "aggregatableValues.workType",
                  "size": 30,
                  "include": ".*(\\\"(a<)\\\").*",
                  "min_doc_count": 0,
                  "order": [
                    {
                      "_count": "desc"
                    },
                    {
                      "_key": "asc"
                    }
                  ]
                }
              }
            }
          }
        }
      }
    },
    "postFilter": {
      "bool": {
        "must": [
          {
            "term": {
              "type": {
                "value": "Visible"
              }
            }
          },
          {
            "terms": {
              "query.format.id": [
                "a<"
              ]
            }
          }
        ]
      }
    }
  }
}

I suspect this part of the query is the issue; angle brackets are a control character in regex and it's being misinterpreted as such:

                  "field": "aggregatableValues.workType",
                  "size": 30,
                  "include": ".*(\\\"(a<)\\\").*",
paul-butcher commented 1 year ago

I originally omitted the optional extra regex characters (failing to spot that ALL is the default)