dainiusjocas / lucene-grep

Grep-like utility based on Lucene Monitor compiled with GraalVM native-image
Apache License 2.0
190 stars 5 forks source link

Print available text analysis components #156

Closed dainiusjocas closed 2 years ago

dainiusjocas commented 2 years ago

e.g. lmgrep --show-analysis-components | jq prints

{
  "analyzers": [
    "arabic",
    "armenian",
    "basque",
    "bengali",
    "brazilian",
    "bulgarian",
    "catalan",
    "cjk",
    "classic",
    "collationkey",
    "czech",
    "danish",
    "dutch",
    "english",
    "estonian",
    "finnish",
    "french",
    "galician",
    "german",
    "greek",
    "hindi",
    "hungarian",
    "indonesian",
    "irish",
    "italian",
    "keyword",
    "latvian",
    "lithuanian",
    "norwegian",
    "persian",
    "polish",
    "portuguese",
    "romanian",
    "russian",
    "simple",
    "sorani",
    "spanish",
    "standard",
    "stop",
    "swedish",
    "thai",
    "turkish",
    "uax29urlemail",
    "unicodewhitespace",
    "whitespace"
  ],
  "char-filters": [
    "cjkwidth",
    "htmlstrip",
    "mapping",
    "patternreplace",
    "persian"
  ],
  "tokenizers": [
    "classic",
    "edgengram",
    "keyword",
    "letter",
    "ngram",
    "pathhierarchy",
    "pattern",
    "simplepattern",
    "simplepatternsplit",
    "standard",
    "thai",
    "uax29urlemail",
    "whitespace",
    "wikipedia"
  ],
  "token-filters": [
    "apostrophe",
    "arabicnormalization",
    "arabicstem",
    "armeniansnowballstem",
    "asciifolding",
    "basquesnowballstem",
    "bengalinormalization",
    "bengalistem",
    "brazilianstem",
    "bulgarianstem",
    "capitalization",
    "catalansnowballstem",
    "cjkbigram",
    "cjkwidth",
    "classic",
    "codepointcount",
    "commongrams",
    "commongramsquery",
    "concatenategraph",
    "czechstem",
    "danishsnowballstem",
    "daterecognizer",
    "decimaldigit",
    "delimitedboost",
    "delimitedpayload",
    "delimitedtermfrequency",
    "dictionarycompoundword",
    "dropifflagged",
    "dutchsnowballstem",
    "edgengram",
    "elision",
    "englishminimalstem",
    "englishpossessive",
    "estoniansnowballstem",
    "fingerprint",
    "finnishlightstem",
    "fixbrokenoffsets",
    "fixedshingle",
    "flattengraph",
    "frenchlightstem",
    "frenchminimalstem",
    "galicianminimalstem",
    "galicianstem",
    "germanlightstem",
    "germanminimalstem",
    "germannormalization",
    "germanstem",
    "greeklowercase",
    "greekstem",
    "hindinormalization",
    "hindistem",
    "hungarianlightstem",
    "hunspellstem",
    "hyphenatedwords",
    "hyphenationcompoundword",
    "indicnormalization",
    "indonesianstem",
    "irishlowercase",
    "irishsnowballstem",
    "italianlightstem",
    "keepword",
    "keywordmarker",
    "keywordrepeat",
    "kpsnowballstem",
    "kstem",
    "latvianstem",
    "length",
    "limittokencount",
    "limittokenoffset",
    "limittokenposition",
    "lithuaniansnowballstem",
    "lovinssnowballstem",
    "lowercase",
    "minhash",
    "ngram",
    "norwegianlightstem",
    "norwegianminimalstem",
    "norwegiannormalization",
    "numericpayload",
    "patterncapturegroup",
    "patternreplace",
    "patterntyping",
    "persiannormalization",
    "porterstem",
    "portugueselightstem",
    "portugueseminimalstem",
    "portuguesestem",
    "protectedterm",
    "removeduplicates",
    "reversestring",
    "romaniansnowballstem",
    "russianlightstem",
    "scandinavianfolding",
    "scandinaviannormalization",
    "serbiannormalization",
    "shingle",
    "snowballporter",
    "soraninormalization",
    "soranistem",
    "spanishlightstem",
    "spanishminimalstem",
    "stemmeroverride",
    "stempelpolishstem",
    "stop",
    "swedishlightstem",
    "swedishminimalstem",
    "synonym",
    "synonymgraph",
    "telugunormalization",
    "telugustem",
    "tokenoffsetpayload",
    "trim",
    "truncate",
    "turkishlowercase",
    "turkishsnowballstem",
    "type",
    "typeaspayload",
    "typeassynonym",
    "uppercase",
    "worddelimiter",
    "worddelimitergraph"
  ]
}