typesense / typesense

Open Source alternative to Algolia + Pinecone and an Easier-to-Use alternative to ElasticSearch ⚡ 🔍 ✨ Fast, typo tolerant, in-memory fuzzy Search Engine for building delightful search experiences
https://typesense.org
GNU General Public License v3.0
20.89k stars 647 forks source link

The pipe character is making some tokens not get matched #1477

Closed Kimeiga closed 9 months ago

Kimeiga commented 9 months ago

Description

I'm using typesense to query the cedict which is a chinese dictionary, and it uses pipes ("|") to delimit traditional/simplified character pairs in the definitions.

Like this: 余 余 [yu2] /(literary) I; me/variant of 餘|余[yu2]/

When I search a simplified character, nothing comes up because the pipe character is preventing the simplified character from being matched. When I replace all pipe characters in the definitions with spaces, the simplified characters get matched as normal

Steps to reproduce

Download cedict_ts.u8


import json

def parse_cedict_line(line):
    if line.startswith("#"):
        return None

    parts = line.strip().split(" ")
    traditional = parts[0]
    simplified = parts[1]

    # Extracting pronunciation and definitions
    remaining = " ".join(parts[2:])
    pronunciation_start = remaining.find("[")
    pronunciation_end = remaining.find("]")
    pronunciation = remaining[pronunciation_start + 1 : pronunciation_end]

    definitions = remaining[pronunciation_end + 2 :].strip("/")

    return {
        "t": traditional,
        "s": simplified,
        "p": pronunciation,
        "d": definitions,
    }

def convert_cedict_to_jsonl(cedict_file_path, output_file_path):
    with open(cedict_file_path, "r", encoding="utf-8") as cedict_file, open(
        output_file_path, "w", encoding="utf-8"
    ) as output_file:
        for line in cedict_file:
            parsed_line = parse_cedict_line(line.strip())
            if parsed_line:
                json_line = json.dumps(parsed_line, ensure_ascii=False)
                output_file.write(json_line + "\n")

convert_cedict_to_jsonl("cedict_ts.u8", "output.jsonl")
/*
 *  Our JavaScript client library works on both the server and the browser.
 *  When using the library on the browser, please be sure to use the
 *  search-only API Key rather than the master API key since the latter
 *  has write access to Typesense and you don't want to expose that.
 */
const util = require('util');
(async () => {

    const Typesense = require('typesense')

    let client = new Typesense.Client({
        'nodes': [{
            'host': 'localhost', // For Typesense Cloud use xxx.a1.typesense.net
            'port': 8108,      // For Typesense Cloud use 443
            'protocol': 'http'   // For Typesense Cloud use https
        }],
        'apiKey': 'apikey',
        'connectionTimeoutSeconds': 2
    })

    let cedictSchema = {
        'name': 'cedict',
        'fields': [
            { 'name': 't', 'type': 'string' },
            { 'name': 's', 'type': 'string' },
            { 'name': 'p', 'type': 'string' },
            { 'name': 'd', 'type': 'string' }
        ]
    };
    client.collections().create(cedictSchema)
        .then(response => {
            console.log("Collection created:", response);
        })
        .catch(error => {
            console.error("Error creating collection:", error);
        });

    var fs = require('fs/promises');

    const cedictData = await fs.readFile("output.jsonl");

    // Now import these documents into Typesense
    client.collections('cedict').documents().import(cedictData)
        .then(response => {
            console.log("Documents imported:", response);
        })
        .catch(error => {
            console.error("Error importing documents:", error);
        });

    let searchParameters = {
        'q': '观', // <- change this character around between simplified/traditional
        'query_by': 'd',
    };
    client.collections('cedict')
        .documents()
        .search(searchParameters)
        .then(function (searchResults) {
            console.log(util.inspect(searchResults, { showHidden: false, depth: null, colors: true }));
        });
})();

Expected Behavior

haki@colo:~/code/typesense-test$ node typesense.js
{
  facet_counts: [],
  found: 3,
  hits: [
    {
      document: {
        d: 'Japanese variant of 觀|观[guan1]',
        id: '97662',
        p: 'guan1',
        s: '観',
        t: '観'
      },
      highlight: {
        d: {
          matched_tokens: [ '觀' ],
          snippet: 'Japanese variant of <mark>觀</mark>|观[guan1]'
        }
      },
      highlights: [
        {
          field: 'd',
          matched_tokens: [ '觀' ],
          snippet: 'Japanese variant of <mark>觀</mark>|观[guan1]'
        }
      ],
      text_match: 578730089005449300,
      text_match_info: {
        best_field_score: '1108074561536',
        best_field_weight: 15,
        fields_matched: 1,
        score: '578730089005449337',
        tokens_matched: 1
      }
    },
    {
      document: {
        d: 'variant of 觀|观[guan1]',
        id: '97463',
        p: 'guan1',
        s: '覌',
        t: '覌'
      },
      highlight: {
        d: {
          matched_tokens: [ '觀' ],
          snippet: 'variant of <mark>觀</mark>|观[guan1]'
        }
      },
      highlights: [
        {
          field: 'd',
          matched_tokens: [ '觀' ],
          snippet: 'variant of <mark>觀</mark>|观[guan1]'
        }
      ],
      text_match: 578730089005449300,
      text_match_info: {
        best_field_score: '1108074561536',
        best_field_weight: 15,
        fields_matched: 1,
        score: '578730089005449337',
        tokens_matched: 1
      }
    },
    {
      document: {
        d: 'Mt Potala at Zhoushan 舟山市 in Zhejiang, one of the Four Sacred Mountains and Bodhimanda of Guanyin 觀音|观音 (Avalokiteśvara)',
        id: '52704',
        p: 'Pu3 tuo2 shan1',
        s: '普陀山',
        t: '普陀山'
      },
      highlight: {
        d: {
          matched_tokens: [ '觀' ],
          snippet: 'Mt Potala at Zhoushan 舟山市 in Zhejiang, one of the Four Sacred Mountains and Bodhimanda of Guanyin <mark>觀</mark>音|观音 (Avalokiteśvara)'
        }
      },
      highlights: [
        {
          field: 'd',
          matched_tokens: [ '觀' ],
          snippet: 'Mt Potala at Zhoushan 舟山市 in Zhejiang, one of the Four Sacred Mountains and Bodhimanda of Guanyin <mark>觀</mark>音|观音 (Avalokiteśvara)'
        }
      ],
      text_match: 578730089005449300,
      text_match_info: {
        best_field_score: '1108074561536',
        best_field_weight: 15,
        fields_matched: 1,
        score: '578730089005449337',
        tokens_matched: 1
      }
    }
  ],
  out_of: 122289,
  page: 1,
  request_params: { collection_name: 'cedict', per_page: 10, q: '觀' },
  search_cutoff: false,
  search_time_ms: 0
}

Actual Behavior

haki@colo:~/code/typesense-test$ node typesense.js
{
  facet_counts: [],
  found: 0,
  hits: [],
  out_of: 122289,
  page: 1,
  request_params: { collection_name: 'cedict', per_page: 10, q: '观' },
  search_cutoff: false,
  search_time_ms: 0
}

Metadata

Typesense Version: 0.25.2

OS: WSL2 Ubuntu 22 LTS

Kimeiga commented 9 months ago

I think it could be because typesense is prefix matching as when I search for a a character, it doesn't give me any results that have that character in the middle or end of words. This is a big problem because if you want to solve it for the individual character case you can add spaces between each character but then you can no longer match the whole word if you supplied the whole word as a search input. is there anything I can do to search without only prefix matching?

Kimeiga commented 9 months ago

Nvm solved with using infix #393

let cedictSchema = {
  'name': 'cedict',
  'fields': [
      { 'name': 't', 'type': 'string', "infix": true },
      { 'name': 's', 'type': 'string', "infix": true },
      { 'name': 'p', 'type': 'string', "infix": true },
      { 'name': 'd', 'type': 'string', "infix": true } 
  ]
};

let searchParameters = {
  'q': '汉',
  'query_by': '*',
  'infix': 'always'
};