Open sheerun opened 5 years ago
tokenizer =PragmaticTokenizer::Tokenizer.new({ language: :pl, numbers: :all, downcase: false, contractions: { "os" => "osiedle", "os." => "osiedle" }, expand_contractions: true }) puts tokenizer.tokenize("Na os.Piłsudskiego")
The proper tokenization should be
["Na", "osiedle", "Piłsudskiego"]
["Na", "Osiedle", ".", "Piłsudskiego"]
The proper tokenization should be
["Na", "osiedle", "Piłsudskiego"]
while tokenizer returns["Na", "Osiedle", ".", "Piłsudskiego"]