CAMeL-Lab / camel_tools

A suite of Arabic natural language processing tools developed by the CAMeL Lab at New York University Abu Dhabi.
MIT License
413 stars 73 forks source link

[BUG] Top scored analyses isn't "correct" #136

Closed mustafa0x closed 9 months ago

mustafa0x commented 10 months ago

given the phrase 'كلم الرجل'

>>> words = 'كلم الرجل'

>>> from camel_tools.disambig.mle import MLEDisambiguator
... from camel_tools.tokenizers.morphological import MorphologicalTokenizer
... from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
... from camel_tools.morphology.database import MorphologyDB
... from camel_tools.morphology.analyzer import Analyzer
... 
... unfactored = BERTUnfactoredDisambiguator.pretrained()
... analyzer = Analyzer(db=MorphologyDB.builtin_db())
... mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')
... msa_d3_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, scheme='d3tok')

# the root of كلم is seen as FOREIGN (since it assumes it is kilometer)
>>> analyzer.analyze(words)[0]['root']
'FOREIGN'
>>> unfactored.disambiguate([words])[0].analyses[0].analysis['root']
'FOREIGN'
>>> mle_msa.disambiguate([words])[0].analyses[0].analysis['root']
'FOREIGN'

>>> import requests
... import json
... 
... url = "https://camelira.abudhabi.nyu.edu/api/disambig"
... headers = {
...     "content-type": "application/json",
...     "Referer": "https://camelira.abudhabi.nyu.edu/",
... }
... data = {
...     "dialect": "msa",
...     "sentence": words,
... }
... response = requests.post(url, headers=headers, data=json.dumps(data))

# camelira returns the correct root
>>> json.loads(response.text)['output']['disambig'][0]['analyses'][0]['analysis']['root']
'ك.ل.م'