aboSamoor / polyglot

Multilingual text (NLP) processing toolkit
http://polyglot-nlp.com
Other
2.31k stars 337 forks source link

Tokenization and NER #59

Open tgalery opened 8 years ago

tgalery commented 8 years ago

Hi there, I'm using polyglot to do some tokenization and NER extraction and am using the output both as features in a Machine Learning model. Since I know in advance which language I am processing, I instantiate a Text object using language hinting.

text = Text(text_string, hint_language_code="pt")

Now, for some reason, it seems that NER doesn't rely on the language hint passed in the constructor. It tries to infer a language again and it sometimes gets it wrong (e.g. detects portuguese as gl). Since there are no ner2 models for galician to be downloaded, I get a ValueError: Package u'ner2.gl' not found in index.

Here is the full stack trace:

--> 238             ne_tuples = [((ent.start, ent.end), ent.tag, 1.) for ent in sent.entities]
    239             ne_entity_range = get_target_entity_range(ne_tuples, entity_name, sent.tokens)
    240             if ne_entity_range:

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/decorators.pyc in __get__(self, obj, cls)
     18     if obj is None:
     19         return self
---> 20     value = obj.__dict__[self.func.__name__] = self.func(obj)
     21     return value
     22 

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/text.pyc in entities(self)
    130     prev_tag = u'O'
    131     chunks = []
--> 132     for i, (w, tag) in enumerate(self.ne_chunker.annotate(self.words)):
    133       if tag != prev_tag:
    134         if prev_tag == u'O':

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/decorators.pyc in __get__(self, obj, cls)
     18     if obj is None:
     19         return self
---> 20     value = obj.__dict__[self.func.__name__] = self.func(obj)
     21     return value
     22 

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/text.pyc in ne_chunker(self)
     98   @cached_property
     99   def ne_chunker(self):
--> 100     return get_ner_tagger(lang=self.language.code)
    101 
    102   @cached_property

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/decorators.pyc in memoizer(*args, **kwargs)
     28     key = tuple(list(args) + sorted(kwargs.items()))
     29     if key not in cache:
---> 30       cache[key] = obj(*args, **kwargs)
     31     return cache[key]
     32   return memoizer

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/tag/base.pyc in get_ner_tagger(lang)
    190 def get_ner_tagger(lang='en'):
    191   """Return a NER tagger from the models cache."""
--> 192   return NEChunker(lang=lang)

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/tag/base.pyc in __init__(self, lang)
    102       lang: language code to decide which chunker to use.
    103     """
--> 104     super(NEChunker, self).__init__(lang=lang)
    105     self.ID_TAG = NER_ID_TAG
    106 

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/tag/base.pyc in __init__(self, lang)
     38     """
     39     self.lang = lang
---> 40     self.predictor = self._load_network()
     41     self.ID_TAG = {}
     42     self.add_bias = True

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/tag/base.pyc in _load_network(self)
    109     self.embeddings = load_embeddings(self.lang, type='cw')
    110     self.embeddings.normalize_words(inplace=True)
--> 111     self.model = load_ner_model(lang=self.lang, version=2)
    112     first_layer, second_layer = self.model
    113     def predict_proba(input_):

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/decorators.pyc in memoizer(*args, **kwargs)
     28     key = tuple(list(args) + sorted(kwargs.items()))
     29     if key not in cache:
---> 30       cache[key] = obj(*args, **kwargs)
     31     return cache[key]
     32   return memoizer

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/load.pyc in load_ner_model(lang, version)
     92   """
     93   src_dir = "ner{}".format(version)
---> 94   p = locate_resource(src_dir, lang)
     95   fh = _open(p)
     96   try:

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/load.pyc in locate_resource(name, lang, filter)
     41   p = path.join(polyglot_path, task_dir, lang)
     42   if not path.isdir(p):
---> 43     if downloader.status(package_id) != downloader.INSTALLED:
     44       raise ValueError("This resource is available in the index "
     45                        "but not downloaded, yet. Try to run\n\n"

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/downloader.pyc in status(self, info_or_id, download_dir)
    735     """
    736     if download_dir is None: download_dir = self._download_dir
--> 737     info = self._info_or_id(info_or_id)
    738 
    739     # Handle collections:

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/downloader.pyc in _info_or_id(self, info_or_id)
    505   def _info_or_id(self, info_or_id):
    506     if isinstance(info_or_id, unicode):
--> 507       return self.info(info_or_id)
    508     else:
    509       return info_or_id

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/downloader.pyc in info(self, id)
    931     if id in self._packages: return self._packages[id]
    932     if id in self._collections: return self._collections[id]
--> 933     raise ValueError('Package %r not found in index' % id)
    934 
    935   def get_collection(self, lang=None, task=None):

ValueError: Package u'ner2.gl' not found in index
alantian commented 8 years ago

Can you please provide the text you have in this issue? Since we cannot reproduce this issue yet.

tgalery commented 8 years ago

Sure, here is an example:

In [1]: u = u'Akihiro Gono   é um lutador japonês de MMA. Já venceu Gegard Mousasi, Hector Lombard e Hayato Sakurai.'

In [3]: from polyglot.text import Text

In [4]: output_tuples = []

In [5]: text = Text(u, hint_language_code="pt")

In [6]: for sent in text.sentences:
   ...:     output_tuples.append([((ent.start, ent.end), ent.tag, 1.) for ent in sent.entities])
   ...:     

This then generates the following stacktrace

No handlers could be found for logger "polyglot.detect.base"
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-fcec27e389bd> in <module>()
      1 for sent in text.sentences:
----> 2     output_tuples.append([((ent.start, ent.end), ent.tag, 1.) for ent in sent.entities])
      3 

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/decorators.pyc in __get__(self, obj, cls)
     18     if obj is None:
     19         return self
---> 20     value = obj.__dict__[self.func.__name__] = self.func(obj)
     21     return value
     22 

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/text.pyc in entities(self)
    130     prev_tag = u'O'
    131     chunks = []
--> 132     for i, (w, tag) in enumerate(self.ne_chunker.annotate(self.words)):
    133       if tag != prev_tag:
    134         if prev_tag == u'O':

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/decorators.pyc in __get__(self, obj, cls)
     18     if obj is None:
     19         return self
---> 20     value = obj.__dict__[self.func.__name__] = self.func(obj)
     21     return value
     22 

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/text.pyc in ne_chunker(self)
     98   @cached_property
     99   def ne_chunker(self):
--> 100     return get_ner_tagger(lang=self.language.code)
    101 
    102   @cached_property

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/decorators.pyc in memoizer(*args, **kwargs)
     28     key = tuple(list(args) + sorted(kwargs.items()))
     29     if key not in cache:
---> 30       cache[key] = obj(*args, **kwargs)
     31     return cache[key]
     32   return memoizer

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/tag/base.pyc in get_ner_tagger(lang)
    190 def get_ner_tagger(lang='en'):
    191   """Return a NER tagger from the models cache."""
--> 192   return NEChunker(lang=lang)

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/tag/base.pyc in __init__(self, lang)
    102       lang: language code to decide which chunker to use.
    103     """
--> 104     super(NEChunker, self).__init__(lang=lang)
    105     self.ID_TAG = NER_ID_TAG
    106 

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/tag/base.pyc in __init__(self, lang)
     38     """
     39     self.lang = lang
---> 40     self.predictor = self._load_network()
     41     self.ID_TAG = {}
     42     self.add_bias = True

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/tag/base.pyc in _load_network(self)
    107   def _load_network(self):
    108     """ Building the predictor out of the model."""
--> 109     self.embeddings = load_embeddings(self.lang, type='cw')
    110     self.embeddings.normalize_words(inplace=True)
    111     self.model = load_ner_model(lang=self.lang, version=2)

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/decorators.pyc in memoizer(*args, **kwargs)
     28     key = tuple(list(args) + sorted(kwargs.items()))
     29     if key not in cache:
---> 30       cache[key] = obj(*args, **kwargs)
     31     return cache[key]
     32   return memoizer

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/load.pyc in load_embeddings(lang, task, type)
     58   """
     59   src_dir = "_".join((type, task)) if type else task
---> 60   p = locate_resource(src_dir, lang)
     61   e = Embedding.load(p)
     62   if type == "cw":

/home/preceptor/miniconda2/envs/wiki/lib/python2.7/site-packages/polyglot/load.pyc in locate_resource(name, lang, filter)
     44       raise ValueError("This resource is available in the index "
     45                        "but not downloaded, yet. Try to run\n\n"
---> 46                        "polyglot download {}".format(package_id))
     47   return path.join(p, os.listdir(p)[0])
     48 

ValueError: This resource is available in the index but not downloaded, yet. Try to run

polyglot download embeddings2.da
dicleoztur commented 7 years ago

I'm having the same error for some texts in another language despite passing hint in initializing the text. Is there a way to suppress this language inference of the NER?

bilalghanem commented 5 years ago

Simply follow what the error says:

in your command line write: polyglot download embeddings2.da