nickdavidhaynes / spacy-cld

Language detection extension for spaCy 2.0+
MIT License
111 stars 9 forks source link

Doesn't work with multiple instances of spacy #10

Open beowulfenator opened 5 years ago

beowulfenator commented 5 years ago

When trying to attach language detector to multiple instances of spaCy, it fails. Example source code:

import spacy
from spacy_cld import LanguageDetector

nlp1 = spacy.load('en_core_web_sm')
language_detector1 = LanguageDetector()
nlp1.add_pipe(language_detector1)

nlp2 = spacy.load('en_core_web_md')
language_detector2 = LanguageDetector()
nlp2.add_pipe(language_detector2)

Error message:

File "/home/ubuntu/test/venv/lib/python3.5/site-packages/spacy_cld/spacy_cld.py", line 30, in __init__
Doc.set_extension(self._languages, getter=get_languages)
File "doc.pyx", line 100, in spacy.tokens.doc.Doc.set_extension
ValueError: [E090] Extension 'languages' already exists on Doc. To overwrite the existing extension, set `force=True` on `Doc.set_extension`.
Eleni170 commented 5 years ago

Hello @beowulfenator ,

You can try to override LanguageDetector from spacy_cld.py and set force=True in all set_extension calls in init like this:

from pycld2 import detect, error as pycld_error
from spacy.tokens import Doc, Span

def get_languages(text, cld_results=None):
    if cld_results is None:
        cld_results = detect_languages(text)
    return [lang for (_, lang, _, _) in cld_results if lang != 'un']

def get_scores(text, cld_results=None):
    if cld_results is None:
        cld_results = detect_languages(text)
    return {lang: score / 100. for (_, lang, score, _)
            in cld_results if lang != 'un'}

def detect_languages(text):
    try:
        _, _, results = detect(text.text)
    except pycld_error as err:
        results = [[None, "error", 0.0, None]]
    return results

class LanguageDetector(object):

    name = 'cld'

    def __init__(self, attrs=('languages', 'language_scores')):
        self._languages, self._scores = attrs
        Doc.set_extension(self._languages, getter=get_languages, force=True)
        Doc.set_extension(self._scores, getter=get_scores, force=True)
        Span.set_extension(self._languages, getter=get_languages, force=True)
        Span.set_extension(self._scores, getter=get_scores, force=True)

    def __call__(self, doc):
        cld_results = detect_languages(doc)
        doc._.set(self._languages, get_languages(doc, cld_results))
        doc._.set(self._scores, get_scores(doc, cld_results))
        return doc

After that, you can use your custom spacy_cld like this:

import spacy
from spacy_cld_custom import LanguageDetector

nlp1 = spacy.load('en_core_web_sm')
language_detector1 = LanguageDetector()
nlp1.add_pipe(language_detector1)

nlp2 = spacy.load('en_core_web_md')
language_detector2 = LanguageDetector()
nlp2.add_pipe(language_detector2)