Open beowulfenator opened 5 years ago
Hello @beowulfenator ,
You can try to override LanguageDetector from spacy_cld.py and set force=True in all set_extension calls in init like this:
from pycld2 import detect, error as pycld_error
from spacy.tokens import Doc, Span
def get_languages(text, cld_results=None):
if cld_results is None:
cld_results = detect_languages(text)
return [lang for (_, lang, _, _) in cld_results if lang != 'un']
def get_scores(text, cld_results=None):
if cld_results is None:
cld_results = detect_languages(text)
return {lang: score / 100. for (_, lang, score, _)
in cld_results if lang != 'un'}
def detect_languages(text):
try:
_, _, results = detect(text.text)
except pycld_error as err:
results = [[None, "error", 0.0, None]]
return results
class LanguageDetector(object):
name = 'cld'
def __init__(self, attrs=('languages', 'language_scores')):
self._languages, self._scores = attrs
Doc.set_extension(self._languages, getter=get_languages, force=True)
Doc.set_extension(self._scores, getter=get_scores, force=True)
Span.set_extension(self._languages, getter=get_languages, force=True)
Span.set_extension(self._scores, getter=get_scores, force=True)
def __call__(self, doc):
cld_results = detect_languages(doc)
doc._.set(self._languages, get_languages(doc, cld_results))
doc._.set(self._scores, get_scores(doc, cld_results))
return doc
After that, you can use your custom spacy_cld like this:
import spacy
from spacy_cld_custom import LanguageDetector
nlp1 = spacy.load('en_core_web_sm')
language_detector1 = LanguageDetector()
nlp1.add_pipe(language_detector1)
nlp2 = spacy.load('en_core_web_md')
language_detector2 = LanguageDetector()
nlp2.add_pipe(language_detector2)
When trying to attach language detector to multiple instances of spaCy, it fails. Example source code:
Error message: