explosion / spaCy

💫 Industrial-strength Natural Language Processing (NLP) in Python
https://spacy.io
MIT License
29.82k stars 4.37k forks source link

ner training warning after spacy-lookups-data loaded #5789

Closed pythonBerg closed 4 years ago

pythonBerg commented 4 years ago

I am retraining old models last trained in spacy 2.1. When attempting to train against empty english model, received error message UserWarning: [W033] Training a new parser or NER using a model with no lexeme normalization table. This may degrade the performance of the model to some degree. If this is intentional or the language you're using doesn't have a normalization table, please ignore this warning. If this is surprising, make sure you have the spacy-lookups-data package installed. The languages with lexeme normalization tables are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.

I am guessing this is related to the decoupling of models and the vocab + other features... I did pip install spacy[lookups] and now see the package in pip list. I continue to get these errors training and can find nothing in documentation that illustrates additional include or loads.

Please help me understand what I am missing.

windows 10. spacy 2.3.4


from __future__ import unicode_literals, print_function
import plac
import random
import sys
import os
from pathlib import Path
import thinc.extra.datasets
import psycopg2
import spacy
from spacy.lang.en import English
from spacy.util import minibatch, compounding
import re
import regex
from collections import defaultdict, OrderedDict
import time
import warnings
spacy.prefer_gpu()

@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    provision_type=("Optional output directory", "option", "p", str),
    version=("Spacy version", "option", "v", str),
    n_texts=("Number of texts to train from", "option", "t", int),
    n_iter=("Number of training iterations", "option", "n", int))

def main(model='devModel',  n_iter=30, n_texts=6000,provision_type='Lease',version='v23'):
    output_dir= os.path.join('C:/home/compscre/training/',version,'models',provision_type,model)
    # output_dir='C:/home/compscre/training/v23/models/'+provision_type.lower()+'/'+model

    try:
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    except:
        nlp = spacy.blank('en')  # create blank Language class
        infixes = nlp.Defaults.infixes + (r'''[(\)\()\)\(]''',)
        infix_regex = spacy.util.compile_infix_regex(infixes)
        nlp.tokenizer.infix_finditer = infix_regex.finditer

        suffixes = nlp.Defaults.suffixes + (r'''\.''',)
        suffix_regex = spacy.util.compile_suffix_regex(suffixes)
        nlp.tokenizer.suffix_search = suffix_regex.search

        print("Created blank 'en' model")

    usecat='ner_tokens'
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    print("Loading NER Training Data")
    [lst,TRAIN_DATA] = load_ner_data(provision_type)

    for lab in lst:
        ner.add_label(lab)

    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    disabled = nlp.disable_pipes(*other_pipes)
    with warnings.catch_warnings():

        warnings.filterwarnings("once", category=UserWarning, module='spacy')
        optimizer = nlp.begin_training()
        for itn in range(n_iter+30):
            sTime = time.ctime()
            print("running iteration ",itn)
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(2.0, 12.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                try:
                    nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        sgd=optimizer,
                        drop=0.45,  # dropout - make it harder to memorise data
                        losses=losses)
                except:
                    print("Training failed  for sdoc ",texts[0][:100])
            eTime=time.ctime()
            print(itn,sTime,eTime,losses)
            if output_dir is not None:
                output_dir = Path(output_dir)
                if not output_dir.exists():
                    output_dir.mkdir()
                disabled.restore()
                nlp.to_disk(output_dir)
                disabled = nlp.disable_pipes(*other_pipes)  # only train NER. Adjust position to allow for disable/enable each iteration so program doesn't lose other pipes
                print("Saved model # TODO: o", output_dir)
pythonBerg commented 4 years ago

Closed. Clearing out buffers and restarting clean seems to have resolved.

github-actions[bot] commented 2 years ago

This thread has been automatically locked since there has not been any recent activity after it was closed. Please open a new issue for related bugs.