explosion / spaCy

💫 Industrial-strength Natural Language Processing (NLP) in Python
https://spacy.io
MIT License
30.21k stars 4.4k forks source link

loose all the previous Entities while updating my model with new entities. #3677

Closed AshishRanjan-004 closed 5 years ago

AshishRanjan-004 commented 5 years ago

I followed all the instruction of the documentation to update my existing model but when I update the model, all previous entities vanished and only new entities were recognized by the model. here is the code with output

  1. first code to create a model

` from future import unicode_literals, print_function import plac import random from pathlib import Path import spacy from spacy.util import minibatch, compounding

TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}), ]

@plac.annotations( model=("Model name. Defaults to blank 'en' model.", "option", "m", str), output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int), )

def trainModel(model=None, output_dir=None, n_iter=100): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model")

# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe("ner")

# add labels
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    # reset and initialize the weights randomly – but only if we're
    # training a new model
    if model is None:
        nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
        print("Losses", losses)

# test the trained model
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

print("training complete")

# save model to output directory
if output_dir is None:
    output_dir = Path("C:\\Users\\Ashish\\Desktop\\model")
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

    # test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    for text, _ in TRAIN_DATA:
        doc = nlp2(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

trainModel()

output:- Entities [('Shaka Khan', 'PERSON')] Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), ('Khan', 'PERSON', 1), ('?', '', 2)] Entities [('London', 'LOC'), ('Berlin', 'LOC')] Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]

`

  1. second code to update the created model ` from future import unicode_literals, print_function import plac import random from pathlib import Path import spacy from spacy.util import minibatch, compounding

LABEL = "ANIMAL"

TRAIN_DATA = [ ( "Horses are too tall and they pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}, ), ("Do they bite?", {"entities": []}), ( "horses are too tall and they pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}, ), ("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}), ( "they pretend to care about your feelings, those horses", {"entities": [(48, 54, LABEL)]}, ), ("horses?", {"entities": [(0, 6, LABEL)]}), ]

@plac.annotations( model=("Model name. Defaults to blank 'en' model.", "option", "m", str), new_model_name=("New model name for model meta.", "option", "nm", str), output_dir=("Optional output directory", "option", "o", Path), n_iter=("Number of training iterations", "option", "n", int), ) def main(model="C:\Users\Ashish\Desktop\model", new_model_name="animal", output_dir=None, n_iter=70): """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model")

Add entity recognizer to model if it's not in the pipeline

# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe("ner")

ner.add_label(LABEL)  # add new entity label to entity recognizer
# Adding extraneous labels shouldn't mess anything up
ner.add_label("VEGETABLE")
if model is None:
    optimizer = nlp.begin_training()
else:
    optimizer = nlp.resume_training()
move_names = list(ner.move_names)
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    sizes = compounding(1.0, 4.0, 1.001)
    # batch up the examples using spaCy's minibatch
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        batches = minibatch(TRAIN_DATA, size=sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        print("Losses", losses)

# test the trained model
test_text = "who is shaka khan? this is my horse"
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent.label_, ent.text)

main()

output;- Entities in 'who is shaka khan? this is my horse' ANIMAL horse `

so, in the above code, we getting horse as ANIMAL but unable to recognize shaka khan as a person.

So how can void this situation and get all the entities?

ENVIRONMENT:-

THANKS

ines commented 5 years ago

It sounds like what you're experiencing is the "catastrophic forgetting" problem – as your model learns about the new entities, it's "forgetting" what it had previously learned.

Here are some resources and threads with more information and possible solutions:

AshishRanjan-004 commented 5 years ago

thanks for the quick reply. but don't you think this should be fixed?

lock[bot] commented 5 years ago

This thread has been automatically locked since there has not been any recent activity after it was closed. Please open a new issue for related bugs.