Closed oscarvalenzuelab closed 6 months ago
It needs more trained data, for example:
import spacy
from spacy.training import Example
# Load a pre-existing spaCy model
nlp = spacy.load("en_core_web_sm")
# Get the ner pipeline component
ner = nlp.get_pipe('ner')
# Prepare training data
train_data = [
("Copyright 2018 The Grin Developers", {"entities": [(17, 34, "ORG")]}),
# Add more examples
]
# Add labels to the 'ner'
for _, annotations in train_data:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# Disable other pipelines during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.create_optimizer()
for itn in range(10):
random.shuffle(train_data)
losses = {}
for text, annotations in train_data:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)
print(losses)
# Test the updated model
test_text = "Copyright 2018 The Grin Developers"
doc = nlp(test_text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
And used like:
import re
pattern = re.compile(r'Copyright (\d{4}) (.+)')
text = "Copyright 2018 The Grin Developers"
match = pattern.search(text)
if match:
year, entity = match.groups()
print("Year:", year, "Entity:", entity)
Will move out the release.
Use Spacy and NLTK for "entity extraction".