Open Ejigsonpeter opened 5 years ago
I have fixed the error. I will raise PR for the same. Please check
Unable to create a PR because of acess issue. but here is the modified train.py
Modification done to original code
trim_entity_spans
import re
at the toptrain_spacy
method, add following line
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
How to Run the code
train.py
to your local path.pip3 install sklearn
)python3 train.py
( Note: you should have python3 installed, else usepython
instead of python3
)############################################ NOTE ########################################################
#
# Creates NER training data in Spacy format from JSON downloaded from Dataturks.
#
# Outputs the Spacy training data which can be used for Spacy training.
#
############################################################################################################
import json
import random
import logging
import re
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
for annotation in data['annotation']:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
import spacy
################### Train Spacy NER.###########
def trim_entity_spans(data: list) -> list:
"""Removes leading and trailing white spaces from entity spans.
Args:
data (list): The data to be cleaned in spaCy JSON format.
Returns:
list: The cleaned data.
"""
invalid_span_tokens = re.compile(r'\s')
cleaned_data = []
for text, annotations in data:
entities = annotations['entities']
valid_entities = []
for start, end, label in entities:
valid_start = start
valid_end = end
while valid_start < len(text) and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > 1 and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
valid_entities.append([valid_start, valid_end, label])
cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("/home/abhishekn/dataturks/entityrecognition/traindata.json")
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
nlp = spacy.blank('en') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(10):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
#test the model and evaluate it
examples = convert_dataturks_to_spacy("/home/abhishekn/dataturks/entityrecognition/testdata.json")
tp=0
tr=0
tf=0
ta=0
c=0
for text,annot in examples:
f=open("resume"+str(c)+".txt","w")
doc_to_test=nlp(text)
d={}
for ent in doc_to_test.ents:
d[ent.label_]=[]
for ent in doc_to_test.ents:
d[ent.label_].append(ent.text)
for i in set(d.keys()):
f.write("\n\n")
f.write(i +":"+"\n")
for j in set(d[i]):
f.write(j.replace('\n','')+"\n")
d={}
for ent in doc_to_test.ents:
d[ent.label_]=[0,0,0,0,0,0]
for ent in doc_to_test.ents:
doc_gold_text= nlp.make_doc(text)
gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]
if(d[ent.label_][0]==0):
#f.write("For Entity "+ent.label_+"\n")
#f.write(classification_report(y_true, y_pred)+"\n")
(p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
a=accuracy_score(y_true,y_pred)
d[ent.label_][0]=1
d[ent.label_][1]+=p
d[ent.label_][2]+=r
d[ent.label_][3]+=f
d[ent.label_][4]+=a
d[ent.label_][5]+=1
c+=1
for i in d:
print("\n For Entity "+i+"\n")
print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%")
print("Precision : "+str(d[i][1]/d[i][5]))
print("Recall : "+str(d[i][2]/d[i][5]))
print("F-score : "+str(d[i][3]/d[i][5]))
train_spacy()
from spacy.gold import GoldParse
ModuleNotFoundError: No module named 'spacy.gold'
Please Can someone paste a working code here or tell me the version of spacy they are using