Open harishr1308 opened 5 years ago
ERROR:root:Unable to process C:/Users/NISHIT/Desktop/Entity-Recognition-In-Resumes-SpaCy-master/traindata.json error = 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 6, in convert_dataturks_to_spacy lines = f.readlines() File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last):
File "", line 85, in train_spacy()
File "", line 11, in trainspacy for , annotations in TRAIN_DATA:
TypeError: 'NoneType' object is not iterable
Has anyone found a solution to this?
ERROR:root:Unable to process C:/Users/NISHIT/Desktop/Entity-Recognition-In-Resumes-SpaCy-master/traindata.json error = 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 6, in convert_dataturks_to_spacy lines = f.readlines() File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 85, in train_spacy() File "", line 11, in trainspacy for , annotations in TRAIN_DATA: TypeError: 'NoneType' object is not iterable
Has anyone found a solution to this?
This is just an encoding error.Try inserting an encoding type(encoding="utf-8") when passing a file path.Worked for me
ERROR:root:Unable to process C:/Users/NISHIT/Desktop/Entity-Recognition-In-Resumes-SpaCy-master/traindata.json error = 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 6, in convert_dataturks_to_spacy lines = f.readlines() File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 85, in train_spacy() File "", line 11, in trainspacy for , annotations in TRAIN_DATA: TypeError: 'NoneType' object is not iterable
Has anyone found a solution to this?
This is just an encoding error.Try inserting an encoding type(encoding="utf-8") when passing a file path.Worked for me
Hmm, now I have this error:
Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0)) Statring iteration 0
Process finished with exit code -1073741819 (0xC0000005)
ERROR:root:Unable to process C:/Users/NISHIT/Desktop/Entity-Recognition-In-Resumes-SpaCy-master/traindata.json error = 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 6, in convert_dataturks_to_spacy lines = f.readlines() File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 85, in train_spacy() File "", line 11, in trainspacy for , annotations in TRAIN_DATA: TypeError: 'NoneType' object is not iterable
Has anyone found a solution to this?
This is just an encoding error.Try inserting an encoding type(encoding="utf-8") when passing a file path.Worked for me
Hmm, now I have this error:
Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0)) Statring iteration 0
Process finished with exit code -1073741819 (0xC0000005)
Just copy paste the following code.Let me know if it helps.
//Modification done to original code//
1.added method trim_entity_spans 2.Added import re at the top 3.In train_spacy method, add following line 4.TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
import json import random import logging import re from sklearn.metrics import classification_report from sklearn.metrics import precision_recall_fscore_support from spacy.gold import GoldParse from spacy.scorer import Scorer from sklearn.metrics import accuracy_score def convert_dataturks_to_spacy(dataturks_JSON_FilePath,encoding="utf-8"): try: training_data = [] lines=[] with open(dataturks_JSON_FilePath, 'r',encoding="utf-8) as f: lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
for annotation in data['annotation']:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e: logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e)) return None import spacy ################### Train Spacy NER.########### def trim_entity_spans(data: list) -> list: """Removes leading and trailing white spaces from entity spans.
Args: data (list): The data to be cleaned in spaCy JSON format.
Returns: list: The cleaned data. """ invalid_span_tokens = re.compile(r'\s')
cleaned_data = [] for text, annotations in data: entities = annotations['entities'] valid_entities = [] for start, end, label in entities: valid_start = start valid_end = end while valid_start < len(text) and invalid_span_tokens.match( text[valid_start]): valid_start += 1 while valid_end > 1 and invalid_span_tokens.match( text[valid_end - 1]): valid_end -= 1 valid_entities.append([valid_start, valid_end, label]) cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("Your path where the traindata.json file is located/traindata.json",encoding="utf-8) TRAIN_DATA = trim_entity_spans(TRAIN_DATA) nlp = spacy.blank('en') # create blank Language class
if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True)
for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for itn in range(10): print("Statring iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update( [text], # batch of texts [annotations], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses)
examples = convert_dataturks_to_spacy("/home/abhishekn/dataturks/entityrecognition/testdata.json",encoding="utf-8) tp=0 tr=0 tf=0
ta=0 c=0 for text,annot in examples:
f=open("resume"+str(c)+".txt","w")
doc_to_test=nlp(text)
d={}
for ent in doc_to_test.ents:
d[ent.label_]=[]
for ent in doc_to_test.ents:
d[ent.label_].append(ent.text)
for i in set(d.keys()):
f.write("\n\n")
f.write(i +":"+"\n")
for j in set(d[i]):
f.write(j.replace('\n','')+"\n")
d={}
for ent in doc_to_test.ents:
d[ent.label_]=[0,0,0,0,0,0]
for ent in doc_to_test.ents:
doc_gold_text= nlp.make_doc(text)
gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner]
y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test]
if(d[ent.label_][0]==0):
#f.write("For Entity "+ent.label_+"\n")
#f.write(classification_report(y_true, y_pred)+"\n")
(p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted')
a=accuracy_score(y_true,y_pred)
d[ent.label_][0]=1
d[ent.label_][1]+=p
d[ent.label_][2]+=r
d[ent.label_][3]+=f
d[ent.label_][4]+=a
d[ent.label_][5]+=1
c+=1
for i in d: print("\n For Entity "+i+"\n") print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%") print("Precision : "+str(d[i][1]/d[i][5])) print("Recall : "+str(d[i][2]/d[i][5])) print("F-score : "+str(d[i][3]/d[i][5])) train_spacy()
ERROR:root:Unable to process C:/Users/NISHIT/Desktop/Entity-Recognition-In-Resumes-SpaCy-master/traindata.json error = 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 6, in convert_dataturks_to_spacy lines = f.readlines() File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6983: character maps to Traceback (most recent call last): File "", line 85, in train_spacy() File "", line 11, in trainspacy for , annotations in TRAIN_DATA: TypeError: 'NoneType' object is not iterable
Has anyone found a solution to this?
This is just an encoding error.Try inserting an encoding type(encoding="utf-8") when passing a file path.Worked for me
Hmm, now I have this error: Warning: Unnamed vectors -- this won't allow multiple vectors models to be loaded. (Shape: (0, 0)) Statring iteration 0 Process finished with exit code -1073741819 (0xC0000005)
Just copy paste the following code.Let me know if it helps.
//Modification done to original code//
1.added method trim_entity_spans 2.Added import re at the top 3.In train_spacy method, add following line 4.TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
import json import random import logging import re from sklearn.metrics import classification_report from sklearn.metrics import precision_recall_fscore_support from spacy.gold import GoldParse from spacy.scorer import Scorer from sklearn.metrics import accuracy_score def convert_dataturks_to_spacy(dataturks_JSON_FilePath,encoding="utf-8"): try: training_data = [] lines=[] with open(dataturks_JSON_FilePath, 'r',encoding="utf-8) as f: lines = f.readlines()
for line in lines: data = json.loads(line) text = data['content'] entities = [] for annotation in data['annotation']: #only a single point in text annotation. point = annotation['points'][0] labels = annotation['label'] # handle both list of labels or a single label. if not isinstance(labels, list): labels = [labels] for label in labels: #dataturks indices are both inclusive [start, end] but spacy is not [start, end) entities.append((point['start'], point['end'] + 1 ,label)) training_data.append((text, {"entities" : entities})) return training_data
except Exception as e: logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e)) return None import spacy ################### Train Spacy NER.########### def trim_entity_spans(data: list) -> list: """Removes leading and trailing white spaces from entity spans.
Args: data (list): The data to be cleaned in spaCy JSON format.
Returns: list: The cleaned data. """ invalid_span_tokens = re.compile(r'\s')
cleaned_data = [] for text, annotations in data: entities = annotations['entities'] valid_entities = [] for start, end, label in entities: valid_start = start valid_end = end while valid_start < len(text) and invalid_span_tokens.match( text[valid_start]): valid_start += 1 while valid_end > 1 and invalid_span_tokens.match( text[valid_end - 1]): valid_end -= 1 valid_entities.append([valid_start, valid_end, label]) cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("Your path where the traindata.json file is located/traindata.json",encoding="utf-8) TRAIN_DATA = trim_entity_spans(TRAIN_DATA) nlp = spacy.blank('en') # create blank Language class
create the built-in pipeline components and add them to the pipeline
nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True)
add labels
for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2])
get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for itn in range(10): print("Statring iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update( [text], # batch of texts [annotations], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses)
test the model and evaluate it
examples = convert_dataturks_to_spacy("/home/abhishekn/dataturks/entityrecognition/testdata.json",encoding="utf-8) tp=0 tr=0 tf=0
ta=0 c=0 for text,annot in examples:
f=open("resume"+str(c)+".txt","w") doc_to_test=nlp(text) d={} for ent in doc_to_test.ents: d[ent.label_]=[] for ent in doc_to_test.ents: d[ent.label_].append(ent.text) for i in set(d.keys()): f.write("\n\n") f.write(i +":"+"\n") for j in set(d[i]): f.write(j.replace('\n','')+"\n") d={} for ent in doc_to_test.ents: d[ent.label_]=[0,0,0,0,0,0] for ent in doc_to_test.ents: doc_gold_text= nlp.make_doc(text) gold = GoldParse(doc_gold_text, entities=annot.get("entities")) y_true = [ent.label_ if ent.label_ in x else 'Not '+ent.label_ for x in gold.ner] y_pred = [x.ent_type_ if x.ent_type_ ==ent.label_ else 'Not '+ent.label_ for x in doc_to_test] if(d[ent.label_][0]==0): #f.write("For Entity "+ent.label_+"\n") #f.write(classification_report(y_true, y_pred)+"\n") (p,r,f,s)= precision_recall_fscore_support(y_true,y_pred,average='weighted') a=accuracy_score(y_true,y_pred) d[ent.label_][0]=1 d[ent.label_][1]+=p d[ent.label_][2]+=r d[ent.label_][3]+=f d[ent.label_][4]+=a d[ent.label_][5]+=1 c+=1
for i in d: print("\n For Entity "+i+"\n") print("Accuracy : "+str((d[i][4]/d[i][5])*100)+"%") print("Precision : "+str(d[i][1]/d[i][5])) print("Recall : "+str(d[i][2]/d[i][5])) print("F-score : "+str(d[i][3]/d[i][5])) train_spacy()
When I use the trim_entity_spans method I get a blank [] value for TRAIN_DATA Can you post the trim_entity_spans method with the proper indentation?
ERROR:root:Unable to process C:/Users/NISHIT/Desktop/Entity-Recognition-In-Resumes-SpaCy-master/traindata.json error = 'charmap' codec can't decode byte 0x9d in position 6983: character maps to
Traceback (most recent call last):
File "", line 6, in convert_dataturks_to_spacy
lines = f.readlines()
File "C:\ProgramData\Anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 6983: character maps to
Traceback (most recent call last):
File "", line 85, in
train_spacy()
File "", line 11, in trainspacy
for , annotations in TRAIN_DATA:
TypeError: 'NoneType' object is not iterable