Open Monikasinghjmi opened 4 years ago
Hi, Which dataset and the version of the code did you use? Please let me know the commit number (hash value) and I'll check them.
when I tried it on the NCBI-disease dataset it worked fine, now I am using Kaggle covid dataset on research papers https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge/tasks?taskId=570.
It must be related to the pre-processing step.
In my experience, some Unicode space might cause this problem in tokenization steps.
I used complex code (modified from here) to do the step but I think normalize function from unicodedata package can be a good option.
from unicodedata import normalize
Normalizing all the original text will help.
Also, you need to check 1) the maximum length of a word and 2)the handling of special characters such as '(', ')', '-', '_' ... etc. I used the code from here (Checkout input_form function)
The code is from my colleagues and needs to be refactored a bit more.
Thanks :)
These special characters are in the train.tsv of each dataset, how can these be causing issue?
Sorry I meant spacing near special characters. We added a space before and after some spaces. Ex) (BRCA1) -> ( BRCA1 )
I have used from word_tokenize from nltk.tokenize to make the test.tsv file for kaggle tokens. I have understood what you explained. Can you direct me to the tokenization code which is compatible with this model.
Hi, the pre-processing of the datasets was mostly done by other co-authors. I tried nltk for my other project, but it seems like nltk is not compatible with the BERT tokenizer (especially near special characters). So I get tokenizer code from this repository by co-authors and modified it for my own use (see the end of this comment for the modified code).
Following is the example usage of the code
from ops import json_to_sent, input_form
data = [{
"pmid":"123",
#"title":"I want coffee", # not necessary
"abstract": "This is a dummy data to learn how these codes (ner.ops - json_to_sent and input form) are working. Thanks."
}]
sentData = json_to_sent(data, is_raw_text=True) # set is_raw_text=True if you do not use "title"
sentData
This will split the input sequence into multiple sentences.
{'1': {'sentence': ['This is a dummy data to learn how these codes (ner.ops - json_to_sent and input form) are working.', ' Thanks.']}}
Use input_form to tokenize the sentences.
MAX_CHARS_WORD = 22
for key, values in input_form(sentData, max_input_chars_per_word = MAX_CHARS_WORD)["123"].items():
print(key+": ", values)
sentence: ['This is a dummy data to learn how these codes (ner.ops - json_tosent and input form) are working.', ' Thanks.'] words: [['This', 'is', 'a', 'dummy', 'data', 'to', 'learn', 'how', 'these', 'codes', '(', 'ner', '.', 'ops', '-', 'json', '', 'to', '_', 'sent', 'and', 'input', 'form', ')', 'are', 'working', '.'], ['Thanks', '.']] wordPos: [[(0, 3), (5, 6), (8, 8), (10, 14), (16, 19), (21, 22), (24, 28), (30, 32), (34, 38), (40, 44), (46, 46), (47, 49), (50, 50), (51, 53), (55, 55), (57, 60), (61, 61), (62, 63), (64, 64), (65, 68), (70, 72), (74, 78), (80, 83), (84, 84), (86, 88), (90, 96), (97, 97)], [(99, 104), (105, 105)]]
The code (ops.py) :
#
# Original code from https://github.com/dmis-lab/bern/blob/master/biobert_ner/ops.py
# Modified by Wonjin Yoon (wonjin.info) for BioBERT SeqTag task
#
import numpy as np
import re
tokenize_regex = re.compile(r'([0-9a-zA-Z]+|[^0-9a-zA-Z])')
def json_to_sent(data, is_raw_text=False):
'''data: list of json file [{pmid,abstract,title}, ...] '''
out = dict()
for paper in data:
sentences = list()
if is_raw_text:
# assure that paper['abstract'] is not empty
abst = sentence_split(paper['abstract'])
if len(abst) != 1 or len(abst[0].strip()) > 0:
sentences.extend(abst)
else:
# assure that paper['title'] is not empty
if len(CoNLL_tokenizer(paper['title'])) < 50:
title = [paper['title']]
else:
title = sentence_split(paper['title'])
if len(title) != 1 or len(title[0].strip()) > 0:
sentences.extend(title)
if len(paper['abstract']) > 0:
abst = sentence_split(' ' + paper['abstract'])
if len(abst) != 1 or len(abst[0].strip()) > 0:
sentences.extend(abst)
out[paper['pmid']] = dict()
out[paper['pmid']]['sentence'] = sentences
return out
def input_form(sent_data, max_input_chars_per_word=20):
'''sent_data: dict of sentence, key=pmid {pmid:[sent,sent, ...], pmid: ...}'''
for pmid in sent_data:
sent_data[pmid]['words'] = list()
sent_data[pmid]['wordPos'] = list()
doc_piv = 0
for sent in sent_data[pmid]['sentence']:
wids = list()
wpos = list()
sent_piv = 0
tok = CoNLL_tokenizer(sent)
for w in tok:
if len(w) > max_input_chars_per_word: # was 20
wids.append(w[:max_input_chars_per_word]) # was 10
else:
wids.append(w)
start = doc_piv + sent_piv + sent[sent_piv:].find(w)
end = start + len(w) - 1
sent_piv = end - doc_piv + 1
wpos.append((start, end))
doc_piv += len(sent)
sent_data[pmid]['words'].append(wids)
sent_data[pmid]['wordPos'].append(wpos)
return sent_data
def isInt(string):
try:
int(string)
return True
except ValueError:
return False
def isFloat(string):
try:
float(string)
return True
except ValueError:
return False
def softmax(logits):
out = list()
for logit in logits:
temp = np.subtract(logit, np.max(logit))
p = np.exp(temp) / np.sum(np.exp(temp))
out.append(np.max(p))
return out
def CoNLL_tokenizer(text):
rawTok = [t for t in tokenize_regex.split(text) if t]
assert ''.join(rawTok) == text
tok = [t for t in rawTok if t != ' ']
return tok
def sentence_split(text):
sentences = list()
sent = ''
piv = 0
for idx, char in enumerate(text):
if char in "?!":
if idx > len(text) - 3:
sent = text[piv:]
piv = -1
else:
sent = text[piv:idx + 1]
piv = idx + 1
elif char == '.':
if idx > len(text) - 3:
sent = text[piv:]
piv = -1
elif (text[idx + 1] == ' ') and (
text[idx + 2] in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ-"' + "'"):
sent = text[piv:idx + 1]
piv = idx + 1
if sent != '':
toks = CoNLL_tokenizer(sent)
if len(toks) > 100:
while True:
rawTok = [t for t in tokenize_regex.split(sent) if t]
cut = ''.join(rawTok[:200])
sent = ''.join(rawTok[200:])
sentences.append(cut)
if len(CoNLL_tokenizer(sent)) < 100:
if sent.strip() == '':
sent = ''
break
else:
sentences.append(sent)
sent = ''
break
else:
sentences.append(sent)
sent = ''
if piv == -1:
break
if piv != -1:
sent = text[piv:]
toks = CoNLL_tokenizer(sent)
if len(toks) > 100:
while True:
rawTok = [t for t in tokenize_regex.split(sent) if t]
cut = ''.join(rawTok[:200])
sent = ''.join(rawTok[200:])
sentences.append(cut)
if len(CoNLL_tokenizer(sent)) < 100:
if sent.strip() == '':
sent = ''
break
else:
sentences.append(sent)
sent = ''
break
else:
sentences.append(sent)
sent = ''
return sentences
I am trying to run de_tokenize.py and getting this error. Can anyone help me in this?
437923 485173 Error! : len(ans['labels']) != len(bert_pred['labels']) : Please report us Traceback (most recent call last): File "biocodes/ner_detokenize.py", line 88, in
detokenize(args.answer_path, args.token_test_path, args.label_test_path, args.output_dir)
File "biocodes/ner_detokenize.py", line 77, in detokenize
raise
RuntimeError: No active exception to reraise