Closed lx6000 closed 3 years ago
Here is some code I used to process my own data
I had my data in data frame mind you with each utterance as a row.
import spacy
import gzip
import pandas as pd
import json
nlp = spacy.load('en')
POS = {w: i for i, w in enumerate([''] + list(nlp.tagger.labels))}
ENT = {w: i for i, w in enumerate([''] + nlp.entity.move_names)}
def extract(text):
doc = nlp(text)
tokens = []
pos_ids = []
ent_ids = []
for token in doc:
tokens.append(str(token))
pos_ids.append(POS[token.tag_])
if token.ent_type_ != '':
move_name = str(token.ent_iob_) + '-' + str(token.ent_type_)
ent_id = ENT[move_name]
elif token.ent_iob_ == 'O':
ent_id = ENT[token.ent_iob_]
else:
raise ValueError('Invalid token')
ent_ids.append(ent_id)
return {
'utt': {
'word': tokens,
'pos_id': pos_ids,
'ent_id': ent_ids
}
}
df = pd.read_csv('transcript.csv')
def get_json_element(row):
line = row['line']
speaker = row['speaker']
extracts = extract(line)
extracts['speaker'] = speaker
extracts['role'] = speaker
return extracts
utts = df.apply(get_json_element, 1).tolist()
final = {
'id': 'some id',
'meeting': utts,
'summary': []
}
fn = 'output_path/split_0.jsonl.gz'
with gzip.open(fn, 'wt', encoding='UTF-8') as zipfile:
json.dump(final, zipfile)
Sorry for the late reply. Here is our snippet of code for getting POS_ID and ENT_ID:
import spacy
nlp = spacy.load('en', parser = False)
POS = {w: i for i, w in enumerate([''] + list(nlp.tagger.labels))}
ENT = {w: i for i, w in enumerate([''] + nlp.entity.move_names)}
def _str(s):
""" Convert PTB tokens to normal tokens """
if (s.lower() == '-lrb-'):
s = '('
elif (s.lower() == '-rrb-'):
s = ')'
elif (s.lower() == '-lsb-'):
s = '['
elif (s.lower() == '-rsb-'):
s = ']'
elif (s.lower() == '-lcb-'):
s = '{'
elif (s.lower() == '-rcb-'):
s = '}'
return s
def _parse_tags(parsed_text):
output = { 'word': [],
'pos_id': [],
'ent_id': []}
for token in parsed_text:
#[(token.text,token.idx) for token in parsed_sentence]
output['word'].append(_str(token.text))
pos = token.tag_
output['pos_id'].append(POS[pos] if pos in POS else 0)
ent = 'O' if token.ent_iob_ == 'O' else (token.ent_iob_ + '-' + token.ent_type_)
output['ent_id'].append(ENT[ent] if ent in ENT else 0)
word_idx = 0
for sent in parsed_text.sents:
# output['sentences'].append((word_idx, word_idx + len(sent)))
word_idx += len(sent)
# import pdb; pdb.set_trace()
assert word_idx == len(output['word'])
assert len(output['word']) > 0
return output
# usage
processed_data_obj = _parse_tags(nlp(raw_input_str))
Hi, I have successfully run through your code, and the effect is quite good. Is it convenient for you to open source pre training data? Or can you tell me how to get a POS ID and ENT ID? Thanks.