How to build a new data set with the same format

lx6000 commented 3 years ago

Hi, I have successfully run through your code, and the effect is quite good. Is it convenient for you to open source pre training data? Or can you tell me how to get a POS ID and ENT ID? Thanks.

JohnnyC08 commented 3 years ago

Here is some code I used to process my own data

I had my data in data frame mind you with each utterance as a row.

import spacy
import gzip
import pandas as pd
import json

nlp = spacy.load('en')
POS = {w: i for i, w in enumerate([''] + list(nlp.tagger.labels))}
ENT = {w: i for i, w in enumerate([''] + nlp.entity.move_names)}

def extract(text):
    doc = nlp(text)

    tokens = []
    pos_ids = []
    ent_ids = []

    for token in doc:
        tokens.append(str(token))
        pos_ids.append(POS[token.tag_])

        if token.ent_type_ != '':
            move_name = str(token.ent_iob_) + '-' + str(token.ent_type_)
            ent_id = ENT[move_name]

        elif token.ent_iob_ == 'O':
            ent_id = ENT[token.ent_iob_]

        else:
            raise ValueError('Invalid token')

        ent_ids.append(ent_id)

    return {
        'utt': {
            'word': tokens,
            'pos_id': pos_ids,
            'ent_id': ent_ids
        }
    }

df = pd.read_csv('transcript.csv')

def get_json_element(row):
    line = row['line']
    speaker = row['speaker']

    extracts = extract(line)
    extracts['speaker'] = speaker
    extracts['role'] = speaker

    return extracts

utts = df.apply(get_json_element, 1).tolist()

final = {
    'id': 'some id',
    'meeting': utts,
    'summary': []
}

fn = 'output_path/split_0.jsonl.gz'

with gzip.open(fn, 'wt', encoding='UTF-8') as zipfile:
    json.dump(final, zipfile)

xrc10 commented 3 years ago

Sorry for the late reply. Here is our snippet of code for getting POS_ID and ENT_ID:

import spacy
nlp = spacy.load('en', parser = False)
POS = {w: i for i, w in enumerate([''] + list(nlp.tagger.labels))}
ENT = {w: i for i, w in enumerate([''] + nlp.entity.move_names)}

def _str(s):
    """ Convert PTB tokens to normal tokens """
    if (s.lower() == '-lrb-'):
        s = '('
    elif (s.lower() == '-rrb-'):
        s = ')'
    elif (s.lower() == '-lsb-'):
        s = '['
    elif (s.lower() == '-rsb-'):
        s = ']'
    elif (s.lower() == '-lcb-'):
        s = '{'
    elif (s.lower() == '-rcb-'):
        s = '}'
    return s

def _parse_tags(parsed_text):
    output = {  'word': [],
                'pos_id': [],
                'ent_id': []}

    for token in parsed_text:
        #[(token.text,token.idx) for token in parsed_sentence]
        output['word'].append(_str(token.text))
        pos = token.tag_
        output['pos_id'].append(POS[pos] if pos in POS else 0)

        ent = 'O' if token.ent_iob_ == 'O' else (token.ent_iob_ + '-' + token.ent_type_)
        output['ent_id'].append(ENT[ent] if ent in ENT else 0)

    word_idx = 0
    for sent in parsed_text.sents:
        # output['sentences'].append((word_idx, word_idx + len(sent)))
        word_idx += len(sent)

    # import pdb; pdb.set_trace()
    assert word_idx == len(output['word'])
    assert len(output['word']) > 0

    return output

# usage
processed_data_obj = _parse_tags(nlp(raw_input_str))

microsoft / HMNet

How to build a new data set with the same format #2