yzhangcs / parser

:rocket: State-of-the-art parsers for natural language.
https://parser.yzhang.site/
MIT License
829 stars 141 forks source link

CoNLLU annotation #30

Closed attardi closed 4 years ago

attardi commented 4 years ago

Here is code to preserve the annotations in the CoNLLU format.

In utils/corpus.py change method Corpus.load()

@classmethod
def load(cls, path, fields, max_sent_length=math.inf):
    sentences = []
    fields = [field if field is not None else Field(str(i))
              for i, field in enumerate(fields)]
    with open(path, 'r') as f:
        lines = []
        for line in f:
            line = line.strip()
            if not line:
                sentences.append(Sentence(fields, lines))
                lines = []
            else:
                lines.append(line)

    return cls(fields, sentences)

method Sentence._init():

def __init__(self, fields, lines):
    self.annotations = dict()
    values = []
    for i, line in enumerate(lines):
        if line.startswith('#'):
            self.annotations[-i-1] = line
        else:
            value = line.split('\t')
            if value[0].isdigit():
                values.append(value)
                self.annotations[int(value[0])] = '' # placeholder                                                                 
            else:
                self.annotations[-i] = line
    for field, value in zip(fields, list(zip(*values))):
        if isinstance(field, Iterable):
            for j in range(len(field)):
                setattr(self, field[j].name, value)
        else:
            setattr(self, field.name, value)
    self.fields = fields

and method Sentence.repr():

def __repr__(self):
    merged = {**self.annotations,
              **{i+1: '\t'.join(map(str, line))
                 for i, line in enumerate(zip(*self.values))} }
    return '\n'.join(merged.values()) + '\n'
yzhangcs commented 4 years ago

Great! I will add it to the release version. Thanks.