Here is code to preserve the annotations in the CoNLLU format.
In utils/corpus.py change method Corpus.load()
@classmethod
def load(cls, path, fields, max_sent_length=math.inf):
sentences = []
fields = [field if field is not None else Field(str(i))
for i, field in enumerate(fields)]
with open(path, 'r') as f:
lines = []
for line in f:
line = line.strip()
if not line:
sentences.append(Sentence(fields, lines))
lines = []
else:
lines.append(line)
return cls(fields, sentences)
method Sentence._init():
def __init__(self, fields, lines):
self.annotations = dict()
values = []
for i, line in enumerate(lines):
if line.startswith('#'):
self.annotations[-i-1] = line
else:
value = line.split('\t')
if value[0].isdigit():
values.append(value)
self.annotations[int(value[0])] = '' # placeholder
else:
self.annotations[-i] = line
for field, value in zip(fields, list(zip(*values))):
if isinstance(field, Iterable):
for j in range(len(field)):
setattr(self, field[j].name, value)
else:
setattr(self, field.name, value)
self.fields = fields
and method Sentence.repr():
def __repr__(self):
merged = {**self.annotations,
**{i+1: '\t'.join(map(str, line))
for i, line in enumerate(zip(*self.values))} }
return '\n'.join(merged.values()) + '\n'
Here is code to preserve the annotations in the CoNLLU format.
In utils/corpus.py change method Corpus.load()
method Sentence._init():
and method Sentence.repr():