Closed aryehgigi closed 2 years ago
sorry for the bad commit names...
Hello @aryehgigi
I went over your changes and and I suggest a bit different API:
predict:
def predict(self,
texts: List[str] = None,
tokenized_texts: List[List[str]] = None,
max_tokens_in_batch: int = 10000,
output_file: str = None):
if texts is not None:
tokenized = False
texts_input = texts
elif tokenized_texts is not None:
tokenized = True
texts_input = tokenized_texts
else:
raise ValueError(" 'texts' and 'tokenized_texts' are None, need input to predict. ")
dataset = self._create_dataset(inputs=texts_input, tokenized=tokenized)
dataloader = self._prepare_batches(dataset, max_tokens_in_batch)
Then create dataset becomes
def _create_dataset(self, inputs, tokenized=False):
logger.info(f'Tokenize {len(inputs)} inputs...')
# Save original text ordering for later use
if tokenized:
dataset = Dataset.from_dict({'tokens': inputs, 'idx': range(len(inputs))})
dataset = dataset.map(
encode, batched=True, batch_size=10000,
fn_kwargs={'tokenizer': self.tokenizer, 'nlp': None}
)
else:
dataset = Dataset.from_dict({'text': inputs, 'idx': range(len(inputs))})
dataset = dataset.map(
encode, batched=True, batch_size=10000,
fn_kwargs={'tokenizer': self.tokenizer, 'nlp': self.nlp}
)
return dataset
and encode becomes:
def encode(batch, tokenizer, nlp):
if nlp is not None:
tokenized_texts = tokenize_with_spacy(batch['text'], nlp)
else:
tokenized_texts = batch
tokenized_texts['offset_mapping'] = [[None] * len(tokens) for tokens in tokenized_texts['tokens']]
encoded_batch = tokenizer(
tokenized_texts['tokens'], add_special_tokens=True, is_split_into_words=True,
return_length=True, return_attention_mask=False
)
return {
'tokens': tokenized_texts['tokens'],
'input_ids': encoded_batch['input_ids'],
'length': encoded_batch['length'],
# bpe token -> spacy tokens
'subtoken_map': [enc.word_ids for enc in encoded_batch.encodings],
# this is a can use for speaker info TODO: better name!
'new_token_map': [list(range(len(tokens))) for tokens in tokenized_texts['tokens']],
# spacy tokens -> text char
'offset_mapping': tokenized_texts['offset_mapping']
}
Lastly at inference time, you need to understand if needs to return char level or token level clusters indices
so in the inference function we can do something like:
for batch in dataloader:
texts_or_tokens = batch['text'] if 'text' in batch else batch['tokens']
then we have the function align_to_char_level
so we can create align_to_token_level
which is basically the same just without the layer token_to_char = batch['offset_mapping']
What do you think?
adding an option to call
predict
with a tokenized input (requires passingNone
totexts
adding two corresponding methods to
CorefResult
: