Open dlduddk opened 1 year ago
def split(self, input_ids, tokenizer, logger, block_size=1024): sample = [] i = 0 while i < len(input_ids): sample = input_ids[i: i+block_size] if len(sample) == block_size: for j in range(block_size): if tokenizer.convert_ids_to_tokens(sample[block_size-1-j])[0] == '\u0120' or tokenizer.convert_ids_to_tokens(sample[block_size-1-j]).startswith("<NUM_LIT"): break if sample[block_size-1-j] in [tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.sep_token_id]: if sample[block_size-1-j] != tokenizer.bos_token_id: j -= 1 break if j == block_size-1: print(tokenizer.decode(sample)) exit() sample = sample[: block_size-1-j] # print(len(sample)) i += len(sample) pad_len = block_size-len(sample) sample += [tokenizer.pad_token_id]*pad_len self.inputs.append(sample) if len(self.inputs) % 10000 == 0: logger.info(f"{len(self.inputs)} samples")
When I use LlamaTokenizer, it becomes exit() in this code. But when I use GPT2Tokenizer, it was okay. How can I solve it?
When I use LlamaTokenizer, it becomes exit() in this code. But when I use GPT2Tokenizer, it was okay. How can I solve it?