Open FutureWithoutEnding opened 11 months ago
code in data/dpr_wikitext103_1024/encode_doc.py
data/dpr_wikitext103_1024/encode_doc.py
def inference(**args): data = DPRDataset(args['data_path']) sampler = torch.utils.data.distributed.DistributedSampler(data) data_iter = DataLoader(data, batch_size=args['batch_size'], collate_fn=data.collate, sampler=sampler) sampler.set_epoch(0) text_lists, embeddings, size, counter = [], [], 0, 0 for documents, labels in tqdm(data_iter): embed = inference_one_batch(documents) text_lists.extend(labels) embeddings.append(embed) size += len(embed) if len(embeddings) > args['cut_size']: embed = torch.cat(embeddings) torch.save((text_lists, embed), f'dpr_chunk_{args["local_rank"]}_{counter}.pt') counter += 1 embeddings = [] if len(embed) > 0: embed = torch.cat(embeddings) torch.save((text_lists, embed), f'dpr_chunk_{args["local_rank"]}_{counter}.pt')
this part of code is right? I think should 'clean' the text_lists when embeddings = [].
text_lists
embeddings = []
code in
data/dpr_wikitext103_1024/encode_doc.py
this part of code is right? I think should 'clean' the
text_lists
whenembeddings = []
.