Open rahulmool opened 1 year ago
Hi, I realized we have done small modification to the original biencoder.py script. Here is the updated function:
@classmethod
def create_biencoder_input(cls,
samples: List,
tensorizer: Tensorizer,
insert_title: bool,
num_hard_negatives: int = 0,
num_other_negatives: int = 0,
shuffle: bool = True,
shuffle_positives: bool = False,
) -> BiEncoderBatch:
"""
Creates a batch of the biencoder training tuple.
:param samples: list of data items (from json) to create the batch for
:param tensorizer: components to create model input tensors from a text sequence
:param insert_title: enables title insertion at the beginning of the context sequences
:param num_hard_negatives: amount of hard negatives per question (taken from samples' pools)
:param num_other_negatives: amount of other negatives per question (taken from samples' pools)
:param shuffle: shuffles negative passages pools
:param shuffle_positives: shuffles positive passages pools
:return: BiEncoderBatch tuple
"""
question_tensors = []
ctx_tensors = []
positive_ctx_indices = []
hard_neg_ctx_indices = []
for sample in samples:
# ctx+ & [ctx-] composition
# as of now, take the first(gold) ctx+ only
if shuffle and shuffle_positives:
positive_ctxs = sample['positive_ctxs']
positive_ctx = positive_ctxs[np.random.choice(len(positive_ctxs))]
else:
positive_ctx = sample['positive_ctxs'][0]
positive_ctx = positive_ctx['passage']
neg_ctxs = sample['negative_ctxs'] if 'hard_negative_ctxs' in sample else []
hard_neg_ctxs = sample['hard_negative_ctxs'] if 'hard_negative_ctxs' in sample else [ctx['passage'] for ctx in sample['negative_ctxs']]
question = normalize_question(sample['claim'])
if shuffle:
random.shuffle(neg_ctxs)
random.shuffle(hard_neg_ctxs)
neg_ctxs = neg_ctxs[0:num_other_negatives]
hard_neg_ctxs = hard_neg_ctxs[0:num_hard_negatives]
all_ctxs = [positive_ctx] + neg_ctxs + hard_neg_ctxs
# print(all_ctxs)
hard_negatives_start_idx = 1
hard_negatives_end_idx = 1 + len(hard_neg_ctxs)
current_ctxs_len = len(ctx_tensors)
sample_ctxs_tensors = [tensorizer.text_to_tensor(ctx, title=None)
for
ctx in all_ctxs]
ctx_tensors.extend(sample_ctxs_tensors)
positive_ctx_indices.append(current_ctxs_len)
hard_neg_ctx_indices.append(
[i for i in
range(current_ctxs_len + hard_negatives_start_idx, current_ctxs_len + hard_negatives_end_idx)])
question_tensors.append(tensorizer.text_to_tensor(question))
ctxs_tensor = torch.cat([ctx.view(1, -1) for ctx in ctx_tensors], dim=0)
questions_tensor = torch.cat([q.view(1, -1) for q in question_tensors], dim=0)
ctx_segments = torch.zeros_like(ctxs_tensor)
question_segments = torch.zeros_like(questions_tensor)
return BiEncoderBatch(questions_tensor, question_segments, ctxs_tensor, ctx_segments, positive_ctx_indices,
hard_neg_ctx_indices)
on running run_xict.sh i get the following error. Traceback (most recent call last): File "run_xict.py", line 600, in
main()
File "run_xict.py", line 590, in main
trainer.run_train()
File "run_xict.py", line 132, in run_train
self._train_epoch(scheduler, epoch, eval_step, train_iterator)
File "run_xict.py", line 325, in _train_epoch
shuffle_positives=args.shuffle_positive_ctx
File "/scratch/22cs60r72/InformationRetrival/copy/CORA/mDPR/mool/lib/python3.7/site-packages/dpr/models/biencoder.py", line 172, in create_biencoder_input
question = normalize_question(sample["question"])
KeyError: 'question'
should i initialize question =[] or should i do something else ?