def preprocess_function_eval(examples):
inputs, targets = [], []
for i in range(len(examples[prompt_column])):
if examples[prompt_column][i] and examples[response_column][i]:
query = examples[prompt_column][i]
history = examples[history_column][i] if history_column is not None else None
prompt = tokenizer.build_prompt(query, history)
inputs.append(prompt)
targets.append(examples[response_column][i])
inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(
inputs, max_length=data_args.max_source_length, truncation=True, padding=True)
labels = tokenizer(text_target=targets,
max_length=max_target_length, truncation=True)
if data_args.ignore_pad_token_for_loss:
labels["input_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
我按照您的写法(上述代码)的逻辑写了一个自己的,但是我发现,会报错,显示target和input的batch size的维度必须一样,简单来说就是必须一样长,我的eval的input是instruction +input ,label是 output,这样是否能eval,还是说 input和output都得是instruction+input+output