kunling-cxk commented 3 months ago

System Info

Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

transformers version: 4.43.3
Platform: Linux-5.15.0-117-generic-x86_64-with-glibc2.35
Python version: 3.10.14
Huggingface_hub version: 0.24.3
Safetensors version: 0.4.3
Accelerate version: 0.33.0
Accelerate config: not found
PyTorch version (GPU?): 2.4.0 (True)
Tensorflow version (GPU?): not installed (NA)
Flax version (CPU?/GPU?/TPU?): not installed (NA)
Jax version: not installed
JaxLib version: not installed
Using distributed or parallel set-up in script?:
Using GPU in script?:
GPU type: NVIDIA GeForce RTX 4090

Who can help?

model = prepare_peft_model(model, model_args.peft_mode) loss_func = TargetLMLoss(ignore_index=tokenizer.pad_token_id)

train_dataset = UnifiedSFTDataset(file=data_args.data_path, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, template = template_dict["default"])
data_collator = SFTDataCollator(tokenizer=tokenizer, max_seq_length=data_args.max_seq_length)

print(train_dataset[0].keys())
trainer = LoRATrainer(
                    model=model,
                    tokenizer=tokenizer,            #默认accelerator="gpu"
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=train_dataset,
                    data_collator=data_collator,
                    compute_loss=loss_func,
)

Information

[ ] The official example scripts
[X] My own modified scripts

Tasks

[ ] An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
[X] My own task or dataset (give details below)

Reproduction

class UnifiedSFTDataset(Dataset): """ 统一的数据处理dataset """ def init(self, file, tokenizer, max_seq_length, template): self.tokenizer = tokenizer self.template_name = template.template_name #模板名字 self.system_format = template.system_format #模板系统格式 self.user_format = template.user_format #模板用户格式 self.assistant_format = template.assistant_format #模板助手格式 self.system = template.system #系统可能会有一些行为

    self.max_seq_length = max_seq_length
    logger.info('Loading data: {}'.format(file))
    data_list=load_raw_dataset(data_dir=file)
    logger.info(f'Use template "{self.template_name}" for training')
    logger.info("There are {} data in dataset".format(len(data_list)))
    self.data_list = data_list

def __len__(self):
    return len(self.data_list)

def __getitem__(self, index):
    # 每条数据拼接格式为: {system_format}{user_format}{assistant_format}{user_format}{assistant_format}...
    data = self.data_list[index]
    if isinstance(data, str):
        data = json.loads(data)         #json字符串转换为字典类型
    #data一个场景

    input_ids, target_mask = [], []
    # setting system information
    if self.system_format is not None:  #就是需要考虑sysytem的情况
        system = data['system'].strip() if 'system' in data.keys() else self.system
        # system信息不为空
        if system is not None:
            system_text = self.system_format.format(content=system)
            input_ids = self.tokenizer.encode(system_text, add_special_tokens=False)
            target_mask = [0] * len(input_ids)

    conversations = data['conversation']
    # 拼接多轮对话
    for i, conv in enumerate(conversations):
        human = conv['human'].strip()
        assistant = conv['assistant'].strip()

        human = self.user_format.format(content=human, stop_token=self.tokenizer.eos_token)
        assistant = self.assistant_format.format(content=assistant, stop_token=self.tokenizer.eos_token)

        input_tokens = self.tokenizer.encode(human, add_special_tokens=False)
        output_tokens = self.tokenizer.encode(assistant, add_special_tokens=False)

        input_ids += input_tokens + output_tokens
        target_mask += [0] * len(input_tokens) + [1] * len(output_tokens)
    assert len(input_ids) == len(target_mask)
    # 对长度进行截断
    input_ids = input_ids[:self.max_seq_length]
    target_mask = target_mask[:self.max_seq_length]
    attention_mask = [1] * len(input_ids)

    assert len(input_ids) == len(target_mask) == len(attention_mask)
    inputs = {
        'target_mask':target_mask,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        }

    return inputs

class Trainer(transformers.Trainer): """ 主要修改逻辑：通过传入compute_loss，支持自定义loss计算方式 """ def init( self, model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, model_init: Callable[[], PreTrainedModel] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = None, compute_loss=None, remove_unused_columns:Optional[bool] = False, ): super(Trainer, self).init( model=model, args=args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, model_init=model_init, compute_metrics=compute_metrics, callbacks=callbacks, optimizers=optimizers, preprocess_logits_for_metrics=preprocess_logits_for_metrics, ) self.loss_func = compute_loss print("trainer:",train_dataset[0].keys())

def compute_loss(self, model, inputs, return_outputs=False):
    """
    重写loss的计算方式
    How the loss is computed by Trainer. By default, all models return the loss in the first element.

    Subclass and override for custom behavior.
    """
    if self.loss_func is None:
        loss = super().compute_loss(model, inputs, return_outputs)
    else:
        loss = self.loss_func(model, inputs, self.args, return_outputs)
    return loss

class LoRATrainer(Trainer): """ 修改checkkpoint的保存逻辑，只保存lora """ def _save(self, output_dir: Optional[str] = None, state_dict=None):

If we are executing this function, we are the process zero, so we don't check for that.

    output_dir = output_dir if output_dir is not None else self.args.output_dir
    os.makedirs(output_dir, exist_ok=True)
    logger.info(f"Saving model checkpoint to {output_dir}")
    # 保存lora权重和配置
    self.model.save_pretrained(
        output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
    )

    if self.tokenizer is not None:
        self.tokenizer.save_pretrained(output_dir)

    # Good practice: save your training arguments together with the trained model
    torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))

Expected behavior

i'm sure the dataset was has 3 keys('target_mask,'input_ids','attention_mask') but when call in SFTDataCollator,the train_dataset was only 2 keys,missing 'target_mask'

github-actions[bot] commented 1 month ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

LysandreJik commented 1 month ago

Hey! Sorry for the delay; should this be addressed to the TRL or Axolotl library instead? There is no SFTDataCollator in transformers.

github-actions[bot] commented 4 weeks ago