ridgerchu / matmulfreellm

Implementation for MatMul-free LM.
Apache License 2.0
2.5k stars 139 forks source link

Failure to train. #9

Closed kazuya-hodatsu-336-1 closed 2 weeks ago

kazuya-hodatsu-336-1 commented 2 weeks ago

Hello.

I tried to conduct training using the code below, but I kept encountering errors and couldn't get it to work.

Could you provide a sample training method to create a new model?

PyTorch version: 2.3.1+cu121 Triton version: 2.2.0 Einops version: 0.8.0

import os

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoTokenizer
import torch
from datasets import load_dataset
from mmfreelm.models import HGRNBitConfig, HGRNBitModel

# Definition of CustomTrainer class
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        inputs = {key: value.to(self.args.device) for key, value in inputs.items()}
        labels = inputs.pop("labels", None)

        outputs = model(**inputs)
        logits = outputs[0]  # Assuming the first element of the output is logits

        if labels is not None:
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        else:
            loss = outputs["loss"] if "loss" is in outputs else outputs[0]

        return (loss, outputs) if return_outputs else loss

# Load dataset
dataset = load_dataset('range3/wikipedia-ja-20230101', split='train[:1%]')

# Prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, return_tensors="pt")

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=100)

# Set data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Shape the dataset
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

# Initialize model
config = HGRNBitConfig()
model = HGRNBitModel(config)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
)

# Execute training
trainer.train()

Error:

TypeError                                 Traceback (most recent call last)
Cell In[2], line 71
     62 trainer = CustomTrainer(
     63     model=model,
     64     args=training_args,
   (...)
     67     data_collator=data_collator,
     68 )
     70 # Execute training
---> 71 trainer.train()

File /usr/local/lib/python3.10/dist-packages/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1883         hf_hub_utils.enable_progress_bars()
   1884 else:
-> 1885     return inner_training_loop(
   1886         args=args,
   1887         resume_from_checkpoint=resume_from_checkpoint,
   1888         trial=trial,
   1889         ignore_keys_for_eval=ignore_keys_for_eval,
   1890     )

File /usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2213     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
   2215 with self.accelerator.accumulate(model):
-> 2216     tr_loss_step = self.training_step(model, inputs)
   2218 if (
   2219     args.logging_nan_inf_filter
   2220     and not is_torch_xla_available()
   2221     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   2222 ):
   2223     # if loss is nan or inf simply add the average of previous logged losses
   2224     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /usr/local/lib/python3.10/dist-packages/transformers/trainer.py:3238, in Trainer.training_step(self, model, inputs)
   3235     return loss_mb.reduce_mean().detach().to(self.args.device)
   3237 with self.compute_loss_context_manager():
-> 3238     loss = self.compute_loss(model, inputs)
   3240 del inputs
   3241 torch.cuda.empty_cache()

Cell In[2], line 14, in CustomTrainer.compute_loss(self, model, inputs, return_outputs)
     11 inputs = {key: value.to(self.args.device) for key, value in inputs.items()}
     12 labels = inputs.pop("labels", None)
---> 14 outputs = model(**inputs)
     15 logits = outputs[0]  # Assuming the first element of the output is logits
     17 if labels is not None:

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or _global_backward_pre_hooks or _global_backward_hooks
   1539         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File /usr/local/lib/python3.10/dist-packages/torch/nn/parallel/data_parallel.py:185, in DataParallel.forward(self, *inputs, **kwargs)
    183     return self.module(*inputs[0], **module_kwargs[0])
    184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 185 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
    186 return self.gather(outputs, self.output_device)

File /usr/local/lib/python3.10/dist-packages/torch/nn/parallel/data_parallel.py:200, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
    199 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
--> 200     return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])

File /usr/local/lib/python3.10/dist-packages/torch/nn/parallel/parallel_apply.py:108, in parallel_apply(modules, inputs, kwargs_tup, devices)
    107     if isinstance(output, ExceptionWrapper):
    108         output.reraise()
    109     outputs.append(output)
    110 return outputs

File /usr/local/lib/python3.10/dist-packages/torch/_utils.py:705, in ExceptionWrapper.reraise(self)
    701 except TypeError:
    702     # If the exception takes multiple arguments, don't try to
    703     # instantiate since we don't know how to
    704     raise RuntimeError(msg) from None
--> 705 raise exception

TypeError: Caught TypeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/mmfreelm/models/hgrn_bit/modeling_hgrn_bit.py", line 253, in forward
    hidden_states, attentions, past_key_values = layer(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/mmfreelm/models/hgrn_bit/modeling_hgrn_bit.py", line 101, in forward
    hidden_states = self.attn_norm(hidden_states)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/mmfreelm/modules/layernorm.py", line 615, in forward
    return rms_norm_fn(
  File "/usr/local/lib/python3.10/dist-packages/mmfreelm/modules/layernorm.py", line 543, in rms_norm_fn
    return LayerNormFn.apply(x, weight, bias, residual, eps, prenorm, residual_in_fp32, True)
  File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 598, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/usr/local/lib/python3.10/dist-packages/mmfreelm/utils.py", line 9, in wrapper
    return fn(ctx,
  File "/usr/local/lib/python3.10/dist-packages/mmfreelm/modules/layernorm.py", line 471, in forward
    y, mean, rstd, residual_out = _layer_norm_fwd(
  File "/usr/local/lib/python3.10/dist-packages/mmfreelm/modules/layernorm.py", line 203, in _layer_norm_fwd
    _layer_norm_fwd_1pass_kernel[(M,)](
  File "/usr/local/lib/python3.10/dist-packages/triton/runtime/autotuner.py", line 153, in run
    full_nargs = {**self.nargs, **kwargs, **self.best_config.kwargs}
TypeError: 'NoneType' object is not a mapping

Am I doing something wrong to begin with? Sorry if I am making a big mistake.

ridgerchu commented 2 weeks ago

Hello, Thank you for reaching out about the issues you encountered while training your model. We recommend using the official Hugging Face training code, as it has proven to be reliable and effective in our experience. We have successfully used the official Hugging Face training code and have achieved good performance results. It has consistently worked well for us, and we haven't encountered any significant issues... Thanks!

kazuya-hodatsu-336-1 commented 2 weeks ago

@ridgerchu Thanks for replying. I'll try it.

radna0 commented 2 weeks ago

Could anyone provide me the links to the official Hugging Face training code? Or if possible can you share what worked @kazuya-hodatsu-336-1

yumemio commented 1 day ago

@radna0 This 🤗 tutorial is my go-to resource when I want to train a model from scratch. It also has a link to the Colab notebook, which is pretty handy.

Just make sure to swap AutoConfig with HGRNBitConfig, and AutoModel with AutoModelForCausalLM:

from mmfreelm.models import HGRNBitConfig
from transformers import AutoModelForCausalLM

# Config for the 370M model
# Reference: https://huggingface.co/ridger/MMfreeLM-370M/blob/main/config.json
config_params = {
    "attn_mode": "fused_recurrent",
    "bos_token_id": 1,
    "conv_size": 4,
    "eos_token_id": 2,
    "expand_ratio": 1,
    "fuse_cross_entropy": True,
    "hidden_act": "swish",
    "hidden_ratio": 4,
    "hidden_size": 1024,
    "initializer_range": 0.02,
    "intermediate_size": None,
    "max_position_embeddings": 2048,
    "model_type": "hgrn_bit",
    "num_heads": 1,
    "num_hidden_layers": 24,
    "rms_norm_eps": 1e-06,
    "share_conv_kernel": True,
    "tie_word_embeddings": False,
    "torch_dtype": "bfloat16",
    "transformers_version": "4.40.2",
    "use_cache": True,
    "use_lower_bound": True,
    "use_short_conv": False,
    "vocab_size": 32000,
}

config = HGRNBitConfig(**config_params)
model = AutoModelForCausalLM.from_config(config)