pytorch / pytorch

Tensors and Dynamic neural networks in Python with strong GPU acceleration
https://pytorch.org
Other
83.92k stars 22.62k forks source link

RuntimeError: NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_() INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":813, please report a bug to PyTorch. #130486

Open Zzv213 opened 4 months ago

Zzv213 commented 4 months ago

🐛 Describe the bug

''' checkpoint_path = './llama_relevance_results'

training_args = transformers.TrainingArguments(

remove_unused_columns=False, # Whether or not to automatically remove the columns unused by the model forward method

    report_to='none', # default to ['tensorboard', 'wandb']
    num_train_epochs=5,
    per_device_train_batch_size=64, # 8 for 65B
    gradient_accumulation_steps=1,
    #warmup_ratio=0.05,
    #max_steps=100,
    #save_steps = 10,
    save_strategy="epoch",
    save_total_limit=None, # If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in output_dir.
    max_grad_norm=0.3,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    output_dir=checkpoint_path,
    optim="paged_adamw_32bit",
    lr_scheduler_type="constant",
    group_by_length=True, #  Whether or not to group together samples of roughly the same length in the training dataset (to minimize padding applied and be more efficient). Only useful if applying dynamic padding.
)

print(f"training_args:\n{training_args}")

data_collator = transformers.DataCollatorForSeq2Seq( tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True )

trainer = transformers.Trainer( model=model, train_dataset=dataset, args=training_args, data_collator=data_collator, callbacks=[SavePeftModelCallback] ) model.config.use_cache = False # silence the warnings. Please re-enable for inference!

for name, module in model.named_modules():

if isinstance(module, LoraLayer):

    #module = module.to(torch.bfloat16)

if 'norm' in name:
    module = module.to(torch.float32)

if 'lm_head' in name or 'embed_tokens' in name:
    if hasattr(module, 'weight'):
        if module.weight.dtype == torch.float32:
            module = module.to(torch.bfloat16)

trainer.train() '''

''' --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[29], line 52 49 if module.weight.dtype == torch.float32: 50 module = module.to(torch.bfloat16) ---> 52 trainer.train()

File ~/ftenv/lib/python3.10/site-packages/transformers/trainer.py:1932, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs) 1930 hf_hub_utils.enable_progress_bars() 1931 else: -> 1932 return inner_training_loop( 1933 args=args, 1934 resume_from_checkpoint=resume_from_checkpoint, 1935 trial=trial, 1936 ignore_keys_for_eval=ignore_keys_for_eval, 1937 )

File ~/ftenv/lib/python3.10/site-packages/transformers/trainer.py:2268, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval) 2265 self.control = self.callback_handler.on_step_begin(args, self.state, self.control) 2267 with self.accelerator.accumulate(model): -> 2268 tr_loss_step = self.training_step(model, inputs) 2270 if ( 2271 args.logging_nan_inf_filter 2272 and not is_torch_xla_available() 2273 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) 2274 ): 2275 # if loss is nan or inf simply add the average of previous logged losses 2276 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/ftenv/lib/python3.10/site-packages/transformers/trainer.py:3307, in Trainer.training_step(self, model, inputs) 3304 return loss_mb.reduce_mean().detach().to(self.args.device) 3306 with self.compute_loss_context_manager(): -> 3307 loss = self.compute_loss(model, inputs) 3309 del inputs 3311 kwargs = {}

File ~/ftenv/lib/python3.10/site-packages/transformers/trainer.py:3338, in Trainer.compute_loss(self, model, inputs, return_outputs) 3336 else: 3337 labels = None -> 3338 outputs = model(**inputs) 3339 # Save past state if it exists 3340 # TODO: this needs to be fixed and made cleaner later. 3341 if self.args.past_index >= 0:

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File ~/ftenv/lib/python3.10/site-packages/accelerate/utils/operations.py:819, in convert_outputs_to_fp32..forward(*args, kwargs) 818 def forward(*args, *kwargs): --> 819 return model_forward(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/accelerate/utils/operations.py:807, in ConvertOutputsToFp32.call(self, *args, kwargs) 806 def call(self, *args, *kwargs): --> 807 return convert_to_fp32(self.model_forward(args, kwargs))

File ~/ftenv/lib/python3.10/site-packages/torch/amp/autocast_mode.py:16, in autocast_decorator..decorate_autocast(*args, kwargs) 13 @functools.wraps(func) 14 def decorate_autocast(*args, *kwargs): 15 with autocast_instance: ---> 16 return func(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/peft/peft_model.py:1430, in PeftModelForCausalLM.forward(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, kwargs) 1428 with self._enable_peft_forward_hooks(kwargs): 1429 kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} -> 1430 return self.base_model( 1431 input_ids=input_ids, 1432 attention_mask=attention_mask, 1433 inputs_embeds=inputs_embeds, 1434 labels=labels, 1435 output_attentions=output_attentions, 1436 output_hidden_states=output_hidden_states, 1437 return_dict=return_dict, 1438 **kwargs, 1439 ) 1441 batch_size = _get_batch_size(input_ids, inputs_embeds) 1442 if attention_mask is not None: 1443 # concat prompt attention mask

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File ~/ftenv/lib/python3.10/site-packages/peft/tuners/tuners_utils.py:179, in BaseTuner.forward(self, *args, kwargs) 178 def forward(self, *args: Any, *kwargs: Any): --> 179 return self.model.forward(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/accelerate/hooks.py:169, in add_hook_to_module..new_forward(module, *args, kwargs) 167 output = module._old_forward(*args, *kwargs) 168 else: --> 169 output = module._old_forward(args, kwargs) 170 return module._hf_hook.post_forward(module, output)

File ~/ftenv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:1174, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position) 1171 return_dict = return_dict if return_dict is not None else self.config.use_return_dict 1173 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) -> 1174 outputs = self.model( 1175 input_ids=input_ids, 1176 attention_mask=attention_mask, 1177 position_ids=position_ids, 1178 past_key_values=past_key_values, 1179 inputs_embeds=inputs_embeds, 1180 use_cache=use_cache, 1181 output_attentions=output_attentions, 1182 output_hidden_states=output_hidden_states, 1183 return_dict=return_dict, 1184 cache_position=cache_position, 1185 ) 1187 hidden_states = outputs[0] 1188 if self.config.pretraining_tp > 1:

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File ~/ftenv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:967, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position) 964 all_hidden_states += (hidden_states,) 966 if self.gradient_checkpointing and self.training: --> 967 layer_outputs = self._gradient_checkpointing_func( 968 decoder_layer.call, 969 hidden_states, 970 causal_mask, 971 position_ids, 972 past_key_values, 973 output_attentions, 974 use_cache, 975 cache_position, 976 ) 977 else: 978 layer_outputs = decoder_layer( 979 hidden_states, 980 attention_mask=causal_mask, (...) 985 cache_position=cache_position, 986 )

File ~/ftenv/lib/python3.10/site-packages/torch/_compile.py:24, in _disable_dynamo..inner(*args, kwargs) 20 @functools.wraps(fn) 21 def inner(*args, *kwargs): 22 import torch._dynamo ---> 24 return torch._dynamo.disable(fn, recursive)(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py:451, in _TorchDynamoContext.call.._fn(*args, *kwargs) 449 prior = set_eval_frame(callback) 450 try: --> 451 return fn(args, **kwargs) 452 finally: 453 set_eval_frame(prior)

File ~/ftenv/lib/python3.10/site-packages/torch/_dynamo/external_utils.py:36, in wrap_inline..inner(*args, kwargs) 34 @functools.wraps(fn) 35 def inner(*args, *kwargs): ---> 36 return fn(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/torch/utils/checkpoint.py:487, in checkpoint(function, use_reentrant, context_fn, determinism_check, debug, *args, kwargs) 482 if context_fn is not noop_context_fn or debug is not False: 483 raise ValueError( 484 "Passing context_fn or debug is only supported when " 485 "use_reentrant=False." 486 ) --> 487 return CheckpointFunction.apply(function, preserve, args) 488 else: 489 gen = _checkpoint_without_reentrant_generator( 490 function, preserve, context_fn, determinism_check, debug, args, kwargs 491 )

File ~/ftenv/lib/python3.10/site-packages/torch/autograd/function.py:598, in Function.apply(cls, *args, *kwargs) 595 if not torch._C._are_functorch_transforms_active(): 596 # See NOTE: [functorch vjp and autograd interaction] 597 args = _functorch.utils.unwrap_dead_wrappers(args) --> 598 return super().apply(args, **kwargs) # type: ignore[misc] 600 if not is_setup_ctx_defined: 601 raise RuntimeError( 602 "In order to use an autograd.Function with functorch transforms " 603 "(vmap, grad, jvp, jacrev, ...), it must override the setup_context " 604 "staticmethod. For more details, please see " 605 "https://pytorch.org/docs/master/notes/extending.func.html" 606 )

File ~/ftenv/lib/python3.10/site-packages/torch/utils/checkpoint.py:262, in CheckpointFunction.forward(ctx, run_function, preserve_rng_state, args) 259 ctx.save_for_backward(tensor_inputs) 261 with torch.no_grad(): --> 262 outputs = run_function(*args) 263 return outputs

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File ~/ftenv/lib/python3.10/site-packages/accelerate/hooks.py:169, in add_hook_to_module..new_forward(module, *args, kwargs) 167 output = module._old_forward(*args, *kwargs) 168 else: --> 169 output = module._old_forward(args, kwargs) 170 return module._hf_hook.post_forward(module, output)

File ~/ftenv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:732, in LlamaDecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, cache_position, **kwargs) 730 residual = hidden_states 731 hidden_states = self.post_attention_layernorm(hidden_states) --> 732 hidden_states = self.mlp(hidden_states) 733 hidden_states = residual + hidden_states 735 outputs = (hidden_states,)

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File ~/ftenv/lib/python3.10/site-packages/accelerate/hooks.py:169, in add_hook_to_module..new_forward(module, *args, kwargs) 167 output = module._old_forward(*args, *kwargs) 168 else: --> 169 output = module._old_forward(args, kwargs) 170 return module._hf_hook.post_forward(module, output)

File ~/ftenv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:215, in LlamaMLP.forward(self, x) 213 down_proj = sum(down_proj) 214 else: --> 215 down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) 217 return down_proj

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, kwargs) 1530 return self._compiled_call_impl(*args, *kwargs) # type: ignore[misc] 1531 else: -> 1532 return self._call_impl(args, kwargs)

File ~/ftenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, *kwargs) 1536 # If we don't have any hooks, we want to skip the rest of the logic in 1537 # this function, and just call forward. 1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1539 or _global_backward_pre_hooks or _global_backward_hooks 1540 or _global_forward_hooks or _global_forward_pre_hooks): -> 1541 return forward_call(args, **kwargs) 1543 try: 1544 result = None

File ~/ftenv/lib/python3.10/site-packages/peft/tuners/lora/bnb.py:480, in Linear4bit.forward(self, x, *args, **kwargs) 477 if requires_conversion: 478 output = output.to(expected_dtype) --> 480 result = result + output 482 return result

RuntimeError: NVML_SUCCESS == DriverAPI::get()->nvmlInitv2() INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":813, please report a bug to PyTorch. '''

Versions

[pip3] numpy==1.26.4 [pip3] torch==2.3.1+cu121 [pip3] triton==2.3.1 [conda] Could not collect

cc @ptrblck @msaroufim

jbschlosser commented 4 months ago

Hi @Zzv213, if possible, could you please provide a small, self-contained reproducible example that demonstrates the error?

ijpq commented 4 months ago

this error looks like GPU system unstable or bad state, according to nvml docs. should error code from any calling of DriverAPI::get() print out in assertion msg?

nctu6 commented 1 month ago

Environment: Driver Version: 470.161.03 CUDA Driver Version: 12.4 nvidia-cuda-cupti-cu12 12.1.105 nvidia-cuda-nvrtc-cu12 12.1.105 nvidia-cuda-runtime-cu12 12.1.105 torch 2.4.1 transformers 4.44.2

Jackie-shi commented 6 days ago

@nctu6 I got the same error