Duxiaoman-DI / XuanYuan

轩辕:度小满中文金融对话大模型
1.07k stars 97 forks source link

最低 CUDA 版本要求 #8

Closed LuneZ99 closed 11 months ago

LuneZ99 commented 1 year ago

在两台不同的机器部署时遇到问题,其中 cuda11.7 环境的机器会报错

CUDA 11.7 运行示例报错 ``` 输入: 介绍下你自己 /opt/conda/conda-bld/pytorch_1695392020201/work/aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [19,0,0], thread: [96,0 --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[5], line 9 7 print(f"输入: {content}") 8 inputs = tokenizer(prompt, return_tensors="pt").to("cuda") ----> 9 outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.95) 10 outputs = tokenizer.decode(outputs.cpu()[0][len(inputs.input_ids[0]):], skip_special_tokens=True) 11 print(f"输出: {outputs}") File ~/miniconda3/envs/llm/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, **kwargs): 114 with ctx_factory(): --> 115 return func(*args, **kwargs) File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/generation/utils.py:1652, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs) 1644 input_ids, model_kwargs = self._expand_inputs_for_generation( 1645 input_ids=input_ids, 1646 expand_size=generation_config.num_return_sequences, 1647 is_encoder_decoder=self.config.is_encoder_decoder, 1648 **model_kwargs, 1649 ) 1651 # 13. run sample -> 1652 return self.sample( 1653 input_ids, 1654 logits_processor=logits_processor, 1655 logits_warper=logits_warper, 1656 stopping_criteria=stopping_criteria, 1657 pad_token_id=generation_config.pad_token_id, 1658 eos_token_id=generation_config.eos_token_id, 1659 output_scores=generation_config.output_scores, 1660 return_dict_in_generate=generation_config.return_dict_in_generate, 1661 synced_gpus=synced_gpus, 1662 streamer=streamer, 1663 **model_kwargs, 1664 ) 1666 elif generation_mode == GenerationMode.BEAM_SEARCH: 1667 # 11. prepare beam search scorer 1668 beam_scorer = BeamSearchScorer( 1669 batch_size=batch_size, 1670 num_beams=generation_config.num_beams, (...) 1675 max_length=generation_config.max_length, 1676 ) File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/generation/utils.py:2734, in GenerationMixin.sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs) 2731 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) 2733 # forward pass to get next token -> 2734 outputs = self( 2735 **model_inputs, 2736 return_dict=True, 2737 output_attentions=output_attentions, 2738 output_hidden_states=output_hidden_states, 2739 ) 2741 if synced_gpus and this_peer_finished: 2742 continue # don't waste resources running the code we don't need File ~/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs) 1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(*args, **kwargs) File ~/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(*args, **kwargs) 1529 try: 1530 result = None File ~/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/hooks.py:164, in add_hook_to_module..new_forward(module, *args, **kwargs) 162 output = module._old_forward(*args, **kwargs) 163 else: --> 164 output = module._old_forward(*args, **kwargs) 165 return module._hf_hook.post_forward(module, output) File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:1038, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict) 1035 return_dict = return_dict if return_dict is not None else self.config.use_return_dict 1037 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) -> 1038 outputs = self.model( 1039 input_ids=input_ids, 1040 attention_mask=attention_mask, 1041 position_ids=position_ids, 1042 past_key_values=past_key_values, 1043 inputs_embeds=inputs_embeds, 1044 use_cache=use_cache, 1045 output_attentions=output_attentions, 1046 output_hidden_states=output_hidden_states, 1047 return_dict=return_dict, 1048 ) 1050 hidden_states = outputs[0] 1051 if self.config.pretraining_tp > 1: File ~/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs) 1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(*args, **kwargs) File ~/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(*args, **kwargs) 1529 try: 1530 result = None File ~/miniconda3/envs/llm/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:925, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict) 921 layer_outputs = torch.utils.checkpoint.checkpoint( 922 create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids 923 ) 924 else: --> 925 layer_outputs = decoder_layer( 926 hidden_states, 927 attention_mask=attention_mask, 928 position_ids=position_ids, 929 past_key_value=past_key_value, 930 output_attentions=output_attentions, 931 use_cache=use_cache, 932 padding_mask=padding_mask, 933 ) 935 hidden_states = layer_outputs[0] 937 if use_cache: File ~/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs) 1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1517 else: -> 1518 return self._call_impl(*args, **kwargs) File ~/miniconda3/envs/llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs) 1522 # If we don't have any hooks, we want to skip the rest of the logic in 1523 # this function, and just call forward. 1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1525 or _global_backward_pre_hooks or _global_backward_hooks 1526 or _global_forward_hooks or _global_forward_pre_hooks): -> 1527 return forward_call(*args, **kwargs) 1529 try: 1530 result = None File ~/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/hooks.py:159, in add_hook_to_module..new_forward(module, *args, **kwargs) 158 def new_forward(module, *args, **kwargs): --> 159 args, kwargs = module._hf_hook.pre_forward(module, *args, **kwargs) 160 if module._hf_hook.no_grad: 161 with torch.no_grad(): File ~/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/hooks.py:290, in AlignDevicesHook.pre_forward(self, module, *args, **kwargs) 285 fp16_statistics = self.weights_map[name.replace("weight", "SCB")] 286 set_module_tensor_to_device( 287 module, name, self.execution_device, value=self.weights_map[name], fp16_statistics=fp16_statistics 288 ) --> 290 return send_to_device(args, self.execution_device), send_to_device( 291 kwargs, self.execution_device, skip_keys=self.skip_keys 292 ) File ~/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/utils/operations.py:160, in send_to_device(tensor, device, non_blocking, skip_keys) 157 elif skip_keys is None: 158 skip_keys = [] 159 return type(tensor)( --> 160 { 161 k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys) 162 for k, t in tensor.items() 163 } 164 ) 165 elif hasattr(tensor, "to"): 166 try: File ~/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/utils/operations.py:161, in (.0) 157 elif skip_keys is None: 158 skip_keys = [] 159 return type(tensor)( 160 { --> 161 k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys) 162 for k, t in tensor.items() 163 } 164 ) 165 elif hasattr(tensor, "to"): 166 try: File ~/miniconda3/envs/llm/lib/python3.10/site-packages/accelerate/utils/operations.py:167, in send_to_device(tensor, device, non_blocking, skip_keys) 165 elif hasattr(tensor, "to"): 166 try: --> 167 return tensor.to(device, non_blocking=non_blocking) 168 except TypeError: # .to() doesn't accept non_blocking as kwarg 169 return tensor.to(device) RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. ```

在 CUDA 11.8 环境运行正常。 可能本项目最低需要 CUDA 11.8 ?

Duxiaomantech commented 1 year ago

你好,我们这边测试环境基于CUDA 12.0以上的版本,11.7暂时没有测试过。应该是transformers/torch对cuda版本要求较高。 感谢反馈,我们将CUDA环境补充到README中了