Open zhangyuanscall opened 1 year ago
加载方式
model = AutoModel.from_pretrained(visualchatglm_model_path,trust_remote_code=True).to(torch.cuda.current_device())
运行环境如下:
SwissArmyTransformer 0.3.7 transformers 4.28.1 deepspeed 0.9.1 torch 1.11.0+cu113 torchaudio 0.11.0+rocm4.5.2 torchvision 0.12.0+cu113 cpm-kernels 1.0.11 einops 0.6.1
报错日志如下:
Epoch_0: 0%| | 0/16 [00:04<?, ?it/s] ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /export/App/training_platform/PinoModel/applications/VisualGLM/visual_chatgl │ │ m_instructing_mergeclose_v1.py:229 in <module> │ │ │ │ 226 │ parser.add_argument('--lr', type=float, default=5e-6) │ │ 227 │ parser.add_argument('--accimulation_steps', type=int, default=4) │ │ 228 │ args = parser.parse_args() │ │ ❱ 229 │ train(args) │ │ 230 │ │ │ │ /export/App/training_platform/PinoModel/applications/VisualGLM/visual_chatgl │ │ m_instructing_mergeclose_v1.py:203 in train │ │ │ │ 200 │ │ │ │ │ │ model_save_path='/media/cfs/zhanglezhong/LLMS │ │ 201 │ │ │ │ │ │ tensorboard_writer=tensorboard_writer) │ │ 202 │ │ │ ❱ 203 │ trainer.fit(logger=logger, log_interval=args.log_interval) │ │ 204 │ │ 205 # # save model checkpoint after fitting on only rank0 │ │ 206 # trainer.save_model(path=args.save_path, only_rank0=True, tokeniz │ │ │ │ /export/App/training_platform/PinoModel/applications/VisualGLM/coati/trainer │ │ /visual_sft_glm.py:134 in fit │ │ │ │ 131 │ │ │ │ labels = batch["labels"].to(torch.cuda.current_device( │ │ 132 │ │ │ │ image = batch["img"].to(torch.cuda.current_device()) │ │ 133 │ │ │ │ pre_image = batch["pre_image"] │ │ ❱ 134 │ │ │ │ outputs = self.model(input_ids=prompt_ids, images=imag │ │ 135 │ │ │ │ │ │ 136 │ │ │ │ loss = outputs.loss │ │ 137 # if loss >= 2.5 and is_rank_0() : │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │ │ 1110 in _call_impl │ │ │ │ 1107 │ │ # this function, and just call forward. │ │ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │ │ 1111 │ │ # Do not call functions when jit is used │ │ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/visualglm/modeling_cha │ │ tglm.py:1462 in forward │ │ │ │ 1459 │ │ │ return_dict: Optional[bool] = None, │ │ 1460 │ ): │ │ 1461 │ │ if inputs_embeds is None and past_key_values is None and imag │ │ ❱ 1462 │ │ │ image_embeds = self.image_encoder(images) │ │ 1463 │ │ │ pre_id, pads, post_id = torch.tensor_split(input_ids, │ │ 1464 │ │ │ │ │ │ │ │ │ │ │ │ │ [pre_image_len │ │ 1465 │ │ │ │ │ │ │ │ │ │ │ │ │ dim=1) # imag │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │ │ 1110 in _call_impl │ │ │ │ 1107 │ │ # this function, and just call forward. │ │ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │ │ 1111 │ │ # Do not call functions when jit is used │ │ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/visualglm/visual.py:69 │ │ in forward │ │ │ │ 66 │ │ │ self.qformer.parameters().__next__().dtype) │ │ 67 │ │ │ 68 │ def forward(self, image, **kwargs): │ │ ❱ 69 │ │ enc = self.vit(image)[0] │ │ 70 │ │ out = self.qformer(enc)[0] │ │ 71 │ │ return self.glm_proj(out) │ │ 72 │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │ │ 1110 in _call_impl │ │ │ │ 1107 │ │ # this function, and just call forward. │ │ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │ │ 1111 │ │ # Do not call functions when jit is used │ │ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /root/.cache/huggingface/modules/transformers_modules/visualglm/visual.py:28 │ │ in forward │ │ │ │ 25 │ │ batch_size = image.size(0) │ │ 26 │ │ input_ids = torch.zeros(batch_size, 1, dtype=torch.long, devic │ │ 27 │ │ attention_mask = torch.tensor([[1.]], dtype=image.dtype, devic │ │ ❱ 28 │ │ return super().forward(input_ids=input_ids, position_ids=None, │ │ 29 │ │ 30 │ │ 31 class QFormer(BaseModel): │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/sat/model/base_model.py:144 │ │ in forward │ │ │ │ 141 │ │ # Attention! the transformer might be shared by multiple model │ │ 142 │ │ self.transformer.hooks.clear() │ │ 143 │ │ self.transformer.hooks.update(self.hooks) │ │ ❱ 144 │ │ return self.transformer(*args, **kwargs) │ │ 145 │ │ │ 146 │ def collect_hooks_(self): │ │ 147 │ │ names = list(HOOKS_DEFAULT.keys()) │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │ │ 1110 in _call_impl │ │ │ │ 1107 │ │ # this function, and just call forward. │ │ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │ │ 1111 │ │ # Do not call functions when jit is used │ │ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/sat/model/transformer.py:56 │ │ 9 in forward │ │ │ │ 566 │ │ │ │ │ │ output_this_layer=output_this_layer_obj, outpu │ │ 567 │ │ │ │ │ ) │ │ 568 │ │ │ │ else: │ │ ❱ 569 │ │ │ │ │ layer_ret = layer(*args, layer_id=torch.tensor(i), │ │ 570 │ │ │ │ │ │ output_this_layer=output_this_layer_obj, outpu │ │ 571 │ │ │ │ if isinstance(layer_ret, tuple): │ │ 572 │ │ │ │ │ layer_ret = layer_ret[0] # for legacy API │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │ │ 1110 in _call_impl │ │ │ │ 1107 │ │ # this function, and just call forward. │ │ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │ │ 1111 │ │ # Do not call functions when jit is used │ │ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/sat/model/transformer.py:33 │ │ 0 in forward │ │ │ │ 327 │ │ ) │ │ 328 │ │ │ 329 │ def forward(self, hidden_states, mask, *args, **kw_args): │ │ ❱ 330 │ │ return HOOKS_DEFAULT['layer_forward'](self, hidden_states, mas │ │ 331 │ │ 332 │ │ 333 class BaseTransformer(torch.nn.Module): │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/sat/transformer_defaults.py │ │ :127 in layer_forward_default │ │ │ │ 124 │ # Layer norm at the begining of the transformer layer. │ │ 125 │ attention_input = self.input_layernorm(hidden_states) │ │ 126 │ # Self attention. │ │ ❱ 127 │ attention_output = self.attention(attention_input, mask, **kw_args │ │ 128 │ │ │ 129 │ # Third LayerNorm │ │ 130 │ if self.layernorm_order == 'sandwich': │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/torch/nn/modules/module.py: │ │ 1110 in _call_impl │ │ │ │ 1107 │ │ # this function, and just call forward. │ │ 1108 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1109 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1110 │ │ │ return forward_call(*input, **kwargs) │ │ 1111 │ │ # Do not call functions when jit is used │ │ 1112 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1113 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/sat/model/transformer.py:10 │ │ 3 in forward │ │ │ │ 100 │ │ if 'attention_forward' in self.hooks: │ │ 101 │ │ │ return self.hooks['attention_forward'](hidden_states, mask │ │ 102 │ │ else: │ │ ❱ 103 │ │ │ return HOOKS_DEFAULT['attention_forward'](self, hidden_sta │ │ 104 │ │ 105 │ │ 106 class CrossAttention(torch.nn.Module): │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/sat/transformer_defaults.py │ │ :63 in attention_forward_default │ │ │ │ 60 │ key_layer = self._transpose_for_scores(mixed_key_layer) │ │ 61 │ value_layer = self._transpose_for_scores(mixed_value_layer) │ │ 62 │ │ │ ❱ 63 │ context_layer = attention_fn(query_layer, key_layer, value_layer, │ │ 64 │ │ │ 65 │ context_layer = context_layer.permute(0, 2, 1, 3).contiguous() │ │ 66 │ new_context_layer_shape = context_layer.size()[:-2] + (self.hidden │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/sat/transformer_defaults.py │ │ :38 in standard_attention │ │ │ │ 35 │ │ │ 36 │ if attention_dropout is not None: │ │ 37 │ │ if mpu.get_cuda_rng_tracker is not None: │ │ ❱ 38 │ │ │ with mpu.get_cuda_rng_tracker().fork(): │ │ 39 │ │ │ │ attention_probs = attention_dropout(attention_probs) │ │ 40 │ │ else: │ │ 41 │ │ │ attention_probs = attention_dropout(attention_probs) │ │ │ │ /usr/local/anaconda3/lib/python3.8/contextlib.py:113 in __enter__ │ │ │ │ 110 │ │ # they are only needed for recreation, which is not possible a │ │ 111 │ │ del self.args, self.kwds, self.func │ │ 112 │ │ try: │ │ ❱ 113 │ │ │ return next(self.gen) │ │ 114 │ │ except StopIteration: │ │ 115 │ │ │ raise RuntimeError("generator didn't yield") from None │ │ 116 │ │ │ │ /usr/local/anaconda3/lib/python3.8/site-packages/deepspeed/runtime/activatio │ │ n_checkpointing/checkpointing.py:174 in fork │ │ │ │ 171 │ │ the original state.""" │ │ 172 │ │ # Check if we have added the state │ │ 173 │ │ if name not in self.states_: │ │ ❱ 174 │ │ │ raise Exception('cuda rng state {} is not added'.format(na │ │ 175 │ │ # Store current rng state. │ │ 176 │ │ orig_cuda_rng_state = get_accelerator().get_rng_state() │ │ 177 │ │ # Set rng state to the desired one │ ╰──────────────────────────────────────────────────────────────────────────────╯ Exception: cuda rng state model-parallel-rng is not added
建议使用提供的sat方法微调,因为sat支持模型并行(tensor parallel)的时候要先确定模型并行大小来切分和存随机种子等,如果使用其他方法训练需要手动初始化一些东西,正常的pytorch应该没问题,但是不确定是否会和transformers训练的某些初始化冲突。
加载方式
运行环境如下:
报错日志如下: