ssbuild / chatglm2_finetuning

chatglm2 6b finetuning and alpaca finetuning
Apache License 2.0
144 stars 17 forks source link

大佬,lora多卡训练报错,帮忙看下 #24

Open fjchung opened 1 year ago

fjchung commented 1 year ago

训练命令如下: CUDA_VISIBLE_DEVICES=0,1 python train.py

报错信息如下: ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /chatglm2-dev/train.py:122 in │ │ │ │ 119 │ ) │ │ 120 │ │ │ 121 │ if train_datasets is not None: │ │ ❱ 122 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │ │ 123 │ │ 124 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │ │ 520 in fit │ │ │ │ 517 │ │ """ │ │ 518 │ │ model = _maybe_unwrap_optimized(model) │ │ 519 │ │ self.strategy._lightning_module = model │ │ ❱ 520 │ │ call._call_and_handle_interrupt( │ │ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │ │ 522 │ │ ) │ │ 523 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:42 │ │ in _call_and_handle_interrupt │ │ │ │ 39 │ """ │ │ 40 │ try: │ │ 41 │ │ if trainer.strategy.launcher is not None: │ │ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, args, │ │ 43 │ │ else: │ │ 44 │ │ │ return trainer_fn(args, kwargs) │ │ 45 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/launcher │ │ s/subprocess_script.py:92 in launch │ │ │ │ 89 │ │ """ │ │ 90 │ │ if not self.cluster_environment.creates_processes_externally: │ │ 91 │ │ │ self._call_children_scripts() │ │ ❱ 92 │ │ return function(*args, **kwargs) │ │ 93 │ │ │ 94 │ def kill(self, signum: _SIGNUM) -> None: │ │ 95 │ │ for proc in self.procs: │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │ │ 559 in _fit_impl │ │ │ │ 556 │ │ │ model_provided=True, │ │ 557 │ │ │ model_connected=self.lightning_module is not None, │ │ 558 │ │ ) │ │ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │ │ 560 │ │ │ │ 561 │ │ assert self.state.stopped │ │ 562 │ │ self.training = False │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │ │ 935 in _run │ │ │ │ 932 │ │ # ---------------------------- │ │ 933 │ │ # RUN THE TRAINER │ │ 934 │ │ # ---------------------------- │ │ ❱ 935 │ │ results = self._run_stage() │ │ 936 │ │ │ │ 937 │ │ # ---------------------------- │ │ 938 │ │ # POST-Training CLEAN UP │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │ │ 978 in _run_stage │ │ │ │ 975 │ │ │ with isolate_rng(): │ │ 976 │ │ │ │ self._run_sanity_check() │ │ 977 │ │ │ with torch.autograd.set_detect_anomaly(self._detect_anoma │ │ ❱ 978 │ │ │ │ self.fit_loop.run() │ │ 979 │ │ │ return None │ │ 980 │ │ raise RuntimeError(f"Unexpected state {self.state}") │ │ 981 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:2 │ │ 01 in run │ │ │ │ 198 │ │ while not self.done: │ │ 199 │ │ │ try: │ │ 200 │ │ │ │ self.on_advance_start() │ │ ❱ 201 │ │ │ │ self.advance() │ │ 202 │ │ │ │ self.on_advance_end() │ │ 203 │ │ │ │ self._restarting = False │ │ 204 │ │ │ except StopIteration: │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:3 │ │ 54 in advance │ │ │ │ 351 │ │ assert self._data_fetcher is not None │ │ 352 │ │ self._data_fetcher.setup(combined_loader) │ │ 353 │ │ with self.trainer.profiler.profile("run_training_epoch"): │ │ ❱ 354 │ │ │ self.epoch_loop.run(self._data_fetcher) │ │ 355 │ │ │ 356 │ def on_advance_end(self) -> None: │ │ 357 │ │ trainer = self.trainer │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │ │ h_loop.py:133 in run │ │ │ │ 130 │ │ self.on_run_start(data_fetcher) │ │ 131 │ │ while not self.done: │ │ 132 │ │ │ try: │ │ ❱ 133 │ │ │ │ self.advance(data_fetcher) │ │ 134 │ │ │ │ self.on_advance_end() │ │ 135 │ │ │ │ self._restarting = False │ │ 136 │ │ │ except StopIteration: │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │ │ h_loop.py:218 in advance │ │ │ │ 215 │ │ │ with trainer.profiler.profile("run_training_batch"): │ │ 216 │ │ │ │ if trainer.lightning_module.automatic_optimization: │ │ 217 │ │ │ │ │ # in automatic optimization, there can only be one │ │ ❱ 218 │ │ │ │ │ batch_output = self.automatic_optimization.run(tra │ │ 219 │ │ │ │ else: │ │ 220 │ │ │ │ │ batch_output = self.manual_optimization.run(kwargs │ │ 221 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │ │ automatic.py:185 in run │ │ │ │ 182 │ │ # ------------------------------ │ │ 183 │ │ # gradient update with accumulated gradients │ │ 184 │ │ else: │ │ ❱ 185 │ │ │ self._optimizer_step(kwargs.get("batch_idx", 0), closure) │ │ 186 │ │ │ │ 187 │ │ result = closure.consume_result() │ │ 188 │ │ if result.loss is None: │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │ │ automatic.py:261 in _optimizer_step │ │ │ │ 258 │ │ │ self.optim_progress.optimizer.step.increment_ready() │ │ 259 │ │ │ │ 260 │ │ # model hook │ │ ❱ 261 │ │ call._call_lightning_module_hook( │ │ 262 │ │ │ trainer, │ │ 263 │ │ │ "optimizer_step", │ │ 264 │ │ │ trainer.current_epoch, │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:142 │ │ in _call_lightning_module_hook │ │ │ │ 139 │ pl_module._current_fx_name = hook_name │ │ 140 │ │ │ 141 │ with trainer.profiler.profile(f"[LightningModule]{pl_module.__clas │ │ ❱ 142 │ │ output = fn(*args, kwargs) │ │ 143 │ │ │ 144 │ # restore current_fx when nested context │ │ 145 │ pl_module._current_fx_name = prev_fx_name │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/module.py:1265 │ │ in optimizer_step │ │ │ │ 1262 │ │ │ │ │ for pg in optimizer.param_groups: │ │ 1263 │ │ │ │ │ │ pg["lr"] = lr_scale * self.learning_rate │ │ 1264 │ │ """ │ │ ❱ 1265 │ │ optimizer.step(closure=optimizer_closure) │ │ 1266 │ │ │ 1267 │ def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimiz │ │ 1268 │ │ """Override this method to change the default behaviour of │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/optimizer.py:1 │ │ 58 in step │ │ │ │ 155 │ │ │ raise MisconfigurationException("When `optimizer.step(clos │ │ 156 │ │ │ │ 157 │ │ assert self._strategy is not None │ │ ❱ 158 │ │ step_output = self._strategy.optimizer_step(self._optimizer, c │ │ 159 │ │ │ │ 160 │ │ self._on_after_step() │ │ 161 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:2 │ │ 59 in optimizer_step │ │ │ │ 256 │ │ │ model: reference to the model, optionally defining optimiz │ │ 257 │ │ │ **kwargs: Any extra arguments tooptimizer.step`│ │ 258 │ │ """ │ │ ❱ 259 │ │ optimizer_output = super().optimizer_step(optimizer, closure, │ │ 260 │ │ │ │ 261 │ │ if self._model_averager is None: │ │ 262 │ │ │ return optimizer_output │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/strategy │ │ .py:224 in optimizer_step │ │ │ │ 221 │ │ model = model or self.lightning_module │ │ 222 │ │ # TODO(fabric): remove assertion once strategy's optimizer_ste │ │ 223 │ │ assert isinstance(model, pl.LightningModule) │ │ ❱ 224 │ │ return self.precision_plugin.optimizer_step(optimizer, model=m │ │ 225 │ │ │ 226 │ def _setup_model_and_optimizers(self, model: Module, optimizers: L │ │ 227 │ │ """Setup a model and multiple optimizers together. │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/plugins/precision/d │ │ eepspeed.py:92 in optimizer_step │ │ │ │ 89 │ ) -> Any: │ │ 90 │ │ if isinstance(optimizer, LBFGS): │ │ 91 │ │ │ raise MisconfigurationException("DeepSpeed and the LBFGS o │ │ ❱ 92 │ │ closure_result = closure() │ │ 93 │ │ self._after_closure(model, optimizer) │ │ 94 │ │ skipped_backward = closure_result is None │ │ 95 │ │ # in manual optimization, the closure does not return a value │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │ │ automatic.py:140 in __call__ │ │ │ │ 137 │ │ return step_output │ │ 138 │ │ │ 139 │ def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: │ │ ❱ 140 │ │ self._result = self.closure(*args, **kwargs) │ │ 141 │ │ return self._result.loss │ │ 142 │ │ 143 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │ │ automatic.py:126 in closure │ │ │ │ 123 │ │ self._zero_grad_fn = zero_grad_fn │ │ 124 │ │ │ 125 │ def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: │ │ ❱ 126 │ │ step_output = self._step_fn() │ │ 127 │ │ │ │ 128 │ │ if step_output.closure_loss is None: │ │ 129 │ │ │ self.warning_cache.warn("training_stepreturnedNone. │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │ │ automatic.py:308 in _training_step │ │ │ │ 305 │ │ trainer = self.trainer │ │ 306 │ │ │ │ 307 │ │ # manually capture logged metrics │ │ ❱ 308 │ │ training_step_output = call._call_strategy_hook(trainer, "trai │ │ 309 │ │ self.trainer.strategy.post_training_step() │ │ 310 │ │ │ │ 311 │ │ result = self.output_result_cls.from_training_step_output(trai │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:288 │ │ in _call_strategy_hook │ │ │ │ 285 │ │ return │ │ 286 │ │ │ 287 │ with trainer.profiler.profile(f"[Strategy]{trainer.strategy.__clas │ │ ❱ 288 │ │ output = fn(*args, **kwargs) │ │ 289 │ │ │ 290 │ # restore current_fx when nested context │ │ 291 │ pl_module._current_fx_name = prev_fx_name │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:3 │ │ 31 in training_step │ │ │ │ 328 │ def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: │ │ 329 │ │ assert self.model is not None │ │ 330 │ │ with self.precision_plugin.train_step_context(): │ │ ❱ 331 │ │ │ return self.model(*args, **kwargs) │ │ 332 │ │ │ 333 │ def validation_step(self, *args: Any, **kwargs: Any) -> Optional[S │ │ 334 │ │ with self.precision_plugin.val_step_context(): │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/deepspeed/utils/nvtx.py:15 in │ │ wrapped_fn │ │ │ │ 12 │ │ │ 13 │ def wrapped_fn(*args, **kwargs): │ │ 14 │ │ get_accelerator().range_push(func.__qualname__) │ │ ❱ 15 │ │ ret_val = func(*args, **kwargs) │ │ 16 │ │ get_accelerator().range_pop() │ │ 17 │ │ return ret_val │ │ 18 │ │ │ │ /usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py:1769 in │ │ forward │ │ │ │ 1766 │ │ if self.fp16_auto_cast(): │ │ 1767 │ │ │ inputs = self._cast_inputs_half(inputs) │ │ 1768 │ │ │ │ ❱ 1769 │ │ loss = self.module(*inputs, **kwargs) │ │ 1770 │ │ │ │ 1771 │ │ if self.zero_optimization_partition_weights(): │ │ 1772 │ │ │ # Disable automated discovery of external parameters │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/overrides/base.py:9 │ │ 0 in forward │ │ │ │ 87 │ │ │ │ 88 │ │ if trainer is not None: │ │ 89 │ │ │ if trainer.training: │ │ ❱ 90 │ │ │ │ output = self._forward_module.training_step(*inputs, * │ │ 91 │ │ │ │ # In manual_optimization, we need to prevent DDP reduc │ │ 92 │ │ │ │ # it is done manually inLightningModule.manual_backw │ │ 93 │ │ │ │ # require_backward_grad_sync will be reset in the │ │ │ │ /usr/local/lib/python3.8/dist-packages/deeptraining/nlp/models/transformer │ │ base.py:552 in training_step │ │ │ │ 549 │ │ │ 550 │ def training_step(self, batch): │ │ 551 │ │ if isinstance(batch, dict): │ │ ❱ 552 │ │ │ outputs = self.compute_loss(batch) │ │ 553 │ │ else: │ │ 554 │ │ │ outputs = self.compute_loss(dict(batch)) │ │ 555 │ │ loss = outputs[0] │ │ │ │ /usr/local/lib/python3.8/dist-packages/deeptraining/nlp/models/transformer │ │ base.py:371 in compute_loss │ │ │ │ 368 │ def compute_loss(self, args, kwargs): │ │ 369 │ │ if len(args): │ │ 370 │ │ │ kwargs.update(dict(args)) │ │ ❱ 371 │ │ return self.model.compute_loss(kwargs) │ │ 372 │ │ │ 373 │ def forward(self, args, kwargs): │ │ 374 │ │ if len(args): │ │ │ │ /usr/local/lib/python3.8/dist-packages/deeptraining/nlp/models/transformer │ │ base.py:117 in compute_loss │ │ │ │ 114 │ │ return self.model(*args, *batch) │ │ 115 │ │ │ 116 │ def compute_loss(self, args, batch) -> tuple: │ │ ❱ 117 │ │ return self.model(*args, batch) │ │ 118 │ │ │ 119 │ def post_init(self): │ │ 120 │ │ return self.model.post_init() │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, *kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │ │ new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:953 in forward │ │ │ │ 950 │ │ use_cache = use_cache if use_cache is not None else self.conf │ │ 951 │ │ return_dict = return_dict if return_dict is not None else sel │ │ 952 │ │ │ │ ❱ 953 │ │ transformer_outputs = self.transformer( │ │ 954 │ │ │ input_ids=input_ids, │ │ 955 │ │ │ position_ids=position_ids, │ │ 956 │ │ │ attention_mask=attention_mask, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, *kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:849 in forward │ │ │ │ 846 │ │ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() │ │ 847 │ │ │ │ 848 │ │ # Run encoder. │ │ ❱ 849 │ │ hidden_states, presents, all_hidden_states, all_self_attentio │ │ 850 │ │ │ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary │ │ 851 │ │ │ kv_caches=past_key_values, use_cache=use_cache, output_hi │ │ 852 │ │ ) │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(input, kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:639 in forward │ │ │ │ 636 │ │ │ │ │ 637 │ │ │ layer = self._get_layer(index) │ │ 638 │ │ │ if self.gradient_checkpointing and self.training: │ │ ❱ 639 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │ │ 640 │ │ │ │ │ layer, │ │ 641 │ │ │ │ │ hidden_states, │ │ 642 │ │ │ │ │ attention_mask, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:235 in │ │ checkpoint │ │ │ │ 232 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(a │ │ 233 │ │ │ 234 │ if use_reentrant: │ │ ❱ 235 │ │ return CheckpointFunction.apply(function, preserve, args) │ │ 236 │ else: │ │ 237 │ │ return _checkpoint_without_reentrant( │ │ 238 │ │ │ function, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:96 in │ │ forward │ │ │ │ 93 │ │ ctx.save_for_backward(tensor_inputs) │ │ 94 │ │ │ │ 95 │ │ with torch.no_grad(): │ │ ❱ 96 │ │ │ outputs = run_function(args) │ │ 97 │ │ return outputs │ │ 98 │ │ │ 99 │ @staticmethod │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(input, kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │ │ new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(*args, *kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:551 in forward │ │ │ │ 548 │ │ # hidden_states: [s, b, h] │ │ 549 │ │ │ │ 550 │ │ # Layer norm at the beginning of the transformer layer. │ │ ❱ 551 │ │ layernorm_output = self.input_layernorm(hidden_states) │ │ 552 │ │ # Self attention. │ │ 553 │ │ attention_output, kv_cache = self.self_attention( │ │ 554 │ │ │ layernorm_output, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │ │ new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(*args, *kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:201 in forward │ │ │ │ 198 │ │ variance = hidden_states.to(torch.float32).pow(2).mean(-1, ke │ │ 199 │ │ hidden_states = hidden_states torch.rsqrt(variance + self.e │ │ 200 │ │ │ │ ❱ 201 │ │ return (self.weight hidden_states).to(input_dtype) │ │ 202 │ │ 203 │ │ 204 class CoreAttention(torch.nn.Module): │ ╰──────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

sft.config.py中的train_info_args如下: train_info_args = { 'devices': 2, 'data_backend': 'parquet', #one of record lmdb arrow_stream ,arrow_file, parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大

预训练模型路径

**train_model_config,
'convert_onnx': False, # 转换onnx模型
'do_train': True,
'train_file':  [ '/chatglm2-dev/data/finetune_train_examples.json'],
'max_epochs': 20,
'max_steps': -1,
'optimizer': 'lion', # one of [lamb,adma,adamw_hf,adamw,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit]

'scheduler_type': 'CAWR', #one of [linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau, cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau]
'scheduler':{'T_mult': 1,
             'rewarm_epoch_num': 0.5,  # 如果 max_epochs is not None !
             # 'T_0': 50000,    # 如果 max_epochs is None , 设定步数
             'verbose': False},

# 'scheduler_type': 'linear',# one of [linear,WarmupCosine,CAWR,CAL,Step,ReduceLROnPlateau
# 'scheduler': None,

# 切换scheduler类型
# 'scheduler_type': 'WarmupCosine',
# 'scheduler': None,

# 'scheduler_type': 'ReduceLROnPlateau',
# 'scheduler': None,

# 'scheduler_type': 'Step',
# 'scheduler':{ 'decay_rate': 0.999,'decay_steps': 100,'verbose': True},

# 'scheduler_type': 'CAWR',
# 'scheduler':{'T_mult': 1, 'rewarm_epoch_num': 2, 'verbose': True},

# 'scheduler_type': 'CAL',
# 'scheduler': {'rewarm_epoch_num': 2,'verbose': True},

'optimizer_betas': (0.9, 0.999),
'train_batch_size': 1,
'eval_batch_size': 2,
'test_batch_size': 2,
'learning_rate': 2e-5,  #
'adam_epsilon': 1e-8,
'gradient_accumulation_steps': 1,
'max_grad_norm': 1.0,
'weight_decay': 0,
'warmup_steps': 0,
'output_dir': './output',
'max_seq_length': 16, # 如果资源充足,推荐长度2048 与官方保持一致
'max_target_length': 16,  # 预测最大长度, 保留字段
'use_fast_tokenizer': False,
'do_lower_case': False,

}

main.py的信息如下:

模块配置, 默认启用lora

enable_deepspeed = True enable_ptv2 = False enable_lora = True load_in_bit = 0 # 4 load_in_4bit, 8 load_in_8bit other 0

if enable_lora: from config.sft_config_lora import elif enable_ptv2: from config.sft_config_ptv2 import else: from config.sft_config import *

if enable_lora: enable_ptv2 = False global_args['load_in_4bit'] = load_in_bit == 4 global_args['load_in_8bit'] = load_in_bit == 8

if global_args['load_in_4bit']:
    global_args['quantization_config'] = None

#检查lora adalora是否开启
if 'lora' not in train_info_args and 'adalora' not in train_info_args:
    raise ValueError('please config lora or adalora')
if train_info_args.get('lora',{}).get('with_lora',False) and train_info_args.get('adalora',{}).get('with_lora',False):
    raise Exception('lora and adalora can set one at same time !')

train_info_args.pop('prompt', None)

elif enable_ptv2: enable_lora = False global_args['load_in_4bit'] = False global_args['load_in_8bit'] = False train_info_args.pop('lora', None) train_info_args.pop('adalora', None) else: enable_ptv2 = False enable_lora = False

global_args['load_in_4bit'] = False

# global_args['load_in_8bit'] = False
train_info_args.pop('lora',None)
train_info_args.pop('adalora', None)
train_info_args.pop('prompt', None)

预处理

if 'rwkv' in train_info_args['tokenizer_name'].lower(): train_info_args['use_fast_tokenizer'] = True

def get_deepspeed_config(): ''' lora prompt finetuning 使用 deepspeed_offload.json 普通finetuning 使用deepspeed.json '''

是否开启deepspeed

if not enable_deepspeed:
    return None

# 选择 deepspeed 配置文件
is_need_update_config = False
if enable_lora or enable_ptv2:
    is_need_update_config = True
    filename = os.path.join(os.path.dirname(__file__), 'deepspeed_offload.json')
else:
    filename = os.path.join(os.path.dirname(__file__), 'deepspeed.json')

with open(filename, mode='r', encoding='utf-8') as f:
    deepspeed_config = json.loads(f.read())

#lora offload 同步优化器配置
if is_need_update_config:
    optimizer = deepspeed_config.get('optimizer',None)
    if optimizer:
        optimizer['params']['betas'] = train_info_args.get('optimizer_betas', (0.9, 0.999))
        optimizer['params']['lr'] = train_info_args.get('learning_rate', 2e-5)
        optimizer['params']['eps'] = train_info_args.get('adam_epsilon', 1e-8)
return deepspeed_config
ssbuild commented 1 year ago

pip list | grep -E "ing|torch|deep"

ssbuild commented 1 year ago

export CUDA_DEVICE_ORDER="PCI_BUS_ID"
然后用一下其中之一方案

  1. 设置 train_info_args['devices'] = [0,1,2,3]. 设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

  2. 设置 train_info_args['devices'] = 4. 设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

fjchung commented 1 year ago

export CUDA_DEVICE_ORDER="PCI_BUS_ID" 然后用一下其中之一方案

  1. 设置 train_info_args['devices'] = [0,1,2,3]. 设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7
  2. 设置 train_info_args['devices'] = 4. 设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7

大佬,还是不行,还是之前的报错。全量参数微调是没问题,但是lora多卡就报这个错了