Open fjchung opened 1 year ago
pip list | grep -E "ing|torch|deep"
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
然后用一下其中之一方案
设置 train_info_args['devices'] = [0,1,2,3]. 设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7
设置 train_info_args['devices'] = 4. 设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7
export CUDA_DEVICE_ORDER="PCI_BUS_ID" 然后用一下其中之一方案
- 设置 train_info_args['devices'] = [0,1,2,3]. 设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7
- 设置 train_info_args['devices'] = 4. 设置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7
大佬,还是不行,还是之前的报错。全量参数微调是没问题,但是lora多卡就报这个错了
训练命令如下: CUDA_VISIBLE_DEVICES=0,1 python train.py
报错信息如下: ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /chatglm2-dev/train.py:122 in │
│ │
│ 119 │ ) │
│ 120 │ │
│ 121 │ if train_datasets is not None: │
│ ❱ 122 │ │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 123 │
│ 124 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 520 in fit │
│ │
│ 517 │ │ """ │
│ 518 │ │ model = _maybe_unwrap_optimized(model) │
│ 519 │ │ self.strategy._lightning_module = model │
│ ❱ 520 │ │ call._call_and_handle_interrupt( │
│ 521 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 522 │ │ ) │
│ 523 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:42 │
│ in _call_and_handle_interrupt │
│ │
│ 39 │ """ │
│ 40 │ try: │
│ 41 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 42 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, args, │
│ 43 │ │ else: │
│ 44 │ │ │ return trainer_fn(args, kwargs) │
│ 45 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/launcher │
│ s/subprocess_script.py:92 in launch │
│ │
│ 89 │ │ """ │
│ 90 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 91 │ │ │ self._call_children_scripts() │
│ ❱ 92 │ │ return function(*args, **kwargs) │
│ 93 │ │
│ 94 │ def kill(self, signum: _SIGNUM) -> None: │
│ 95 │ │ for proc in self.procs: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 559 in _fit_impl │
│ │
│ 556 │ │ │ model_provided=True, │
│ 557 │ │ │ model_connected=self.lightning_module is not None, │
│ 558 │ │ ) │
│ ❱ 559 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 560 │ │ │
│ 561 │ │ assert self.state.stopped │
│ 562 │ │ self.training = False │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 935 in _run │
│ │
│ 932 │ │ # ---------------------------- │
│ 933 │ │ # RUN THE TRAINER │
│ 934 │ │ # ---------------------------- │
│ ❱ 935 │ │ results = self._run_stage() │
│ 936 │ │ │
│ 937 │ │ # ---------------------------- │
│ 938 │ │ # POST-Training CLEAN UP │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/trainer.py: │
│ 978 in _run_stage │
│ │
│ 975 │ │ │ with isolate_rng(): │
│ 976 │ │ │ │ self._run_sanity_check() │
│ 977 │ │ │ with torch.autograd.set_detect_anomaly(self._detect_anoma │
│ ❱ 978 │ │ │ │ self.fit_loop.run() │
│ 979 │ │ │ return None │
│ 980 │ │ raise RuntimeError(f"Unexpected state {self.state}") │
│ 981 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:2 │
│ 01 in run │
│ │
│ 198 │ │ while not self.done: │
│ 199 │ │ │ try: │
│ 200 │ │ │ │ self.on_advance_start() │
│ ❱ 201 │ │ │ │ self.advance() │
│ 202 │ │ │ │ self.on_advance_end() │
│ 203 │ │ │ │ self._restarting = False │
│ 204 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/fit_loop.py:3 │
│ 54 in advance │
│ │
│ 351 │ │ assert self._data_fetcher is not None │
│ 352 │ │ self._data_fetcher.setup(combined_loader) │
│ 353 │ │ with self.trainer.profiler.profile("run_training_epoch"): │
│ ❱ 354 │ │ │ self.epoch_loop.run(self._data_fetcher) │
│ 355 │ │
│ 356 │ def on_advance_end(self) -> None: │
│ 357 │ │ trainer = self.trainer │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:133 in run │
│ │
│ 130 │ │ self.on_run_start(data_fetcher) │
│ 131 │ │ while not self.done: │
│ 132 │ │ │ try: │
│ ❱ 133 │ │ │ │ self.advance(data_fetcher) │
│ 134 │ │ │ │ self.on_advance_end() │
│ 135 │ │ │ │ self._restarting = False │
│ 136 │ │ │ except StopIteration: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/training_epoc │
│ h_loop.py:218 in advance │
│ │
│ 215 │ │ │ with trainer.profiler.profile("run_training_batch"): │
│ 216 │ │ │ │ if trainer.lightning_module.automatic_optimization: │
│ 217 │ │ │ │ │ # in automatic optimization, there can only be one │
│ ❱ 218 │ │ │ │ │ batch_output = self.automatic_optimization.run(tra │
│ 219 │ │ │ │ else: │
│ 220 │ │ │ │ │ batch_output = self.manual_optimization.run(kwargs │
│ 221 │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:185 in run │
│ │
│ 182 │ │ # ------------------------------ │
│ 183 │ │ # gradient update with accumulated gradients │
│ 184 │ │ else: │
│ ❱ 185 │ │ │ self._optimizer_step(kwargs.get("batch_idx", 0), closure) │
│ 186 │ │ │
│ 187 │ │ result = closure.consume_result() │
│ 188 │ │ if result.loss is None: │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │
│ automatic.py:261 in _optimizer_step │
│ │
│ 258 │ │ │ self.optim_progress.optimizer.step.increment_ready() │
│ 259 │ │ │
│ 260 │ │ # model hook │
│ ❱ 261 │ │ call._call_lightning_module_hook( │
│ 262 │ │ │ trainer, │
│ 263 │ │ │ "optimizer_step", │
│ 264 │ │ │ trainer.current_epoch, │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:142 │
│ in _call_lightning_module_hook │
│ │
│ 139 │ pl_module._current_fx_name = hook_name │
│ 140 │ │
│ 141 │ with trainer.profiler.profile(f"[LightningModule]{pl_module.__clas │
│ ❱ 142 │ │ output = fn(*args, kwargs) │
│ 143 │ │
│ 144 │ # restore current_fx when nested context │
│ 145 │ pl_module._current_fx_name = prev_fx_name │
│ │
│ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/module.py:1265 │
│ in optimizer_step │
│ │
│ 1262 │ │ │ │ │ for pg in optimizer.param_groups: │
│ 1263 │ │ │ │ │ │ pg["lr"] = lr_scale * self.learning_rate │
│ 1264 │ │ """ │
│ ❱ 1265 │ │ optimizer.step(closure=optimizer_closure) │
│ 1266 │ │
│ 1267 │ def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimiz │
│ 1268 │ │ """Override this method to change the default behaviour of
│ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/core/optimizer.py:1 │ │ 58 in step │ │ │ │ 155 │ │ │ raise MisconfigurationException("When `optimizer.step(clos │ │ 156 │ │ │ │ 157 │ │ assert self._strategy is not None │ │ ❱ 158 │ │ step_output = self._strategy.optimizer_step(self._optimizer, c │ │ 159 │ │ │ │ 160 │ │ self._on_after_step() │ │ 161 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:2 │ │ 59 in optimizer_step │ │ │ │ 256 │ │ │ model: reference to the model, optionally defining optimiz │ │ 257 │ │ │ **kwargs: Any extra arguments to
optimizer.step`│ │ 258 │ │ """ │ │ ❱ 259 │ │ optimizer_output = super().optimizer_step(optimizer, closure, │ │ 260 │ │ │ │ 261 │ │ if self._model_averager is None: │ │ 262 │ │ │ return optimizer_output │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/strategy │ │ .py:224 in optimizer_step │ │ │ │ 221 │ │ model = model or self.lightning_module │ │ 222 │ │ # TODO(fabric): remove assertion once strategy's optimizer_ste │ │ 223 │ │ assert isinstance(model, pl.LightningModule) │ │ ❱ 224 │ │ return self.precision_plugin.optimizer_step(optimizer, model=m │ │ 225 │ │ │ 226 │ def _setup_model_and_optimizers(self, model: Module, optimizers: L │ │ 227 │ │ """Setup a model and multiple optimizers together. │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/plugins/precision/d │ │ eepspeed.py:92 in optimizer_step │ │ │ │ 89 │ ) -> Any: │ │ 90 │ │ if isinstance(optimizer, LBFGS): │ │ 91 │ │ │ raise MisconfigurationException("DeepSpeed and the LBFGS o │ │ ❱ 92 │ │ closure_result = closure() │ │ 93 │ │ self._after_closure(model, optimizer) │ │ 94 │ │ skipped_backward = closure_result is None │ │ 95 │ │ # in manual optimization, the closure does not return a value │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │ │ automatic.py:140 in __call__ │ │ │ │ 137 │ │ return step_output │ │ 138 │ │ │ 139 │ def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]: │ │ ❱ 140 │ │ self._result = self.closure(*args, **kwargs) │ │ 141 │ │ return self._result.loss │ │ 142 │ │ 143 │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │ │ automatic.py:126 in closure │ │ │ │ 123 │ │ self._zero_grad_fn = zero_grad_fn │ │ 124 │ │ │ 125 │ def closure(self, *args: Any, **kwargs: Any) -> ClosureResult: │ │ ❱ 126 │ │ step_output = self._step_fn() │ │ 127 │ │ │ │ 128 │ │ if step_output.closure_loss is None: │ │ 129 │ │ │ self.warning_cache.warn("
training_stepreturned
None. │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/loops/optimization/ │ │ automatic.py:308 in _training_step │ │ │ │ 305 │ │ trainer = self.trainer │ │ 306 │ │ │ │ 307 │ │ # manually capture logged metrics │ │ ❱ 308 │ │ training_step_output = call._call_strategy_hook(trainer, "trai │ │ 309 │ │ self.trainer.strategy.post_training_step() │ │ 310 │ │ │ │ 311 │ │ result = self.output_result_cls.from_training_step_output(trai │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/trainer/call.py:288 │ │ in _call_strategy_hook │ │ │ │ 285 │ │ return │ │ 286 │ │ │ 287 │ with trainer.profiler.profile(f"[Strategy]{trainer.strategy.__clas │ │ ❱ 288 │ │ output = fn(*args, **kwargs) │ │ 289 │ │ │ 290 │ # restore current_fx when nested context │ │ 291 │ pl_module._current_fx_name = prev_fx_name │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/strategies/ddp.py:3 │ │ 31 in training_step │ │ │ │ 328 │ def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: │ │ 329 │ │ assert self.model is not None │ │ 330 │ │ with self.precision_plugin.train_step_context(): │ │ ❱ 331 │ │ │ return self.model(*args, **kwargs) │ │ 332 │ │ │ 333 │ def validation_step(self, *args: Any, **kwargs: Any) -> Optional[S │ │ 334 │ │ with self.precision_plugin.val_step_context(): │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/deepspeed/utils/nvtx.py:15 in │ │ wrapped_fn │ │ │ │ 12 │ │ │ 13 │ def wrapped_fn(*args, **kwargs): │ │ 14 │ │ get_accelerator().range_push(func.__qualname__) │ │ ❱ 15 │ │ ret_val = func(*args, **kwargs) │ │ 16 │ │ get_accelerator().range_pop() │ │ 17 │ │ return ret_val │ │ 18 │ │ │ │ /usr/local/lib/python3.8/dist-packages/deepspeed/runtime/engine.py:1769 in │ │ forward │ │ │ │ 1766 │ │ if self.fp16_auto_cast(): │ │ 1767 │ │ │ inputs = self._cast_inputs_half(inputs) │ │ 1768 │ │ │ │ ❱ 1769 │ │ loss = self.module(*inputs, **kwargs) │ │ 1770 │ │ │ │ 1771 │ │ if self.zero_optimization_partition_weights(): │ │ 1772 │ │ │ # Disable automated discovery of external parameters │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forward_hooks or self._ │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, **kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/lightning/pytorch/overrides/base.py:9 │ │ 0 in forward │ │ │ │ 87 │ │ │ │ 88 │ │ if trainer is not None: │ │ 89 │ │ │ if trainer.training: │ │ ❱ 90 │ │ │ │ output = self._forward_module.training_step(*inputs, * │ │ 91 │ │ │ │ # In manual_optimization, we need to prevent DDP reduc │ │ 92 │ │ │ │ # it is done manually in
LightningModule.manual_backw │ │ 93 │ │ │ │ #require_backward_grad_sync
will be reset in the │ │ │ │ /usr/local/lib/python3.8/dist-packages/deeptraining/nlp/models/transformer │ │ base.py:552 in training_step │ │ │ │ 549 │ │ │ 550 │ def training_step(self, batch): │ │ 551 │ │ if isinstance(batch, dict): │ │ ❱ 552 │ │ │ outputs = self.compute_loss(batch) │ │ 553 │ │ else: │ │ 554 │ │ │ outputs = self.compute_loss(dict(batch)) │ │ 555 │ │ loss = outputs[0] │ │ │ │ /usr/local/lib/python3.8/dist-packages/deeptraining/nlp/models/transformer │ │ base.py:371 in compute_loss │ │ │ │ 368 │ def compute_loss(self, args, kwargs): │ │ 369 │ │ if len(args): │ │ 370 │ │ │ kwargs.update(dict(args)) │ │ ❱ 371 │ │ return self.model.compute_loss(kwargs) │ │ 372 │ │ │ 373 │ def forward(self, args, kwargs): │ │ 374 │ │ if len(args): │ │ │ │ /usr/local/lib/python3.8/dist-packages/deeptraining/nlp/models/transformer │ │ base.py:117 in compute_loss │ │ │ │ 114 │ │ return self.model(*args, *batch) │ │ 115 │ │ │ 116 │ def compute_loss(self, args, batch) -> tuple: │ │ ❱ 117 │ │ return self.model(*args, batch) │ │ 118 │ │ │ 119 │ def post_init(self): │ │ 120 │ │ return self.model.post_init() │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, *kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │ │ new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(args, kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(*args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:953 in forward │ │ │ │ 950 │ │ use_cache = use_cache if use_cache is not None else self.conf │ │ 951 │ │ return_dict = return_dict if return_dict is not None else sel │ │ 952 │ │ │ │ ❱ 953 │ │ transformer_outputs = self.transformer( │ │ 954 │ │ │ input_ids=input_ids, │ │ 955 │ │ │ position_ids=position_ids, │ │ 956 │ │ │ attention_mask=attention_mask, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, *kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:849 in forward │ │ │ │ 846 │ │ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() │ │ 847 │ │ │ │ 848 │ │ # Run encoder. │ │ ❱ 849 │ │ hidden_states, presents, all_hidden_states, all_self_attentio │ │ 850 │ │ │ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary │ │ 851 │ │ │ kv_caches=past_key_values, use_cache=use_cache, output_hi │ │ 852 │ │ ) │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(input, kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:639 in forward │ │ │ │ 636 │ │ │ │ │ 637 │ │ │ layer = self._get_layer(index) │ │ 638 │ │ │ if self.gradient_checkpointing and self.training: │ │ ❱ 639 │ │ │ │ layer_ret = torch.utils.checkpoint.checkpoint( │ │ 640 │ │ │ │ │ layer, │ │ 641 │ │ │ │ │ hidden_states, │ │ 642 │ │ │ │ │ attention_mask, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:235 in │ │ checkpoint │ │ │ │ 232 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(a │ │ 233 │ │ │ 234 │ if use_reentrant: │ │ ❱ 235 │ │ return CheckpointFunction.apply(function, preserve, args) │ │ 236 │ else: │ │ 237 │ │ return _checkpoint_without_reentrant( │ │ 238 │ │ │ function, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/utils/checkpoint.py:96 in │ │ forward │ │ │ │ 93 │ │ ctx.save_for_backward(tensor_inputs) │ │ 94 │ │ │ │ 95 │ │ with torch.no_grad(): │ │ ❱ 96 │ │ │ outputs = run_function(args) │ │ 97 │ │ return outputs │ │ 98 │ │ │ 99 │ @staticmethod │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(input, kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │ │ new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(*args, *kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:551 in forward │ │ │ │ 548 │ │ # hidden_states: [s, b, h] │ │ 549 │ │ │ │ 550 │ │ # Layer norm at the beginning of the transformer layer. │ │ ❱ 551 │ │ layernorm_output = self.input_layernorm(hidden_states) │ │ 552 │ │ # Self attention. │ │ 553 │ │ attention_output, kv_cache = self.self_attention( │ │ 554 │ │ │ layernorm_output, │ │ │ │ /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130 in │ │ _call_impl │ │ │ │ 1127 │ │ # this function, and just call forward. │ │ 1128 │ │ if not (self._backward_hooks or self._forwardhooks or self. │ │ 1129 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks │ │ ❱ 1130 │ │ │ return forward_call(*input, kwargs) │ │ 1131 │ │ # Do not call functions when jit is used │ │ 1132 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │ │ 1133 │ │ if self._backward_hooks or _global_backward_hooks: │ │ │ │ /usr/local/lib/python3.8/dist-packages/accelerate/hooks.py:165 in │ │ new_forward │ │ │ │ 162 │ │ │ with torch.no_grad(): │ │ 163 │ │ │ │ output = old_forward(*args, *kwargs) │ │ 164 │ │ else: │ │ ❱ 165 │ │ │ output = old_forward(args, kwargs) │ │ 166 │ │ return module._hf_hook.post_forward(module, output) │ │ 167 │ │ │ 168 │ module.forward = new_forward │ │ │ │ /usr/local/lib/python3.8/dist-packages/deep_training/nlp/models/chatglm2/mod │ │ eling_chatglm.py:201 in forward │ │ │ │ 198 │ │ variance = hidden_states.to(torch.float32).pow(2).mean(-1, ke │ │ 199 │ │ hidden_states = hidden_states torch.rsqrt(variance + self.e │ │ 200 │ │ │ │ ❱ 201 │ │ return (self.weight hidden_states).to(input_dtype) │ │ 202 │ │ 203 │ │ 204 class CoreAttention(torch.nn.Module): │ ╰──────────────────────────────────────────────────────────────────────────────╯ RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!sft.config.py中的train_info_args如下: train_info_args = { 'devices': 2, 'data_backend': 'parquet', #one of record lmdb arrow_stream ,arrow_file, parquet, 超大数据集可以使用 lmdb , 注 lmdb 存储空间比record大
预训练模型路径
}
main.py的信息如下:
模块配置, 默认启用lora
enable_deepspeed = True enable_ptv2 = False enable_lora = True load_in_bit = 0 # 4 load_in_4bit, 8 load_in_8bit other 0
if enable_lora: from config.sft_config_lora import elif enable_ptv2: from config.sft_config_ptv2 import else: from config.sft_config import *
if enable_lora: enable_ptv2 = False global_args['load_in_4bit'] = load_in_bit == 4 global_args['load_in_8bit'] = load_in_bit == 8
elif enable_ptv2: enable_lora = False global_args['load_in_4bit'] = False global_args['load_in_8bit'] = False train_info_args.pop('lora', None) train_info_args.pop('adalora', None) else: enable_ptv2 = False enable_lora = False
global_args['load_in_4bit'] = False
预处理
if 'rwkv' in train_info_args['tokenizer_name'].lower(): train_info_args['use_fast_tokenizer'] = True
def get_deepspeed_config(): ''' lora prompt finetuning 使用 deepspeed_offload.json 普通finetuning 使用deepspeed.json '''
是否开启deepspeed