OpenMOSS / CoLLiE

Collaborative Training of Large Language Models in an Efficient Way
https://openlmlab-collie.readthedocs.io
Apache License 2.0
410 stars 58 forks source link

megatron是哪个版本 #69

Closed skepsun closed 1 year ago

skepsun commented 1 year ago
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/collie/models/llama/m │
│ odel.py:235 in __init__                                                                   │
│                                                                                           │
│   232 │                                                                                   │
│   233 │   def __init__(self, config: CollieConfig) -> None:                               │
│   234 │   │   super().__init__(config)                                                    │
│ ❱ 235 │   │   self.embed_tokens = tensor_parallel.VocabParallelEmbedding(                 │
│   236 │   │   │   self.collie_config.vocab_size,                                          │
│   237 │   │   │   self.collie_config.hidden_size                                          │
│   238 │   │   )                              

VocabParallelEmbedding.__init__() missing 2 required keyword-only arguments: 'init_method' and 'config'

KaiLv69 commented 1 year ago

你好,我使用的megatron-core的版本是0.1.0。直接使用pip install megatron-core的版本就可以。

skepsun commented 1 year ago

谢谢!

skepsun commented 1 year ago

实例化trainer的时候会报显存溢出,A800的卡

╭─────────────────────────── Traceback (most recent call last) ───────────────────────────╮
│ /data/llm/collie/examples/alpaca/train.py:115 in <module>                               │
│                                                                                         │
│   112 )                                                                                 │
│   113                                                                                   │
│   114 # 9. 实例化trainer                                                                │
│ ❱ 115 trainer = Trainer(                                                                │
│   116 │   model = model,                                                                │
│   117 │   config = config,                                                              │
│   118 │   loss_fn = GPTLMLoss(-100),                                                    │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/collie/controller/t │
│ rainer.py:190 in __init__                                                               │
│                                                                                         │
│   187 │   │                                                                             │
│   188 │   │   callbacks = prepare_callback(callbacks)                                   │
│   189 │   │   self.callback_manager = CallbackManager(callbacks)                        │
│ ❱ 190 │   │   self.setup_parallel_model()                                               │
│   191 │   │   if isinstance(self.engine.module, PipelineGenerationMixin):               │
│   192 │   │   │   self.engine.module.set_engine(self.engine)                            │
│   193 │   │   if isinstance(self.engine.module, PeftModel) and isinstance(self.engine.m │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/collie/controller/t │
│ rainer.py:282 in setup_parallel_model                                                   │
│                                                                                         │
│   279 │   │   │   │   config=self.config,                                               │
│   280 │   │   │   )                                                                     │
│   281 │   │   else:                                                                     │
│ ❱ 282 │   │   │   self.engine, self.optimizer, _, self.lr_scheduler = setup_ds_engine(  │
│   283 │   │   │   │   model=self.model,                                                 │
│   284 │   │   │   │   optimizer=self.optimizer,                                         │
│   285 │   │   │   │   lr_scheduler=self.lr_scheduler,                                   │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/collie/utils/dist_u │
│ tils.py:91 in setup_ds_engine                                                           │
│                                                                                         │
│    88 │   │   │   assert isinstance(model.get_base_model(), (CollieModelForCausalLM, Pi │
│    89 │   │   else:                                                                     │
│    90 │   │   │   assert isinstance(model, (CollieModelForCausalLM, PipelineModel)), "C │
│ ❱  91 │   engine, optimizer, _, lr_scheduler = initialize(                              │
│    92 │   │   model=model,                                                              │
│    93 │   │   optimizer=optimizer,                                                      │
│    94 │   │   lr_scheduler=lr_scheduler,                                                │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/collie/utils/dist_u │
│ tils.py:620 in initialize                                                               │
│                                                                                         │
│   617 │   │   │   │   │   │   │   │   │   │      config=config,                         │
│   618 │   │   │   │   │   │   │   │   │   │      config_class=config_class)             │
│   619 │   │   else:                                                                     │
│ ❱ 620 │   │   │   engine = DeepSpeedEngine(args=args,                                   │
│   621 │   │   │   │   │   │   │   │   │    model=model,                                 │
│   622 │   │   │   │   │   │   │   │   │    optimizer=optimizer,                         │
│   623 │   │   │   │   │   │   │   │   │    model_parameters=model_parameters,           │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/deepspeed/runtime/e │
│ ngine.py:309 in __init__                                                                │
│                                                                                         │
│    306 │   │   │   model_parameters = list(model_parameters)                            │
│    307 │   │                                                                            │
│    308 │   │   if has_optimizer:                                                        │
│ ❱  309 │   │   │   self._configure_optimizer(optimizer, model_parameters)               │
│    310 │   │   │   self._configure_lr_scheduler(lr_scheduler)                           │
│    311 │   │   │   self._report_progress(0)                                             │
│    312 │   │   elif self.zero_optimization():                                           │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/deepspeed/runtime/e │
│ ngine.py:1193 in _configure_optimizer                                                   │
│                                                                                         │
│   1190 │   │   │   self._broadcast_model()                                              │
│   1191 │   │   │   # TODO: maybe need to broadcast experts differently?                 │
│   1192 │   │   elif optimizer_wrapper == FP16:                                          │
│ ❱ 1193 │   │   │   self.optimizer = self._configure_fp16_optimizer(basic_optimizer)     │
│   1194 │   │   elif optimizer_wrapper == BFLOAT16:                                      │
│   1195 │   │   │   self.optimizer = self._configure_bf16_optimizer(basic_optimizer)     │
│   1196 │   │   else:                                                                    │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/deepspeed/runtime/e │
│ ngine.py:1353 in _configure_fp16_optimizer                                              │
│                                                                                         │
│   1350 │   │   │   │   )                                                                │
│   1351 │   │   else:                                                                    │
│   1352 │   │   │   log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', │
│ ❱ 1353 │   │   │   optimizer = FP16_UnfusedOptimizer(                                   │
│   1354 │   │   │   │   optimizer,                                                       │
│   1355 │   │   │   │   deepspeed=self,                                                  │
│   1356 │   │   │   │   static_loss_scale=self.loss_scale(),                             │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/deepspeed/runtime/f │
│ p16/unfused_optimizer.py:113 in __init__                                                │
│                                                                                         │
│   110 │   │   self.overflow = False                                                     │
│   111 │   │   self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu, dee │
│   112 │   │                                                                             │
│ ❱ 113 │   │   self.initialize_optimizer_states()                                        │
│   114 │                                                                                 │
│   115 │   def zero_grad(self, set_to_none=False):                                       │
│   116 │   │   """                                                                       │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/deepspeed/runtime/f │
│ p16/unfused_optimizer.py:421 in initialize_optimizer_states                             │
│                                                                                         │
│   418 │   │   │   │   │   │   │   │   │   │    dtype=param.dtype,                       │
│   419 │   │   │   │   │   │   │   │   │   │    device=get_accelerator().current_device_ │
│   420 │   │                                                                             │
│ ❱ 421 │   │   self.optimizer.step()                                                     │
│   422 │   │                                                                             │
│   423 │   │   for i, group in enumerate(self.fp16_groups):                              │
│   424 │   │   │   for param in group:                                                   │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/torch/optim/optimiz │
│ er.py:280 in wrapper                                                                    │
│                                                                                         │
│   277 │   │   │   │   │   │   │   raise RuntimeError(f"{func} must return None or a tup │
│   278 │   │   │   │   │   │   │   │   │   │   │      f"but got {result}.")              │
│   279 │   │   │   │                                                                     │
│ ❱ 280 │   │   │   │   out = func(*args, **kwargs)                                       │
│   281 │   │   │   │   self._optimizer_step_code()                                       │
│   282 │   │   │   │                                                                     │
│   283 │   │   │   │   # call optimizer step post hooks                                  │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/torch/optim/optimiz │
│ er.py:33 in _use_grad                                                                   │
│                                                                                         │
│    30 │   │   prev_grad = torch.is_grad_enabled()                                       │
│    31 │   │   try:                                                                      │
│    32 │   │   │   torch.set_grad_enabled(self.defaults['differentiable'])               │
│ ❱  33 │   │   │   ret = func(self, *args, **kwargs)                                     │
│    34 │   │   finally:                                                                  │
│    35 │   │   │   torch.set_grad_enabled(prev_grad)                                     │
│    36 │   │   return ret                                                                │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/torch/optim/adam.py │
│ :141 in step                                                                            │
│                                                                                         │
│   138 │   │   │   │   max_exp_avg_sqs,                                                  │
│   139 │   │   │   │   state_steps)                                                      │
│   140 │   │   │                                                                         │
│ ❱ 141 │   │   │   adam(                                                                 │
│   142 │   │   │   │   params_with_grad,                                                 │
│   143 │   │   │   │   grads,                                                            │
│   144 │   │   │   │   exp_avgs,                                                         │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/torch/optim/adam.py │
│ :281 in adam                                                                            │
│                                                                                         │
│   278 │   else:                                                                         │
│   279 │   │   func = _single_tensor_adam                                                │
│   280 │                                                                                 │
│ ❱ 281 │   func(params,                                                                  │
│   282 │   │    grads,                                                                   │
│   283 │   │    exp_avgs,                                                                │
│   284 │   │    exp_avg_sqs,                                                             │
│                                                                                         │
│ /data/conda/usr/llm/envs/llama_etuning/lib/python3.10/site-packages/torch/optim/adam.py │
│ :505 in _multi_tensor_adam                                                              │
│                                                                                         │
│   502 │   │   │   │   torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction2_sqrt)   │
│   503 │   │   │   │   denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps)              │
│   504 │   │   │   else:                                                                 │
│ ❱ 505 │   │   │   │   exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)         │
│   506 │   │   │   │   torch._foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)       │
│   507 │   │   │   │   denom = torch._foreach_add(exp_avg_sq_sqrt, eps)                  │
│   508                                                                                   │
╰─────────────────────────────────────────────────────────────────────────────────────────╯
OutOfMemoryError: CUDA out of memory. Tried to allocate 500.00 MiB (GPU 1; 79.35 GiB total 
capacity; 77.85 GiB already allocated; 94.12 MiB free; 78.05 GiB reserved in total by 
PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid 
fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
KaiLv69 commented 1 year ago

你好,对于7B的model可以把tp_size设置为4