DachengLi1 / LongChat

Official repository for LongChat and LongEval
Apache License 2.0
504 stars 29 forks source link

OutOfMemoryError: CUDA out of memory. #9

Open brewswang opened 1 year ago

brewswang commented 1 year ago

I have 9 V100 16G GPUs,but training CUDA out of memory. The specific errors are as follows: Formatting inputs...Skip in lazy mode /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py:295: UserWarning: FSDP is switching to use NO_SHARD instead of ShardingStrategy.FULL_SHARD since the world size is 1. warnings.warn( ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /nvme/soft/brewswang/chatgpt/LongChat/longchat/train/fine_tune/train_condense_16K.py:15 in │ │ │ │ │ │ 12 from longchat.train.fine_tune.train import train │ │ 13 │ │ 14 if name == "main": │ │ ❱ 15 │ train() │ │ 16 │ │ │ │ /nvme/soft/brewswang/chatgpt/LongChat/longchat/train/fine_tune/train.py:262 in train │ │ │ │ 259 │ if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): │ │ 260 │ │ trainer.train(resume_from_checkpoint=True) │ │ 261 │ else: │ │ ❱ 262 │ │ trainer.train() │ │ 263 │ trainer.save_state() │ │ 264 │ safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) │ │ 265 │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/transformers/trainer.py:16 │ │ 62 in train │ │ │ │ 1659 │ │ inner_training_loop = find_executable_batch_size( │ │ 1660 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │ │ 1661 │ │ ) │ │ ❱ 1662 │ │ return inner_training_loop( │ │ 1663 │ │ │ args=args, │ │ 1664 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1665 │ │ │ trial=trial, │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/transformers/trainer.py:17 │ │ 49 in _inner_training_loop │ │ │ │ 1746 │ │ if args.gradient_checkpointing: │ │ 1747 │ │ │ self.model.gradient_checkpointing_enable() │ │ 1748 │ │ │ │ ❱ 1749 │ │ model = self._wrap_model(self.model_wrapped) │ │ 1750 │ │ │ │ 1751 │ │ if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: │ │ 1752 │ │ │ self._load_from_checkpoint(resume_from_checkpoint, model) │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/transformers/trainer.py:14 │ │ 89 in _wrap_model │ │ │ │ 1486 │ │ │ │ │ for arg in ["limit_all_gathers", "forward_prefetch", "backward_prefe │ │ 1487 │ │ │ │ │ │ if arg in signature: │ │ 1488 │ │ │ │ │ │ │ kwargs[arg] = getattr(self, arg) │ │ ❱ 1489 │ │ │ │ │ self.model = model = FSDP( │ │ 1490 │ │ │ │ │ │ model, │ │ 1491 │ │ │ │ │ │ sharding_strategy=self.fsdp, │ │ 1492 │ │ │ │ │ │ cpu_offload=cpu_offload, │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/ful │ │ ly_sharded_data_parallel.py:391 in init │ │ │ │ 388 │ │ │ │ # process groups. │ │ 389 │ │ │ │ fsdp_kwargs["process_group"] = (self.process_group, self._inter_node_pg) │ │ 390 │ │ │ │ │ ❱ 391 │ │ │ _auto_wrap(auto_wrap_kwargs, fsdp_kwargs, FullyShardedDataParallel) │ │ 392 │ │ │ │ 393 │ │ backward_prefetch_limit = 1 │ │ 394 │ │ forward_prefetch_limit = 1 │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/_wr │ │ ap_utils.py:73 in _auto_wrap │ │ │ │ 70 │ │ │ "kernels do not support low precision." │ │ 71 │ │ ) │ │ 72 │ auto_wrap_kwargs["auto_wrap_policy"] = auto_wrap_policy │ │ ❱ 73 │ _recursive_wrap(auto_wrap_kwargs, fsdp_kwargs) │ │ 74 │ │ 75 │ │ 76 def _get_fully_sharded_module_to_states( │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │ │ p.py:370 in _recursive_wrap │ │ │ │ 367 │ │ for name, child in module.named_children(): │ │ 368 │ │ │ if child in ignored_modules: │ │ 369 │ │ │ │ continue │ │ ❱ 370 │ │ │ wrapped_child, num_wrapped_params = _recursive_wrap( │ │ 371 │ │ │ │ module=child, │ │ 372 │ │ │ │ auto_wrap_policy=auto_wrap_policy, │ │ 373 │ │ │ │ wrapper_cls=wrapper_cls, │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │ │ p.py:370 in _recursive_wrap │ │ │ │ 367 │ │ for name, child in module.named_children(): │ │ 368 │ │ │ if child in ignored_modules: │ │ 369 │ │ │ │ continue │ │ ❱ 370 │ │ │ wrapped_child, num_wrapped_params = _recursive_wrap( │ │ 371 │ │ │ │ module=child, │ │ 372 │ │ │ │ auto_wrap_policy=auto_wrap_policy, │ │ 373 │ │ │ │ wrapper_cls=wrapper_cls, │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │ │ p.py:370 in _recursive_wrap │ │ │ │ 367 │ │ for name, child in module.named_children(): │ │ 368 │ │ │ if child in ignored_modules: │ │ 369 │ │ │ │ continue │ │ ❱ 370 │ │ │ wrapped_child, num_wrapped_params = _recursive_wrap( │ │ 371 │ │ │ │ module=child, │ │ 372 │ │ │ │ auto_wrap_policy=auto_wrap_policy, │ │ 373 │ │ │ │ wrapper_cls=wrapper_cls, │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │ │ p.py:388 in _recursive_wrap │ │ │ │ 385 │ │ │ module=module, recurse=False, nonwrapped_numel=remainder │ │ 386 │ │ ): │ │ 387 │ │ │ # Leaf node or final wrapping of the remainder both happen here. │ │ ❱ 388 │ │ │ return _wrap(module, wrapper_cls, kwargs), nonwrapped_numel │ │ 389 │ │ else: │ │ 390 │ │ │ return module, total_wrapped_numel │ │ 391 │ return module, 0 │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/wra │ │ p.py:317 in _wrap │ │ │ │ 314 │ │ overrides = {kwargs, module._wrap_overrides} # type: ignore[arg-type] │ │ 315 │ │ return wrapper_cls(module, overrides) │ │ 316 │ │ │ ❱ 317 │ return wrapper_cls(module, **kwargs) │ │ 318 │ │ 319 │ │ 320 def _recursive_wrap( │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/ful │ │ ly_sharded_data_parallel.py:408 in init │ │ │ │ 405 │ │ _init_runtime_state(self) │ │ 406 │ │ _init_prefetching_state(self, backward_prefetch, forward_prefetch) │ │ 407 │ │ _init_buffer_state(self, module) │ │ ❱ 408 │ │ _init_param_handle_from_module( │ │ 409 │ │ │ self, │ │ 410 │ │ │ module, │ │ 411 │ │ │ device_id, │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/_in │ │ it_utils.py:429 in _init_param_handle_from_module │ │ │ │ 426 │ │ _sync_module_params_and_buffers( │ │ 427 │ │ │ fully_sharded_module, managed_params, state.process_group │ │ 428 │ │ ) │ │ ❱ 429 │ _init_param_handle_from_params(state, managed_params, fully_sharded_module) │ │ 430 │ return state │ │ 431 │ │ 432 │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/_in │ │ it_utils.py:525 in _init_param_handle_from_params │ │ │ │ 522 ): │ │ 523 │ if len(params) == 0: │ │ 524 │ │ return │ │ ❱ 525 │ handle = FlatParamHandle( │ │ 526 │ │ params, │ │ 527 │ │ fully_sharded_module, │ │ 528 │ │ state.compute_device, │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/fla │ │ t_param.py:366 in init │ │ │ │ 363 │ │ self._training_state = HandleTrainingState.IDLE │ │ 364 │ │ self._debug_level = dist.get_debug_level() │ │ 365 │ │ self._fully_sharded_module = fully_sharded_module │ │ ❱ 366 │ │ self._init_flat_param(params, fully_sharded_module, use_orig_params) │ │ 367 │ │ self._orig_param_dtype = self.flat_param.dtype │ │ 368 │ │ self._use_unsharded_views(as_params=False) │ │ 369 │ │ self._init_param_reduce_dtypes(mp_param_dtype, mp_reduce_dtype) │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/fla │ │ t_param.py:462 in _init_flat_param │ │ │ │ 459 │ │ │ "Passed-in params were not found in the module tree\n" │ │ 460 │ │ │ f"params: {params}\nmodule: {module}" │ │ 461 │ │ ) │ │ ❱ 462 │ │ self.flat_param = FlatParamHandle.flatten_params( │ │ 463 │ │ │ params_to_flatten, requires_grad │ │ 464 │ │ ) │ │ 465 │ │ # For use_orig_params=True, ensure that the logical parameters are │ │ │ │ /home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/fsdp/fla │ │ t_param.py:505 in flatten_params │ │ │ │ 502 │ │ │ │ p.detach().reshape(-1) if isinstance(p, nn.Parameter) else p.reshape(-1) │ │ 503 │ │ │ │ for p in params │ │ 504 │ │ │ ] │ │ ❱ 505 │ │ │ flat_param_data = torch.cat(flat_params, dim=0) │ │ 506 │ │ flat_param = FlatParameter(flat_param_data, requires_grad=requires_grad) │ │ 507 │ │ return flat_param │ │ 508 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ OutOfMemoryError: CUDA out of memory. Tried to allocate 774.00 MiB (GPU 0; 15.78 GiB total capacity; 14.62 GiB already allocated; 369.69 MiB free; 14.72 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 23015) of binary: /home/chat_glm6b/anaconda3/envs/longeval/bin/python Traceback (most recent call last): File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 798, in main() File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper return f(*args, **kwargs) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main run(args) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run elastic_launch( File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

longchat/train/fine_tune/train_condense_16K.py FAILED

Failures:

------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2023-07-02_15:05:45 host : localhost.localdomain rank : 0 (local_rank: 0) exitcode : 1 (pid: 23015) error_file: traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
DachengLi1 commented 1 year ago

@brewswang Thanks for trying out the training code! In this release, the code has only been tested on 8xA100 for a 7B model, because of the very long sequence length causes high memory consumption. To run on V100 16GB, first change the monkey_patch here from flash attention to xformer.

There are several things to try:

(1) use FSDP cpu offloading. (2) Try lower sequence length. (remember also change the ratio to lower ones, e.g. 2 for 4K, 4 for 8K).

Let me know if it works for you!

brewswang commented 1 year ago

my train_condense_16K.py file content:

Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.

Need to call this before importing transformers.

from longchat.train.monkey_patch.llama_condense_monkey_patch import replace_llama_with_condense

replace_llama_with_condense(ratio=8)

from longchat.train.monkey_patch.llama_xformer_monkey_patch import replace_llama_attn_with_xformer

replace_llama_attn_with_xformer()

from longchat.train.fine_tune.train import train

if name == "main": train() my train comend: CUDA_VISIBLE_DEVICES=0,2,3,4,5,6,7,8,9 python -m torch.distributed.run --nproc_per_node=1 \ longchat/train/fine_tune/train_condense_16K.py \ --model_name_or_path model/open_llama_7b/ \ --data_path data/dummy_conversation.json \ --bf16 False \ --output_dir outputs/models \ --num_train_epochs 1 \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 1 \ --evaluation_strategy no \ --save_strategy steps \ --save_steps 1000 \ --save_total_limit 1 \ --learning_rate 2e-5 \ --weight_decay 0. \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --fsdp "no_shard offload" \ --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \ --tf32 False \ --model_max_length 100 \ --gradient_checkpointing True \ --lazy_preprocess True

output: OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB (GPU 0; 15.78 GiB total capacity; 14.94 GiB already allocated; 41.69 MiB free; 15.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 23915) of binary: /home/chat_glm6b/anaconda3/envs/longeval/bin/python Traceback (most recent call last): File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 798, in main() File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper return f(*args, **kwargs) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main run(args) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run elastic_launch( File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

longchat/train/fine_tune/train_condense_16K.py FAILED

DachengLi1 commented 1 year ago

Please change nproc_per_node to the number of GPU you have, also I would suggest using 8 GPUs instead of 9.

brewswang commented 1 year ago

I got errors: WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24901 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24902 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24903 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24904 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24906 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24907 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24908 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24909 closing signal SIGTERM ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 4 (pid: 24905) of binary: /home/chat_glm6b/anaconda3/envs/longeval/bin/python Traceback (most recent call last): File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 798, in main() File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper return f(*args, **kwargs) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main run(args) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run elastic_launch( File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

longchat/train/fine_tune/train_condense_16K.py FAILED

Failures:

------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2023-07-02_18:08:34 host : localhost.localdomain rank : 4 (local_rank: 4) exitcode : -9 (pid: 24905) error_file: traceback : Signal 9 (SIGKILL) received by PID 24905 ======================================================
ChaoyuHuang commented 1 year ago

I got errors:

WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24901 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24902 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24903 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24904 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24906 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24907 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24908 closing signal SIGTERM WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 24909 closing signal SIGTERM ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 4 (pid: 24905) of binary: /home/chat_glm6b/anaconda3/envs/longeval/bin/python Traceback (most recent call last): File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 798, in main() File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper return f(*args, kwargs) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main run(args) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run elastic_launch( File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in call** return launch_agent(self._config, self._entrypoint, list(args)) File "/home/chat_glm6b/anaconda3/envs/longeval/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

longchat/train/fine_tune/train_condense_16K.py FAILED

Failures:

# Root Cause (first observed failure): [0]: time : 2023-07-02_18:08:34 host : localhost.localdomain rank : 4 (local_rank: 4) exitcode : -9 (pid: 24905) error_file: traceback : Signal 9 (SIGKILL) received by PID 24905

i also met this problem, i guess it ouccered by RAM memory