Open listwebit opened 1 year ago
我们模型用的BLoom-2M的,用的docker的环境,用的bash training_scripts/single_node/run_LoRA.sh output-lora 2; 也换成3试了,也跑不起来。但是用以前版本的fineture用lora就可以跑起来,这是为啥是不是现在lora还不完善呢
出现下面的错误: [2023-04-25 10:52:32,890] [INFO] [utils.py:793:see_memoryusage] CPU Virtual Memory: used = 47.61 GB, percent = 18.9% Traceback (most recent call last): File "main.py", line 402, in main() File "main.py", line 343, in main model, optimizer, , lr_scheduler = deepspeed.initialize( File "/opt/conda/lib/python3.8/site-packages/deepspeed/init.py", line 156, in initialize engine = DeepSpeedEngine(args=args, File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 328, in init self._configure_optimizer(optimizer, model_parameters) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer self.optimizer = self._configure_zero_optimizer(basic_optimizer) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1465, in _configure_zero_optimizer optimizer = DeepSpeedZeroOptimizer_Stage3( File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 304, in init self._setup_for_real_optimizer() File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 361, in _setup_for_real_optimizer self.initialize_optimizer_states() File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 865, in initialize_optimizer_states self._optimizer_step(i) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 796, in _optimizer_step self.optimizer.step() File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 69, in wrapper return wrapped(*args, kwargs) File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 280, in wrapper out = func(*args, kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/ops/adam/fused_adam.py", line 129, in step state['exp_avg_sq'] = torch.zeros_like(p.data) torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.75 GiB (GPU 1; 31.75 GiB total capacity; 28.88 GiB already allocated; 497.50 MiB free; 30.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOCCONF Traceback (most recent call last): File "main.py", line 402, in main() File "main.py", line 343, in main model, optimizer, , lr_scheduler = deepspeed.initialize( File "/opt/conda/lib/python3.8/site-packages/deepspeed/init.py", line 156, in initialize engine = DeepSpeedEngine(args=args, File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 328, in init self._configure_optimizer(optimizer, model_parameters) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer self.optimizer = self._configure_zero_optimizer(basic_optimizer) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1465, in _configure_zero_optimizer optimizer = DeepSpeedZeroOptimizer_Stage3( File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 304, in init* self._setup_for_real_optimizer() File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 361, in _setup_for_real_optimizer self.initialize_optimizer_states() File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 865, in initialize_optimizer_states self._optimizer_step(i) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 796, in _optimizer_step self.optimizer.step() File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 69, in wrapper return wrapped(args, kwargs) File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 280, in wrapper out = func(*args, kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/ops/adam/fused_adam.py", line 127, in step state['exp_avg'] = torch.zeros_like(p.data) torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.75 GiB (GPU 2; 31.75 GiB total capacity; 25.13 GiB already allocated; 3.21 GiB free; 27.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF [2023-04-25 10:52:33,009] [INFO] [utils.py:785:see_memory_usage] Before initializing optimizer states [2023-04-25 10:52:33,010] [INFO] [utils.py:786:see_memory_usage] MA 13.81 GB Max_MA 13.81 GB CA 15.72 GB Max_CA 16 GB [2023-04-25 10:52:33,010] [INFO] [utils.py:793:see_memoryusage] CPU Virtual Memory: used = 47.62 GB, percent = 18.9% Traceback (most recent call last): File "main.py", line 402, in main() File "main.py", line 343, in main model, optimizer, , lr_scheduler = deepspeed.initialize( File "/opt/conda/lib/python3.8/site-packages/deepspeed/init.py", line 156, in initialize engine = DeepSpeedEngine(args=args, File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 328, in init self._configure_optimizer(optimizer, model_parameters) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer self.optimizer = self._configure_zero_optimizer(basic_optimizer) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1465, in _configure_zero_optimizer optimizer = DeepSpeedZeroOptimizer_Stage3( File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 304, in init self._setup_for_real_optimizer() File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 361, in _setup_for_real_optimizer self.initialize_optimizer_states() File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 865, in initialize_optimizer_states self._optimizer_step(i) File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 796, in _optimizer_step self.optimizer.step() File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 69, in wrapper return wrapped(*args, *kwargs) File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 280, in wrapper out = func(args, kwargs) File "/opt/conda/lib/python3.8/site-packages/deepspeed/ops/adam/fused_adam.py", line 129, in step state['exp_avg_sq'] = torch.zeros_like(p.data) torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.75 GiB (GPU 0; 31.75 GiB total capacity; 28.88 GiB already allocated; 521.50 MiB free; 30.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF [2023-04-25 10:52:34,179] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 2404 [2023-04-25 10:52:34,313] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 2405 [2023-04-25 10:52:34,314] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 2406 [2023-04-25 10:52:34,315] [ERROR] [launch.py:434:sigkill_handler] ['/opt/conda/bin/python3', '-u', 'main.py', '--local_rank=2', '--sft_only_data_path', '/home/centos/liulei/belle/BELLE/data/dev1K.json', '--data_split', '10,0,0', '--model_name_or_path', '/home/centos/liulei/belle/BELLE/models/BELLE-7B-2M', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '64', '--learning_rate', '3e-4', '--weight_decay', '0.', '--num_train_epochs', '5', '--gradient_accumulation_steps', '1', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '100', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '3', '--lora_dim', '16', '--lora_alpha', '16', '--lora_droppout', '0.05', '--lora_module_name', 'q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj', '--deepspeed', '--output_dir', './output-lora'] exits with return code = 1
之前的代码用8bit加载模型,所以会减少显存占用
是的,我也发现这个问题了,所以后来换了斯坦福的微调代码,单块tesla a100 可以微调llama 650亿 tloen / alpaca-lora
现在是fp32吗
cpu offload加了吗
是的,我也发现这个问题了,所以后来换了斯坦福的微调代码,单块tesla a100 可以微调llama 650亿 tloen / alpaca-lora
alpaca-lora好用吗
我们模型用的BLoom-2M的,用的docker的环境,用的bash training_scripts/single_node/run_LoRA.sh output-lora 2; 也换成3试了,也跑不起来。但是用以前版本的fineture用lora就可以跑起来,这是为啥是不是现在lora还不完善呢
出现下面的错误: [2023-04-25 10:52:32,890] [INFO] [utils.py:793:see_memoryusage] CPU Virtual Memory: used = 47.61 GB, percent = 18.9% Traceback (most recent call last): File "main.py", line 402, in
main()
File "main.py", line 343, in main
model, optimizer, , lr_scheduler = deepspeed.initialize(
File "/opt/conda/lib/python3.8/site-packages/deepspeed/init.py", line 156, in initialize
engine = DeepSpeedEngine(args=args,
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 328, in init
self._configure_optimizer(optimizer, model_parameters)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1465, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer_Stage3(
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 304, in init
self._setup_for_real_optimizer()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 361, in _setup_for_real_optimizer
self.initialize_optimizer_states()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 865, in initialize_optimizer_states
self._optimizer_step(i)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 796, in _optimizer_step
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 69, in wrapper
return wrapped(*args, kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(*args, *kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/ops/adam/fused_adam.py", line 129, in step
state['exp_avg_sq'] = torch.zeros_like(p.data)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.75 GiB (GPU 1; 31.75 GiB total capacity; 28.88 GiB already allocated; 497.50 MiB free; 30.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOCCONF
Traceback (most recent call last):
File "main.py", line 402, in
main()
File "main.py", line 343, in main
model, optimizer, , lr_scheduler = deepspeed.initialize(
File "/opt/conda/lib/python3.8/site-packages/deepspeed/init.py", line 156, in initialize
engine = DeepSpeedEngine(args=args,
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 328, in init
self._configure_optimizer(optimizer, model_parameters)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1465, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer_Stage3(
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 304, in init
self._setup_for_real_optimizer()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 361, in _setup_for_real_optimizer
self.initialize_optimizer_states()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 865, in initialize_optimizer_states
self._optimizer_step(i)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 796, in _optimizer_step
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 69, in wrapper
return wrapped(args, kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(*args, kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/ops/adam/fused_adam.py", line 127, in step
state['exp_avg'] = torch.zeros_like(p.data)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.75 GiB (GPU 2; 31.75 GiB total capacity; 25.13 GiB already allocated; 3.21 GiB free; 27.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
[2023-04-25 10:52:33,009] [INFO] [utils.py:785:see_memory_usage] Before initializing optimizer states
[2023-04-25 10:52:33,010] [INFO] [utils.py:786:see_memory_usage] MA 13.81 GB Max_MA 13.81 GB CA 15.72 GB Max_CA 16 GB
[2023-04-25 10:52:33,010] [INFO] [utils.py:793:see_memoryusage] CPU Virtual Memory: used = 47.62 GB, percent = 18.9%
Traceback (most recent call last):
File "main.py", line 402, in
main()
File "main.py", line 343, in main
model, optimizer, , lr_scheduler = deepspeed.initialize(
File "/opt/conda/lib/python3.8/site-packages/deepspeed/init.py", line 156, in initialize
engine = DeepSpeedEngine(args=args,
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 328, in init
self._configure_optimizer(optimizer, model_parameters)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1187, in _configure_optimizer
self.optimizer = self._configure_zero_optimizer(basic_optimizer)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1465, in _configure_zero_optimizer
optimizer = DeepSpeedZeroOptimizer_Stage3(
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 304, in init
self._setup_for_real_optimizer()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 361, in _setup_for_real_optimizer
self.initialize_optimizer_states()
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 865, in initialize_optimizer_states
self._optimizer_step(i)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/runtime/zero/stage3.py", line 796, in _optimizer_step
self.optimizer.step()
File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 69, in wrapper
return wrapped(*args, *kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 280, in wrapper
out = func(args, kwargs)
File "/opt/conda/lib/python3.8/site-packages/deepspeed/ops/adam/fused_adam.py", line 129, in step
state['exp_avg_sq'] = torch.zeros_like(p.data)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 3.75 GiB (GPU 0; 31.75 GiB total capacity; 28.88 GiB already allocated; 521.50 MiB free; 30.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
[2023-04-25 10:52:34,179] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 2404
[2023-04-25 10:52:34,313] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 2405
[2023-04-25 10:52:34,314] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 2406
[2023-04-25 10:52:34,315] [ERROR] [launch.py:434:sigkill_handler] ['/opt/conda/bin/python3', '-u', 'main.py', '--local_rank=2', '--sft_only_data_path', '/home/centos/liulei/belle/BELLE/data/dev1K.json', '--data_split', '10,0,0', '--model_name_or_path', '/home/centos/liulei/belle/BELLE/models/BELLE-7B-2M', '--per_device_train_batch_size', '1', '--per_device_eval_batch_size', '1', '--max_seq_len', '64', '--learning_rate', '3e-4', '--weight_decay', '0.', '--num_train_epochs', '5', '--gradient_accumulation_steps', '1', '--lr_scheduler_type', 'cosine', '--num_warmup_steps', '100', '--seed', '1234', '--gradient_checkpointing', '--zero_stage', '3', '--lora_dim', '16', '--lora_alpha', '16', '--lora_droppout', '0.05', '--lora_module_name', 'q_proj,k_proj,v_proj,o_proj,down_proj,gate_proj,up_proj', '--deepspeed', '--output_dir', './output-lora'] exits with return code = 1