Open janglichao opened 1 year ago
Hi @janglichao, can you please provide more information about your setup?
ds_report output Please run ds_report to give us details about your setup.
System info (please complete the following information):
OS: [e.g. Ubuntu 18.04] GPU count and types [e.g. two machines with x8 A100s each] (if applicable) Hugging Face Transformers/Accelerate/etc. versions Python version Any other relevant info about your setup
Hi @janglichao, can you please provide more information about your setup?
ds_report output Please run ds_report to give us details about your setup.
System info (please complete the following information):
OS: [e.g. Ubuntu 18.04] GPU count and types [e.g. two machines with x8 A100s each] (if applicable) Hugging Face Transformers/Accelerate/etc. versions Python version Any other relevant info about your setup
ds_report:
DeepSpeed general environment info: torch install path ............... ['/home/kidd/anaconda3/envs/deepspeed/lib/python3.8/site-packages/torch'] torch version .................... 2.0.1+cu117 deepspeed install path ........... ['/home/kidd/anaconda3/envs/deepspeed/lib/python3.8/site-packages/deepspeed'] deepspeed info ................... 0.9.3+4d269c6e, 4d269c6e, master torch cuda version ............... 11.7 torch hip version ................ None nvcc version ..................... 12.1 deepspeed wheel compiled w. ...... torch 2.0, cuda 11.7
os:ubuntu22 GPU:3090(24GB)*2 python3.8
@janglichao 我也遇到了这个问题,请问你解决了吗?怎么解决的。
run step3 with: deepspeed --master_port 12346 DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py \ --data_path wangrui6/Zhihu-KOL \ --data_split 2,4,4 \ --actor_model_name_or_path /home/kidd/projects/llms/pretrain_models/ChatGLM-6B/ \ --critic_model_name_or_path /home/kidd/projects/llms/path_to_rm_checkpoint/ \ --num_padding_at_beginning 1 \ --per_device_train_batch_size 4 \ --per_device_mini_train_batch_size 4 \ --generation_batch_numbers 1 \ --ppo_epochs 1 \ --max_answer_seq_len 256 \ --max_prompt_seq_len 256 \ --actor_learning_rate 9.65e-6 \ --critic_learning_rate 5e-6 \ --actor_weight_decay 0.1 \ --critic_weight_decay 0.1 \ --num_train_epochs 1 \ --lr_scheduler_type cosine \ --gradient_accumulation_steps 1 \ --num_warmup_steps 100 \ --deepspeed --seed 1234 \ --enable_hybrid_engine \ --actor_gradient_checkpointing \ --critic_gradient_checkpointing \ --actor_zero_stage 2 \ --critic_zero_stage 2 \ --output_dir /home/kidd/projects/llms/ChatGLM-Efficient-Tuning/examples/ppo_model/ \ &> /home/kidd/projects/llms/ChatGLM-Efficient-Tuning/examples/ppo_model/training.log then got errors
/DeepSpeedExamples/applications/DeepSpeed-Chat/ │ │ training/step3_rlhf_finetuning/main.py:521 in │
│ │
│ 518 │
│ 519 │
│ 520 if name == "main": │
│ ❱ 521 │ main() │
│ 522 │
│ │
DeepSpeedExamples/applications/DeepSpeed-Chat/ │
│ training/step3_rlhf_finetuning/main.py:386 in main │
│ │
│ 383 │ │ │ │ │ │ │ │ fast_tokenizer=True) │
│ 384 │ tokenizer.pad_token = tokenizer.eos_token │
│ 385 │ │
│ ❱ 386 │ prompt_train_dataloader, unsupervised_train_dataloader, num_total_iters = create_dat │
│ 387 │ │ args=args, tokenizer=tokenizer, train_phase=3) │
│ 388 │ │
│ 389 │ # RLHF engine is responsible for creating models, loading checkpoints, ds-initialize │
│ │
/DeepSpeedExamples/applications/DeepSpeed-Chat/ │
│ training/step3_rlhf_finetuning/main.py:310 in create_datasets │
│ │
│ 307 │
│ 308 def create_datasets(args, tokenizer, train_phase=3): │
│ 309 │ unsupervised_training_enabled = args.unsupervised_dataset_name and args.unsupervised │
│ ❱ 310 │ prompt_traindataset, = create_prompt_dataset( │
│ 311 │ │ args.local_rank, args.data_path, args.data_split, │
│ 312 │ │ args.data_output_path, train_phase, args.seed, tokenizer, │
│ 313 │ │ args.max_prompt_seq_len) │
│ │
│ /DeepSpeedExamples/applications/DeepSpeed-Chat/ │
│ training/utils/data/data_utils.py:273 in create_prompt_dataset │
│ │
│ 270 │ │
│ 271 │ if local_rank <= 0 and buf_create_cache.item() != 0: │
│ 272 │ │ if len(data_path) == 1: # Single dataset. │
│ ❱ 273 │ │ │ train_dataset, eval_dataset = create_dataset( │
│ 274 │ │ │ │ local_rank, data_path[0], data_split, output_path, train_phase, │
│ 275 │ │ │ │ seed, tokenizer, end_of_conversation_token, max_seq_len) │
│ 276 │ │ else: # Blending datasets. │
│ │
│ /DeepSpeedExamples/applications/DeepSpeed-Chat/ │
│ training/utils/data/data_utils.py:225 in create_dataset │
│ │
│ 222 │ │ │ │ │ │ │ │ │ │ │ train_phase - 1, │
│ 223 │ │ │ │ │ │ │ │ │ │ │ len(train_dataset)) │
│ 224 │ train_dataset = Subset(train_dataset, train_index) │
│ ❱ 225 │ train_dataset = create_dataset_split(train_dataset, raw_dataset, │
│ 226 │ │ │ │ │ │ │ │ │ │ train_phase, tokenizer, │
│ 227 │ │ │ │ │ │ │ │ │ │ end_of_conversation_token, │
│ 228 │ │ │ │ │ │ │ │ │ │ max_seq_len) │
│ │
│ /DeepSpeedExamples/applications/DeepSpeed-Chat/ │
│ training/utils/data/data_utils.py:199 in create_dataset_split │
│ │
│ 196 │ │ │ if prompt is not None: │
│ 197 │ │ │ │ prompt_token = tokenizer(prompt, return_tensors="pt") │
│ 198 │ │ │ │ prompt_token["input_ids"] = prompt_token["input_ids"] │
│ ❱ 199 │ │ │ │ prompt_token["attention_mask"] = prompt_token["attention_mask"] │
│ 200 │ │ │ │ for key_word in ["input_ids", "attention_mask"]: │
│ 201 │ │ │ │ │ length = prompt_token[key_word].size()[-1] │
│ 202 │ │ │ │ │ if length > max_seq_len: │
│ │
│ /home/kidd/anaconda3/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:238 in │
│ getitem │
│ │
│ 235 │ │ If the key is an integer, get the
tokenizers.Encoding
for batch item with inde │ │ 236 │ │ """ │ │ 237 │ │ if isinstance(item, str): │ │ ❱ 238 │ │ │ return self.data[item] │ │ 239 │ │ elif self._encodings is not None: │ │ 240 │ │ │ return self._encodings[item] │ │ 241 │ │ else: │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ KeyError: 'attention_mask' [2023-05-01 18:32:09,958] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 1241775 [2023-05-01 18:32:09,958] [INFO] [launch.py:428:sigkill_handler] Killing subprocess 1241776 [2023-05-01 18:32:10,014] [ERROR] [launch.py:434:sigkill_handler] ['/home/kidd/anaconda3/bin/python', '-u', 'DeepSpeedExamples/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/main.py', '--local_rank=1', '--data_path', 'wangrui6/Zhihu-KOL', '--data_split', '2,4,4', '--actor_model_name_or_path', '/home/kidd/projects/llms/chatGLM-6B/ChatGLM-6B/pretrain_models/ChatGLM-6B/', '--critic_model_name_or_path', '/examples/path_to_rm_checkpoint/', '--num_padding_at_beginning', '1', '--per_device_train_batch_size', '4', '--per_device_mini_train_batch_size', '4', '--generation_batch_numbers', '1', '--ppo_epochs', '1', '--max_answer_seq_len', '256', '--max_prompt_seq_len', '256', '--actor_learning_rate', '9.65e-6', '--critic_learning_rate', '5e-6', '--actor_weight_decay', '0.1', '--critic_weight_decay', '0.1', '--num_train_epochs', '1', '--lr_scheduler_type', 'cosine', '--gradient_accumulation_steps', '1', '--num_warmup_steps', '100', '--deepspeed', '--seed', '1234', '--enable_hybrid_engine', '--actor_gradient_checkpointing', '--critic_gradient_checkpointing', '--actor_zero_stage', '2', '--critic_zero_stage', '2', '--output_dir', '/examples/ppo_model/'] exits with return code = 1