(gh_LoRA) ub2004@ub2004-B85M-A0:~/llm_dev/LoRA/examples/NLG$ python3 -m torch.distributed.launch --nproc_per_node=1 src/gpt2_ft.py --train_data ./data/e2e/train.jsonl --valid_data ./data/e2e/valid.jsonl --train_batch_size 8 --grad_acc 1 --valid_batch_size 4 --seq_len 512 --model_card gpt2.md --init_checkpoint ./pretrained_checkpoints/gpt2-medium-pytorch_model.bin --platform local --clip 0.0 --lr 0.0002 --weight_decay 0.01 --correct_bias --adam_beta2 0.999 --scheduler linear --warmup_step 500 --max_epoch 5 --save_interval 1000 --lora_dim 4 --lora_alpha 32 --lora_dropout 0.1 --label_smooth 0.1 --work_dir ./trained_models/GPT2_M/e2e --random_seed 110
/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launch.py:181: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use-env is set by default in torchrun.
If your script expects --local-rank argument to be set, please
change it to read from os.environ['LOCAL_RANK'] instead. See
https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
warnings.warn(
usage: gpt2_ft.py [-h] [--platform PLATFORM] [--local_rank LOCAL_RANK] [--rank RANK] [--device DEVICE] [--world_size WORLD_SIZE] [--random_seed RANDOM_SEED] [--lr LR] [--weight_decay WEIGHT_DECAY]
[--correct_bias] [--adam_epislon ADAM_EPISLON] [--no_decay_bias] [--adam_beta1 ADAM_BETA1] [--adam_beta2 ADAM_BETA2] [--scheduler {cosine,inv_sqrt,dev_perf,constant,linear,cycle}]
[--max_step MAX_STEP] [--max_epoch MAX_EPOCH] [--warmup_step WARMUP_STEP] [--i_steps I_STEPS] [--i_lrs I_LRS] --train_data TRAIN_DATA --valid_data VALID_DATA
[--train_batch_size TRAIN_BATCH_SIZE] [--valid_batch_size VALID_BATCH_SIZE] [--grad_acc GRAD_ACC] [--clip CLIP] [--seq_len SEQ_LEN] [--model_card {gpt2.sm,gpt2.md,gpt2.lg}]
[--init_checkpoint INIT_CHECKPOINT] [--fp16] [--log_interval LOG_INTERVAL] [--eval_interval EVAL_INTERVAL] [--save_interval SAVE_INTERVAL] [--work_dir WORK_DIR] [--lora_dim LORA_DIM]
[--lora_alpha LORA_ALPHA] [--obj {jlm,clm}] [--lora_dropout LORA_DROPOUT] [--label_smooth LABEL_SMOOTH] [--roll_interval ROLL_INTERVAL] [--roll_lr ROLL_LR] [--roll_step ROLL_STEP]
[--eval_epoch EVAL_EPOCH]
gpt2_ft.py: error: unrecognized arguments: --local-rank=0
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 50826) of binary: /usr/bin/python3
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launch.py", line 196, in
main()
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launch.py", line 192, in main
launch(args)
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launch.py", line 177, in launch
run(args)
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
(gh_LoRA) ub2004@ub2004-B85M-A0:~/llm_dev/LoRA/examples/NLG$ python3 -m torch.distributed.launch --nproc_per_node=1 src/gpt2_ft.py --train_data ./data/e2e/train.jsonl --valid_data ./data/e2e/valid.jsonl --train_batch_size 8 --grad_acc 1 --valid_batch_size 4 --seq_len 512 --model_card gpt2.md --init_checkpoint ./pretrained_checkpoints/gpt2-medium-pytorch_model.bin --platform local --clip 0.0 --lr 0.0002 --weight_decay 0.01 --correct_bias --adam_beta2 0.999 --scheduler linear --warmup_step 500 --max_epoch 5 --save_interval 1000 --lora_dim 4 --lora_alpha 32 --lora_dropout 0.1 --label_smooth 0.1 --work_dir ./trained_models/GPT2_M/e2e --random_seed 110 /home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launch.py:181: FutureWarning: The module torch.distributed.launch is deprecated and will be removed in future. Use torchrun. Note that --use-env is set by default in torchrun. If your script expects
--local-rank
argument to be set, please change it to read fromos.environ['LOCAL_RANK']
instead. See https://pytorch.org/docs/stable/distributed.html#launch-utility for further instructionswarnings.warn( usage: gpt2_ft.py [-h] [--platform PLATFORM] [--local_rank LOCAL_RANK] [--rank RANK] [--device DEVICE] [--world_size WORLD_SIZE] [--random_seed RANDOM_SEED] [--lr LR] [--weight_decay WEIGHT_DECAY] [--correct_bias] [--adam_epislon ADAM_EPISLON] [--no_decay_bias] [--adam_beta1 ADAM_BETA1] [--adam_beta2 ADAM_BETA2] [--scheduler {cosine,inv_sqrt,dev_perf,constant,linear,cycle}] [--max_step MAX_STEP] [--max_epoch MAX_EPOCH] [--warmup_step WARMUP_STEP] [--i_steps I_STEPS] [--i_lrs I_LRS] --train_data TRAIN_DATA --valid_data VALID_DATA [--train_batch_size TRAIN_BATCH_SIZE] [--valid_batch_size VALID_BATCH_SIZE] [--grad_acc GRAD_ACC] [--clip CLIP] [--seq_len SEQ_LEN] [--model_card {gpt2.sm,gpt2.md,gpt2.lg}] [--init_checkpoint INIT_CHECKPOINT] [--fp16] [--log_interval LOG_INTERVAL] [--eval_interval EVAL_INTERVAL] [--save_interval SAVE_INTERVAL] [--work_dir WORK_DIR] [--lora_dim LORA_DIM] [--lora_alpha LORA_ALPHA] [--obj {jlm,clm}] [--lora_dropout LORA_DROPOUT] [--label_smooth LABEL_SMOOTH] [--roll_interval ROLL_INTERVAL] [--roll_lr ROLL_LR] [--roll_step ROLL_STEP] [--eval_epoch EVAL_EPOCH] gpt2_ft.py: error: unrecognized arguments: --local-rank=0 ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 2) local_rank: 0 (pid: 50826) of binary: /usr/bin/python3 Traceback (most recent call last): File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launch.py", line 196, in
main()
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launch.py", line 192, in main
launch(args)
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launch.py", line 177, in launch
run(args)
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/ub2004/.local/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
src/gpt2_ft.py FAILED
Failures: