I met the error related to NODE_RANK. The log is as follows:
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
Traceback (most recent call last):
Traceback (most recent call last):
File "metro/tools/run_metro_bodymesh.py", line 714, in <module>
File "metro/tools/run_metro_bodymesh.py", line 714, in <module>
Traceback (most recent call last):
File "metro/tools/run_metro_bodymesh.py", line 714, in <module>
main(args)main(args)
File "metro/tools/run_metro_bodymesh.py", line 587, in main
File "metro/tools/run_metro_bodymesh.py", line 587, in main
main(args)
File "metro/tools/run_metro_bodymesh.py", line 587, in main
print("Init distributed training on local rank {} ({}), rank {}, world size {}".format(args.local_rank, int(os.environ["LOCAL_RANK"]), int(os.environ["NODE_RANK"]), args.num_gpus))print("Init distributed training on local rank {} ({}), rank {}, world size {}".format(args.local_rank, int(os.environ["LOCAL_RANK"]), int(os.environ["NODE_RANK"]), args.num_gpus))
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/os.py", line 681, in __getitem__
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/os.py", line 681, in __getitem__
print("Init distributed training on local rank {} ({}), rank {}, world size {}".format(args.local_rank, int(os.environ["LOCAL_RANK"]), int(os.environ["NODE_RANK"]), args.num_gpus))
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/os.py", line 681, in __getitem__
raise KeyError(key) from Noneraise KeyError(key) from None
KeyErrorKeyError: 'NODE_RANK':
'NODE_RANK'
raise KeyError(key) from None
KeyError: 'NODE_RANK'
Traceback (most recent call last):
File "metro/tools/run_metro_bodymesh.py", line 714, in <module>
main(args)
File "metro/tools/run_metro_bodymesh.py", line 587, in main
print("Init distributed training on local rank {} ({}), rank {}, world size {}".format(args.local_rank, int(os.environ["LOCAL_RANK"]), int(os.environ["NODE_RANK"]), args.num_gpus))
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/os.py", line 681, in __getitem__
raise KeyError(key) from None
KeyError: 'NODE_RANK'
Traceback (most recent call last):
File "metro/tools/run_metro_bodymesh.py", line 714, in <module>
Traceback (most recent call last):
File "metro/tools/run_metro_bodymesh.py", line 714, in <module>
main(args)
File "metro/tools/run_metro_bodymesh.py", line 587, in main
print("Init distributed training on local rank {} ({}), rank {}, world size {}".format(args.local_rank, int(os.environ["LOCAL_RANK"]), int(os.environ["NODE_RANK"]), args.num_gpus))
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/os.py", line 681, in __getitem__
main(args)
File "metro/tools/run_metro_bodymesh.py", line 587, in main
raise KeyError(key) from None
KeyError: 'NODE_RANK'
print("Init distributed training on local rank {} ({}), rank {}, world size {}".format(args.local_rank, int(os.environ["LOCAL_RANK"]), int(os.environ["NODE_RANK"]), args.num_gpus))
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/os.py", line 681, in __getitem__
raise KeyError(key) from None
KeyError: 'NODE_RANK'
Traceback (most recent call last):
File "metro/tools/run_metro_bodymesh.py", line 714, in <module>
main(args)
File "metro/tools/run_metro_bodymesh.py", line 587, in main
print("Init distributed training on local rank {} ({}), rank {}, world size {}".format(args.local_rank, int(os.environ["LOCAL_RANK"]), int(os.environ["NODE_RANK"]), args.num_gpus))
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/os.py", line 681, in __getitem__
raise KeyError(key) from None
KeyError: 'NODE_RANK'
Traceback (most recent call last):
File "metro/tools/run_metro_bodymesh.py", line 714, in <module>
main(args)
File "metro/tools/run_metro_bodymesh.py", line 587, in main
print("Init distributed training on local rank {} ({}), rank {}, world size {}".format(args.local_rank, int(os.environ["LOCAL_RANK"]), int(os.environ["NODE_RANK"]), args.num_gpus))
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/os.py", line 681, in __getitem__
raise KeyError(key) from None
KeyError: 'NODE_RANK'
Killing subprocess 22973
Killing subprocess 22974
Killing subprocess 22975
Killing subprocess 22976
Killing subprocess 22977
Killing subprocess 22978
Killing subprocess 22979
Killing subprocess 22980
Traceback (most recent call last):
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/site-packages/torch/distributed/launch.py", line 340, in <module>
main()
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/site-packages/torch/distributed/launch.py", line 326, in main
sigkill_handler(signal.SIGTERM, None) # not coming back
File "/home/ubuntu/anaconda3/envs/metro/lib/python3.7/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/home/ubuntu/anaconda3/envs/metro/bin/python', '-u', 'metro/tools/run_metro_bodymesh.py', '--local_rank=7', '--train_yaml', 'Tax-H36m-coco40k-Muco-UP-Mpii/train.yaml', '--val_yaml', 'human3.6m/valid.protocol2.yaml', '--arch', 'hrnet-w64', '--num_workers', '4', '--per_gpu_train_batch_size', '4', '--per_gpu_eval_batch_size', '4', '--num_hidden_layers', '4', '--num_attention_heads', '4', '--lr', '1e-4', '--num_train_epochs', '200', '--input_feat_dim', '2051,512,128', '--hidden_feat_dim', '1024,256,128']' returned non-zero exit status 1.
Could you please give me some solutions?
Thank you in advance!
Hi authors,
I am facing the problem when training the model using:
I met the error related to
NODE_RANK
. The log is as follows:Could you please give me some solutions? Thank you in advance!