I'm trying to train from scratch, and at epoch 42 I encountered an assertion error for having the 'spine_rest_loc' tensor being nan.
Can you please assist?
############# Starting Epoch 42 | LR: 0.001 #############
loss: -2.84896647 | accuvd29: 0.7797 | acc17: 0.8761: 71%|██████████████▊ | 3446/4878 [22:06<09:11, 2.60it/s]
Traceback (most recent call last):
File "./scripts/train_smpl_cam.py", line 363, in
main()
File "./scripts/train_smpl_cam.py", line 209, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(opt, cfg))
File "/home/fou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, args)
File "/home/orifou/HybrIK/scripts/train_smpl_cam.py", line 310, in main_worker
loss, acc17 = train(opt, train_loader, m, criterion, optimizer, writer, i)
File "/home/orifou/HybrIK/scripts/train_smpl_cam.py", line 59, in train
output = m(inps, trans_inv=trans_inv, intrinsic_param=intrinsic_param, joint_root=root, depth_factor=depth_factor)
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(input, kwargs)
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward
output = self._run_ddp_forward(*inputs, *kwargs)
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969, in _run_ddp_forward
return module_to_run(inputs[0], kwargs[0])
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "/home/orifou/HybrIK/hybrik/models/simple3dposeSMPLWithCam.py", line 346, in forward
output = self.smpl.hybrik(
File "/home/orifou/HybrIK/hybrik/models/layers/smpl/SMPL.py", line 256, in hybrik
vertices, new_joints, rot_mats, joints_from_verts = hybrik(
File "/home/orifou/HybrIK/hybrik/models/layers/smpl/lbs.py", line 357, in hybrik
rot_mats, rotate_rest_pose = batch_inverse_kinematics_transform_naive(
File "/home/orifou/HybrIK/hybrik/models/layers/smpl/lbs.py", line 821, in batch_inverse_kinematics_transform_naive
global_orient_mat = batch_get_pelvis_orient(
File "/home/orifou/HybrIK/hybrik/models/layers/smpl/lbs.py", line 994, in batch_get_pelvis_orient
assert torch.sum(torch.isnan(spine_rest_loc)
AssertionError: ('spine_rest_loc', tensor([[[nan],
Hi,
Thank you for your work!
I'm trying to train from scratch, and at epoch 42 I encountered an assertion error for having the 'spine_rest_loc' tensor being nan. Can you please assist?
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Epoch 41 | h36m err: 47.64674702400522 / 46.18705879641749 | 3dpw err: 61.5547851122922 / 57.69414353312088
############# Starting Epoch 42 | LR: 0.001 ############# loss: -2.84896647 | accuvd29: 0.7797 | acc17: 0.8761: 71%|██████████████▊ | 3446/4878 [22:06<09:11, 2.60it/s] Traceback (most recent call last): File "./scripts/train_smpl_cam.py", line 363, in
main()
File "./scripts/train_smpl_cam.py", line 209, in main
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(opt, cfg))
File "/home/fou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error: Traceback (most recent call last): File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap fn(i, args) File "/home/orifou/HybrIK/scripts/train_smpl_cam.py", line 310, in main_worker loss, acc17 = train(opt, train_loader, m, criterion, optimizer, writer, i) File "/home/orifou/HybrIK/scripts/train_smpl_cam.py", line 59, in train output = m(inps, trans_inv=trans_inv, intrinsic_param=intrinsic_param, joint_root=root, depth_factor=depth_factor) File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(input, kwargs) File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward output = self._run_ddp_forward(*inputs, *kwargs) File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 969, in _run_ddp_forward return module_to_run(inputs[0], kwargs[0]) File "/home/orifou/anaconda3/envs/hybrik2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl return forward_call(*input, **kwargs) File "/home/orifou/HybrIK/hybrik/models/simple3dposeSMPLWithCam.py", line 346, in forward output = self.smpl.hybrik( File "/home/orifou/HybrIK/hybrik/models/layers/smpl/SMPL.py", line 256, in hybrik vertices, new_joints, rot_mats, joints_from_verts = hybrik( File "/home/orifou/HybrIK/hybrik/models/layers/smpl/lbs.py", line 357, in hybrik rot_mats, rotate_rest_pose = batch_inverse_kinematics_transform_naive( File "/home/orifou/HybrIK/hybrik/models/layers/smpl/lbs.py", line 821, in batch_inverse_kinematics_transform_naive global_orient_mat = batch_get_pelvis_orient( File "/home/orifou/HybrIK/hybrik/models/layers/smpl/lbs.py", line 994, in batch_get_pelvis_orient assert torch.sum(torch.isnan(spine_rest_loc) AssertionError: ('spine_rest_loc', tensor([[[nan],