Closed SakuraMaiii closed 7 months ago
HI sir
I was able to train normally before, but I don't know why an error suddenly occurred. Can you help me fix this problem
Me too.
$python tasks/run.py --config=egs/datasets/{Video_ID}/lm3d_radnerf_sr.yaml --exp_name=motion2video_nerf/{Video_ID}_head --reset
| Hparams chains: ['egs/egs_bases/radnerf/base.yaml', 'egs/egs_bases/radnerf/lm3d_radnerf.yaml', 'egs/datasets/lizzi/lm3d_radnerf.yaml', 'egs/datasets/lizzi/lm3d_radnerf_sr.yaml']
| Hparams: {
"accumulate_grad_batches": 1,
"add_eye_blink_cond": true,
"ambient_coord_dim": 3,
"ambient_loss_mode": "mae",
"amp": true,
"base_config": [
"./lm3d_radnerf.yaml"
],
"binary_data_dir": "data/binary/videos",
"bound": 1,
"camera_offset": [
0,
0,
0
],
"camera_scale": 4.0,
"clip_grad_norm": 0.0,
"clip_grad_value": 0,
"cond_dropout_rate": 0.0,
"cond_out_dim": 64,
"cond_type": "idexp_lm3d_normalized",
"cond_win_size": 1,
"cuda_ray": true,
"debug": false,
"density_thresh": 10,
"density_thresh_torso": 0.01,
"desired_resolution": 2048,
"dt_gamma": 0.00390625,
"eval_max_batches": 100,
"exp_name": "motion2video_nerf/lizzi_head",
"eye_blink_dim": 8,
"far": 0.9,
"finetune_lips": true,
"finetune_lips_start_iter": 200000,
"geo_feat_dim": 128,
"grid_interpolation_type": "linear",
"grid_size": 128,
"grid_type": "tiledgrid",
"gui_fovy": 21.24,
"gui_h": 512,
"gui_max_spp": 1,
"gui_radius": 3.35,
"gui_w": 512,
"hidden_dim_ambient": 128,
"hidden_dim_color": 128,
"hidden_dim_sigma": 128,
"individual_embedding_dim": 4,
"individual_embedding_num": 13000,
"infer": false,
"infer_audio_source_name": "",
"infer_bg_img_fname": "",
"infer_c2w_name": "",
"infer_cond_name": "",
"infer_lm3d_clamp_std": 1.5,
"infer_lm3d_lle_percent": 0.25,
"infer_lm3d_smooth_sigma": 0.0,
"infer_out_video_name": "",
"infer_scale_factor": 1.0,
"infer_smo_std": 0.0,
"infer_smooth_camera_path": true,
"infer_smooth_camera_path_kernel_size": 7,
"init_method": "tcp",
"lambda_ambient": null,
"lambda_dual_fm": 0.0,
"lambda_lap_ambient_loss": 0.0,
"lambda_lpips_loss": 0.001,
"lambda_weights_entropy": 0.0001,
"load_ckpt": "",
"load_imgs_to_memory": false,
"log2_hashmap_size": 16,
"lpips_mode": "vgg19_v2",
"lpips_start_iters": 200000,
"lr": 0.0005,
"lr_lambda_ambient": 0.01,
"max_ray_batch": 4096,
"max_steps": 16,
"max_updates": 250000,
"min_near": 0.05,
"n_rays": 65536,
"near": 0.3,
"nerf_keypoint_mode": "lm68",
"not_save_modules": [
"criterion_lpips",
"dual_disc"
],
"num_ckpt_keep": 1,
"num_layers_ambient": 3,
"num_layers_color": 2,
"num_layers_sigma": 3,
"num_sanity_val_steps": 2,
"num_steps": 16,
"num_valid_plots": 5,
"optimizer_adam_beta1": 0.9,
"optimizer_adam_beta2": 0.999,
"polygon_face_mask": true,
"print_nan_grads": false,
"processed_data_dir": "data/processed/videos",
"raw_data_dir": "data/raw/videos",
"resume_from_checkpoint": 0,
"save_best": true,
"save_codes": [
"tasks",
"modules",
"egs"
],
"save_gt": true,
"scheduler": "exponential",
"seed": 9999,
"smo_win_size": 3,
"smooth_lips": false,
"sr_start_iters": 0,
"start_rank": 0,
"target_ambient_loss": 1e-08,
"task_cls": "tasks.radnerfs.radnerf_sr.RADNeRFTask",
"tb_log_interval": 100,
"torso_head_aware": false,
"torso_individual_embedding_dim": 8,
"torso_shrink": 0.8,
"update_extra_interval": 16,
"upsample_steps": 0,
"use_window_cond": true,
"val_check_interval": 2000,
"valid_infer_interval": 10000,
"valid_monitor_key": "val_loss",
"valid_monitor_mode": "min",
"validate": false,
"video_id": "lizzi",
"warmup_updates": 0,
"weight_decay": 0,
"with_att": true,
"with_sr": true,
"work_dir": "checkpoints/motion2video_nerf/lizzi_head",
"world_size": -1,
"zero_dummy": true
}
02/20 10:22:28 PM GPU available: True, GPU used: [], world_size: 0, multi-machine training: False
/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/dataset_utils.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
self.lm68s = torch.tensor(self.lm2ds[:, index_lm68_from_lm478, :])
val: Smooth head trajectory (rotation and translation) with a window size of 7
| Copied codes to checkpoints/motion2video_nerf/lizzi_head/codes/20240220222234.
| cond_prenet Trainable Parameters: 0.050M
| blink_embedding Trainable Parameters: 0.000M
| blink_encoder Trainable Parameters: 0.001M
| cond_att_net Trainable Parameters: 0.004M
| position_embedder Trainable Parameters: 1.807M
| ambient_net Trainable Parameters: 0.029M
| ambient_embedder Trainable Parameters: 1.807M
| sigma_net Trainable Parameters: 0.041M
| direction_embedder Trainable Parameters: 0.000M
| color_net Trainable Parameters: 0.019M
| dropout Trainable Parameters: 0.000M
| sr_net Trainable Parameters: 0.271M
Sanity Val: 0%| | 0/2 [00:00<?, ?step/s]/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/dataset_utils.py:427: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
sample['lm68'] = torch.tensor(self.lm68s[idx].reshape([68*2]))
Sanity Val: 0%| | 0/2 [00:00<?, ?step/s]
Traceback (most recent call last):
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 236, in run_single_process
self.train()
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 314, in train
self.evaluate(self.task, False, 'Sanity Val', max_batches=self.num_sanity_val_steps)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 293, in evaluate
output = task_ref.validation_step(*args)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/radnerf_sr.py", line 344, in validation_step
outputs['losses'], model_out = self.run_model(sample, infer=False)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/radnerf_sr.py", line 202, in run_model
model_out = self.model.render(rays_o, rays_d, cond_inp, bg_coords, poses, index=idx, staged=False, bg_color=bg_color, perturb=True, force_all_rays=False, cond_mask=cond_mask, eye_area_percent=eye_area_percent, **hparams)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/radnerf_sr.py", line 204, in render
results = super().render(rays_o, rays_d, cond, bg_coords, poses, index, dt_gamma, bg_color, perturb, force_all_rays, max_steps, T_thresh, cond_mask, eye_area_percent=eye_area_percent, **kwargs)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/renderer.py", line 372, in render
sigmas, rgbs, ambient = self.forward(xyzs, dirs, cond_feat, ind_code, cond_mask=cond_mask[rays_alive])
File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/radnerf_sr.py", line 152, in forward
pos_feat = self.position_embedder(position, bound=self.bound) # spatial feat f after E^3_{spatial} 3D grid in the paper
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/encoders/gridencoder/grid.py", line 159, in forward
outputs = grid_encode(inputs, self.embeddings, self.offsets, self.per_level_scale, self.base_resolution, inputs.requires_grad, self.gridtype_id, self.align_corners, self.interp_id)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/cuda/amp/autocast_mode.py", line 98, in decorate_fwd
return fwd(*args, **kwargs)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/encoders/gridencoder/grid.py", line 54, in forward
_backend.grid_encode_forward(inputs, embeddings, offsets, outputs, B, D, C, L, S, H, dy_dx, gridtype, align_corners, interpolation)
RuntimeError: embeddings must be a CUDA tensor
Hi, it is because CUDA_VISIBLE_DEVICES
is not assigned and the code try to run on CPU. I have updated the code and it now take CUDA_VISIBLE_DEVICES=0 as default.
Hi, it is because
CUDA_VISIBLE_DEVICES
is not assigned and the code try to run on CPU. I have updated the code and it now take CUDA_VISIBLE_DEVICES=0 as default.
Thank you, it worked for me. But when i tried to use CUDA_VISIBLE_DEVICES=0,1
, it raised an error:
$!export CUDA_VISIBLE_DEVICES=0,1 && python tasks/run.py --config=egs/datasets/{Video_ID}/lm3d_radnerf_sr.yaml --exp_name=motion2video_nerf/{Video_ID}_head --reset
| set_hparams Unknow hparams: []
| Hparams chains: ['egs/egs_bases/radnerf/base.yaml', 'egs/egs_bases/radnerf/lm3d_radnerf.yaml', 'egs/datasets/lizzi/lm3d_radnerf.yaml', 'egs/datasets/lizzi/lm3d_radnerf_sr.yaml']
| Hparams: {
"accumulate_grad_batches": 1,
"add_eye_blink_cond": true,
"ambient_coord_dim": 3,
"ambient_loss_mode": "mae",
"amp": true,
"base_config": [
"./lm3d_radnerf.yaml"
],
"binary_data_dir": "data/binary/videos",
"bound": 1,
"camera_offset": [
0,
0,
0
],
"camera_scale": 4.0,
"clip_grad_norm": 0.0,
"clip_grad_value": 0,
"cond_dropout_rate": 0.0,
"cond_out_dim": 64,
"cond_type": "idexp_lm3d_normalized",
"cond_win_size": 1,
"cuda_ray": true,
"debug": false,
"density_thresh": 10,
"density_thresh_torso": 0.01,
"desired_resolution": 2048,
"dt_gamma": 0.00390625,
"eval_max_batches": 100,
"exp_name": "motion2video_nerf/lizzi_head",
"eye_blink_dim": 8,
"far": 0.9,
"finetune_lips": true,
"finetune_lips_start_iter": 200000,
"geo_feat_dim": 128,
"grid_interpolation_type": "linear",
"grid_size": 128,
"grid_type": "tiledgrid",
"gui_fovy": 21.24,
"gui_h": 512,
"gui_max_spp": 1,
"gui_radius": 3.35,
"gui_w": 512,
"hidden_dim_ambient": 128,
"hidden_dim_color": 128,
"hidden_dim_sigma": 128,
"individual_embedding_dim": 4,
"individual_embedding_num": 13000,
"infer": false,
"infer_audio_source_name": "",
"infer_bg_img_fname": "",
"infer_c2w_name": "",
"infer_cond_name": "",
"infer_lm3d_clamp_std": 1.5,
"infer_lm3d_lle_percent": 0.25,
"infer_lm3d_smooth_sigma": 0.0,
"infer_out_video_name": "",
"infer_scale_factor": 1.0,
"infer_smo_std": 0.0,
"infer_smooth_camera_path": true,
"infer_smooth_camera_path_kernel_size": 7,
"init_method": "tcp",
"lambda_ambient": null,
"lambda_dual_fm": 0.0,
"lambda_lap_ambient_loss": 0.0,
"lambda_lpips_loss": 0.001,
"lambda_weights_entropy": 0.0001,
"load_ckpt": "",
"load_imgs_to_memory": false,
"log2_hashmap_size": 16,
"lpips_mode": "vgg19_v2",
"lpips_start_iters": 200000,
"lr": 0.0005,
"lr_lambda_ambient": 0.01,
"max_ray_batch": 4096,
"max_steps": 16,
"max_updates": 250000,
"min_near": 0.05,
"n_rays": 65536,
"near": 0.3,
"nerf_keypoint_mode": "lm68",
"not_save_modules": [
"criterion_lpips",
"dual_disc"
],
"num_ckpt_keep": 1,
"num_layers_ambient": 3,
"num_layers_color": 2,
"num_layers_sigma": 3,
"num_sanity_val_steps": 2,
"num_steps": 16,
"num_valid_plots": 5,
"optimizer_adam_beta1": 0.9,
"optimizer_adam_beta2": 0.999,
"polygon_face_mask": true,
"print_nan_grads": false,
"processed_data_dir": "data/processed/videos",
"raw_data_dir": "data/raw/videos",
"resume_from_checkpoint": 0,
"save_best": true,
"save_codes": [
"tasks",
"modules",
"egs"
],
"save_gt": true,
"scheduler": "exponential",
"seed": 9999,
"smo_win_size": 3,
"smooth_lips": false,
"sr_start_iters": 0,
"start_rank": 0,
"target_ambient_loss": 1e-08,
"task_cls": "tasks.radnerfs.radnerf_sr.RADNeRFTask",
"tb_log_interval": 100,
"torso_head_aware": false,
"torso_individual_embedding_dim": 8,
"torso_shrink": 0.8,
"update_extra_interval": 16,
"upsample_steps": 0,
"use_window_cond": true,
"val_check_interval": 2000,
"valid_infer_interval": 10000,
"valid_monitor_key": "val_loss",
"valid_monitor_mode": "min",
"validate": false,
"video_id": "lizzi",
"warmup_updates": 0,
"weight_decay": 0,
"with_att": true,
"with_sr": true,
"work_dir": "checkpoints/motion2video_nerf/lizzi_head",
"world_size": -1,
"zero_dummy": true
}
02/20 11:46:13 PM GPU available: True, GPU used: [0, 1], world_size: 2, multi-machine training: False
before init_tcp!before init_tcp!
02/20 11:46:16 PM Added key: store_based_barrier_key:1 to store for rank: 0
02/20 11:46:16 PM Added key: store_based_barrier_key:1 to store for rank: 1
02/20 11:46:16 PM Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
02/20 11:46:16 PM Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
after init_tcp!
/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/dataset_utils.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
self.lm68s = torch.tensor(self.lm2ds[:, index_lm68_from_lm478, :])
val: Smooth head trajectory (rotation and translation) with a window size of 7
| Copied codes to checkpoints/motion2video_nerf/lizzi_head/codes/20240220234623.
| cond_prenet Trainable Parameters: 0.050M
| blink_embedding Trainable Parameters: 0.000M
| blink_encoder Trainable Parameters: 0.001M
| cond_att_net Trainable Parameters: 0.004M
| position_embedder Trainable Parameters: 1.807M
| ambient_net Trainable Parameters: 0.029M
| ambient_embedder Trainable Parameters: 1.807M
| sigma_net Trainable Parameters: 0.041M
| direction_embedder Trainable Parameters: 0.000M
| color_net Trainable Parameters: 0.019M
| dropout Trainable Parameters: 0.000M
| sr_net Trainable Parameters: 0.271M
Sanity Val: 0%| | 0/2 [00:00<?, ?step/s]/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/dataset_utils.py:427: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
sample['lm68'] = torch.tensor(self.lm68s[idx].reshape([68*2]))
Sanity Val: 0%| | 0/2 [00:00<?, ?step/s]
Traceback (most recent call last):
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 236, in run_single_process
self.train()
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 314, in train
self.evaluate(self.task, False, 'Sanity Val', max_batches=self.num_sanity_val_steps)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 288, in evaluate
output = task(*args)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/ddp_utils.py", line 82, in forward
inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1223, in scatter
return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 52, in scatter_kwargs
inputs = scatter(inputs, target_gpus, dim) if inputs else []
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 44, in scatter
res = scatter_map(inputs)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 31, in scatter_map
return list(zip(*map(scatter_map, obj)))
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 35, in scatter_map
return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 31, in scatter_map
return list(zip(*map(scatter_map, obj)))
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 27, in scatter_map
return Scatter.apply(target_gpus, None, dim, obj)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/_functions.py", line 96, in forward
outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 189, in scatter
return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
RuntimeError: chunk expects at least a 1-dimensional tensor
Traceback (most recent call last):
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 141, in fit
mp.start_processes(self.ddp_run,nprocs=self.num_local_gpus, args=(task_cls, copy.deepcopy(hparams)), start_method='spawn')
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
while not context.join():
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGTERM
Traceback (most recent call last):
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 141, in fit
mp.start_processes(self.ddp_run,nprocs=self.num_local_gpus, args=(task_cls, copy.deepcopy(hparams)), start_method='spawn')
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
while not context.join():
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGTERM
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/bingo06/AIGC/GeneFacePlusPlus/tasks/run.py", line 28, in <module>
run_task()
File "/home/bingo06/AIGC/GeneFacePlusPlus/tasks/run.py", line 16, in run_task
task_cls.start()
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/base_task.py", line 272, in start
trainer.fit(cls)
File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 154, in fit
subprocess.check_call(f'pkill -f "GeneFace_worker \({hparams["work_dir"]}"', shell=True)
File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/subprocess.py", line 373, in check_call
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command 'pkill -f "GeneFace_worker \(checkpoints/motion2video_nerf/lizzi_head"' returned non-zero exit status 1.
Does this mean you can’t train on multiple GPUs?
嗨,这是因为未分配并且代码尝试在 CPU 上运行。我已经更新了代码,现在默认为 CUDA_VISIBLE_DEVICES=0。
CUDA_VISIBLE_DEVICES
thanks sir
HI sir
I was able to train normally before, but I don't know why an error suddenly occurred. Can you help me fix this problem