RuntimeError: embeddings must be a CUDA tensor

HI sir

I was able to train normally before, but I don't know why an error suddenly occurred. Can you help me fix this problem

Me too.

$python tasks/run.py --config=egs/datasets/{Video_ID}/lm3d_radnerf_sr.yaml --exp_name=motion2video_nerf/{Video_ID}_head --reset

| Hparams chains:  ['egs/egs_bases/radnerf/base.yaml', 'egs/egs_bases/radnerf/lm3d_radnerf.yaml', 'egs/datasets/lizzi/lm3d_radnerf.yaml', 'egs/datasets/lizzi/lm3d_radnerf_sr.yaml']
| Hparams:  {
  "accumulate_grad_batches": 1,
  "add_eye_blink_cond": true,
  "ambient_coord_dim": 3,
  "ambient_loss_mode": "mae",
  "amp": true,
  "base_config": [
    "./lm3d_radnerf.yaml"
  ],
  "binary_data_dir": "data/binary/videos",
  "bound": 1,
  "camera_offset": [
    0,
    0,
    0
  ],
  "camera_scale": 4.0,
  "clip_grad_norm": 0.0,
  "clip_grad_value": 0,
  "cond_dropout_rate": 0.0,
  "cond_out_dim": 64,
  "cond_type": "idexp_lm3d_normalized",
  "cond_win_size": 1,
  "cuda_ray": true,
  "debug": false,
  "density_thresh": 10,
  "density_thresh_torso": 0.01,
  "desired_resolution": 2048,
  "dt_gamma": 0.00390625,
  "eval_max_batches": 100,
  "exp_name": "motion2video_nerf/lizzi_head",
  "eye_blink_dim": 8,
  "far": 0.9,
  "finetune_lips": true,
  "finetune_lips_start_iter": 200000,
  "geo_feat_dim": 128,
  "grid_interpolation_type": "linear",
  "grid_size": 128,
  "grid_type": "tiledgrid",
  "gui_fovy": 21.24,
  "gui_h": 512,
  "gui_max_spp": 1,
  "gui_radius": 3.35,
  "gui_w": 512,
  "hidden_dim_ambient": 128,
  "hidden_dim_color": 128,
  "hidden_dim_sigma": 128,
  "individual_embedding_dim": 4,
  "individual_embedding_num": 13000,
  "infer": false,
  "infer_audio_source_name": "",
  "infer_bg_img_fname": "",
  "infer_c2w_name": "",
  "infer_cond_name": "",
  "infer_lm3d_clamp_std": 1.5,
  "infer_lm3d_lle_percent": 0.25,
  "infer_lm3d_smooth_sigma": 0.0,
  "infer_out_video_name": "",
  "infer_scale_factor": 1.0,
  "infer_smo_std": 0.0,
  "infer_smooth_camera_path": true,
  "infer_smooth_camera_path_kernel_size": 7,
  "init_method": "tcp",
  "lambda_ambient": null,
  "lambda_dual_fm": 0.0,
  "lambda_lap_ambient_loss": 0.0,
  "lambda_lpips_loss": 0.001,
  "lambda_weights_entropy": 0.0001,
  "load_ckpt": "",
  "load_imgs_to_memory": false,
  "log2_hashmap_size": 16,
  "lpips_mode": "vgg19_v2",
  "lpips_start_iters": 200000,
  "lr": 0.0005,
  "lr_lambda_ambient": 0.01,
  "max_ray_batch": 4096,
  "max_steps": 16,
  "max_updates": 250000,
  "min_near": 0.05,
  "n_rays": 65536,
  "near": 0.3,
  "nerf_keypoint_mode": "lm68",
  "not_save_modules": [
    "criterion_lpips",
    "dual_disc"
  ],
  "num_ckpt_keep": 1,
  "num_layers_ambient": 3,
  "num_layers_color": 2,
  "num_layers_sigma": 3,
  "num_sanity_val_steps": 2,
  "num_steps": 16,
  "num_valid_plots": 5,
  "optimizer_adam_beta1": 0.9,
  "optimizer_adam_beta2": 0.999,
  "polygon_face_mask": true,
  "print_nan_grads": false,
  "processed_data_dir": "data/processed/videos",
  "raw_data_dir": "data/raw/videos",
  "resume_from_checkpoint": 0,
  "save_best": true,
  "save_codes": [
    "tasks",
    "modules",
    "egs"
  ],
  "save_gt": true,
  "scheduler": "exponential",
  "seed": 9999,
  "smo_win_size": 3,
  "smooth_lips": false,
  "sr_start_iters": 0,
  "start_rank": 0,
  "target_ambient_loss": 1e-08,
  "task_cls": "tasks.radnerfs.radnerf_sr.RADNeRFTask",
  "tb_log_interval": 100,
  "torso_head_aware": false,
  "torso_individual_embedding_dim": 8,
  "torso_shrink": 0.8,
  "update_extra_interval": 16,
  "upsample_steps": 0,
  "use_window_cond": true,
  "val_check_interval": 2000,
  "valid_infer_interval": 10000,
  "valid_monitor_key": "val_loss",
  "valid_monitor_mode": "min",
  "validate": false,
  "video_id": "lizzi",
  "warmup_updates": 0,
  "weight_decay": 0,
  "with_att": true,
  "with_sr": true,
  "work_dir": "checkpoints/motion2video_nerf/lizzi_head",
  "world_size": -1,
  "zero_dummy": true
}
02/20 10:22:28 PM GPU available: True, GPU used: [], world_size: 0, multi-machine training: False
/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/dataset_utils.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  self.lm68s = torch.tensor(self.lm2ds[:, index_lm68_from_lm478, :])
val: Smooth head trajectory (rotation and translation) with a window size of 7
| Copied codes to checkpoints/motion2video_nerf/lizzi_head/codes/20240220222234.
| cond_prenet Trainable Parameters: 0.050M
| blink_embedding Trainable Parameters: 0.000M
| blink_encoder Trainable Parameters: 0.001M
| cond_att_net Trainable Parameters: 0.004M
| position_embedder Trainable Parameters: 1.807M
| ambient_net Trainable Parameters: 0.029M
| ambient_embedder Trainable Parameters: 1.807M
| sigma_net Trainable Parameters: 0.041M
| direction_embedder Trainable Parameters: 0.000M
| color_net Trainable Parameters: 0.019M
| dropout Trainable Parameters: 0.000M
| sr_net Trainable Parameters: 0.271M
Sanity Val:   0%|                                       | 0/2 [00:00<?, ?step/s]/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/dataset_utils.py:427: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  sample['lm68'] = torch.tensor(self.lm68s[idx].reshape([68*2]))
Sanity Val:   0%|                                       | 0/2 [00:00<?, ?step/s]
Traceback (most recent call last):
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 236, in run_single_process
    self.train()
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 314, in train
    self.evaluate(self.task, False, 'Sanity Val', max_batches=self.num_sanity_val_steps)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 293, in evaluate
    output = task_ref.validation_step(*args)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/radnerf_sr.py", line 344, in validation_step
    outputs['losses'], model_out = self.run_model(sample, infer=False)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/radnerf_sr.py", line 202, in run_model
    model_out = self.model.render(rays_o, rays_d, cond_inp, bg_coords, poses, index=idx, staged=False, bg_color=bg_color, perturb=True, force_all_rays=False, cond_mask=cond_mask, eye_area_percent=eye_area_percent, **hparams)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/radnerf_sr.py", line 204, in render
    results = super().render(rays_o, rays_d, cond, bg_coords, poses, index, dt_gamma, bg_color, perturb, force_all_rays, max_steps, T_thresh, cond_mask, eye_area_percent=eye_area_percent, **kwargs)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/renderer.py", line 372, in render
    sigmas, rgbs, ambient = self.forward(xyzs, dirs, cond_feat, ind_code, cond_mask=cond_mask[rays_alive])
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/radnerf_sr.py", line 152, in forward
    pos_feat = self.position_embedder(position, bound=self.bound) # spatial feat f after E^3_{spatial} 3D grid in the paper
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/encoders/gridencoder/grid.py", line 159, in forward
    outputs = grid_encode(inputs, self.embeddings, self.offsets, self.per_level_scale, self.base_resolution, inputs.requires_grad, self.gridtype_id, self.align_corners, self.interp_id)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/cuda/amp/autocast_mode.py", line 98, in decorate_fwd
    return fwd(*args, **kwargs)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./modules/radnerfs/encoders/gridencoder/grid.py", line 54, in forward
    _backend.grid_encode_forward(inputs, embeddings, offsets, outputs, B, D, C, L, S, H, dy_dx, gridtype, align_corners, interpolation)
RuntimeError: embeddings must be a CUDA tensor

Hi, it is because CUDA_VISIBLE_DEVICES is not assigned and the code try to run on CPU. I have updated the code and it now take CUDA_VISIBLE_DEVICES=0 as default.

Thank you, it worked for me. But when i tried to use CUDA_VISIBLE_DEVICES=0,1, it raised an error:

$!export CUDA_VISIBLE_DEVICES=0,1 && python tasks/run.py --config=egs/datasets/{Video_ID}/lm3d_radnerf_sr.yaml --exp_name=motion2video_nerf/{Video_ID}_head --reset

| set_hparams Unknow hparams:  []
| Hparams chains:  ['egs/egs_bases/radnerf/base.yaml', 'egs/egs_bases/radnerf/lm3d_radnerf.yaml', 'egs/datasets/lizzi/lm3d_radnerf.yaml', 'egs/datasets/lizzi/lm3d_radnerf_sr.yaml']
| Hparams:  {
  "accumulate_grad_batches": 1,
  "add_eye_blink_cond": true,
  "ambient_coord_dim": 3,
  "ambient_loss_mode": "mae",
  "amp": true,
  "base_config": [
    "./lm3d_radnerf.yaml"
  ],
  "binary_data_dir": "data/binary/videos",
  "bound": 1,
  "camera_offset": [
    0,
    0,
    0
  ],
  "camera_scale": 4.0,
  "clip_grad_norm": 0.0,
  "clip_grad_value": 0,
  "cond_dropout_rate": 0.0,
  "cond_out_dim": 64,
  "cond_type": "idexp_lm3d_normalized",
  "cond_win_size": 1,
  "cuda_ray": true,
  "debug": false,
  "density_thresh": 10,
  "density_thresh_torso": 0.01,
  "desired_resolution": 2048,
  "dt_gamma": 0.00390625,
  "eval_max_batches": 100,
  "exp_name": "motion2video_nerf/lizzi_head",
  "eye_blink_dim": 8,
  "far": 0.9,
  "finetune_lips": true,
  "finetune_lips_start_iter": 200000,
  "geo_feat_dim": 128,
  "grid_interpolation_type": "linear",
  "grid_size": 128,
  "grid_type": "tiledgrid",
  "gui_fovy": 21.24,
  "gui_h": 512,
  "gui_max_spp": 1,
  "gui_radius": 3.35,
  "gui_w": 512,
  "hidden_dim_ambient": 128,
  "hidden_dim_color": 128,
  "hidden_dim_sigma": 128,
  "individual_embedding_dim": 4,
  "individual_embedding_num": 13000,
  "infer": false,
  "infer_audio_source_name": "",
  "infer_bg_img_fname": "",
  "infer_c2w_name": "",
  "infer_cond_name": "",
  "infer_lm3d_clamp_std": 1.5,
  "infer_lm3d_lle_percent": 0.25,
  "infer_lm3d_smooth_sigma": 0.0,
  "infer_out_video_name": "",
  "infer_scale_factor": 1.0,
  "infer_smo_std": 0.0,
  "infer_smooth_camera_path": true,
  "infer_smooth_camera_path_kernel_size": 7,
  "init_method": "tcp",
  "lambda_ambient": null,
  "lambda_dual_fm": 0.0,
  "lambda_lap_ambient_loss": 0.0,
  "lambda_lpips_loss": 0.001,
  "lambda_weights_entropy": 0.0001,
  "load_ckpt": "",
  "load_imgs_to_memory": false,
  "log2_hashmap_size": 16,
  "lpips_mode": "vgg19_v2",
  "lpips_start_iters": 200000,
  "lr": 0.0005,
  "lr_lambda_ambient": 0.01,
  "max_ray_batch": 4096,
  "max_steps": 16,
  "max_updates": 250000,
  "min_near": 0.05,
  "n_rays": 65536,
  "near": 0.3,
  "nerf_keypoint_mode": "lm68",
  "not_save_modules": [
    "criterion_lpips",
    "dual_disc"
  ],
  "num_ckpt_keep": 1,
  "num_layers_ambient": 3,
  "num_layers_color": 2,
  "num_layers_sigma": 3,
  "num_sanity_val_steps": 2,
  "num_steps": 16,
  "num_valid_plots": 5,
  "optimizer_adam_beta1": 0.9,
  "optimizer_adam_beta2": 0.999,
  "polygon_face_mask": true,
  "print_nan_grads": false,
  "processed_data_dir": "data/processed/videos",
  "raw_data_dir": "data/raw/videos",
  "resume_from_checkpoint": 0,
  "save_best": true,
  "save_codes": [
    "tasks",
    "modules",
    "egs"
  ],
  "save_gt": true,
  "scheduler": "exponential",
  "seed": 9999,
  "smo_win_size": 3,
  "smooth_lips": false,
  "sr_start_iters": 0,
  "start_rank": 0,
  "target_ambient_loss": 1e-08,
  "task_cls": "tasks.radnerfs.radnerf_sr.RADNeRFTask",
  "tb_log_interval": 100,
  "torso_head_aware": false,
  "torso_individual_embedding_dim": 8,
  "torso_shrink": 0.8,
  "update_extra_interval": 16,
  "upsample_steps": 0,
  "use_window_cond": true,
  "val_check_interval": 2000,
  "valid_infer_interval": 10000,
  "valid_monitor_key": "val_loss",
  "valid_monitor_mode": "min",
  "validate": false,
  "video_id": "lizzi",
  "warmup_updates": 0,
  "weight_decay": 0,
  "with_att": true,
  "with_sr": true,
  "work_dir": "checkpoints/motion2video_nerf/lizzi_head",
  "world_size": -1,
  "zero_dummy": true
}
02/20 11:46:13 PM GPU available: True, GPU used: [0, 1], world_size: 2, multi-machine training: False
before init_tcp!before init_tcp!

02/20 11:46:16 PM Added key: store_based_barrier_key:1 to store for rank: 0
02/20 11:46:16 PM Added key: store_based_barrier_key:1 to store for rank: 1
02/20 11:46:16 PM Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
02/20 11:46:16 PM Rank 1: Completed store-based barrier for key:store_based_barrier_key:1 with 2 nodes.
after init_tcp!
/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/dataset_utils.py:266: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  self.lm68s = torch.tensor(self.lm2ds[:, index_lm68_from_lm478, :])
val: Smooth head trajectory (rotation and translation) with a window size of 7
| Copied codes to checkpoints/motion2video_nerf/lizzi_head/codes/20240220234623.
| cond_prenet Trainable Parameters: 0.050M
| blink_embedding Trainable Parameters: 0.000M
| blink_encoder Trainable Parameters: 0.001M
| cond_att_net Trainable Parameters: 0.004M
| position_embedder Trainable Parameters: 1.807M
| ambient_net Trainable Parameters: 0.029M
| ambient_embedder Trainable Parameters: 1.807M
| sigma_net Trainable Parameters: 0.041M
| direction_embedder Trainable Parameters: 0.000M
| color_net Trainable Parameters: 0.019M
| dropout Trainable Parameters: 0.000M
| sr_net Trainable Parameters: 0.271M
Sanity Val:   0%|                                       | 0/2 [00:00<?, ?step/s]/home/bingo06/AIGC/GeneFacePlusPlus/./tasks/radnerfs/dataset_utils.py:427: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  sample['lm68'] = torch.tensor(self.lm68s[idx].reshape([68*2]))
Sanity Val:   0%|                                       | 0/2 [00:00<?, ?step/s]
Traceback (most recent call last):
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 236, in run_single_process
    self.train()
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 314, in train
    self.evaluate(self.task, False, 'Sanity Val', max_batches=self.num_sanity_val_steps)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 288, in evaluate
    output = task(*args)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/ddp_utils.py", line 82, in forward
    inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1223, in scatter
    return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 52, in scatter_kwargs
    inputs = scatter(inputs, target_gpus, dim) if inputs else []
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 44, in scatter
    res = scatter_map(inputs)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 31, in scatter_map
    return list(zip(*map(scatter_map, obj)))
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 35, in scatter_map
    return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 31, in scatter_map
    return list(zip(*map(scatter_map, obj)))
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/scatter_gather.py", line 27, in scatter_map
    return Scatter.apply(target_gpus, None, dim, obj)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/_functions.py", line 96, in forward
    outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/nn/parallel/comm.py", line 189, in scatter
    return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
RuntimeError: chunk expects at least a 1-dimensional tensor
Traceback (most recent call last):
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 141, in fit
    mp.start_processes(self.ddp_run,nprocs=self.num_local_gpus, args=(task_cls, copy.deepcopy(hparams)), start_method='spawn')
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
    while not context.join():
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join
    raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGTERM
Traceback (most recent call last):
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 141, in fit
    mp.start_processes(self.ddp_run,nprocs=self.num_local_gpus, args=(task_cls, copy.deepcopy(hparams)), start_method='spawn')
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes
    while not context.join():
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join
    raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGTERM

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/bingo06/AIGC/GeneFacePlusPlus/tasks/run.py", line 28, in <module>
    run_task()
  File "/home/bingo06/AIGC/GeneFacePlusPlus/tasks/run.py", line 16, in run_task
    task_cls.start()
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/base_task.py", line 272, in start
    trainer.fit(cls)
  File "/home/bingo06/AIGC/GeneFacePlusPlus/./utils/commons/trainer.py", line 154, in fit
    subprocess.check_call(f'pkill -f "GeneFace_worker \({hparams["work_dir"]}"', shell=True)
  File "/home/bingo06/miniconda3/envs/gene/lib/python3.9/subprocess.py", line 373, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command 'pkill -f "GeneFace_worker \(checkpoints/motion2video_nerf/lizzi_head"' returned non-zero exit status 1.

Does this mean you can’t train on multiple GPUs?

yerfor / GeneFacePlusPlus

RuntimeError: embeddings must be a CUDA tensor #71