ali-vilab / dreamtalk

Official implementations for paper: DreamTalk: When Expressive Talking Head Generation Meets Diffusion Probabilistic Models
https://dreamtalk-project.github.io/
MIT License
1.54k stars 185 forks source link

Error when using Nvidia GPU. #50

Open Toolfolks opened 1 month ago

Toolfolks commented 1 month ago

I have 'python inference_for_demo_video.py --wav_path data/audio/acknowledgement_english.m4a --style_clip_path data/style_clip/3DMM/M030_front_neutral_level1_001.mat --pose_path data/pose/RichardShelby_front_neutral_level1_001.mat --image_path data/src_img/uncropped/male_face.png --cfg_scale 1.0 --max_gen_len 30 --output_name acknowledgement_english@M030_front_neutral_level1_001@male_face --device cuda' working okay. (new_dreamtalk) D:\techy\talkingHeads\dreamtalk>python testGpu.py WAV Path: D:\techy\talkingHeads\dreamtalk\data\audio\acknowledgement_english.m4a Output Path: D:\techy\talkingHeads\dreamtalk\tmp\acknowledgement_english@M030_front_neutral_level1_001@male_face\acknowledgement_english@M030_front_neutral_level1_001@male_face_16K.wav PyTorch Version: 2.3.1+cpu CUDA Available: False CUDA Version: None No CUDA device found. NumPy Version: 1.22.4 SciPy Version: 1.13.1 Torchaudio Version: 2.3.1+cpu OpenCV Version: 4.4.0 Available backends after updating PATH: ['soundfile'] However when I switch to an environment with GPU

(dreamtalk) D:\techy\talkingHeads\dreamtalk>python testGpu.py WAV Path: D:\techy\talkingHeads\dreamtalk\data\audio\acknowledgement_english.m4a Output Path: D:\techy\talkingHeads\dreamtalk\tmp\acknowledgement_english@M030_front_neutral_level1_001@male_face\acknowledgement_english@M030_front_neutral_level1_001@male_face_16K.wav PyTorch Version: 2.3.1+cu121 CUDA Available: True CUDA Version: 12.1 Device Name: NVIDIA GeForce RTX 3060 NumPy Version: 1.22.4 SciPy Version: 1.10.0 Torchaudio Version: 2.3.1+cu121 OpenCV Version: 4.10.0 Available backends after updating PATH: ['soundfile']

I get error Traceback (most recent call last): File "inference_for_demo_video.py", line 187, in inference_one_video( File "C:\Users\User.conda\envs\dreamtalk\lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "inference_for_demo_video.py", line 88, in inference_one_video gen_exp_stack = diff_net.sample( File "D:\techy\talkingHeads\dreamtalk\core\networks\diffusion_net.py", line 216, in sample return self.ddim_sample( File "D:\techy\talkingHeads\dreamtalk\core\networks\diffusion_net.py", line 144, in ddim_sample "style_clip": torch.cat([style_clip, uncond_style_clip], dim=0), RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument tensors in method wrapper_CUDA_cat)

New to programming but I have hours going round in circles with ChatGPT.

Anyone have a solution ?

Toolfolks commented 1 month ago

I finally got it going.... 2-3 days It was a pain getting all the installs done.... Had to use .to(device)

(dreamtalk38) D:\techy\talkingHeads\dreamtalk>python testGpu.py 10.0.0 WAV Path: D:\techy\talkingHeads\dreamtalk\data\audio\acknowledgement_english.m4a Output Path: D:\techy\talkingHeads\dreamtalk\tmp\acknowledgement_english@M030_front_neutral_level1_001@male_face\acknowledgement_english@M030_front_neutral_level1_001@male_face_16K.wav PyTorch Version: 2.3.1 CUDA Available: True CUDA Version: 11.8 Device Name: NVIDIA GeForce RTX 3060 number of devices 1 NumPy Version: 1.22.4 SciPy Version: 1.7.3 Torchaudio Version: 2.3.1 OpenCV Version: 4.4.0 Available backends after updating PATH: ['soundfile'] Python 3.8.19

Hope this helps someone.

import argparse import os import shutil import subprocess import numpy as np import torch import librosa from scipy.io import loadmat from transformers import Wav2Vec2Processor from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model

from configs.default import get_cfg_defaults from core.networks.diffusion_net import DiffusionNet from core.networks.diffusion_util import NoisePredictor, VarianceSchedule from core.utils import ( crop_src_image, get_pose_params, get_video_style_clip, get_wav2vec_audio_window, ) from generators.utils import get_netG, render_video

def print_tensor_device(tensor, tensor_name): if isinstance(tensor, torch.Tensor): print(f"{tensor_name} is on device: {tensor.device}") elif isinstance(tensor, (list, tuple)): for i, t in enumerate(tensor): if isinstance(t, torch.Tensor): print(f"{tensor_name}[{i}] is on device: {t.device}") else: print(f"{tensor_name}[{i}] is not a tensor") else: print(f"{tensor_name} is not a tensor")

@torch.no_grad() def get_diff_net(cfg, device): diff_net = DiffusionNet( cfg=cfg, net=NoisePredictor(cfg), var_sched=VarianceSchedule( num_steps=cfg.DIFFUSION.SCHEDULE.NUM_STEPS, beta_1=cfg.DIFFUSION.SCHEDULE.BETA_1, beta_T=cfg.DIFFUSION.SCHEDULE.BETA_T, mode=cfg.DIFFUSION.SCHEDULE.MODE, ), ) checkpoint = torch.load(cfg.INFERENCE.CHECKPOINT, map_location=device) model_state_dict = checkpoint["model_state_dict"] diff_net_dict = { k[9:]: v for k, v in model_state_dict.items() if k.startswith("diff_net.") } diff_net.load_state_dict(diff_net_dict, strict=True) diff_net.to(device).eval() # Ensure model is on the correct device return diff_net

@torch.no_grad() def get_audio_feat(wav_path, output_name, wav2vec_model):

Placeholder function

pass

@torch.no_grad() def inference_one_video( cfg, audio_path, style_clip_path, pose_path, output_path, diff_net, device, max_audio_len=None, sample_method="ddim", ddim_num_step=10, ): audio_raw = np.load(audio_path) if max_audio_len is not None: audio_raw = audio_raw[:max_audio_len * 50] gen_num_frames = len(audio_raw) // 2

audio_win_array = get_wav2vec_audio_window(
    audio_raw,
    start_idx=0,
    num_frames=gen_num_frames,
    win_size=cfg.WIN_SIZE,
)

audio_win = torch.tensor(audio_win_array).to(device)
audio = audio_win.unsqueeze(0).to(device)
print_tensor_device(audio, "audio")

style_clip_raw, style_pad_mask_raw = get_video_style_clip(
    style_clip_path, "", style_max_len=256, start_idx=0
)

style_clip = style_clip_raw.unsqueeze(0).to(device)
print_tensor_device(style_clip, "style_clip")
style_pad_mask = (
    style_pad_mask_raw.unsqueeze(0).to(device)
    if style_pad_mask_raw is not None
    else None
)
print_tensor_device(style_pad_mask, "style_pad_mask")

# Ensure all inputs are on the same device
gen_exp_stack = diff_net.sample(
    audio,
    style_clip,
    style_pad_mask,
    output_dim=cfg.DATASET.FACE3D_DIM,
    use_cf_guidance=cfg.CF_GUIDANCE.INFERENCE,
    cfg_scale=cfg.CF_GUIDANCE.SCALE,
    sample_method=sample_method,
    ddim_num_step=ddim_num_step,
)
gen_exp = gen_exp_stack[0].cpu().numpy()

pose = get_pose_params(pose_path)
selected_pose = pose[: len(gen_exp)] if len(pose) >= len(gen_exp) else pose[-1].unsqueeze(0).repeat(len(gen_exp), 1)
gen_exp_pose = np.concatenate((gen_exp, selected_pose), axis=1)
np.save(output_path, gen_exp_pose)
return output_path

if name == "main": parser = argparse.ArgumentParser(description="Inference for demo") parser.add_argument("--wav_path", type=str, required=True, help="Path to WAV file") parser.add_argument("--image_path", type=str, required=True, help="Path to image file") parser.add_argument("--style_clip_path", type=str, required=True, help="Path to style clip MAT file") parser.add_argument("--pose_path", type=str, required=True, help="Path to pose file") parser.add_argument("--max_gen_len", type=int, default=1000, help="Maximum length (in seconds) for generating videos") parser.add_argument("--cfg_scale", type=float, default=1.0, help="Scale of classifier-free guidance") parser.add_argument("--output_name", type=str, default="test", help="Name for the output") parser.add_argument("--device", type=str, choices=['cpu', 'cuda'], default="cpu", help="Device to use for computation") parser.add_argument("--disable_img_crop", dest="img_crop", action="store_false", help="Disable image cropping") parser.set_defaults(img_crop=True)

args = parser.parse_args()

if args.device == "cuda" and not torch.cuda.is_available():
    print("CUDA is not available. Switching to CPU.")
    args.device = "cpu"

device = torch.device(args.device)
print(f"Device = {device}")
cfg = get_cfg_defaults()
cfg.CF_GUIDANCE.SCALE = args.cfg_scale
cfg.freeze()

tmp_dir = os.path.join("tmp", args.output_name)
os.makedirs(tmp_dir, exist_ok=True)

# Preprocess the audio file to WAV format with 16kHz sample rate
wav_16k_path = os.path.join(tmp_dir, f"{args.output_name}_16K.wav")
command = [
    "D:\\techy\\talkingHeads\\dreamtalk\\ffmpeg\\bin\\ffmpeg.exe",
    "-y",
    "-i", args.wav_path,
    "-async", "1",
    "-ac", "1",
    "-vn",
    "-acodec", "pcm_s16le",
    "-ar", "16000",
    wav_16k_path
]
print(f"Executing command: {' '.join(command)}")
try:
    subprocess.run(command, check=True)
except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")
    exit(1)

# Load and process audio using librosa
wav2vec_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
wav2vec_model = (
    Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    .eval()
    .to(device)
)

try:
    audio_data, _ = librosa.load(wav_16k_path, sr=16000)
except Exception as e:
    print(f"Error loading audio file: {e}")
    exit(1)

inputs = wav2vec_processor(audio_data, sampling_rate=16_000, return_tensors="pt", padding=True)

with torch.no_grad():
    audio_embedding = wav2vec_model(inputs.input_values.to(device), return_dict=False)[0]

audio_feat_path = os.path.join(tmp_dir, f"{args.output_name}_wav2vec.npy")
np.save(audio_feat_path, audio_embedding[0].cpu().numpy())

# Get source image
src_img_path = os.path.join(tmp_dir, "src_img.png")
if args.img_crop:
    crop_src_image(args.image_path, src_img_path, 0.4)
else:
    shutil.copy(args.image_path, src_img_path)

with torch.no_grad():
    # Get diffusion model and load checkpoint
    diff_net = get_diff_net(cfg, device)
    # Generate face motion
    face_motion_path = os.path.join(tmp_dir, f"{args.output_name}_facemotion.npy")
    inference_one_video(
        cfg,
        audio_feat_path,
        args.style_clip_path,
        args.pose_path,
        face_motion_path,
        diff_net,
        device,
        max_audio_len=args.max_gen_len,
    )
    # Get renderer
    renderer = get_netG("checkpoints/renderer.pt", device)
    # Render video
    output_video_path = os.path.join("output_video", f"{args.output_name}.mp4")
    render_video(
        renderer,
        src_img_path,
        face_motion_path,
        wav_16k_path,
        output_video_path,
        device,
        fps=25,
        no_move=False,
    )

    # Add watermark
    watermark = os.path.join(tmp_dir, f"{args.output_name}_watermarked.mp4")
    watermark_command = [
        "ffmpeg",
        "-i", output_video_path,
        "-vf", "drawtext=text='Your Watermark Text':x=10:y=10:fontsize=24:fontcolor=white",
        "-codec:a", "copy",
        watermark
    ]
    try:
        subprocess.run(watermark_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error adding watermark: {e}")

print(f"Processing complete. Output video saved to {watermark}")