I finally got it going.... 2-3 days It was a pain getting all the installs done.... Had to use .to(device)

(dreamtalk38) D:\techy\talkingHeads\dreamtalk>python testGpu.py 10.0.0 WAV Path: D:\techy\talkingHeads\dreamtalk\data\audio\acknowledgement_english.m4a Output Path: D:\techy\talkingHeads\dreamtalk\tmp\acknowledgement_english@M030_front_neutral_level1_001@male_face\acknowledgement_english@M030_front_neutral_level1_001@male_face_16K.wav PyTorch Version: 2.3.1 CUDA Available: True CUDA Version: 11.8 Device Name: NVIDIA GeForce RTX 3060 number of devices 1 NumPy Version: 1.22.4 SciPy Version: 1.7.3 Torchaudio Version: 2.3.1 OpenCV Version: 4.4.0 Available backends after updating PATH: ['soundfile'] Python 3.8.19

Hope this helps someone.

import argparse import os import shutil import subprocess import numpy as np import torch import librosa from scipy.io import loadmat from transformers import Wav2Vec2Processor from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model

from configs.default import get_cfg_defaults from core.networks.diffusion_net import DiffusionNet from core.networks.diffusion_util import NoisePredictor, VarianceSchedule from core.utils import ( crop_src_image, get_pose_params, get_video_style_clip, get_wav2vec_audio_window, ) from generators.utils import get_netG, render_video

def print_tensor_device(tensor, tensor_name): if isinstance(tensor, torch.Tensor): print(f"{tensor_name} is on device: {tensor.device}") elif isinstance(tensor, (list, tuple)): for i, t in enumerate(tensor): if isinstance(t, torch.Tensor): print(f"{tensor_name}[{i}] is on device: {t.device}") else: print(f"{tensor_name}[{i}] is not a tensor") else: print(f"{tensor_name} is not a tensor")

@torch.no_grad() def get_diff_net(cfg, device): diff_net = DiffusionNet( cfg=cfg, net=NoisePredictor(cfg), var_sched=VarianceSchedule( num_steps=cfg.DIFFUSION.SCHEDULE.NUM_STEPS, beta_1=cfg.DIFFUSION.SCHEDULE.BETA_1, beta_T=cfg.DIFFUSION.SCHEDULE.BETA_T, mode=cfg.DIFFUSION.SCHEDULE.MODE, ), ) checkpoint = torch.load(cfg.INFERENCE.CHECKPOINT, map_location=device) model_state_dict = checkpoint["model_state_dict"] diff_net_dict = { k[9:]: v for k, v in model_state_dict.items() if k.startswith("diff_net.") } diff_net.load_state_dict(diff_net_dict, strict=True) diff_net.to(device).eval() # Ensure model is on the correct device return diff_net

@torch.no_grad() def get_audio_feat(wav_path, output_name, wav2vec_model):

Placeholder function

pass

@torch.no_grad() def inference_one_video( cfg, audio_path, style_clip_path, pose_path, output_path, diff_net, device, max_audio_len=None, sample_method="ddim", ddim_num_step=10, ): audio_raw = np.load(audio_path) if max_audio_len is not None: audio_raw = audio_raw[:max_audio_len * 50] gen_num_frames = len(audio_raw) // 2

audio_win_array = get_wav2vec_audio_window(
    audio_raw,
    start_idx=0,
    num_frames=gen_num_frames,
    win_size=cfg.WIN_SIZE,
)

audio_win = torch.tensor(audio_win_array).to(device)
audio = audio_win.unsqueeze(0).to(device)
print_tensor_device(audio, "audio")

style_clip_raw, style_pad_mask_raw = get_video_style_clip(
    style_clip_path, "", style_max_len=256, start_idx=0
)

style_clip = style_clip_raw.unsqueeze(0).to(device)
print_tensor_device(style_clip, "style_clip")
style_pad_mask = (
    style_pad_mask_raw.unsqueeze(0).to(device)
    if style_pad_mask_raw is not None
    else None
)
print_tensor_device(style_pad_mask, "style_pad_mask")

# Ensure all inputs are on the same device
gen_exp_stack = diff_net.sample(
    audio,
    style_clip,
    style_pad_mask,
    output_dim=cfg.DATASET.FACE3D_DIM,
    use_cf_guidance=cfg.CF_GUIDANCE.INFERENCE,
    cfg_scale=cfg.CF_GUIDANCE.SCALE,
    sample_method=sample_method,
    ddim_num_step=ddim_num_step,
)
gen_exp = gen_exp_stack[0].cpu().numpy()

pose = get_pose_params(pose_path)
selected_pose = pose[: len(gen_exp)] if len(pose) >= len(gen_exp) else pose[-1].unsqueeze(0).repeat(len(gen_exp), 1)
gen_exp_pose = np.concatenate((gen_exp, selected_pose), axis=1)
np.save(output_path, gen_exp_pose)
return output_path

if name == "main": parser = argparse.ArgumentParser(description="Inference for demo") parser.add_argument("--wav_path", type=str, required=True, help="Path to WAV file") parser.add_argument("--image_path", type=str, required=True, help="Path to image file") parser.add_argument("--style_clip_path", type=str, required=True, help="Path to style clip MAT file") parser.add_argument("--pose_path", type=str, required=True, help="Path to pose file") parser.add_argument("--max_gen_len", type=int, default=1000, help="Maximum length (in seconds) for generating videos") parser.add_argument("--cfg_scale", type=float, default=1.0, help="Scale of classifier-free guidance") parser.add_argument("--output_name", type=str, default="test", help="Name for the output") parser.add_argument("--device", type=str, choices=['cpu', 'cuda'], default="cpu", help="Device to use for computation") parser.add_argument("--disable_img_crop", dest="img_crop", action="store_false", help="Disable image cropping") parser.set_defaults(img_crop=True)

args = parser.parse_args()

if args.device == "cuda" and not torch.cuda.is_available():
    print("CUDA is not available. Switching to CPU.")
    args.device = "cpu"

device = torch.device(args.device)
print(f"Device = {device}")
cfg = get_cfg_defaults()
cfg.CF_GUIDANCE.SCALE = args.cfg_scale
cfg.freeze()

tmp_dir = os.path.join("tmp", args.output_name)
os.makedirs(tmp_dir, exist_ok=True)

# Preprocess the audio file to WAV format with 16kHz sample rate
wav_16k_path = os.path.join(tmp_dir, f"{args.output_name}_16K.wav")
command = [
    "D:\\techy\\talkingHeads\\dreamtalk\\ffmpeg\\bin\\ffmpeg.exe",
    "-y",
    "-i", args.wav_path,
    "-async", "1",
    "-ac", "1",
    "-vn",
    "-acodec", "pcm_s16le",
    "-ar", "16000",
    wav_16k_path
]
print(f"Executing command: {' '.join(command)}")
try:
    subprocess.run(command, check=True)
except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")
    exit(1)

# Load and process audio using librosa
wav2vec_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
wav2vec_model = (
    Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    .eval()
    .to(device)
)

try:
    audio_data, _ = librosa.load(wav_16k_path, sr=16000)
except Exception as e:
    print(f"Error loading audio file: {e}")
    exit(1)

inputs = wav2vec_processor(audio_data, sampling_rate=16_000, return_tensors="pt", padding=True)

with torch.no_grad():
    audio_embedding = wav2vec_model(inputs.input_values.to(device), return_dict=False)[0]

audio_feat_path = os.path.join(tmp_dir, f"{args.output_name}_wav2vec.npy")
np.save(audio_feat_path, audio_embedding[0].cpu().numpy())

# Get source image
src_img_path = os.path.join(tmp_dir, "src_img.png")
if args.img_crop:
    crop_src_image(args.image_path, src_img_path, 0.4)
else:
    shutil.copy(args.image_path, src_img_path)

with torch.no_grad():
    # Get diffusion model and load checkpoint
    diff_net = get_diff_net(cfg, device)
    # Generate face motion
    face_motion_path = os.path.join(tmp_dir, f"{args.output_name}_facemotion.npy")
    inference_one_video(
        cfg,
        audio_feat_path,
        args.style_clip_path,
        args.pose_path,
        face_motion_path,
        diff_net,
        device,
        max_audio_len=args.max_gen_len,
    )
    # Get renderer
    renderer = get_netG("checkpoints/renderer.pt", device)
    # Render video
    output_video_path = os.path.join("output_video", f"{args.output_name}.mp4")
    render_video(
        renderer,
        src_img_path,
        face_motion_path,
        wav_16k_path,
        output_video_path,
        device,
        fps=25,
        no_move=False,
    )

    # Add watermark
    watermark = os.path.join(tmp_dir, f"{args.output_name}_watermarked.mp4")
    watermark_command = [
        "ffmpeg",
        "-i", output_video_path,
        "-vf", "drawtext=text='Your Watermark Text':x=10:y=10:fontsize=24:fontcolor=white",
        "-codec:a", "copy",
        watermark
    ]
    try:
        subprocess.run(watermark_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error adding watermark: {e}")

print(f"Processing complete. Output video saved to {watermark}")

ali-vilab / dreamtalk

Error when using Nvidia GPU. #50

Placeholder function