OpenTalker / SadTalker

[CVPR 2023] SadTalker:Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation
https://sadtalker.github.io/
Other
11.23k stars 2.1k forks source link

I have found a problem, I don't know whether the video will flicker when you modify batchsize. After modifying the code of this project, I found that the generated digital figures appear and the generated faces flicker #778

Open zhangnn520 opened 6 months ago

zhangnn520 commented 6 months ago

def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path, batch_size, input_yaw_list=None, input_pitch_list=None, input_roll_list=None, expression_scale=1.0, still_mode = False, preprocess='crop', size = 256):

semantic_radius = 13
video_name = os.path.splitext(os.path.split(coeff_path)[-1])[0]
txt_path = os.path.splitext(coeff_path)[0]

data={}

img1 = Image.open(pic_path)
source_image = np.array(img1)
source_image = img_as_float32(source_image)
source_image = transform.resize(source_image, (size, size, 3))
source_image = source_image.transpose((2, 0, 1))
source_image_ts = torch.FloatTensor(source_image).unsqueeze(0)
source_image_ts = source_image_ts.repeat(batch_size, 1, 1, 1)
data['source_image'] = source_image_ts

source_semantics_dict = scio.loadmat(first_coeff_path)
generated_dict = scio.loadmat(coeff_path)

if 'full' not in preprocess.lower():
    source_semantics = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
    generated_3dmm = generated_dict['coeff_3dmm'][:,:70]

else:
    source_semantics = source_semantics_dict['coeff_3dmm'][:1,:73]         #1 70
    generated_3dmm = generated_dict['coeff_3dmm'][:,:70]

source_semantics_new = transform_semantic_1(source_semantics, semantic_radius)
source_semantics_ts = torch.FloatTensor(source_semantics_new).unsqueeze(0)
source_semantics_ts = source_semantics_ts.repeat(batch_size, 1, 1)
data['source_semantics'] = source_semantics_ts

# target 
generated_3dmm[:, :64] = generated_3dmm[:, :64] * expression_scale

if 'full' in preprocess.lower():
    generated_3dmm = np.concatenate([generated_3dmm, np.repeat(source_semantics[:,70:], generated_3dmm.shape[0], axis=0)], axis=1)

if still_mode:
    generated_3dmm[:, 64:] = np.repeat(source_semantics[:, 64:], generated_3dmm.shape[0], axis=0)

with open(txt_path+'.txt', 'w') as f:
    for coeff in generated_3dmm:
        for i in coeff:
            f.write(str(i)[:7]   + '  '+'\t')
        f.write('\n')

target_semantics_list = [] 
frame_num = generated_3dmm.shape[0]
data['frame_num'] = frame_num
for frame_idx in range(frame_num):
    target_semantics = transform_semantic_target(generated_3dmm, frame_idx, semantic_radius)
    target_semantics_list.append(target_semantics)

remainder = frame_num%batch_size
if remainder!=0:
    for _ in range(batch_size-remainder):
        target_semantics_list.append(target_semantics)

target_semantics_np = np.array(target_semantics_list)             #frame_num 70 semantic_radius*2+1
target_semantics_np = target_semantics_np.reshape(batch_size, -1, target_semantics_np.shape[-2], target_semantics_np.shape[-1])
data['target_semantics_list'] = torch.FloatTensor(target_semantics_np)
data['video_name'] = video_name
data['audio_path'] = audio_path

if input_yaw_list is not None:
    yaw_c_seq = gen_camera_pose(input_yaw_list, frame_num, batch_size)
    data['yaw_c_seq'] = torch.FloatTensor(yaw_c_seq)
if input_pitch_list is not None:
    pitch_c_seq = gen_camera_pose(input_pitch_list, frame_num, batch_size)
    data['pitch_c_seq'] = torch.FloatTensor(pitch_c_seq)
if input_roll_list is not None:
    roll_c_seq = gen_camera_pose(input_roll_list, frame_num, batch_size) 
    data['roll_c_seq'] = torch.FloatTensor(roll_c_seq)

return data
zhangnn520 commented 6 months ago

Uploading 视频测试2##新年快乐_full.mp4…

zhangnn520 commented 6 months ago

It is a situation like this, I do not know whether the big boss has contacted, and I do not know how to modify this bug, because I want to speed up the digital human synthesis, and it is stuck in the batch size of 1