Driving image instead of driving video

killah-t-cell commented 1 month ago

I wrote custom code to take in a driving image instead of video... it mostly works:

but sometimes it suffers from poor restitching and facial deformation.

Here's my code

    def execute_face_transfer(self, args: ArgumentConfig):
        # for convenience
        inf_cfg = self.live_portrait_wrapper.inference_cfg
        crop_cfg = self.cropper.crop_cfg

        ######## load source input ########
        img_rgb = args.source  # Assuming args.source is already a numpy array
        img_rgb = resize_to_limit(img_rgb, inf_cfg.source_max_dim, inf_cfg.source_division)
        source_rgb_lst = [img_rgb]

        ######## process driving info ########
        driving_rgb = args.driving  # Assuming args.driving is already a numpy array
        driving_rgb = resize_to_limit(driving_rgb, inf_cfg.source_max_dim, inf_cfg.source_division)

        ######## prepare for pasteback ########
        flag_normalize_lip = inf_cfg.flag_normalize_lip
        lip_delta_before_animation = None

        ######## process source info ########
        if inf_cfg.flag_do_crop:
            crop_info = self.cropper.crop_source_image(source_rgb_lst[0], crop_cfg)
            if crop_info is None:
                raise Exception("No face detected in the source image!")
            source_lmk = crop_info['lmk_crop']
            img_crop_256x256 = crop_info['img_crop_256x256']
        else:
            source_lmk = self.cropper.calc_lmk_from_cropped_image(source_rgb_lst[0])
            img_crop_256x256 = cv2.resize(source_rgb_lst[0], (256, 256))

        I_s = self.live_portrait_wrapper.prepare_source(img_crop_256x256)
        x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)
        x_c_s = x_s_info['kp']
        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
        f_s = self.live_portrait_wrapper.extract_feature_3d(I_s)
        x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)

        if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
            c_d_lip_before_animation = [0.]
            combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
            if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
                lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)

        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
            mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, crop_info['M_c2o'], dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0]))

        ######## process driving info ########
        driving_crop_info = self.cropper.crop_source_image(driving_rgb, crop_cfg)
        if driving_crop_info is None:
            raise Exception("No face detected in the driving image!")
        driving_img_crop_256x256 = driving_crop_info['img_crop_256x256']

        I_d = self.live_portrait_wrapper.prepare_source(driving_img_crop_256x256)
        x_d_info = self.live_portrait_wrapper.get_kp_info(I_d)

        ######## animate (single frame) ########
        x_d_info = self.live_portrait_wrapper.get_kp_info(I_d)
        R_d = x_d_info['R'] if 'R' in x_d_info.keys() else get_rotation_matrix(x_d_info['pitch'], x_d_info['yaw'], x_d_info['roll'])

        if inf_cfg.flag_relative_motion:
            R_new = (R_d @ R_s.permute(0, 2, 1)) @ R_s
            delta_new = x_s_info['exp'] + (x_d_info['exp'] - x_s_info['exp'])
            scale_new = x_s_info['scale'] * (x_d_info['scale'] / x_s_info['scale'])
            t_new = x_s_info['t'] + (x_d_info['t'] - x_s_info['t'])
        else:
            R_new = R_d
            delta_new = x_d_info['exp']
            scale_new = x_s_info['scale']
            t_new = x_d_info['t']

        t_new[..., 2].fill_(0)  # zero tz
        x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new

        # Algorithm 1:
        if not inf_cfg.flag_stitching and not inf_cfg.flag_eye_retargeting and not inf_cfg.flag_lip_retargeting:
            # without stitching or retargeting
            if flag_normalize_lip and lip_delta_before_animation is not None:
                x_d_i_new += lip_delta_before_animation
        elif inf_cfg.flag_stitching and not inf_cfg.flag_eye_retargeting and not inf_cfg.flag_lip_retargeting:
            # with stitching and without retargeting
            if flag_normalize_lip and lip_delta_before_animation is not None:
                x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + lip_delta_before_animation
            else:
                x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)
        else:
            eyes_delta, lip_delta = None, None
            if inf_cfg.flag_eye_retargeting and source_lmk is not None:
                c_d_eyes = self.live_portrait_wrapper.calc_ratio([driving_crop_info['lmk_crop']])[0]
                combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eyes, source_lmk)
                eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor)
            if inf_cfg.flag_lip_retargeting and source_lmk is not None:
                c_d_lip = self.live_portrait_wrapper.calc_ratio([driving_crop_info['lmk_crop']])[1]
                combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip, source_lmk)
                lip_delta = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor)

            if inf_cfg.flag_relative_motion:
                x_d_i_new = x_s + (eyes_delta if eyes_delta is not None else 0) + (lip_delta if lip_delta is not None else 0)
            else:
                x_d_i_new = x_d_i_new + (eyes_delta if eyes_delta is not None else 0) + (lip_delta if lip_delta is not None else 0)

            if inf_cfg.flag_stitching:
                x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)

        out = self.live_portrait_wrapper.warp_decode(f_s, x_s, x_d_i_new)
        I_p = self.live_portrait_wrapper.parse_output(out['out'])[0]

        # Convert BGR to RGB
        I_p_rgb = cv2.cvtColor(I_p, cv2.COLOR_BGR2RGB)
        source_rgb = cv2.cvtColor(source_rgb_lst[0], cv2.COLOR_BGR2RGB)

        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
            I_p_pstbk = paste_back(I_p_rgb, crop_info['M_c2o'], source_rgb, mask_ori_float)
            return I_p_pstbk
        else:
            return I_p_rgb

Is there anything I am doing wrong? Is the problem that my faces aren't always frontal? Plus, is there a way to manipulate more emotions through sliders – not only eyes and mouth? I noticed the "exp" is simply read from the driving video, and I figured it could be understandable as well!

aihacker111 commented 1 month ago

@killah-t-cell seem like the keypoint is based on the original image and driving image you use, it's will control base on landmark of driving video and then paste-back to the original image, you can reverse 2 image, and compare 2 result, btw great work

You can try it https://www.researchgate.net/figure/General-overview-of-pose-transfer-using-the-proposed-method_fig1_358603007

Mayorc1978 commented 1 month ago

Nice, try to have a talk with the guy from this post: https://github.com/KwaiVGI/LivePortrait/issues/216

YoungofNUAA commented 1 month ago

I wrote custom code to take in a driving image instead of video... it mostly works:

Screenshot 2024-07-25 at 11 58 39 PM but sometimes it suffers from poor restitching and facial deformation. Screenshot 2024-07-26 at 12 01 03 AM

Here's my code

    def execute_face_transfer(self, args: ArgumentConfig):
        # for convenience
        inf_cfg = self.live_portrait_wrapper.inference_cfg
        crop_cfg = self.cropper.crop_cfg

        ######## load source input ########
        img_rgb = args.source  # Assuming args.source is already a numpy array
        img_rgb = resize_to_limit(img_rgb, inf_cfg.source_max_dim, inf_cfg.source_division)
        source_rgb_lst = [img_rgb]

        ######## process driving info ########
        driving_rgb = args.driving  # Assuming args.driving is already a numpy array
        driving_rgb = resize_to_limit(driving_rgb, inf_cfg.source_max_dim, inf_cfg.source_division)

        ######## prepare for pasteback ########
        flag_normalize_lip = inf_cfg.flag_normalize_lip
        lip_delta_before_animation = None

        ######## process source info ########
        if inf_cfg.flag_do_crop:
            crop_info = self.cropper.crop_source_image(source_rgb_lst[0], crop_cfg)
            if crop_info is None:
                raise Exception("No face detected in the source image!")
            source_lmk = crop_info['lmk_crop']
            img_crop_256x256 = crop_info['img_crop_256x256']
        else:
            source_lmk = self.cropper.calc_lmk_from_cropped_image(source_rgb_lst[0])
            img_crop_256x256 = cv2.resize(source_rgb_lst[0], (256, 256))

        I_s = self.live_portrait_wrapper.prepare_source(img_crop_256x256)
        x_s_info = self.live_portrait_wrapper.get_kp_info(I_s)
        x_c_s = x_s_info['kp']
        R_s = get_rotation_matrix(x_s_info['pitch'], x_s_info['yaw'], x_s_info['roll'])
        f_s = self.live_portrait_wrapper.extract_feature_3d(I_s)
        x_s = self.live_portrait_wrapper.transform_keypoint(x_s_info)

        if flag_normalize_lip and inf_cfg.flag_relative_motion and source_lmk is not None:
            c_d_lip_before_animation = [0.]
            combined_lip_ratio_tensor_before_animation = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip_before_animation, source_lmk)
            if combined_lip_ratio_tensor_before_animation[0][0] >= inf_cfg.lip_normalize_threshold:
                lip_delta_before_animation = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor_before_animation)

        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
            mask_ori_float = prepare_paste_back(inf_cfg.mask_crop, crop_info['M_c2o'], dsize=(source_rgb_lst[0].shape[1], source_rgb_lst[0].shape[0]))

        ######## process driving info ########
        driving_crop_info = self.cropper.crop_source_image(driving_rgb, crop_cfg)
        if driving_crop_info is None:
            raise Exception("No face detected in the driving image!")
        driving_img_crop_256x256 = driving_crop_info['img_crop_256x256']

        I_d = self.live_portrait_wrapper.prepare_source(driving_img_crop_256x256)
        x_d_info = self.live_portrait_wrapper.get_kp_info(I_d)

        ######## animate (single frame) ########
        x_d_info = self.live_portrait_wrapper.get_kp_info(I_d)
        R_d = x_d_info['R'] if 'R' in x_d_info.keys() else get_rotation_matrix(x_d_info['pitch'], x_d_info['yaw'], x_d_info['roll'])

        if inf_cfg.flag_relative_motion:
            R_new = (R_d @ R_s.permute(0, 2, 1)) @ R_s
            delta_new = x_s_info['exp'] + (x_d_info['exp'] - x_s_info['exp'])
            scale_new = x_s_info['scale'] * (x_d_info['scale'] / x_s_info['scale'])
            t_new = x_s_info['t'] + (x_d_info['t'] - x_s_info['t'])
        else:
            R_new = R_d
            delta_new = x_d_info['exp']
            scale_new = x_s_info['scale']
            t_new = x_d_info['t']

        t_new[..., 2].fill_(0)  # zero tz
        x_d_i_new = scale_new * (x_c_s @ R_new + delta_new) + t_new

        # Algorithm 1:
        if not inf_cfg.flag_stitching and not inf_cfg.flag_eye_retargeting and not inf_cfg.flag_lip_retargeting:
            # without stitching or retargeting
            if flag_normalize_lip and lip_delta_before_animation is not None:
                x_d_i_new += lip_delta_before_animation
        elif inf_cfg.flag_stitching and not inf_cfg.flag_eye_retargeting and not inf_cfg.flag_lip_retargeting:
            # with stitching and without retargeting
            if flag_normalize_lip and lip_delta_before_animation is not None:
                x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new) + lip_delta_before_animation
            else:
                x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)
        else:
            eyes_delta, lip_delta = None, None
            if inf_cfg.flag_eye_retargeting and source_lmk is not None:
                c_d_eyes = self.live_portrait_wrapper.calc_ratio([driving_crop_info['lmk_crop']])[0]
                combined_eye_ratio_tensor = self.live_portrait_wrapper.calc_combined_eye_ratio(c_d_eyes, source_lmk)
                eyes_delta = self.live_portrait_wrapper.retarget_eye(x_s, combined_eye_ratio_tensor)
            if inf_cfg.flag_lip_retargeting and source_lmk is not None:
                c_d_lip = self.live_portrait_wrapper.calc_ratio([driving_crop_info['lmk_crop']])[1]
                combined_lip_ratio_tensor = self.live_portrait_wrapper.calc_combined_lip_ratio(c_d_lip, source_lmk)
                lip_delta = self.live_portrait_wrapper.retarget_lip(x_s, combined_lip_ratio_tensor)

            if inf_cfg.flag_relative_motion:
                x_d_i_new = x_s + (eyes_delta if eyes_delta is not None else 0) + (lip_delta if lip_delta is not None else 0)
            else:
                x_d_i_new = x_d_i_new + (eyes_delta if eyes_delta is not None else 0) + (lip_delta if lip_delta is not None else 0)

            if inf_cfg.flag_stitching:
                x_d_i_new = self.live_portrait_wrapper.stitching(x_s, x_d_i_new)

        out = self.live_portrait_wrapper.warp_decode(f_s, x_s, x_d_i_new)
        I_p = self.live_portrait_wrapper.parse_output(out['out'])[0]

        # Convert BGR to RGB
        I_p_rgb = cv2.cvtColor(I_p, cv2.COLOR_BGR2RGB)
        source_rgb = cv2.cvtColor(source_rgb_lst[0], cv2.COLOR_BGR2RGB)

        if inf_cfg.flag_pasteback and inf_cfg.flag_do_crop and inf_cfg.flag_stitching:
            I_p_pstbk = paste_back(I_p_rgb, crop_info['M_c2o'], source_rgb, mask_ori_float)
            return I_p_pstbk
        else:
            return I_p_rgb

Is there anything I am doing wrong? Is the problem that my faces aren't always frontal? Plus, is there a way to manipulate more emotions through sliders – not only eyes and mouth? I noticed the "exp" is simply read from the driving video, and I figured it could be understandable as well!

have you solved this problem yet?

killah-t-cell commented 1 month ago

I did actually. I make it so the driving image emotions scale iteratively to the desired emotion (almost like mimicking the frames) and that did much better.

YoungofNUAA commented 1 month ago

I did actually. I make it so the driving image emotions scale iteratively to the desired emotion (almost like mimicking the frames) and that did much better.

can you share the latest code? i am new to this

YoungofNUAA commented 1 month ago

I did actually. I make it so the driving image emotions scale iteratively to the desired emotion (almost like mimicking the frames) and that did much better.

hi bros, Looking forward to your reply

zzzweakman commented 1 week ago

The newly added feature now supports this functionality, so this issue is closed.

KwaiVGI / LivePortrait

Driving image instead of driving video #229