williamyang1991 / VToonify

[SIGGRAPH Asia 2022] VToonify: Controllable High-Resolution Portrait Video Style Transfer
Other
3.54k stars 446 forks source link

How to put vtoonify on multiple people in this project? #78

Open Ohjunghh opened 1 month ago

Ohjunghh commented 1 month ago

I think the output video is weird because I've tried vtoonify on multiple people, but I didn't recognize their faces properly and applied a style to the background. Is there a way to make vtoonify on multiple people? How do I fix style_transfer.py?

williamyang1991 commented 1 month ago

Our project is mainly designed to toonify a single person. To stylize multiple people, maybe you should crop each person out and toonify seperately, and fuse the results back to a single frame

Ohjunghh commented 1 month ago

The boundary of the square vtoonify is too clear. Can you find the problem in this code?

import os
import argparse
import numpy as np
import cv2
import torch
from torchvision import transforms
import torch.nn.functional as F
from tqdm import tqdm
from model.vtoonify import VToonify
from util import save_image, tensor2cv2, load_psp_standalone
from PIL import Image
import dlib
from model.bisenet.model import BiSeNet

class TestOptions():
    def __init__(self):
        self.parser = argparse.ArgumentParser(description="Style Transfer")
        self.parser.add_argument("--content", type=str, default='./data/077436.jpg', help="path of the content image/video")
        self.parser.add_argument("--style_id", type=int, default=26, help="the id of the style image")
        self.parser.add_argument("--style_degree", type=float, default=0.5, help="style degree for VToonify-D")
        self.parser.add_argument("--color_transfer", action="store_true", help="transfer the color of the style")
        self.parser.add_argument("--ckpt", type=str, default='./checkpoint/vtoonify_d_cartoon/vtoonify_s_d_c.pt', help="path of the saved model")
        self.parser.add_argument("--output_path", type=str, default='./output/', help="path of the output images")
        self.parser.add_argument("--style_encoder_path", type=str, default='./checkpoint/encoder.pt', help="path of the style encoder")
        self.parser.add_argument("--exstyle_path", type=str, default=None, help="path of the extrinsic style code")
        self.parser.add_argument("--faceparsing_path", type=str, default='./checkpoint/faceparsing.pth', help="path of the face parsing model")
        self.parser.add_argument("--video", action="store_true", help="if true, video stylization; if false, image stylization")
        self.parser.add_argument("--cpu", action="store_true", help="if true, only use cpu")
        self.parser.add_argument("--backbone", type=str, default='dualstylegan', help="dualstylegan | toonify")
        self.parser.add_argument("--batch_size", type=int, default=4, help="batch size of frames when processing video")
        self.parser.add_argument("--yolo_model_path", type=str, default='../face_recognition_2/627.pt', help="path to the YOLO model")

    def parse(self):
        self.opt = self.parser.parse_args()
        args = vars(self.opt)
        print('Load options')
        for name, value in sorted(args.items()):
            print(f'{name}: {value}')
        return self.opt

def detect_faces_yolo(model, img, confidence_t=0.5, face_class=4):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    results = model(img_rgb)

    faces = []
    for det in results.xyxy[0]:
        x1, y1, x2, y2, conf, cls = det
        cls = int(cls)
        if conf >= confidence_t and cls == face_class:
            faces.append((int(x1), int(y1), int(x2), int(y2)))
    return faces

def detect_landmarks_dlib(image, predictor, x1, y1, x2, y2):
    face_roi = image[y1:y2, x1:x2]
    gray = cv2.cvtColor(face_roi, cv2.COLOR_BGR2GRAY)

    detector = dlib.get_frontal_face_detector()
    rects = detector(gray, 1)

    if len(rects) == 0:
        print("No landmarks detected in the face region")
        return None

    for rect in rects:
        shape = predictor(gray, rect)
        landmarks = np.array([[p.x, p.y] for p in shape.parts()])

    landmarks[:, 0] += x1
    landmarks[:, 1] += y1

    return landmarks

def align_face(image, landmarks):
    lm_eye_left = landmarks[36:42]
    lm_eye_right = landmarks[42:48]

    eye_left = np.mean(lm_eye_left, axis=0)
    eye_right = np.mean(lm_eye_right, axis=0)
    eye_avg = (eye_left + eye_right) * 0.5
    eye_to_eye = eye_right - eye_left
    mouth_avg = (landmarks[48] + landmarks[54]) * 0.5
    eye_to_mouth = mouth_avg - eye_avg

    x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
    x /= np.hypot(*x)
    x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
    y = np.flipud(x) * [-1, 1]
    c = eye_avg + eye_to_mouth * 0.1
    quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
    qsize = np.hypot(*x) * 2

    img = Image.fromarray(image)
    img = img.transform((256, 256), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)

    return img

if __name__ == "__main__":
    parser = TestOptions()
    args = parser.parse()
    print('*'*98)

    device = "cpu" if args.cpu else "cuda"

    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])

    confidence_t = 0.5
    yolo_model = torch.hub.load('ultralytics/yolov5', 'custom', path=args.yolo_model_path, force_reload=True).to(device)
    yolo_model.conf = confidence_t
    yolo_model.classes = None
    yolo_model.agnostic_nms = False

    vtoonify = VToonify(backbone=args.backbone)
    vtoonify.load_state_dict(torch.load(args.ckpt, map_location=lambda storage, loc: storage)['g_ema'])
    vtoonify.to(device)

    pspencoder = load_psp_standalone(args.style_encoder_path, device)    

    if args.backbone == 'dualstylegan':
        exstyles = np.load(args.exstyle_path, allow_pickle='TRUE').item()
        stylename = list(exstyles.keys())[args.style_id]
        exstyle = torch.tensor(exstyles[stylename]).to(device)
        with torch.no_grad():
            exstyle = vtoonify.zplus2wplus(exstyle)

    print('Load models successfully!')

    filename = args.content
    basename = os.path.basename(filename).split('.')[0]
    print(f'Processing {filename} with vtoonify_{args.backbone[0]}')

    predictor = dlib.shape_predictor('./checkpoint/shape_predictor_68_face_landmarks.dat')

    if args.video:
        video_cap = cv2.VideoCapture(filename)
        num = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))

        output_video_path = os.path.join(args.output_path, f"{basename}_stylized.mp4")
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        fps = int(video_cap.get(cv2.CAP_PROP_FPS))
        frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        videoWriter = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

        parsingpredictor = BiSeNet(n_classes=19)
        parsingpredictor.load_state_dict(torch.load(args.faceparsing_path, map_location=lambda storage, loc: storage))
        parsingpredictor.to(device).eval()

        for i in tqdm(range(num)):
            success, frame = video_cap.read()
            if not success:
                break

            frame_copy = frame.copy()

            faces = detect_faces_yolo(yolo_model, frame, confidence_t)
            if not faces:
                videoWriter.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
                continue

            for (x1, y1, x2, y2) in faces:
                landmarks = detect_landmarks_dlib(frame, predictor, x1, y1, x2, y2)
                if landmarks is None:
                    continue

                aligned_face = align_face(frame, landmarks)
                face_tensor = transform(aligned_face).unsqueeze(dim=0).to(device)

                with torch.no_grad():
                    x_p = F.interpolate(parsingpredictor(2 * (F.interpolate(face_tensor, scale_factor=2, mode='bilinear', align_corners=False)))[0],
                                        scale_factor=0.5, recompute_scale_factor=False).detach()

                inputs = torch.cat((face_tensor, x_p / 16.), dim=1)

                with torch.no_grad():
                    s_w = pspencoder(face_tensor)
                    s_w = vtoonify.zplus2wplus(s_w)
                    if args.backbone == 'dualstylegan':
                        s_w[:, :7] = exstyle[:, :7]

                    y_tilde = vtoonify(inputs, s_w.repeat(inputs.size(0), 1, 1), d_s=args.style_degree)
                    y_tilde = torch.clamp(y_tilde, -1, 1)

                stylized_face_np = tensor2cv2(y_tilde[0].cpu())

                stylized_face_np_bgr = cv2.cvtColor(stylized_face_np, cv2.COLOR_RGB2BGR)
                frame_copy[y1:y2, x1:x2] = cv2.resize(stylized_face_np_bgr, (x2 - x1, y2 - y1))

            frame_bgr = cv2.cvtColor(frame_copy, cv2.COLOR_RGB2BGR)
            videoWriter.write(cv2.cvtColor(frame_bgr, cv2.COLOR_RGB2BGR))

        videoWriter.release()
        video_cap.release()

    print('Transfer style successfully!')