Open Ohjunghh opened 1 month ago
Our project is mainly designed to toonify a single person. To stylize multiple people, maybe you should crop each person out and toonify seperately, and fuse the results back to a single frame
The boundary of the square vtoonify is too clear. Can you find the problem in this code?
import os
import argparse
import numpy as np
import cv2
import torch
from torchvision import transforms
import torch.nn.functional as F
from tqdm import tqdm
from model.vtoonify import VToonify
from util import save_image, tensor2cv2, load_psp_standalone
from PIL import Image
import dlib
from model.bisenet.model import BiSeNet
class TestOptions():
def __init__(self):
self.parser = argparse.ArgumentParser(description="Style Transfer")
self.parser.add_argument("--content", type=str, default='./data/077436.jpg', help="path of the content image/video")
self.parser.add_argument("--style_id", type=int, default=26, help="the id of the style image")
self.parser.add_argument("--style_degree", type=float, default=0.5, help="style degree for VToonify-D")
self.parser.add_argument("--color_transfer", action="store_true", help="transfer the color of the style")
self.parser.add_argument("--ckpt", type=str, default='./checkpoint/vtoonify_d_cartoon/', help="path of the saved model")
self.parser.add_argument("--output_path", type=str, default='./output/', help="path of the output images")
self.parser.add_argument("--style_encoder_path", type=str, default='./checkpoint/', help="path of the style encoder")
self.parser.add_argument("--exstyle_path", type=str, default=None, help="path of the extrinsic style code")
self.parser.add_argument("--faceparsing_path", type=str, default='./checkpoint/faceparsing.pth', help="path of the face parsing model")
self.parser.add_argument("--video", action="store_true", help="if true, video stylization; if false, image stylization")
self.parser.add_argument("--cpu", action="store_true", help="if true, only use cpu")
self.parser.add_argument("--backbone", type=str, default='dualstylegan', help="dualstylegan | toonify")
self.parser.add_argument("--batch_size", type=int, default=4, help="batch size of frames when processing video")
self.parser.add_argument("--yolo_model_path", type=str, default='../face_recognition_2/', help="path to the YOLO model")
def parse(self):
self.opt = self.parser.parse_args()
args = vars(self.opt)
print('Load options')
for name, value in sorted(args.items()):
print(f'{name}: {value}')
return self.opt
def detect_faces_yolo(model, img, confidence_t=0.5, face_class=4):
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results = model(img_rgb)
faces = []
for det in results.xyxy[0]:
x1, y1, x2, y2, conf, cls = det
cls = int(cls)
if conf >= confidence_t and cls == face_class:
faces.append((int(x1), int(y1), int(x2), int(y2)))
return faces
def detect_landmarks_dlib(image, predictor, x1, y1, x2, y2):
face_roi = image[y1:y2, x1:x2]
gray = cv2.cvtColor(face_roi, cv2.COLOR_BGR2GRAY)
detector = dlib.get_frontal_face_detector()
rects = detector(gray, 1)
if len(rects) == 0:
print("No landmarks detected in the face region")
return None
for rect in rects:
shape = predictor(gray, rect)
landmarks = np.array([[p.x, p.y] for p in])
landmarks[:, 0] += x1
landmarks[:, 1] += y1
return landmarks
def align_face(image, landmarks):
lm_eye_left = landmarks[36:42]
lm_eye_right = landmarks[42:48]
eye_left = np.mean(lm_eye_left, axis=0)
eye_right = np.mean(lm_eye_right, axis=0)
eye_avg = (eye_left + eye_right) * 0.5
eye_to_eye = eye_right - eye_left
mouth_avg = (landmarks[48] + landmarks[54]) * 0.5
eye_to_mouth = mouth_avg - eye_avg
x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
x /= np.hypot(*x)
x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
y = np.flipud(x) * [-1, 1]
c = eye_avg + eye_to_mouth * 0.1
quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
qsize = np.hypot(*x) * 2
img = Image.fromarray(image)
img = img.transform((256, 256), Image.QUAD, (quad + 0.5).flatten(), Image.BILINEAR)
return img
if __name__ == "__main__":
parser = TestOptions()
args = parser.parse()
device = "cpu" if args.cpu else "cuda"
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
confidence_t = 0.5
yolo_model = torch.hub.load('ultralytics/yolov5', 'custom', path=args.yolo_model_path, force_reload=True).to(device)
yolo_model.conf = confidence_t
yolo_model.classes = None
yolo_model.agnostic_nms = False
vtoonify = VToonify(backbone=args.backbone)
vtoonify.load_state_dict(torch.load(args.ckpt, map_location=lambda storage, loc: storage)['g_ema'])
pspencoder = load_psp_standalone(args.style_encoder_path, device)
if args.backbone == 'dualstylegan':
exstyles = np.load(args.exstyle_path, allow_pickle='TRUE').item()
stylename = list(exstyles.keys())[args.style_id]
exstyle = torch.tensor(exstyles[stylename]).to(device)
with torch.no_grad():
exstyle = vtoonify.zplus2wplus(exstyle)
print('Load models successfully!')
filename = args.content
basename = os.path.basename(filename).split('.')[0]
print(f'Processing {filename} with vtoonify_{args.backbone[0]}')
predictor = dlib.shape_predictor('./checkpoint/shape_predictor_68_face_landmarks.dat')
video_cap = cv2.VideoCapture(filename)
num = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))
output_video_path = os.path.join(args.output_path, f"{basename}_stylized.mp4")
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = int(video_cap.get(cv2.CAP_PROP_FPS))
frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
videoWriter = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
parsingpredictor = BiSeNet(n_classes=19)
parsingpredictor.load_state_dict(torch.load(args.faceparsing_path, map_location=lambda storage, loc: storage))
for i in tqdm(range(num)):
success, frame =
if not success:
frame_copy = frame.copy()
faces = detect_faces_yolo(yolo_model, frame, confidence_t)
if not faces:
videoWriter.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
for (x1, y1, x2, y2) in faces:
landmarks = detect_landmarks_dlib(frame, predictor, x1, y1, x2, y2)
if landmarks is None:
aligned_face = align_face(frame, landmarks)
face_tensor = transform(aligned_face).unsqueeze(dim=0).to(device)
with torch.no_grad():
x_p = F.interpolate(parsingpredictor(2 * (F.interpolate(face_tensor, scale_factor=2, mode='bilinear', align_corners=False)))[0],
scale_factor=0.5, recompute_scale_factor=False).detach()
inputs =, x_p / 16.), dim=1)
with torch.no_grad():
s_w = pspencoder(face_tensor)
s_w = vtoonify.zplus2wplus(s_w)
if args.backbone == 'dualstylegan':
s_w[:, :7] = exstyle[:, :7]
y_tilde = vtoonify(inputs, s_w.repeat(inputs.size(0), 1, 1), d_s=args.style_degree)
y_tilde = torch.clamp(y_tilde, -1, 1)
stylized_face_np = tensor2cv2(y_tilde[0].cpu())
stylized_face_np_bgr = cv2.cvtColor(stylized_face_np, cv2.COLOR_RGB2BGR)
frame_copy[y1:y2, x1:x2] = cv2.resize(stylized_face_np_bgr, (x2 - x1, y2 - y1))
frame_bgr = cv2.cvtColor(frame_copy, cv2.COLOR_RGB2BGR)
videoWriter.write(cv2.cvtColor(frame_bgr, cv2.COLOR_RGB2BGR))
print('Transfer style successfully!')
I think the output video is weird because I've tried vtoonify on multiple people, but I didn't recognize their faces properly and applied a style to the background. Is there a way to make vtoonify on multiple people? How do I fix