tencent-ailab / IP-Adapter

The image prompt adapter is designed to enable a pretrained text-to-image diffusion model to generate images with image prompt.
Apache License 2.0
4.48k stars 293 forks source link

adapter-faceid-portrait-v11_sd15 model.bin + ControlNet #299

Closed s1ntecs closed 4 months ago

s1ntecs commented 4 months ago

when using the ip adapter-faceid-portrait-v11_sd15 model.bin ignores the pose from ControlNet OpenPose, do I understand correctly that ControlNet does not work with the model?

xiaohu2015 commented 4 months ago

it should can work with ControlNet (do you also test ip adapter-faceid-portrait-v1_sd15

s1ntecs commented 4 months ago

faceid

Thanks for the quick response, it really worked out, I'll put the code here, maybe someone will see the comments or someone will also ask such a question


import os
import random
import cv2
from insightface.app import FaceAnalysis
from insightface.utils import face_align
import numpy as np
import torch

# app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
# app.prepare(ctx_id=0, det_size=(640, 640))

# image = cv2.imread("./2.png")
# faces = app.get(image)

# faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
# face_image = face_align.norm_crop(image, landmark=faces[0].kps, image_size=224) # you can also segment the face

import torch
from diffusers import (StableDiffusionPipeline,
                       DDIMScheduler,
                       AutoencoderKL,
                       ControlNetModel,
                       StableDiffusionControlNetPipeline)
from PIL import Image
from utils import FaceidAcquirer, image_grid

from ip_adapter.ip_adapter_faceid import IPAdapterFaceIDPlus
from ip_adapter.ip_adapter_faceid_separate import IPAdapterFaceID

v2 = False
base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
ip_ckpt = "models/ip-adapter-faceid-portrait-v11_sd15.bin" if not v2 else "./models/ip-adapter-faceid-plusv2_sd15.bin"
device = "cuda"

app = FaceidAcquirer()
faceid_embeds = app.get_multi_embeds(["2.png"])
n_cond = faceid_embeds.shape[1]

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)
vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
controlnet_model_path = "lllyasviel/control_v11p_sd15_openpose"
controlnet = ControlNetModel.from_pretrained(controlnet_model_path,
                                             torch_dtype=torch.float16)

pipe = StableDiffusionControlNetPipeline.from_pretrained(
    base_model_path,
    controlnet=controlnet,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

# load ip-adapter
# ip_model = IPAdapterFaceIDPlus(pipe, image_encoder_path, ip_ckpt, device)
ip_model = IPAdapterFaceID(pipe, ip_ckpt, device, num_tokens=16, n_cond=n_cond)
# generate image
prompt = "Vincent van Gogh style painting of a man"
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality, blurry"
image = Image.open("./1.png")
image = np.array(image)
low_threshold = 100
high_threshold = 200

image = cv2.Canny(image, low_threshold, high_threshold)

image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image)
seed = random.randint(0, 100000)
images = ip_model.generate(
     scale=1,
     prompt=prompt,
     image=canny_image,
     negative_prompt=negative_prompt,
     faceid_embeds=faceid_embeds,
     guidance_scale=8,
     num_samples=1, width=512, height=768,
     num_inference_steps=30, seed=seed
)
for i, image in enumerate(images):
    image.save(f"output_{i}.jpg")
canny_image.save("canny_image.jpg")