tencent-ailab / IP-Adapter

The image prompt adapter is designed to enable a pretrained text-to-image diffusion model to generate images with image prompt.
Apache License 2.0
4.48k stars 293 forks source link

[Demo Bug] Failed to run IP-Adapter-FaceID-Portrait demo #315

Open weiweiwang opened 3 months ago

weiweiwang commented 3 months ago

Code

copied from https://huggingface.co/h94/IP-Adapter-FaceID:

import logging
import sys

import cv2
import torch
from PIL import Image
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from insightface.app import FaceAnalysis
from ip_adapter.ip_adapter_faceid import IPAdapterFaceID
def image_grid(imgs, rows, cols):
    assert len(imgs) == rows * cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols * w, rows * h))
    # grid_w, grid_h = grid.size

    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols * w, i // cols * h))
    return grid

base_model_path = "SG161222/Realistic_Vision_V4.0_noVAE"
vae_model_path = "stabilityai/sd-vae-ft-mse"
ip_ckpt = "ip-adapter-faceid-portrait_sd15.bin"
device = "cpu"

def main():
    app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
    app.prepare(ctx_id=0, det_size=(640, 640))

    images = ["01.jpeg", "02.jpeg", "03.jpeg", "04.jpeg", "05.jpeg"]

    faceid_embeds = []
    for image_file in images:
        image = cv2.imread(image_file)
        faces = app.get(image)
        faceid_embeds.append(torch.from_numpy(faces[0].normed_embedding).unsqueeze(0).unsqueeze(0))
    faceid_embeds = torch.cat(faceid_embeds, dim=1)

    noise_scheduler = DDIMScheduler(
        num_train_timesteps=1000,
        beta_start=0.00085,
        beta_end=0.012,
        beta_schedule="scaled_linear",
        clip_sample=False,
        set_alpha_to_one=False,
        steps_offset=1,
    )
    vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
    pipe = StableDiffusionPipeline.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        scheduler=noise_scheduler,
        vae=vae,
        feature_extractor=None,
        safety_checker=None
    )

    # load ip-adapter
    ip_model = IPAdapterFaceID(pipe, ip_ckpt, device, num_tokens=16)

    # generate image
    prompt = "photo of a woman in red dress in a garden"
    negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality, blurry"

    images = ip_model.generate(
        prompt=prompt,
        negative_prompt=negative_prompt,
        faceid_embeds=faceid_embeds,
        num_samples=4,
        width=512,
        height=512,
        num_inference_steps=30,
        seed=2023
    )
    output_image = image_grid(images, rows=1, cols=4)
    output_image.save("results/portrait.png")
    # print(images[0])

if __name__ == '__main__':
    main()

Problem

1. Error(s) in loading state_dict for ModuleList

Traceback (most recent call last):
  File "/Users/wangweiwei/Library/Mobile Documents/com~apple~CloudDocs/PycharmProjects/ai-demo/faceediting/faceid.py", line 96, in <module>
    main()
  File "/Users/wangweiwei/Library/Mobile Documents/com~apple~CloudDocs/PycharmProjects/ai-demo/faceediting/faceid.py", line 74, in main
    ip_model = IPAdapterFaceID(pipe, ip_ckpt, device, num_tokens=16)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/wangweiwei/opt/anaconda3/envs/ai-demo/lib/python3.11/site-packages/ip_adapter/ip_adapter_faceid.py", line 133, in __init__
    self.load_ip_adapter()
  File "/Users/wangweiwei/opt/anaconda3/envs/ai-demo/lib/python3.11/site-packages/ip_adapter/ip_adapter_faceid.py", line 180, in load_ip_adapter
    ip_layers.load_state_dict(state_dict["ip_adapter"])
  File "/Users/wangweiwei/opt/anaconda3/envs/ai-demo/lib/python3.11/site-packages/torch/nn/modules/module.py", line 2184, in load_state_dict
    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for ModuleList:
    Missing key(s) in state_dict: "0.to_q_lora.down.weight", "0.to_q_lora.up.weight", "0.to_k_lora.down.weight", "0.to_k_lora.up.weight", "0.to_v_lora.down.weight", "0.to_v_lora.up.weight", "0.to_out_lora.down.weight", "0.to_out_lora.up.weight", "1.to_q_lora.down.weight", "1.to_q_lora.up.weight", "1.to_k_lora.down.weight", "1.to_k_lora.up.weight", "1.to_v_lora.down.weight", "1.to_v_lora.up.weight", "1.to_out_lora.down.weight", "1.to_out_lora.up.weight", "2.to_q_lora.down.weight", "2.to_q_lora.up.weight", "2.to_k_lora.down.weight", "2.to_k_lora.up.weight", "2.to_v_lora.down.weight", "2.to_v_lora.up.weight", "2.to_out_lora.down.weight", "2.to_out_lora.up.weight", "3.to_q_lora.down.weight", "3.to_q_lora.up.weight", "3.to_k_lora.down.weight", "3.to_k_lora.up.weight", "3.to_v_lora.down.weight", "3.to_v_lora.up.weight", "3.to_out_lora.down.weight", "3.to_out_lora.up.weight", "4.to_q_lora.down.weight", "4.to_q_lora.up.weight", "4.to_k_lora.down.weight", "4.to_k_lora.up.weight", "4.to_v_lora.down.weight", "4.to_v_lora.up.weight", "4.to_out_lora.down.weight", "4.to_out_lora.up.weight", "5.to_q_lora.down.weight", "5.to_q_lora.up.weight", "5.to_k_lora.down.weight", "5.to_k_lora.up.weight", "5.to_v_lora.down.weight", "5.to_v_lora.up.weight", "5.to_out_lora.down.weight", "5.to_out_lora.up.weight", "6.to_q_lora.down.weight", "6.to_q_lora.up.weight", "6.to_k_lora.down.weight", "6.to_k_lora.up.weight", "6.to_v_lora.down.weight", "6.to_v_lora.up.weight", "6.to_out_lora.down.weight", "6.to_out_lora.up.weight", "7.to_q_lora.down.weight", "7.to_q_lora.up.weight", "7.to_k_lora.down.weight", "7.to_k_lora.up.weight", "7.to_v_lora.down.weight", "7.to_v_lora.up.weight", "7.to_out_lora.down.weight", "7.to_out_lora.up.weight", "8.to_q_lora.down.weight", "8.to_q_lora.up.weight", "8.to_k_lora.down.weight", "8.to_k_lora.up.weight", "8.to_v_lora.down.weight", "8.to_v_lora.up.weight", "8.to_out_lora.down.weight", "8.to_out_lora.up.weight", "9.to_q_lora.down.weight", "9.to_q_lora.up.weight", "9.to_k_lora.down.weight", "9.to_k_lora.up.weight", "9.to_v_lora.down.weight", "9.to_v_lora.up.weight", "9.to_out_lora.down.weight", "9.to_out_lora.up.weight", "10.to_q_lora.down.weight", "10.to_q_lora.up.weight", "10.to_k_lora.down.weight", "10.to_k_lora.up.weight", "10.to_v_lora.down.weight", "10.to_v_lora.up.weight", "10.to_out_lora.down.weight", "10.to_out_lora.up.weight", "11.to_q_lora.down.weight", "11.to_q_lora.up.weight", "11.to_k_lora.down.weight", "11.to_k_lora.up.weight", "11.to_v_lora.down.weight", "11.to_v_lora.up.weight", "11.to_out_lora.down.weight", "11.to_out_lora.up.weight", "12.to_q_lora.down.weight", "12.to_q_lora.up.weight", "12.to_k_lora.down.weight", "12.to_k_lora.up.weight", "12.to_v_lora.down.weight", "12.to_v_lora.up.weight", "12.to_out_lora.down.weight", "12.to_out_lora.up.weight", "13.to_q_lora.down.weight", "13.to_q_lora.up.weight", "13.to_k_lora.down.weight", "13.to_k_lora.up.weight", "13.to_v_lora.down.weight", "13.to_v_lora.up.weight", "13.to_out_lora.down.weight", "13.to_out_lora.up.weight", "14.to_q_lora.down.weight", "14.to_q_lora.up.weight", "14.to_k_lora.down.weight", "14.to_k_lora.up.weight", "14.to_v_lora.down.weight", "14.to_v_lora.up.weight", "14.to_out_lora.down.weight", "14.to_out_lora.up.weight", "15.to_q_lora.down.weight", "15.to_q_lora.up.weight", "15.to_k_lora.down.weight", "15.to_k_lora.up.weight", "15.to_v_lora.down.weight", "15.to_v_lora.up.weight", "15.to_out_lora.down.weight", "15.to_out_lora.up.weight", "16.to_q_lora.down.weight", "16.to_q_lora.up.weight", "16.to_k_lora.down.weight", "16.to_k_lora.up.weight", "16.to_v_lora.down.weight", "16.to_v_lora.up.weight", "16.to_out_lora.down.weight", "16.to_out_lora.up.weight", "17.to_q_lora.down.weight", "17.to_q_lora.up.weight", "17.to_k_lora.down.weight", "17.to_k_lora.up.weight", "17.to_v_lora.down.weight", "17.to_v_lora.up.weight", "17.to_out_lora.down.weight", "17.to_out_lora.up.weight", "18.to_q_lora.down.weight", "18.to_q_lora.up.weight", "18.to_k_lora.down.weight", "18.to_k_lora.up.weight", "18.to_v_lora.down.weight", "18.to_v_lora.up.weight", "18.to_out_lora.down.weight", "18.to_out_lora.up.weight", "19.to_q_lora.down.weight", "19.to_q_lora.up.weight", "19.to_k_lora.down.weight", "19.to_k_lora.up.weight", "19.to_v_lora.down.weight", "19.to_v_lora.up.weight", "19.to_out_lora.down.weight", "19.to_out_lora.up.weight", "20.to_q_lora.down.weight", "20.to_q_lora.up.weight", "20.to_k_lora.down.weight", "20.to_k_lora.up.weight", "20.to_v_lora.down.weight", "20.to_v_lora.up.weight", "20.to_out_lora.down.weight", "20.to_out_lora.up.weight", "21.to_q_lora.down.weight", "21.to_q_lora.up.weight", "21.to_k_lora.down.weight", "21.to_k_lora.up.weight", "21.to_v_lora.down.weight", "21.to_v_lora.up.weight", "21.to_out_lora.down.weight", "21.to_out_lora.up.weight", "22.to_q_lora.down.weight", "22.to_q_lora.up.weight", "22.to_k_lora.down.weight", "22.to_k_lora.up.weight", "22.to_v_lora.down.weight", "22.to_v_lora.up.weight", "22.to_out_lora.down.weight", "22.to_out_lora.up.weight", "23.to_q_lora.down.weight", "23.to_q_lora.up.weight", "23.to_k_lora.down.weight", "23.to_k_lora.up.weight", "23.to_v_lora.down.weight", "23.to_v_lora.up.weight", "23.to_out_lora.down.weight", "23.to_out_lora.up.weight", "24.to_q_lora.down.weight", "24.to_q_lora.up.weight", "24.to_k_lora.down.weight", "24.to_k_lora.up.weight", "24.to_v_lora.down.weight", "24.to_v_lora.up.weight", "24.to_out_lora.down.weight", "24.to_out_lora.up.weight", "25.to_q_lora.down.weight", "25.to_q_lora.up.weight", "25.to_k_lora.down.weight", "25.to_k_lora.up.weight", "25.to_v_lora.down.weight", "25.to_v_lora.up.weight", "25.to_out_lora.down.weight", "25.to_out_lora.up.weight", "26.to_q_lora.down.weight", "26.to_q_lora.up.weight", "26.to_k_lora.down.weight", "26.to_k_lora.up.weight", "26.to_v_lora.down.weight", "26.to_v_lora.up.weight", "26.to_out_lora.down.weight", "26.to_out_lora.up.weight", "27.to_q_lora.down.weight", "27.to_q_lora.up.weight", "27.to_k_lora.down.weight", "27.to_k_lora.up.weight", "27.to_v_lora.down.weight", "27.to_v_lora.up.weight", "27.to_out_lora.down.weight", "27.to_out_lora.up.weight", "28.to_q_lora.down.weight", "28.to_q_lora.up.weight", "28.to_k_lora.down.weight", "28.to_k_lora.up.weight", "28.to_v_lora.down.weight", "28.to_v_lora.up.weight", "28.to_out_lora.down.weight", "28.to_out_lora.up.weight", "29.to_q_lora.down.weight", "29.to_q_lora.up.weight", "29.to_k_lora.down.weight", "29.to_k_lora.up.weight", "29.to_v_lora.down.weight", "29.to_v_lora.up.weight", "29.to_out_lora.down.weight", "29.to_out_lora.up.weight", "30.to_q_lora.down.weight", "30.to_q_lora.up.weight", "30.to_k_lora.down.weight", "30.to_k_lora.up.weight", "30.to_v_lora.down.weight", "30.to_v_lora.up.weight", "30.to_out_lora.down.weight", "30.to_out_lora.up.weight", "31.to_q_lora.down.weight", "31.to_q_lora.up.weight", "31.to_k_lora.down.weight", "31.to_k_lora.up.weight", "31.to_v_lora.down.weight", "31.to_v_lora.up.weight", "31.to_out_lora.down.weight", "31.to_out_lora.up.weight". 

After modify file: ip_adapter_faceid.py, line:180, change from ip_layers.load_state_dict(state_dict["ip_adapter"]) to ip_layers.load_state_dict(state_dict["ip_adapter"],False), the following problem occurs:

2. Sizes of tensors must match except in dimension 1

Traceback (most recent call last):
  File "/Users/wangweiwei/Library/Mobile Documents/com~apple~CloudDocs/PycharmProjects/ai-demo/faceediting/faceid.py", line 98, in <module>
    main()
  File "/Users/wangweiwei/Library/Mobile Documents/com~apple~CloudDocs/PycharmProjects/ai-demo/faceediting/faceid.py", line 82, in main
    images = ip_model.generate(
             ^^^^^^^^^^^^^^^^^^
  File "/Users/wangweiwei/opt/anaconda3/envs/ai-demo/lib/python3.11/site-packages/ip_adapter/ip_adapter_faceid.py", line 237, in generate
    prompt_embeds = torch.cat([prompt_embeds_, image_prompt_embeds], dim=1)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 4 but got size 20 for tensor number 1 in the list.
xiaohu2015 commented 3 months ago

follow this: https://huggingface.co/h94/IP-Adapter-FaceID#ip-adapter-faceid-portrait