huggingface / diffusers

🤗 Diffusers: State-of-the-art diffusion models for image and audio generation in PyTorch and FLAX.
https://huggingface.co/docs/diffusers
Apache License 2.0
25.87k stars 5.33k forks source link

Can't use ip adapter faceid for some reason #8817

Closed alexblattner closed 3 months ago

alexblattner commented 3 months ago

Describe the bug

dimensions are wrong

Reproduction

from insightface.app import FaceAnalysis
from insightface.utils import face_align

import numpy as np
import cv2
import PIL
from PIL import Image
from diffusers import ControlNetModel,T2IAdapter, StableDiffusionPipeline,EulerDiscreteScheduler,AutoencoderKL
from rubberDiffusers import StableDiffusionRubberPipeline
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer,CLIPVisionModelWithProjection, CLIPModel,CLIPFeatureExtractor,AutoConfig
import torch
USE_LOCAL_FILES = True
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
clip_model=CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14",cache_dir="model_cache", local_files_only=USE_LOCAL_FILES)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",cache_dir="model_cache", local_files_only=USE_LOCAL_FILES)
vae = AutoencoderKL.from_single_file("./vae-ft-mse-840000-ema-pruned.safetensors")
image=Image.open('gigachadcolor.png').convert('RGB')
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
    "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
    cache_dir="model_cache",
    torch_dtype=torch.float16,
    local_files_only=True
).to('cuda')
pipe = StableDiffusionPipeline.from_single_file(
    './speedlabs_1.safetensors',
    safety_checker=None,
    local_files_only=True,
    vae=vae,
    image_encoder=image_encoder,
    torch_dtype=torch.float16
).to('cuda')
pipe.load_ip_adapter("ip-adapter-faceid-plusv2_sd15.bin",subfolder=None, weight_name="ip-adapter-faceid-plusv2_sd15.bin",image_encoder_folder=None)
pipe.set_ip_adapter_scale(1)
seeds=[1,2,3]

ref_images_embeds = []
ip_adapter_images = []
app = FaceAnalysis(name="buffalo_l",root='insightface', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
app.prepare(ctx_id=0, det_size=(640, 640))
image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)
faces = app.get(image)
ip_adapter_images.append(face_align.norm_crop(image, landmark=faces[0].kps, image_size=224))
image = torch.from_numpy(faces[0].normed_embedding)
ref_images_embeds.append(image.unsqueeze(0))
ref_images_embeds = torch.stack(ref_images_embeds, dim=0).unsqueeze(0)
neg_ref_images_embeds = torch.zeros_like(ref_images_embeds)
id_embeds = torch.cat([neg_ref_images_embeds, ref_images_embeds]).to(dtype=torch.float16, device="cuda")

clip_embeds = pipe.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), 1, True)[0]

pipe.unet.encoder_hid_proj.image_projection_layers[0].clip_embeds = clip_embeds.to(dtype=torch.float16)
pipe.unet.encoder_hid_proj.image_projection_layers[0].shortcut = True
kwargs=dict()
kwargs['ip_adapter_image_embeds']=[id_embeds]
kwargs['generator'] = [torch.Generator(device="cuda").manual_seed(i) for i in seeds]
kwargs['num_images_per_prompt']=len(seeds)
# kwargs['controlnet_image']=[cimage]
# kwargs['controlnet_conditioning_scale']=[1.0]
# kwargs['ip_adapter_image']=Image.open(ip_adapter_image1).convert('RGB')
# kwargs['image']=image
# kwargs['strength']=1.0
kwargs['clip_skip']=2
kwargs['num_inference_steps']=20
kwargs['height']=1024
kwargs['width']=768
kwargs['prompt']="a woman"
imgs1=pipe(**kwargs).images
imgs1[0].save('11.png')

latest version of diffusers

Logs

Traceback (most recent call last):
  File "/home/alex/cog/P1/r.py", line 67, in <module>
    imgs1=pipe(**kwargs).images
  File "/home/alex/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py", line 1005, in __call__
    noise_pred = self.unet(
  File "/home/alex/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/diffusers/models/unets/unet_2d_condition.py", line 1168, in forward
    encoder_hidden_states = self.process_encoder_hidden_states(
  File "/home/alex/.local/lib/python3.10/site-packages/diffusers/models/unets/unet_2d_condition.py", line 1039, in process_encoder_hidden_states
    image_embeds = self.encoder_hid_proj(image_embeds)
  File "/home/alex/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/diffusers/models/embeddings.py", line 1032, in forward
    image_embed = image_projection_layer(image_embed)
  File "/home/alex/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/diffusers/models/embeddings.py", line 996, in forward
    latents = block(x, latents, residual)
  File "/home/alex/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/alex/.local/lib/python3.10/site-packages/diffusers/models/embeddings.py", line 919, in forward
    encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 2 but got size 6 for tensor number 1 in the list.

System Info

wsl windows111

Who can help?

@fabiorigano

alexblattner commented 3 months ago

the issue was that I wrote clip_embeds = pipe.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), 1, True)[0] instead of clip_embeds = pipe.prepare_ip_adapter_image_embeds([ip_adapter_images], None, torch.device("cuda"), 3, True)[0] since I'm using 3 seeds