Combining community pipeline for image generation

Fqlox commented 1 month ago

Describe the bug

I cannot use both stable diffusion XL reference and Instant ID in the same pipeline. I get 'FrozenDict' object has no attribute 'block_out_channels'"

Reproduction

from stable_diffusion_xl_reference import StableDiffusionXLReferencePipeline
from pipeline_stable_diffusion_xl_instantid import StableDiffusionXLInstantIDPipeline, draw_kps

controlnet_path = f'path/to/instant/id'

# load IdentityNet
identityNet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)

pipe = StableDiffusionXLReferencePipeline.from_pretrained(
    "../path/to/model",
    torch_dtype=torch.float16,
    #use_safetensors=True,
    variant="fp16").to('cuda')

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)

pipe_instant = StableDiffusionXLInstantIDPipeline(
    pipe,
    #vae = pipe.vae, # I tried both witout and with the VAE
    text_encoder = pipe.text_encoder,
    text_encoder_2 = pipe.text_encoder_2,
    tokenizer = pipe.tokenizer,
    tokenizer_2 = pipe.tokenizer_2,
    unet = pipe.unet,
    scheduler = pipe.scheduler,
    feature_extractor = pipe.feature_extractor,
    controlnet= [identityNet],
)

Logs

{
    "name": "AttributeError",
    "message": "'FrozenDict' object has no attribute 'block_out_channels'",
    "stack": "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)\nCell \u001b[1;32mIn[3], line 28\u001b[0m\n\u001b[0;32m     20\u001b[0m pipe \u001b[38;5;241m=\u001b[39m StableDiffusionXLReferencePipeline\u001b[38;5;241m.\u001b[39mfrom_pretrained(\n\u001b[0;32m     21\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m../models/StableDiffusion/RealvisXLv40_lightning\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     22\u001b[0m     torch_dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat16,\n\u001b[0;32m     23\u001b[0m     \u001b[38;5;66;03m#use_safetensors=True,\u001b[39;00m\n\u001b[0;32m     24\u001b[0m     variant\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfp16\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m     26\u001b[0m pipe\u001b[38;5;241m.\u001b[39mscheduler \u001b[38;5;241m=\u001b[39m UniPCMultistepScheduler\u001b[38;5;241m.\u001b[39mfrom_config(pipe\u001b[38;5;241m.\u001b[39mscheduler\u001b[38;5;241m.\u001b[39mconfig)\n\u001b[1;32m---> 28\u001b[0m pipe_instant \u001b[38;5;241m=\u001b[39m \u001b[43mStableDiffusionXLInstantIDPipeline\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m     29\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpipe\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     30\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m#vae = pipe.vae, \u001b[39;49;00m\n\u001b[0;32m     31\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext_encoder\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_encoder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     32\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtext_encoder_2\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext_encoder_2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     33\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     34\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtokenizer_2\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtokenizer_2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     35\u001b[0m \u001b[43m    \u001b[49m\u001b[43munet\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43munet\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     36\u001b[0m \u001b[43m    \u001b[49m\u001b[43mscheduler\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscheduler\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     37\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m#safety_checker = pipe.safety_checker,\u001b[39;49;00m\n\u001b[0;32m     38\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mpipe\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfeature_extractor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     39\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcontrolnet\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\u001b[43midentityNet\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     40\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;66;43;03m#torch_dtype=torch.float16\u001b[39;49;00m\n\u001b[0;32m     41\u001b[0m \u001b[43m)\u001b[49m\n\u001b[0;32m     44\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m     45\u001b[0m \u001b[38;5;124;03mresult_img = pipe_instant(ref_image=input_image,\u001b[39;00m\n\u001b[0;32m     46\u001b[0m \u001b[38;5;124;03m                prompt=\"1girl\",\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     55\u001b[0m \u001b[38;5;124;03mresult_img.show()\u001b[39;00m\n\u001b[0;32m     56\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\nFile \u001b[1;32me:\\conda\\envs\\rayban\\lib\\site-packages\\diffusers\\pipelines\\controlnet\\pipeline_controlnet_sd_xl.py:211\u001b[0m, in \u001b[0;36mStableDiffusionXLControlNetPipeline.__init__\u001b[1;34m(self, vae, text_encoder, text_encoder_2, tokenizer, tokenizer_2, unet, controlnet, scheduler, force_zeros_for_empty_prompt, add_watermarker, feature_extractor, image_encoder)\u001b[0m\n\u001b[0;32m    197\u001b[0m     controlnet \u001b[38;5;241m=\u001b[39m MultiControlNetModel(controlnet)\n\u001b[0;32m    199\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mregister_modules(\n\u001b[0;32m    200\u001b[0m     vae\u001b[38;5;241m=\u001b[39mvae,\n\u001b[0;32m    201\u001b[0m     text_encoder\u001b[38;5;241m=\u001b[39mtext_encoder,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    209\u001b[0m     image_encoder\u001b[38;5;241m=\u001b[39mimage_encoder,\n\u001b[0;32m    210\u001b[0m )\n\u001b[1;32m--> 211\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m (\u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvae\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblock_out_channels\u001b[49m) \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m    212\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimage_processor \u001b[38;5;241m=\u001b[39m VaeImageProcessor(vae_scale_factor\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor, do_convert_rgb\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m    213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol_image_processor \u001b[38;5;241m=\u001b[39m VaeImageProcessor(\n\u001b[0;32m    214\u001b[0m     vae_scale_factor\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvae_scale_factor, do_convert_rgb\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, do_normalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m    215\u001b[0m )\n\n\u001b[1;31mAttributeError\u001b[0m: 'FrozenDict' object has no attribute 'block_out_channels'"
}

System Info

diffusers version: 0.25.0
Platform: Windows-10-10.0.19045-SP0
Python version: 3.10.14
PyTorch version (GPU?): 2.2.2 (True)
Huggingface_hub version: 0.22.2
Transformers version: 4.36.2
Accelerate version: 0.29.2
xFormers version: not installed
Using GPU in script?: yes
Using distributed or parallel set-up in script?: no

Who can help?

@yiyixuxu @sayakpaul @DN6 @stevhliu

tolgacangoz commented 1 month ago

I installed diffusers from the source and this seems to work:

import torch
from diffusers import DiffusionPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler

controlnet_path = f'path/to/instant/id'

# load IdentityNet
identityNet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)

pipe = DiffusionPipeline.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0",
     custom_pipeline="stable_diffusion_xl_reference",
     torch_dtype=torch.float16,
     use_safetensors=True,
     variant="fp16").to('cuda')

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe_instant = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    custom_pipeline="pipeline_stable_diffusion_xl_instantid",
    #vae = pipe.vae, # I tried both witout and with the VAE
    text_encoder = pipe.text_encoder,
    text_encoder_2 = pipe.text_encoder_2,
    tokenizer = pipe.tokenizer,
    tokenizer_2 = pipe.tokenizer_2,
    unet = pipe.unet,
    scheduler = pipe.scheduler,
    feature_extractor = pipe.feature_extractor,
    controlnet = identityNet,
)

yiyixuxu commented 1 month ago

you can create your instant ID pipeline from SDXL reference pipeline with this script

import torch
from diffusers import DiffusionPipeline, ControlNetModel
from diffusers import UniPCMultistepScheduler

# load IdentityNet
identityNet = ControlNetModel.from_pretrained("InstantX/InstantID", subfolder ="ControlNetModel", torch_dtype=torch.float16)

pipe = DiffusionPipeline.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0",
     custom_pipeline="stable_diffusion_xl_reference",
     torch_dtype=torch.float16,
     use_safetensors=True,
     variant="fp16").to('cuda')

pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
pipe_instant = DiffusionPipeline.from_pipe(
    pipe,
    custom_pipeline="pipeline_stable_diffusion_xl_instantid",
    controlnet = identityNet,
)

Fqlox commented 1 month ago

@standardAI I did install diffuser from source then I had to downgrade due to an error on instantID. When I generate using both reference and instantId arguments, and with only instant id argument. It does not impact the generation.

image_plus_ref = pipe_instant(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=4,
    guidance_scale=1.2,
    image_proj_model_in_features=face_emb,
    image_embeds=face_emb,
    image=face_kps,
    controlnet_conditioning_scale=0.8,
    seed = 42,
    reference_attn=True,
    reference_adain=True,
    ref_image = ref_image
).images[0]

And

image = pipe_instant(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=4,
    guidance_scale=1.2,
    image_proj_model_in_features=face_emb,
    image_embeds=face_emb,
    image=face_kps,
    controlnet_conditioning_scale=0.8,
    seed = 42,
).images[0]

[note that I use lightning diffuser model]

@yiyixuxu since I'm on diffusers==0.26.3 the method from_pipe does not seems to work.

huggingface / diffusers