tencent-ailab / IP-Adapter

The image prompt adapter is designed to enable a pretrained text-to-image diffusion model to generate images with image prompt.
Apache License 2.0
5.01k stars 325 forks source link

h94/IP-Adapter-FaceID is not working - I am trying to code a Gradio app #189

Open FurkanGozukara opened 9 months ago

FurkanGozukara commented 9 months ago

All libraries are installed. following the instructions here : https://huggingface.co/h94/IP-Adapter-FaceID

the error is

Traceback (most recent call last):
  File "G:\IP-Adapter-FaceID\venv\lib\site-packages\diffusers\utils\import_utils.py", line 710, in _get_module
    return importlib.import_module("." + module_name, self.__name__)
  File "C:\Python3108\lib\importlib\__init__.py", line 126, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1050, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 688, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "G:\IP-Adapter-FaceID\venv\lib\site-packages\diffusers\models\autoencoder_kl.py", line 21, in <module>
    from ..loaders import FromOriginalVAEMixin
  File "G:\IP-Adapter-FaceID\venv\lib\site-packages\diffusers\loaders.py", line 1164, in <module>
    class LoraLoaderMixin:
  File "G:\IP-Adapter-FaceID\venv\lib\site-packages\diffusers\loaders.py", line 2393, in LoraLoaderMixin
    text_encoder: Optional[PreTrainedModel] = None,
NameError: name 'PreTrainedModel' is not defined

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "G:\IP-Adapter-FaceID\web-ui.py", line 8, in <module>
    from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
  File "<frozen importlib._bootstrap>", line 1075, in _handle_fromlist
  File "G:\IP-Adapter-FaceID\venv\lib\site-packages\diffusers\utils\import_utils.py", line 701, in __getattr__
    value = getattr(module, name)
  File "G:\IP-Adapter-FaceID\venv\lib\site-packages\diffusers\utils\import_utils.py", line 700, in __getattr__
    module = self._get_module(self._class_to_module[name])
  File "G:\IP-Adapter-FaceID\venv\lib\site-packages\diffusers\utils\import_utils.py", line 712, in _get_module
    raise RuntimeError(
RuntimeError: Failed to import diffusers.models.autoencoder_kl because of the following error (look up to see its traceback):
name 'PreTrainedModel' is not defined
Press any key to continue . . .
import gradio as gr
import os
import cv2
import numpy as np
import torch
from PIL import Image
from insightface.app import FaceAnalysis
from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
from ip_adapter.ip_adapter_faceid import IPAdapterFaceID

# Function to list models in the 'models' folder
def list_models():
    return [f for f in os.listdir('models') if os.path.isdir(os.path.join('models', f))]

# Cache for loaded models
model_cache = {}

# Function to load and cache model
def load_model(model_name):
    if model_name in model_cache:
        return model_cache[model_name]

    # Model paths
    base_model_path = os.path.join('models', model_name)
    vae_model_path = "stabilityai/sd-vae-ft-mse"
    ip_ckpt = "ip-adapter-faceid_sd15.bin"
    device = "cuda"
    noise_scheduler = DDIMScheduler(
        num_train_timesteps=1000,
        beta_start=0.00085,
        beta_end=0.012,
        beta_schedule="scaled_linear",
        clip_sample=False,
        set_alpha_to_one=False,
        steps_offset=1,
    )

    # Load model components
    vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
    pipe = StableDiffusionPipeline.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        scheduler=noise_scheduler,
        vae=vae,
        feature_extractor=None,
        safety_checker=None
    )
    ip_model = IPAdapterFaceID(pipe, ip_ckpt, device)

    # Cache the model
    model_cache[model_name] = ip_model
    return ip_model

# Function to process image and generate output
def generate_image(input_image, positive_prompt, negative_prompt, model_name):
    # Load and prepare the model
    ip_model = load_model(model_name)

    # Convert input image to the format expected by the model
    input_image = input_image.convert('RGB')
    input_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
    app = FaceAnalysis(name="buffalo_l", providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
    app.prepare(ctx_id=0, det_size=(640, 640))
    faces = app.get(input_image)
    faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)

    # Generate the image
    generated_images = ip_model.generate(
        prompt=positive_prompt,
        negative_prompt=negative_prompt,
        faceid_embeds=faceid_embeds,
        num_samples=1,
        width=512,
        height=768,
        num_inference_steps=30,
        seed=2023
    )

    # Save the generated image
    if not os.path.exists('outputs'):
        os.makedirs('outputs')
    image_path = os.path.join('outputs', f'generated_{len(os.listdir("outputs"))+1}.png')
    generated_images[0].save(image_path)

    return generated_images[0], f'Saved as {image_path}'

# Get the list of models
models_list = list_models()

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("Image Generation App")
    with gr.Row():
        with gr.Column():
            input_image = gr.Image(type='pil')
            positive_prompt = gr.Textbox(label="Positive Prompt")
            negative_prompt = gr.Textbox(label="Negative Prompt")
            model_selector = gr.Dropdown(label="Select Model", choices=models_list, value=models_list[0] if models_list else None)
        with gr.Column():
            output_image = gr.Image()
            output_text = gr.Textbox(label="Output Info")

    generate_btn = gr.Button("Generate")
    generate_btn.click(generate_image, inputs=[input_image, positive_prompt, negative_prompt, model_selector], outputs=[output_image, output_text])

demo.launch()
FurkanGozukara commented 9 months ago

here pip freeze of venv

(venv) G:\IP-Adapter-FaceID\venv\Scripts>pip freeze
aiofiles==23.2.1
albumentations==1.3.1
altair==5.2.0
annotated-types==0.6.0
anyio==3.7.1
attrs==23.1.0
certifi==2022.12.7
charset-normalizer==2.1.1
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
contourpy==1.2.0
cycler==0.12.1
Cython==3.0.7
diffusers==0.22.1
easydict==1.11
exceptiongroup==1.2.0
fastapi==0.105.0
ffmpy==0.3.1
filelock==3.9.0
flatbuffers==23.5.26
fonttools==4.47.0
fsspec==2023.12.2
gradio==4.11.0
gradio_client==0.7.3
h11==0.14.0
httpcore==1.0.2
httpx==0.26.0
huggingface-hub==0.20.1
humanfriendly==10.0
idna==3.4
imageio==2.33.1
importlib-metadata==7.0.0
importlib-resources==6.1.1
insightface==0.7.3
ip-adapter @ git+https://github.com/tencent-ailab/IP-Adapter.git@6843f295d4a7c651d243e84667a197b68591a980
Jinja2==3.1.2
joblib==1.3.2
jsonschema==4.20.0
jsonschema-specifications==2023.11.2
kiwisolver==1.4.5
lazy_loader==0.3
markdown-it-py==3.0.0
MarkupSafe==2.1.3
matplotlib==3.8.2
mdurl==0.1.2
mpmath==1.3.0
networkx==3.0
numpy==1.24.1
onnx==1.15.0
onnxruntime==1.16.3
opencv-python-headless==4.8.1.78
orjson==3.9.10
packaging==23.2
pandas==2.1.4
Pillow==9.3.0
prettytable==3.9.0
protobuf==4.25.1
pydantic==2.5.2
pydantic_core==2.14.5
pydub==0.25.1
Pygments==2.17.2
pyparsing==3.1.1
pyreadline3==3.4.1
python-dateutil==2.8.2
python-multipart==0.0.6
pytz==2023.3.post1
PyYAML==6.0.1
qudida==0.0.4
referencing==0.32.0
regex==2023.10.3
requests==2.28.1
rich==13.7.0
rpds-py==0.15.2
safetensors==0.4.1
scikit-image==0.22.0
scikit-learn==1.3.2
scipy==1.11.4
semantic-version==2.10.0
shellingham==1.5.4
six==1.16.0
sniffio==1.3.0
starlette==0.27.0
sympy==1.12
threadpoolctl==3.2.0
tifffile==2023.12.9
tomlkit==0.12.0
toolz==0.12.0
torch==2.1.2+cu118
torchaudio==2.1.2+cu118
torchvision==0.16.2+cu118
tqdm==4.66.1
typer==0.9.0
typing_extensions==4.9.0
tzdata==2023.3
urllib3==1.26.13
uvicorn==0.25.0
wcwidth==0.2.12
websockets==11.0.3
zipp==3.17.0

(venv) G:\IP-Adapter-FaceID\venv\Scripts>
xiaohu2015 commented 9 months ago

I think you should install transformers

FurkanGozukara commented 9 months ago

transformers

good catch

FurkanGozukara commented 9 months ago

I think you should install transformers

giving full folder path didnt work it is expecting repo id

how can i fix it?

pipe = StableDiffusionPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float32,  # Use float32 for better compatibility
    scheduler=DDIMScheduler(),
    vae=vae,
    feature_extractor=None,
    safety_checker=None
).to(device)
xiaohu2015 commented 9 months ago

you can use repo id or local model path for base_model_path

FurkanGozukara commented 9 months ago

you can use repo id or local model path for base_model_path

Ty for reply. It is expecting repo id any ideas?

image

xiaohu2015 commented 9 months ago

hi, if you use safetensor model, you can use:

from diffusers import StableDiffusionPipeline

pipeline = StableDiffusionPipeline.from_single_file(
    "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
)

ref to https://huggingface.co/docs/diffusers/using-diffusers/using_safetensors

FurkanGozukara commented 9 months ago

@xiaohu2015 ty so much for answers

for some reason i am getting noise output here

image

the code

def load_model(model_name):
    if model_name in model_cache:
        return model_cache[model_name]

    # Model paths
    base_model_path = os.path.join("models", model_name)
    vae_model_path = "stabilityai/sd-vae-ft-mse"
    ip_ckpt = "ip-adapter-faceid_sd15.bin"
    device = "cuda"

    # Check if the base model path exists
    if not os.path.exists(base_model_path):
        raise FileNotFoundError(f"Base model path {base_model_path} does not exist.")

    noise_scheduler = DDIMScheduler(
        num_train_timesteps=1000,
        beta_start=0.00085,
        beta_end=0.012,
        beta_schedule="scaled_linear",
        clip_sample=False,
        set_alpha_to_one=False,
        steps_offset=1,
    )

    # Load model components
    vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
    pipe = StableDiffusionPipeline.from_single_file(
        base_model_path,
        scheduler=noise_scheduler,
        torch_dtype=torch.float16,
        vae=vae,
        feature_extractor=None,
        safety_checker=None,
    ).to(device)

    ip_model = IPAdapterFaceID(pipe, ip_ckpt, device)  # Assuming this is correct

    # Cache the model
    model_cache[model_name] = ip_model
    return ip_model

# Function to process image and generate output
def generate_image(input_image, positive_prompt, negative_prompt, model_name):
    # Load and prepare the model
    ip_model = load_model(model_name)

    # Convert input image to the format expected by the model
    input_image = input_image.convert("RGB")
    input_image = cv2.cvtColor(np.array(input_image), cv2.COLOR_RGB2BGR)
    app = FaceAnalysis(
        name="buffalo_l", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
    )
    app.prepare(ctx_id=0, det_size=(640, 640))
    faces = app.get(input_image)
    if not faces:
        raise ValueError("No faces found in the image.")

    faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)

    # Generate the image
    generated_images = ip_model.generate(
        prompt=positive_prompt,
        negative_prompt=negative_prompt,
        faceid_embeds=faceid_embeds,
        num_samples=1,
        width=512,
        height=768,
        num_inference_steps=30,
        seed=2023,
    )

the cmd

Found model files: ['Realistic_Vision_V5.1.safetensors']
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.
G:\IP-Adapter-FaceID\venv\lib\site-packages\transformers\models\clip\feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.
  warnings.warn(
G:\IP-Adapter-FaceID\venv\lib\site-packages\diffusers\pipelines\pipeline_utils.py:761: FutureWarning: `torch_dtype` is deprecated and will be removed in version 0.25.0.
  deprecate("torch_dtype", "0.25.0", "")
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CUDAExecutionProvider': {'cudnn_conv_algo_search': 'EXHAUSTIVE', 'device_id': '0', 'cudnn_conv1d_pad_to_nc1d': '0', 'has_user_compute_stream': '0', 'gpu_external_alloc': '0', 'enable_cuda_graph': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_free': '0', 'gpu_external_empty_cache': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'cudnn_conv_use_max_workspace': '1', 'tunable_op_enable': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0'}, 'CPUExecutionProvider': {}}
find model: C:\Users\King/.insightface\models\buffalo_l\1k3d68.onnx landmark_3d_68 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CUDAExecutionProvider': {'cudnn_conv_algo_search': 'EXHAUSTIVE', 'device_id': '0', 'cudnn_conv1d_pad_to_nc1d': '0', 'has_user_compute_stream': '0', 'gpu_external_alloc': '0', 'enable_cuda_graph': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_free': '0', 'gpu_external_empty_cache': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'cudnn_conv_use_max_workspace': '1', 'tunable_op_enable': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0'}, 'CPUExecutionProvider': {}}
find model: C:\Users\King/.insightface\models\buffalo_l\2d106det.onnx landmark_2d_106 ['None', 3, 192, 192] 0.0 1.0
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CUDAExecutionProvider': {'cudnn_conv_algo_search': 'EXHAUSTIVE', 'device_id': '0', 'cudnn_conv1d_pad_to_nc1d': '0', 'has_user_compute_stream': '0', 'gpu_external_alloc': '0', 'enable_cuda_graph': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_free': '0', 'gpu_external_empty_cache': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'cudnn_conv_use_max_workspace': '1', 'tunable_op_enable': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0'}, 'CPUExecutionProvider': {}}
find model: C:\Users\King/.insightface\models\buffalo_l\det_10g.onnx detection [1, 3, '?', '?'] 127.5 128.0
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CUDAExecutionProvider': {'cudnn_conv_algo_search': 'EXHAUSTIVE', 'device_id': '0', 'cudnn_conv1d_pad_to_nc1d': '0', 'has_user_compute_stream': '0', 'gpu_external_alloc': '0', 'enable_cuda_graph': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_free': '0', 'gpu_external_empty_cache': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'cudnn_conv_use_max_workspace': '1', 'tunable_op_enable': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0'}, 'CPUExecutionProvider': {}}
find model: C:\Users\King/.insightface\models\buffalo_l\genderage.onnx genderage ['None', 3, 96, 96] 0.0 1.0
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CUDAExecutionProvider': {'cudnn_conv_algo_search': 'EXHAUSTIVE', 'device_id': '0', 'cudnn_conv1d_pad_to_nc1d': '0', 'has_user_compute_stream': '0', 'gpu_external_alloc': '0', 'enable_cuda_graph': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_free': '0', 'gpu_external_empty_cache': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'cudnn_conv_use_max_workspace': '1', 'tunable_op_enable': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0'}, 'CPUExecutionProvider': {}}
find model: C:\Users\King/.insightface\models\buffalo_l\w600k_r50.onnx recognition ['None', 3, 112, 112] 127.5 127.5
set det-size: (640, 640)
G:\IP-Adapter-FaceID\venv\lib\site-packages\insightface\utils\transform.py:68: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
  P = np.linalg.lstsq(X_homo, Y)[0].T # Affine matrix. 3 x 4
100%|██████████| 30/30 [00:03<00:00,  8.21it/s]
FurkanGozukara commented 9 months ago

here my pip freeze

(venv) G:\IP-Adapter-FaceID\venv\Scripts>pip freeze
accelerate==0.25.0
aiofiles==23.2.1
albumentations==1.3.1
altair==5.2.0
annotated-types==0.6.0
antlr4-python3-runtime==4.9.3
anyio==3.7.1
attrs==23.1.0
certifi==2022.12.7
charset-normalizer==2.1.1
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
contourpy==1.2.0
cycler==0.12.1
Cython==3.0.7
diffusers==0.24.0
easydict==1.11
einops==0.7.0
exceptiongroup==1.2.0
fastapi==0.105.0
ffmpy==0.3.1
filelock==3.9.0
flatbuffers==23.5.26
fonttools==4.47.0
fsspec==2023.12.2
gradio==4.11.0
gradio_client==0.7.3
h11==0.14.0
httpcore==1.0.2
httpx==0.26.0
huggingface-hub==0.20.1
humanfriendly==10.0
idna==3.4
imageio==2.33.1
importlib-metadata==7.0.0
importlib-resources==6.1.1
insightface==0.7.3
ip-adapter @ git+https://github.com/tencent-ailab/IP-Adapter.git@6843f295d4a7c651d243e84667a197b68591a980
Jinja2==3.1.2
joblib==1.3.2
jsonschema==4.20.0
jsonschema-specifications==2023.11.2
kiwisolver==1.4.5
lazy_loader==0.3
markdown-it-py==3.0.0
MarkupSafe==2.1.3
matplotlib==3.8.2
mdurl==0.1.2
mpmath==1.3.0
networkx==3.0
numpy==1.24.1
omegaconf==2.3.0
onnx==1.15.0
onnxruntime-gpu==1.16.3
opencv-python-headless==4.8.1.78
orjson==3.9.10
packaging==23.2
pandas==2.1.4
Pillow==9.3.0
prettytable==3.9.0
protobuf==4.25.1
psutil==5.9.7
pydantic==2.5.2
pydantic_core==2.14.5
pydub==0.25.1
Pygments==2.17.2
pyparsing==3.1.1
pyreadline3==3.4.1
python-dateutil==2.8.2
python-multipart==0.0.6
pytz==2023.3.post1
PyYAML==6.0.1
qudida==0.0.4
referencing==0.32.0
regex==2023.10.3
requests==2.28.1
rich==13.7.0
rpds-py==0.15.2
safetensors==0.4.1
scikit-image==0.22.0
scikit-learn==1.3.2
scipy==1.11.4
semantic-version==2.10.0
shellingham==1.5.4
six==1.16.0
sniffio==1.3.0
starlette==0.27.0
sympy==1.12
threadpoolctl==3.2.0
tifffile==2023.12.9
tokenizers==0.15.0
tomlkit==0.12.0
toolz==0.12.0
torch==2.1.2+cu118
torchaudio==2.1.2+cu118
torchvision==0.16.2+cu118
tqdm==4.66.1
transformers==4.36.2
typer==0.9.0
typing_extensions==4.9.0
tzdata==2023.3
urllib3==1.26.13
uvicorn==0.25.0
wcwidth==0.2.12
websockets==11.0.3
zipp==3.17.0

(venv) G:\IP-Adapter-FaceID\venv\Scripts>
xiaohu2015 commented 9 months ago

can you test this model: https://huggingface.co/SG161222/Realistic_Vision_V4.0_noVAE/tree/main.

and I will also test your code next week.

xiaohu2015 commented 9 months ago

下载

prompt = "photo of a man wearing a white suit a garden" negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality, blurry"

pipe = StableDiffusionPipeline.from_single_file(
        base_model_path,
        scheduler=noise_scheduler, # DDIM works well
        torch_dtype=torch.float16,
        vae=vae,
        feature_extractor=None,
        safety_checker=None,
    ).to(device)
xiaohu2015 commented 9 months ago

yes, I used Realistic_Vision_V5.1 to generate

you can convert to model safetensor to diffusers models:

pipe = StableDiffusionPipeline.from_single_file("Realistic_Vision_V5.1.safetensors")
pipe.save_pretrained("Realistic_Vision_V5.1")

then you can use:

pipe = StableDiffusionPipeline.from_pretrained(
        base_model_path,
        scheduler=noise_scheduler, # DDIM works well
        torch_dtype=torch.float16,
        vae=vae,
        feature_extractor=None,
        safety_checker=None,
    ).to(device)
FurkanGozukara commented 9 months ago

monochrome, lowres, bad anatomy, worst quality, low quality, blurry

when I give diffusers it works but why loading single point not working? doesnt make sense

also i am preparing this graido for people so people really wouldnt like having double size storage

image

xiaohu2015 commented 9 months ago

I also don't know why