huggingface / diffusers

🤗 Diffusers: State-of-the-art diffusion models for image and audio generation in PyTorch and FLAX.
https://huggingface.co/docs/diffusers
Apache License 2.0
25.21k stars 5.22k forks source link

stabilityai/stable-diffusion-2-1 does not appear to have a file named config.json. #7935

Open HaoqianSong opened 4 months ago

HaoqianSong commented 4 months ago

Describe the bug

/home/pcl/Minconda/conda3/envs/DIFT/bin/python3.10 /home/pcl/DETR/SDAseg/others/DIFT-main/demo.py Traceback (most recent call last): File "/home/pcl/Minconda/conda3/envs/DIFT/lib/python3.10/site-packages/diffusers/configuration_utils.py", line 337, in load_config config_file = hf_hub_download( File "/home/pcl/Minconda/conda3/envs/DIFT/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py", line 120, in _inner_fn return fn(*args, **kwargs) File "/home/pcl/Minconda/conda3/envs/DIFT/lib/python3.10/site-packages/huggingface_hub/file_download.py", line 1291, in hf_hub_download raise LocalEntryNotFoundError( huggingface_hub.utils._errors.LocalEntryNotFoundError: Connection error, and we cannot find the requested files in the disk cache. Please try again or make sure your Internet connection is on.

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/home/pcl/DETR/SDAseg/others/DIFT-main/demo.py", line 12, in dift = SDFeaturizer() File "/home/pcl/DETR/SDAseg/others/DIFT-main/src/models/dift_sd.py", line 192, in init unet = MyUNet2DConditionModel.from_pretrained(sd_id, subfolder="unet") File "/home/pcl/Minconda/conda3/envs/DIFT/lib/python3.10/site-packages/diffusers/models/modeling_utils.py", line 472, in from_pretrained config, unused_kwargs, commit_hash = cls.load_config( File "/home/pcl/Minconda/conda3/envs/DIFT/lib/python3.10/site-packages/diffusers/configuration_utils.py", line 364, in load_config raise EnvironmentError( OSError: stabilityai/stable-diffusion-2-1 does not appear to have a file named config.json.

Process finished with exit code 1

Reproduction

raise EnvironmentError(

OSError: stabilityai/stable-diffusion-2-1 does not appear to have a file named config.json.

def __init__(self, sd_id='stabilityai/stable-diffusion-2-1', null_prompt=''):  #  /home/pcl/DETR/SDAseg/SDAseg/dift/config.json
        unet = MyUNet2DConditionModel.from_pretrained(sd_id, subfolder="unet")
        onestep_pipe = OneStepSDPipeline.from_pretrained(sd_id, unet=unet, safety_checker=None)
        onestep_pipe.vae.decoder = None
        onestep_pipe.scheduler = DDIMScheduler.from_pretrained(sd_id, subfolder="scheduler")
        gc.collect()
        onestep_pipe = onestep_pipe.to("cuda")
        onestep_pipe.enable_attention_slicing()
        onestep_pipe.enable_xformers_memory_efficient_attention()
        null_prompt_embeds = onestep_pipe._encode_prompt(
            prompt=null_prompt,
            device='cuda',
            num_images_per_prompt=1,
            do_classifier_free_guidance=False) # [1, 77, dim]

Logs

No response

System Info

diffusers[torch]==0.15.0

Who can help?

No response

tolgacangoz commented 4 months ago

SDFeaturizer, MyUNet2DConditionModel, and OneStepSDPipeline don't come from diffusers. Could you elaborate on them? Or, it would be more appropriate to first open this issue in the repo they came from.

yiyixuxu commented 4 months ago

Hi:

are you getting the same error with this code?

from diffusers import UNet2DConditionModel
sd_id='stabilityai/stable-diffusion-2-1'
unet = UNet2DConditionModel.from_pretrained(sd_id, subfolder="unet")
HaoqianSong commented 4 months ago

Yes, this is a project about dift, the project link is "https://github.com/Tsingularity/dift/tree/main", the code as below:

from diffusers import StableDiffusionPipeline import torch import torch.nn as nn import matplotlib.pyplot as plt import numpy as np from typing import Any, Callable, Dict, List, Optional, Union from diffusers.models.unet_2d_condition import UNet2DConditionModel from diffusers import DDIMScheduler import gc import os from PIL import Image from torchvision.transforms import PILToTensor

class MyUNet2DConditionModel(UNet2DConditionModel): def forward( self, sample: torch.FloatTensor, timestep: Union[torch.Tensor, float, int], up_ft_indices, encoder_hidden_states: torch.Tensor, class_labels: Optional[torch.Tensor] = None, timestep_cond: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None): r""" Args: sample (torch.FloatTensor): (batch, channel, height, width) noisy inputs tensor timestep (torch.FloatTensor or float or int): (batch) timesteps encoder_hidden_states (torch.FloatTensor): (batch, sequence_length, feature_dim) encoder hidden states cross_attention_kwargs (dict, optional): A kwargs dictionary that if specified is passed along to the AttnProcessor as defined under self.processor in diffusers.cross_attention. """

By default samples have to be AT least a multiple of the overall upsampling factor.

    # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
    # However, the upsampling interpolation output size can be forced to fit any upsampling size
    # on the fly if necessary.
    default_overall_up_factor = 2**self.num_upsamplers

    # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
    forward_upsample_size = False
    upsample_size = None

    if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
        # logger.info("Forward upsample size to force interpolation output size.")
        forward_upsample_size = True

    # prepare attention_mask
    if attention_mask is not None:
        attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
        attention_mask = attention_mask.unsqueeze(1)

    # 0. center input if necessary
    if self.config.center_input_sample:
        sample = 2 * sample - 1.0

    # 1. time
    timesteps = timestep
    if not torch.is_tensor(timesteps):
        # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
        # This would be a good case for the `match` statement (Python 3.10+)
        is_mps = sample.device.type == "mps"
        if isinstance(timestep, float):
            dtype = torch.float32 if is_mps else torch.float64
        else:
            dtype = torch.int32 if is_mps else torch.int64
        timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
    elif len(timesteps.shape) == 0:
        timesteps = timesteps[None].to(sample.device)

    # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
    timesteps = timesteps.expand(sample.shape[0])

    t_emb = self.time_proj(timesteps)

    # timesteps does not contain any weights and will always return f32 tensors
    # but time_embedding might actually be running in fp16. so we need to cast here.
    # there might be better ways to encapsulate this.
    t_emb = t_emb.to(dtype=self.dtype)

    emb = self.time_embedding(t_emb, timestep_cond)

    if self.class_embedding is not None:
        if class_labels is None:
            raise ValueError("class_labels should be provided when num_class_embeds > 0")

        if self.config.class_embed_type == "timestep":
            class_labels = self.time_proj(class_labels)

        class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
        emb = emb + class_emb

    # 2. pre-process
    sample = self.conv_in(sample)

    # 3. down
    down_block_res_samples = (sample,)
    for downsample_block in self.down_blocks:
        if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
            sample, res_samples = downsample_block(
                hidden_states=sample,
                temb=emb,
                encoder_hidden_states=encoder_hidden_states,
                attention_mask=attention_mask,
                cross_attention_kwargs=cross_attention_kwargs,
            )
        else:
            sample, res_samples = downsample_block(hidden_states=sample, temb=emb)

        down_block_res_samples += res_samples

    # 4. mid
    if self.mid_block is not None:
        sample = self.mid_block(
            sample,
            emb,
            encoder_hidden_states=encoder_hidden_states,
            attention_mask=attention_mask,
            cross_attention_kwargs=cross_attention_kwargs,
        )

    # 5. up
    up_ft = {}
    for i, upsample_block in enumerate(self.up_blocks):

        if i > np.max(up_ft_indices):
            break

        is_final_block = i == len(self.up_blocks) - 1

        res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
        down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

        # if we have not reached the final block and need to forward the
        # upsample size, we do it here
        if not is_final_block and forward_upsample_size:
            upsample_size = down_block_res_samples[-1].shape[2:]

        if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
            sample = upsample_block(
                hidden_states=sample,
                temb=emb,
                res_hidden_states_tuple=res_samples,
                encoder_hidden_states=encoder_hidden_states,
                cross_attention_kwargs=cross_attention_kwargs,
                upsample_size=upsample_size,
                attention_mask=attention_mask,
            )
        else:
            sample = upsample_block(
                hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
            )

        if i in up_ft_indices:
            up_ft[i] = sample.detach()

    output = {}
    output['up_ft'] = up_ft
    return output

class OneStepSDPipeline(StableDiffusionPipeline): @torch.no_grad() def call( self, img_tensor, t, up_ft_indices, negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, prompt_embeds: Optional[torch.FloatTensor] = None, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None ):

    device = self._execution_device
    latents = self.vae.encode(img_tensor).latent_dist.sample() * self.vae.config.scaling_factor
    t = torch.tensor(t, dtype=torch.long, device=device)
    noise = torch.randn_like(latents).to(device)
    latents_noisy = self.scheduler.add_noise(latents, noise, t)
    unet_output = self.unet(latents_noisy,
                           t,
                           up_ft_indices,
                           encoder_hidden_states=prompt_embeds,
                           cross_attention_kwargs=cross_attention_kwargs)
    return unet_output

class SDFeaturizer: def init(self, sd_id='stabilityai/stable-diffusion-2-1', null_prompt=''): # /home/pcl/DETR/SDAseg/SDAseg/dift/config.json unet = MyUNet2DConditionModel.from_pretrained(sd_id, subfolder="unet") onestep_pipe = OneStepSDPipeline.from_pretrained(sd_id, unet=unet, safety_checker=None) onestep_pipe.vae.decoder = None onestep_pipe.scheduler = DDIMScheduler.from_pretrained(sd_id, subfolder="scheduler") gc.collect() onestep_pipe = onestep_pipe.to("cuda") onestep_pipe.enable_attention_slicing() onestep_pipe.enable_xformers_memory_efficient_attention() null_prompt_embeds = onestep_pipe._encode_prompt( prompt=null_prompt, device='cuda', num_images_per_prompt=1, do_classifier_free_guidance=False) # [1, 77, dim]

    self.null_prompt_embeds = null_prompt_embeds
    self.null_prompt = null_prompt
    self.pipe = onestep_pipe

@torch.no_grad()
def forward(self,
            img_tensor,
            prompt='',
            t=261,
            up_ft_index=1,
            ensemble_size=8):
    '''
    Args:
        img_tensor: should be a single torch tensor in the shape of [1, C, H, W] or [C, H, W]
        prompt: the prompt to use, a string
        t: the time step to use, should be an int in the range of [0, 1000]
        up_ft_index: which upsampling block of the U-Net to extract feature, you can choose [0, 1, 2, 3]
        ensemble_size: the number of repeated images used in the batch to extract features
    Return:
        unet_ft: a torch tensor in the shape of [1, c, h, w]
    '''
    img_tensor = img_tensor.repeat(ensemble_size, 1, 1, 1).cuda() # ensem, c, h, w
    if prompt == self.null_prompt:
        prompt_embeds = self.null_prompt_embeds
    else:
        prompt_embeds = self.pipe._encode_prompt(
            prompt=prompt,
            device='cuda',
            num_images_per_prompt=1,
            do_classifier_free_guidance=False) # [1, 77, dim]
    prompt_embeds = prompt_embeds.repeat(ensemble_size, 1, 1)
    unet_ft_all = self.pipe(
        img_tensor=img_tensor,
        t=t,
        up_ft_indices=[up_ft_index],
        prompt_embeds=prompt_embeds)
    unet_ft = unet_ft_all['up_ft'][up_ft_index] # ensem, c, h, w
    unet_ft = unet_ft.mean(0, keepdim=True) # 1,c,h,w
    return unet_ft

class SDFeaturizer4Eval(SDFeaturizer): def init(self, sd_id='stabilityai/stable-diffusion-2-1', null_prompt='', cat_list=[]): super().init(sd_id, null_prompt) with torch.no_grad(): cat2prompt_embeds = {} for cat in cat_list: prompt = f"a photo of a {cat}" prompt_embeds = self.pipe._encode_prompt( prompt=prompt, device='cuda', num_images_per_prompt=1, do_classifier_free_guidance=False) # [1, 77, dim] cat2prompt_embeds[cat] = prompt_embeds self.cat2prompt_embeds = cat2prompt_embeds

    self.pipe.tokenizer = None
    self.pipe.text_encoder = None
    gc.collect()
    torch.cuda.empty_cache()

@torch.no_grad()
def forward(self,
            img,
            category=None,
            img_size=[768, 768],
            t=261,
            up_ft_index=1,
            ensemble_size=8):
    if img_size is not None:
        img = img.resize(img_size)
    img_tensor = (PILToTensor()(img) / 255.0 - 0.5) * 2
    img_tensor = img_tensor.unsqueeze(0).repeat(ensemble_size, 1, 1, 1).cuda() # ensem, c, h, w
    if category in self.cat2prompt_embeds:
        prompt_embeds = self.cat2prompt_embeds[category]
    else:
        prompt_embeds = self.null_prompt_embeds
    prompt_embeds = prompt_embeds.repeat(ensemble_size, 1, 1).cuda()
    unet_ft_all = self.pipe(
        img_tensor=img_tensor,
        t=t,
        up_ft_indices=[up_ft_index],
        prompt_embeds=prompt_embeds)
    unet_ft = unet_ft_all['up_ft'][up_ft_index] # ensem, c, h, w
    unet_ft = unet_ft.mean(0, keepdim=True) # 1,c,h,w
    return unet_ft
github-actions[bot] commented 5 days ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.