Describe the bug

clip guided generates a very poor result，

this is my code:

`logger = logging.get_logger(name) # pylint: disable=invalid-name

class CLIPGuidedStableDiffusionPipeline(DiffusionPipeline): r""" Pipeline for text-to-image generation using Stable Diffusion. This model inherits from [DiffusionPipeline]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) Args: vae ([AutoencoderKL]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. text_encoder ([CLIPTextModel]): Frozen text-encoder. Stable Diffusion uses the text portion of CLIP, specifically the clip-vit-large-patch14 variant. tokenizer (CLIPTokenizer): Tokenizer of class CLIPTokenizer. unet ([UNet2DConditionModel]): Conditional U-Net architecture to denoise the encoded image latents. scheduler ([SchedulerMixin]): A scheduler to be used in combination with unet to denoise the encoded image latents. Can be one of [DDIMScheduler], [LMSDiscreteScheduler], or [PNDMScheduler]. safety_checker ([StableDiffusionSafetyChecker]): Classification module that estimates whether generated images could be considered offensive or harmful. Please, refer to the model card for details. feature_extractor ([CLIPFeatureExtractor]): Model that extracts features from generated images to be used as inputs for the safety_checker. """ _optional_components = ["safety_checker", "feature_extractor"]

def __init__(
    self,
    vae: AutoencoderKL,
    text_encoder: CLIPTextModel,
    clip_model: CLIPModel,
    clip_tokenizer: CLIPTokenizer,
    tokenizer: CLIPTokenizer,
    unet: UNet2DConditionModel,
    scheduler: Union[
        DDIMScheduler,
        PNDMScheduler,
        LMSDiscreteScheduler,
        EulerDiscreteScheduler,
        EulerAncestralDiscreteScheduler,
        DPMSolverMultistepScheduler,
    ],
    safety_checker: StableDiffusionSafetyChecker,
    feature_extractor: CLIPFeatureExtractor,
    requires_safety_checker: bool = True,
):
    super().__init__()

    if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
        deprecation_message = (
            f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
            f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
            "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
            " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
            " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
            " file"
        )
        deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
        new_config = dict(scheduler.config)
        new_config["steps_offset"] = 1
        scheduler._internal_dict = FrozenDict(new_config)

    if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
        deprecation_message = (
            f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
            " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
            " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
            " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
            " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
        )
        deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
        new_config = dict(scheduler.config)
        new_config["clip_sample"] = False
        scheduler._internal_dict = FrozenDict(new_config)

    if safety_checker is None and requires_safety_checker:
        logger.warning(
            f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
            " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
            " results in services or applications open to the public. Both the diffusers team and Hugging Face"
            " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
            " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
            " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
        )

    if safety_checker is not None and feature_extractor is None:
        raise ValueError(
            "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
            " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
        )

    is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
        version.parse(unet.config._diffusers_version).base_version
    ) < version.parse("0.9.0.dev0")
    is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
    if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
        deprecation_message = (
            "The configuration file of the unet has set the default `sample_size` to smaller than"
            " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
            " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
            " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
            " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
            " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
            " in the config might lead to incorrect results in future versions. If you have downloaded this"
            " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
            " the `unet/config.json` file"
        )
        deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
        new_config = dict(unet.config)
        new_config["sample_size"] = 64
        unet._internal_dict = FrozenDict(new_config)

    self.register_modules(
        vae=vae,
        text_encoder=text_encoder,
        clip_model=clip_model,
        clip_tokenizer=clip_tokenizer,
        tokenizer=tokenizer,
        unet=unet,
        scheduler=scheduler,
        safety_checker=safety_checker,
        feature_extractor=feature_extractor,
    )
    self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
    self.register_to_config(requires_safety_checker=requires_safety_checker)
    self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
    cut_out_size = (
        feature_extractor.size
        if isinstance(feature_extractor.size, int)
        else feature_extractor.size["shortest_edge"]
    )
    self.make_cutouts = MakeCutouts(cut_out_size)

    set_requires_grad(self.text_encoder, False)
    set_requires_grad(self.clip_model, False)
def enable_xformers_memory_efficient_attention(self):
    r"""
    Enable memory efficient attention as implemented in xformers.
    When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
    time. Speed up at training time is not guaranteed.
    Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
    is used.
    """
    self.unet.set_use_memory_efficient_attention_xformers(True)

def disable_xformers_memory_efficient_attention(self):
    r"""
    Disable memory efficient attention as implemented in xformers.
    """
    self.unet.set_use_memory_efficient_attention_xformers(False)

def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
    r"""
    Enable sliced attention computation.
    When this option is enabled, the attention module will split the input tensor in slices, to compute attention
    in several steps. This is useful to save some memory in exchange for a small speed decrease.
    Args:
        slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
            When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
            a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
            `attention_head_dim` must be a multiple of `slice_size`.
    """
    if slice_size == "auto":
        if isinstance(self.unet.config.attention_head_dim, int):
            # half the attention head size is usually a good trade-off between
            # speed and memory
            slice_size = self.unet.config.attention_head_dim // 2
        else:
            # if `attention_head_dim` is a list, take the smallest head size
            slice_size = min(self.unet.config.attention_head_dim)

    self.unet.set_attention_slice(slice_size)

def disable_attention_slicing(self):
    r"""
    Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
    back to computing attention in one step.
    """
    # set slice_size = `None` to disable `attention slicing`
    self.enable_attention_slicing(None)

def enable_vae_slicing(self):
    r"""
    Enable sliced VAE decoding.
    When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
    steps. This is useful to save some memory and allow larger batch sizes.
    """
    self.vae.enable_slicing()

def disable_vae_slicing(self):
    r"""
    Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
    computing decoding in one step.
    """
    self.vae.disable_slicing()

def enable_sequential_cpu_offload(self, gpu_id=0):
    r"""
    Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
    text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
    `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
    """
    if is_accelerate_available():
        from accelerate import cpu_offload
    else:
        raise ImportError("Please install accelerate via `pip install accelerate`")

    device = torch.device(f"cuda:{gpu_id}")

    for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
        if cpu_offloaded_model is not None:
            cpu_offload(cpu_offloaded_model, device)

    if self.safety_checker is not None:
        # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
        # fix by only offloading self.safety_checker for now
        cpu_offload(self.safety_checker.vision_model, device)

@property
def _execution_device(self):
    r"""
    Returns the device on which the pipeline's models will be executed. After calling
    `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
    hooks.
    """
    if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
        return self.device
    for module in self.unet.modules():
        if (
            hasattr(module, "_hf_hook")
            and hasattr(module._hf_hook, "execution_device")
            and module._hf_hook.execution_device is not None
        ):
            return torch.device(module._hf_hook.execution_device)
    return self.device

def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
    r"""
    Encodes the prompt into text encoder hidden states.
    Args:
        prompt (`str` or `list(int)`):
            prompt to be encoded
        device: (`torch.device`):
            torch device
        num_images_per_prompt (`int`):
            number of images that should be generated per prompt
        do_classifier_free_guidance (`bool`):
            whether to use classifier free guidance or not
        negative_prompt (`str` or `List[str]`):
            The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
            if `guidance_scale` is less than `1`).
    """
    batch_size = len(prompt) if isinstance(prompt, list) else 1

    text_inputs = self.tokenizer(
        prompt,
        padding="max_length",
        max_length=self.tokenizer.model_max_length,
        truncation=True,
        return_tensors="pt",
    )
    text_input_ids = text_inputs.input_ids
    untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids

    if not torch.equal(text_input_ids, untruncated_ids):
        removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
        logger.warning(
            "The following part of your input was truncated because CLIP can only handle sequences up to"
            f" {self.tokenizer.model_max_length} tokens: {removed_text}"
        )

    if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
        attention_mask = text_inputs.attention_mask.to(device)
    else:
        attention_mask = None

    text_embeddings = self.text_encoder(
        text_input_ids.to(device),
        attention_mask=attention_mask,
    )
    text_embeddings = text_embeddings[0]

    # duplicate text embeddings for each generation per prompt, using mps friendly method
    bs_embed, seq_len, _ = text_embeddings.shape
    text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
    text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)

    # get unconditional embeddings for classifier free guidance
    if do_classifier_free_guidance:
        uncond_tokens: List[str]
        if negative_prompt is None:
            uncond_tokens = [""] * batch_size
        elif type(prompt) is not type(negative_prompt):
            raise TypeError(
                f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
                f" {type(prompt)}."
            )
        elif isinstance(negative_prompt, str):
            uncond_tokens = [negative_prompt]
        elif batch_size != len(negative_prompt):
            raise ValueError(
                f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
                f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
                " the batch size of `prompt`."
            )
        else:
            uncond_tokens = negative_prompt

        max_length = text_input_ids.shape[-1]
        uncond_input = self.tokenizer(
            uncond_tokens,
            padding="max_length",
            max_length=max_length,
            truncation=True,
            return_tensors="pt",
        )

        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
            attention_mask = uncond_input.attention_mask.to(device)
        else:
            attention_mask = None

        uncond_embeddings = self.text_encoder(
            uncond_input.input_ids.to(device),
            attention_mask=attention_mask,
        )
        uncond_embeddings = uncond_embeddings[0]

        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
        seq_len = uncond_embeddings.shape[1]
        uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
        uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)

        # For classifier free guidance, we need to do two forward passes.
        # Here we concatenate the unconditional and text embeddings into a single batch
        # to avoid doing two forward passes
        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

    return text_embeddings

def run_safety_checker(self, image, device, dtype):
    if self.safety_checker is not None:
        safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
        image, has_nsfw_concept = self.safety_checker(
            images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
        )
    else:
        has_nsfw_concept = None
    return image, has_nsfw_concept

def decode_latents(self, latents):
    latents = 1 / 0.18215 * latents
    image = self.vae.decode(latents).sample
    image = (image / 2 + 0.5).clamp(0, 1)
    # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
    image = image.cpu().permute(0, 2, 3, 1).float().numpy()
    return image

def prepare_extra_step_kwargs(self, generator, eta):
    # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
    # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
    # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
    # and should be between [0, 1]

    accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
    extra_step_kwargs = {}
    if accepts_eta:
        extra_step_kwargs["eta"] = eta

    # check if the scheduler accepts generator
    accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
    if accepts_generator:
        extra_step_kwargs["generator"] = generator
    return extra_step_kwargs

def check_inputs(self, prompt, height, width, callback_steps):
    if not isinstance(prompt, str) and not isinstance(prompt, list):
        raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")

    if height % 8 != 0 or width % 8 != 0:
        raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

    if (callback_steps is None) or (
        callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
    ):
        raise ValueError(
            f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
            f" {type(callback_steps)}."
        )

def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
    shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
    if latents is None:
        if device.type == "mps":
            # randn does not work reproducibly on mps
            latents = torch.randn(shape, generator=generator, device="cpu", dtype=dtype).to(device)
        else:
            latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)
    else:
        if latents.shape != shape:
            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
        latents = latents.to(device)

    # scale the initial noise by the standard deviation required by the scheduler
    latents = latents * self.scheduler.init_noise_sigma
    return latents

@torch.enable_grad()
def cond_fn(
    self,
    latents,
    timestep,
    index,
    text_embeddings,
    noise_pred_original,
    text_embeddings_clip,
    clip_guidance_scale,
    num_cutouts,
    use_cutouts=True,
):
    latents = latents.detach().requires_grad_()

    if isinstance(self.scheduler, (LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler)):
        sigma = self.scheduler.sigmas[index]
        # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
        latent_model_input = latents / ((sigma**2 + 1) ** 0.5)
    else:
        latent_model_input = latents

    # predict the noise residual
    noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample

    if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler)):
        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
        beta_prod_t = 1 - alpha_prod_t
        # compute predicted original sample from predicted noise also called
        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
        pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)

        fac = torch.sqrt(beta_prod_t)
        sample = pred_original_sample * (fac) + latents * (1 - fac)
    elif isinstance(self.scheduler,  (LMSDiscreteScheduler, EulerDiscreteScheduler, EulerAncestralDiscreteScheduler)):
        sigma = self.scheduler.sigmas[index]
        sample = latents - sigma * noise_pred
    else:
        raise ValueError(f"scheduler type {type(self.scheduler)} not supported")

    sample = 1 / 0.18215 * sample
    image = self.vae.decode(sample).sample
    image = (image / 2 + 0.5).clamp(0, 1)

    if use_cutouts:
        image = self.make_cutouts(image, num_cutouts)
    else:
        image = transforms.Resize(feature_extractor.size["shortest_edge"])(image)
    image = self.normalize(image).to(latents.dtype)

    image_embeddings_clip = self.clip_model.get_image_features(image)
    image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True)

    if use_cutouts:
        dists = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip)
        dists = dists.view([num_cutouts, sample.shape[0], -1])
        loss = dists.sum(2).mean(0).sum() * clip_guidance_scale
    else:
        loss = spherical_dist_loss(image_embeddings_clip, text_embeddings_clip).mean() * clip_guidance_scale

    grads = -torch.autograd.grad(loss, latents)[0]

    if isinstance(self.scheduler, (LMSDiscreteScheduler, EulerAncestralDiscreteScheduler)):
        latents = latents.detach() + grads * (sigma**2)
        noise_pred = noise_pred_original
    else:
        noise_pred = noise_pred_original - torch.sqrt(beta_prod_t) * grads
    return noise_pred, latents    

@torch.no_grad()
def __call__(
    self,
    prompt: Union[str, List[str]],
    height: Optional[int] = None,
    width: Optional[int] = None,
    num_inference_steps: int = 50,
    guidance_scale: float = 7.5,
    negative_prompt: Optional[Union[str, List[str]]] = None,
    num_images_per_prompt: Optional[int] = 1,
    clip_guidance_scale: Optional[float] = 100,
    clip_prompt: Optional[Union[str, List[str]]] = None,
    eta: float = 0.0,
    num_cutouts: Optional[int] = 4,
    use_cutouts: Optional[bool] = True,
    generator: Optional[torch.Generator] = None,
    latents: Optional[torch.FloatTensor] = None,
    output_type: Optional[str] = "pil",
    max_embeddings_multiples: Optional[int] = 3,
    return_dict: bool = True,
    callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
    callback_steps: Optional[int] = 1,
):

    # 0. Default height and width to unet
    height = height or self.unet.config.sample_size * self.vae_scale_factor
    width = width or self.unet.config.sample_size * self.vae_scale_factor

    # 1. Check inputs. Raise error if not correct
    self.check_inputs(prompt, height, width, callback_steps)

    # 2. Define call parameters
    batch_size = 1 if isinstance(prompt, str) else len(prompt)
    device = self._execution_device
    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
    # corresponds to doing no classifier free guidance.
    do_classifier_free_guidance = guidance_scale > 1.0

    # 3. Encode input prompt
    text_embeddings = self._encode_prompt(
        prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
    )

    if clip_guidance_scale > 0:
        if clip_prompt is not None:
            clip_text_input = self.clip_tokenizer(
                clip_prompt,
                padding="max_length",
                max_length=self.clip_tokenizer.model_max_length,
                truncation=True,
                return_tensors="pt",
            ).input_ids.to(self.device)
        else:
            # duplicate text embeddings clip for each generation per prompt
            text_input = self.clip_tokenizer(
                    prompt,
                    padding="max_length",
                    max_length= self.clip_tokenizer.model_max_length,
                    truncation=True,
                    return_tensors="pt",
                            )
        clip_text_input = text_input.input_ids.to(self.device)
        text_embeddings_clip = self.clip_model.get_text_features(clip_text_input)
        text_embeddings_clip = text_embeddings_clip / text_embeddings_clip.norm(p=2, dim=-1, keepdim=True)
        # duplicate text embeddings clip for each generation per prompt
        text_embeddings_clip = text_embeddings_clip.repeat_interleave(num_images_per_prompt, dim=0)

    do_classifier_free_guidance = guidance_scale > 1.0
    # 4. Prepare timesteps
    self.scheduler.set_timesteps(num_inference_steps, device=device)
    timesteps = self.scheduler.timesteps

    # 5. Prepare latent variables
    num_channels_latents = self.unet.in_channels
    latents = self.prepare_latents(
        batch_size * num_images_per_prompt,
        num_channels_latents,
        height,
        width,
        text_embeddings.dtype,
        device,
        generator,
        latents,
    )

    # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
    extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

    # 7. Denoising loop
    num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
    with self.progress_bar(total=num_inference_steps) as progress_bar:
        for i, t in enumerate(timesteps):
            # expand the latents if we are doing classifier free guidance
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

            # predict the noise residual
            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

            # perform guidance
            if do_classifier_free_guidance:
                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

            # perform clip guidance
            if clip_guidance_scale > 0:
                text_embeddings_for_guidance = (
                    text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings
                )
                noise_pred, latents = self.cond_fn(
                    latents,
                    t,
                    i,
                    text_embeddings_for_guidance,
                    noise_pred,
                    text_embeddings_clip,
                    clip_guidance_scale,
                    num_cutouts,
                    use_cutouts,
                )

            # compute the previous noisy sample x_t -> x_t-1
            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

            # call the callback, if provided
            if (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
                progress_bar.update()
                if callback is not None and i % callback_steps == 0:
                    callback(i, t, latents)

    # 8. Post-processing
    image = self.decode_latents(latents)

    # 9. Run safety checker
    image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)

    # 10. Convert to PIL
    if output_type == "pil":
        image = self.numpy_to_pil(image)

    if not return_dict:
        return (image, has_nsfw_concept)

    return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)`

`import torch from diffusers import EulerDiscreteScheduler,DDIMScheduler from transformers import CLIPFeatureExtractor, CLIPModel

model_name= r"D:\stable_diffusion模型合辑\stabilityai_stable-diffusion-2" clip_model_path= r'D:\CLIP_models\clip-vit-large-patch14-336' model_name= r"D:\stable_diffusion模型合辑\stabilityai_stable-diffusion-2"

feature_extractor = CLIPFeatureExtractor.from_pretrained(clip_model_path) clip_model = CLIPModel.from_pretrained(clip_model_path, torch_dtype=torch.float16) clip_tokenizer= CLIPTokenizer.from_pretrained(clip_model_path)

Use the Euler scheduler here instead

scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler", prediction_type="v_prediction")

Setup the scheduler and pipeline

scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, prediction_type="v_prediction") # <- make sure we are doing v_prediction

guided_pipeline = CLIPGuidedStableDiffusionPipeline.from_pretrained( pretrained_model_name_or_path=model_name, clip_model=clip_model, clip_tokenizer= clip_tokenizer, feature_extractor=feature_extractor, local_files_only=True, safety_checker=None, revision="fp16", torch_dtype=torch.float16, ) guided_pipeline.enable_attention_slicing(5) guided_pipeline = guided_pipeline.to("cuda") `

`#prompt = "fantasy book cover, full moon, fantasy forest landscape, golden vector elements, fantasy magic, dark light night, intricate, elegant, sharp focus, illustration, highly detailed, digital painting, concept art, matte, art by WLOP and Artgerm and Albert Bierstadt, masterpiece" negative_prompt= '''lowres, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, artist name, (((ugly))), (((duplicate))), ((morbid)), ((mutilated)), out of frame, extra fingers, mutated hands, ((poorly drawn hands)), ((poorly drawn face)), (((mutation))), (((deformed))), ((blurry)), ((bad anatomy)), (((bad proportions))), ((extra limbs)), cloned face, (((disfigured))), bad proportions, (malformed limbs), ((missing arms)), ((missing legs)), (((extra arms))), (((extra legs))), (fused fingers), (too many fingers)''' prompt= 'a big fat cat'

generator = torch.Generator(device="cuda").manual_seed(844440) images = [] for i in range(4): image = guided_pipeline( prompt, negative_prompt=negative_prompt, num_inference_steps=65, guidance_scale=7.5, clip_guidance_scale=100, num_cutouts=4, use_cutouts=True, generator=generator, ).images[0]

image.save(f"clip_sd_imgs\image_{i}.png") # save images locally
images.append(image)

Reproduction

No response

Logs

No response

System Info

diffusers==0.9.0

huggingface / diffusers

Can some of you help me look at my code? #1473