Closed cthulhu-tww closed 1 month ago
hi, this is the same as this one #7033, it will be handled in a future release https://github.com/huggingface/diffusers/issues/7033#issuecomment-1953427912
ok thank you
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
I got similar error in the code I want to create a new class Region_AnimateDiffPipeline and using the cross_attention_kwargs in Region_Animatediff_Processor but got this error
ross_attention_kwargs ['region_list', 'height', 'width'] are not expected by RegionAnimatediff_Processor and will be ignored. cross_attention_kwargs ['region_list', 'height', 'width'] are not expected by RegionAnimatediff_Processor and will be ignored.
class Region_AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for text-to-video generation.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).
Args:
vae ([`AutoencoderKL`]):
Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
text_encoder ([`CLIPTextModel`]):
Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
tokenizer (`CLIPTokenizer`):
A [`~transformers.CLIPTokenizer`] to tokenize text.
unet ([`UNet2DConditionModel`]):
A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
motion_adapter ([`MotionAdapter`]):
A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
"""
model_cpu_offload_seq = "text_encoder->unet->vae"
def __init__(
self,
vae: AutoencoderKL,
text_encoder: CLIPTextModel,
tokenizer: CLIPTokenizer,
unet: UNet2DConditionModel,
motion_adapter: MotionAdapter,
scheduler: Union[
DDIMScheduler,
PNDMScheduler,
LMSDiscreteScheduler,
EulerDiscreteScheduler,
EulerAncestralDiscreteScheduler,
DPMSolverMultistepScheduler,
],
):
super().__init__()
unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
motion_adapter=motion_adapter,
scheduler=scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.set_region_processor()
def set_region_processor(self):
unet_lora_attn_procs = {}
for name, attn_processor in self.unet.attn_processors.items():
if "motion_modules" not in name:
new_module = RegionAnimatediff_Processor().to(self.unet.device)
unet_lora_attn_procs[name] = new_module
else:
unet_lora_attn_procs[name] = attn_processor
self.unet.set_attn_processor(unet_lora_attn_procs)
@torch.no_grad()
def __call__(
self,
prompt: Union[str, List[str]] = None,
num_frames: Optional[int] = 16,
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
negative_prompt: Optional[Union[str, List[str]]] = None,
num_videos_per_prompt: Optional[int] = 1,
eta: float = 0.0,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
callback_steps: Optional[int] = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
clip_skip: Optional[int] = None,
):
# Initialize cross_attention_kwargs if None
if cross_attention_kwargs is None:
cross_attention_kwargs = {}
# 0. Default height and width to unet
height = height or self.unet.config.sample_size * self.vae_scale_factor
width = width or self.unet.config.sample_size * self.vae_scale_factor
num_videos_per_prompt = 1
# 1. Check inputs. Raise error if not correct
self.check_inputs(prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
batch_size = len(prompt)
else:
batch_size = prompt_embeds.shape[0]
device = self._execution_device
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompt
text_encoder_lora_scale = (cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None)
# prompt_embeds, negative_prompt_embeds = self.encode_prompt(
# prompt,
# device,
# num_videos_per_prompt,
# do_classifier_free_guidance,
# negative_prompt,
# prompt_embeds=prompt_embeds,
# negative_prompt_embeds=negative_prompt_embeds,
# lora_scale=text_encoder_lora_scale,
# clip_skip=clip_skip,
# )
# # For classifier free guidance, we need to do two forward passes.
# # Here we concatenate the unconditional and text embeddings into a single batch
# # to avoid doing two forward passes
# if do_classifier_free_guidance:
# prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
prompt_embeds, region_list = self.encode_region_prompt(
prompt,
device,
num_videos_per_prompt,
negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds
)
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps, device=device)
timesteps = self.scheduler.timesteps
# 5. Prepare latent variables
num_channels_latents = self.unet.config.in_channels
latents = self.prepare_latents(
batch_size * num_videos_per_prompt,
num_channels_latents,
num_frames,
height,
width,
prompt_embeds.dtype,
device,
generator,
latents,
)
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# Denoising loop
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
if cross_attention_kwargs is None:
cross_attention_kwargs = {}
b, l, d = prompt_embeds.size()
region_list_repeat = []
for idx in range( len(region_list) ):
region_emb, region_pos = region_list[idx]
region_emb_repeat = region_emb[:, None].repeat(1, num_frames, 1, 1).reshape(b * num_frames, l, d)
region_list_repeat.append( (region_emb_repeat, region_pos))
cross_attention_kwargs.update({
'region_list': region_list_repeat,
'height': height,
'width': width
})
print(cross_attention_kwargs)
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
).sample
`
`class RegionAnimatediff_Processor(nn.Module):
def __init__(self):
super().__init__()
def region_rewrite(self, attn, hidden_states, query, region_list, height, width):
def get_region_mask(region_list, feat_height, feat_width):
exclusive_mask = torch.zeros((feat_height, feat_width))
for region in region_list:
start_h, start_w, end_h, end_w = region[-1]
start_h, start_w, end_h, end_w = math.ceil(start_h * feat_height), math.ceil(
start_w * feat_width), math.floor(end_h * feat_height), math.floor(end_w * feat_width)
exclusive_mask[start_h:end_h, start_w:end_w] += 1
return exclusive_mask
dtype = query.dtype
seq_lens = query.shape[1]
downscale = math.sqrt(height * width / seq_lens)
# 0: context >=1: may be overlap
feat_height, feat_width = int(height // downscale), int(width // downscale)
region_mask = get_region_mask(region_list, feat_height, feat_width)
query = rearrange(query, 'b (h w) c -> b h w c', h=feat_height, w=feat_width)
hidden_states = rearrange(hidden_states, 'b (h w) c -> b h w c', h=feat_height, w=feat_width)
new_hidden_state = torch.zeros_like(hidden_states)
new_hidden_state[:, region_mask == 0, :] = hidden_states[:, region_mask == 0, :]
replace_ratio = 1.0
new_hidden_state[:, region_mask != 0, :] = (1 - replace_ratio) * hidden_states[:, region_mask != 0, :]
for region in region_list:
region_key, region_value, region_box = region
if attn.upcast_attention:
query = query.float()
region_key = region_key.float()
start_h, start_w, end_h, end_w = region_box
start_h, start_w, end_h, end_w = math.ceil(start_h * feat_height), math.ceil(
start_w * feat_width), math.floor(end_h * feat_height), math.floor(end_w * feat_width)
# print("we are here now")
# print(region_key.size())
# print(query.size())
attention_region = einsum('b h w c, b n c -> b h w n', query[:, start_h:end_h, start_w:end_w, :], region_key) * attn.scale
if attn.upcast_softmax:
attention_region = attention_region.float()
attention_region = attention_region.softmax(dim=-1)
attention_region = attention_region.to(dtype)
hidden_state_region = einsum('b h w n, b n c -> b h w c', attention_region, region_value)
new_hidden_state[:, start_h:end_h, start_w:end_w, :] += \
replace_ratio * (hidden_state_region / (region_mask.reshape(1, *region_mask.shape, 1)[:, start_h:end_h, start_w:end_w, :]
).to(query.device))
new_hidden_state = rearrange(new_hidden_state, 'b h w c -> b (h w) c')
return new_hidden_state
def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None, **cross_attention_kwargs):
residual = hidden_states
is_cross = encoder_hidden_states is not None
if attn.spatial_norm is not None:
hidden_states = attn.spatial_norm(hidden_states, temb)
input_ndim = hidden_states.ndim
if input_ndim == 4:
batch_size, channel, height, width = hidden_states.shape
hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
batch_size, sequence_length, _ = (
hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
)
attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
if attn.group_norm is not None:
hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
key = attn.to_k(encoder_hidden_states)
value = attn.to_v(encoder_hidden_states)
query = attn.head_to_batch_dim(query)
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
attention_probs = attn.get_attention_scores(query, key, attention_mask)
hidden_states = torch.bmm(attention_probs, value)
if is_cross:
region_list = []
for region in cross_attention_kwargs['region_list']:
region_key = attn.to_k(region[0])
region_value = attn.to_v(region[0])
region_key = attn.head_to_batch_dim(region_key)
region_value = attn.head_to_batch_dim(region_value)
region_list.append((region_key, region_value, region[1]))
hidden_states = self.region_rewrite(
attn=attn,
hidden_states=hidden_states,
query=query,
region_list=region_list,
height=cross_attention_kwargs['height'],
width=cross_attention_kwargs['width'])
hidden_states = attn.batch_to_head_dim(hidden_states)
# linear proj
hidden_states = attn.to_out[0](hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
if input_ndim == 4:
hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
if attn.residual_connection:
hidden_states = hidden_states + residual
hidden_states = hidden_states / attn.rescale_output_factor
return hidden_states
@JustinKai0527
in your RegionAnimatediff_Processor
definition, if you explicitly list these argument in the signature, you won't get the warning
class RegionAnimatediff_Processor(nn.Module):
def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None, region_list=None, height=None, width=None):
the original issue is fixed so I'm closing for now! @JustinKai0527 feel free to open a new issue if the solution I provided does not solve your problem:)
@yiyixuxu thanks for replying, if I change the argument it won't arise the warning, but what I want to do is to get the cross_attention_kwargs in this processor, so how to I get it? I first initialize the Processor to the unet so after I using call I want the cross_attention_kwargs to flow back to the Processor
self.register_modules(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
motion_adapter=motion_adapter,
scheduler=scheduler,
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.set_region_processor()
def set_region_processor(self):
unet_lora_attn_procs = {}
for name, attn_processor in self.unet.attn_processors.items():
if "motion_modules" not in name:
new_module = RegionAnimatediff_Processor().to(self.unet.device)
unet_lora_attn_procs[name] = new_module
else:
unet_lora_attn_procs[name] = attn_processor
self.unet.set_attn_processor(unet_lora_attn_procs)```
```cross_attention_kwargs.update({
'region_list': region_list_repeat,
'height': height,
'width': width
})
print(cross_attention_kwargs)
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
# predict the noise residual
noise_pred = self.unet(
latent_model_input,
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
).sample```
@cthulhu-tww it should work (pass to attention processor without a warning) if you do this https://github.com/huggingface/diffusers/issues/7695#issuecomment-2401142102 if not, please share a reproducible code example :)
Describe the bug
What error message is this, but it won't cause the task to fail, it will only keep printing,thank you。
Reproduction
import torch from diffusers import StableDiffusionXLPipeline from diffusers.image_processor import IPAdapterMaskProcessor from diffusers.utils import load_image from transformers import CLIPVisionModelWithProjection
image_encoder = CLIPVisionModelWithProjection.from_pretrained( "./repository/h94-IP-Adapter", subfolder="models/image_encoder", torch_dtype=torch.float16 ) pipeline = StableDiffusionXLPipeline.from_pretrained("./repository/stable-diffusion-xl-base-1.0", image_encoder=image_encoder, torch_dtype=torch.float16, use_safetensors=True, ).to("cuda") pipeline.enable_model_cpu_offload()
pipeline.load_ip_adapter("./repository/h94-IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] 2) pipeline.set_ip_adapter_scale([0.6] 2) generator = torch.Generator(device="cpu").manual_seed(4) mask1 = load_image( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_mask1.png") mask2 = load_image( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_mask2.png") output_height = 1024 output_width = 1024 processor = IPAdapterMaskProcessor() masks = processor.preprocess([mask1, mask2, ]) face_image1 = load_image( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl1.png") face_image2 = load_image( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_mask_girl2.png") ip_images = [[face_image1], [face_image2]] image = pipeline( prompt="2 girls", ip_adapter_image=ip_images, negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", num_inference_steps=20, num_images_per_prompt=1, generator=generator, cross_attention_kwargs={"ip_adapter_masks": masks} ).images[0] image.save("test.png")
Logs
System Info
diffusers 0.27.2 python 3.10 windows
Who can help?
No response