YanzuoLu / CFLD

[CVPR 2024 Highlight] Coarse-to-Fine Latent Diffusion for Pose-Guided Person Image Synthesis
MIT License
165 stars 11 forks source link

Issues for running playground.ipynb #23

Closed tonywang-sh closed 3 months ago

tonywang-sh commented 3 months ago

There are some errors at the code line, "unet = UNet(cfg).eval().requiresgrad(False).cuda()", during running playground.ipynb. The error messages are as the below.


ValueError Traceback (most recent call last) Cell In[4], line 4 2 vae = VariationalAutoencoder(pretrained_path="pretrained_models/vae").eval().requiresgrad(False).cuda() 3 model = build_model(cfg).eval().requiresgrad(False).cuda() ----> 4 unet = UNet(cfg).eval().requiresgrad(False).cuda()

File /data/CFLD/models/unet.py:1913, in UNet.init(self, cfg) 1910 def init(self, cfg): 1911 super().init() -> 1913 self.model = ResidualUNet2DConditionModel.from_pretrained( 1914 cfg.MODEL.UNET_CONFIG.PRETRAINED_PATH, use_safetensors = False) 1915 self.model.requiresgrad(False) 1916 self.model.enable_xformers_memory_efficient_attention()

File /usr/local/lib/python3.9/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.._inner_fn(*args, *kwargs) 111 if check_use_auth_token: 112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.name, has_token=has_token, kwargs=kwargs) --> 114 return fn(args, **kwargs)

File /usr/local/lib/python3.9/site-packages/diffusers/models/modeling_utils.py:650, in ModelMixin.from_pretrained(cls, pretrained_model_name_or_path, kwargs) 647 if low_cpu_mem_usage: 648 # Instantiate model with empty weights 649 with accelerate.init_empty_weights(): --> 650 model = cls.from_config(config, unused_kwargs) 652 # if device_map is None, load the state dict and move the params from meta device to the cpu 653 if device_map is None:

File /usr/local/lib/python3.9/site-packages/diffusers/configuration_utils.py:260, in ConfigMixin.from_config(cls, config, return_unused_kwargs, kwargs) 258 # Return model and optionally state and/or unused_kwargs 259 print("init_dict: ", init_dict, flush=True) --> 260 model = cls(init_dict) 262 # make sure to also save config parameters that might be used for compatible classes 263 # update _class_name 264 if "_class_name" in hidden_dict:

File /usr/local/lib/python3.9/site-packages/diffusers/configuration_utils.py:654, in register_to_config..inner_init(self, args, kwargs) 652 new_kwargs = {config_init_kwargs, new_kwargs} 653 getattr(self, "register_to_config")(new_kwargs) --> 654 init(self, args, **init_kwargs)

File /data/CFLD/models/unet.py:1564, in ResidualUNet2DConditionModel.init(self, sample_size, in_channels, out_channels, center_input_sample, flip_sin_to_cos, freq_shift, down_block_types, mid_block_type, up_block_types, only_cross_attention, block_out_channels, layers_per_block, downsample_padding, mid_block_scale_factor, act_fn, norm_num_groups, norm_eps, cross_attention_dim, transformer_layers_per_block, encoder_hid_dim, encoder_hid_dim_type, attention_head_dim, num_attention_heads, dual_cross_attention, use_linear_projection, class_embed_type, addition_embed_type, addition_time_embed_dim, num_class_embeds, upcast_attention, resnet_time_scale_shift, resnet_skip_time_act, resnet_out_scale_factor, time_embedding_type, time_embedding_dim, time_embedding_act_fn, timestep_post_act, time_cond_proj_dim, conv_in_kernel, conv_out_kernel, projection_class_embeddings_input_dim, class_embeddings_concat, mid_block_only_cross_attention, cross_attention_norm, addition_embed_type_num_heads) 1561 else: 1562 add_upsample = False -> 1564 up_block = get_residual_up_block( 1565 up_block_type, 1566 num_layers=reversed_layers_per_block[i] + 1, 1567 transformer_layers_per_block=reversed_transformer_layers_per_block[i], 1568 in_channels=input_channel, 1569 out_channels=output_channel, 1570 prev_output_channel=prev_output_channel, 1571 temb_channels=blocks_time_embed_dim, 1572 add_upsample=add_upsample, 1573 resnet_eps=norm_eps, 1574 resnet_act_fn=act_fn, 1575 resnet_groups=norm_num_groups, 1576 cross_attention_dim=reversed_cross_attention_dim[i], 1577 num_attention_heads=reversed_num_attention_heads[i], 1578 dual_cross_attention=dual_cross_attention, 1579 use_linear_projection=use_linear_projection, 1580 only_cross_attention=only_cross_attention[i], 1581 upcast_attention=upcast_attention, 1582 resnet_time_scale_shift=resnet_time_scale_shift, 1583 resnet_skip_time_act=resnet_skip_time_act, 1584 resnet_out_scale_factor=resnet_out_scale_factor, 1585 cross_attention_norm=cross_attention_norm, 1586 attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, 1587 ) 1588 self.up_blocks.append(up_block) 1589 prev_output_channel = output_channel

File /data/CFLD/models/unet.py:1061, in get_residual_up_block(up_block_type, num_layers, in_channels, out_channels, prev_output_channel, temb_channels, add_upsample, resnet_eps, resnet_act_fn, transformer_layers_per_block, num_attention_heads, resnet_groups, cross_attention_dim, dual_cross_attention, use_linear_projection, only_cross_attention, upcast_attention, resnet_time_scale_shift, resnet_skip_time_act, resnet_out_scale_factor, cross_attention_norm, attention_head_dim, upsample_type) 1059 if cross_attention_dim is None: 1060 raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D") -> 1061 return ResidualCrossAttnUpBlock2D( 1062 num_layers=num_layers, 1063 transformer_layers_per_block=transformer_layers_per_block, 1064 in_channels=in_channels, 1065 out_channels=out_channels, 1066 prev_output_channel=prev_output_channel, 1067 temb_channels=temb_channels, 1068 add_upsample=add_upsample, 1069 resnet_eps=resnet_eps, 1070 resnet_act_fn=resnet_act_fn, 1071 resnet_groups=resnet_groups, 1072 cross_attention_dim=cross_attention_dim, 1073 num_attention_heads=num_attention_heads, 1074 dual_cross_attention=dual_cross_attention, 1075 use_linear_projection=use_linear_projection, 1076 only_cross_attention=only_cross_attention, 1077 upcast_attention=upcast_attention, 1078 resnet_time_scale_shift=resnet_time_scale_shift, 1079 ) 1080 elif up_block_type == "SimpleCrossAttnUpBlock2D": 1081 if cross_attention_dim is None:

File /data/CFLD/models/unet.py:890, in ResidualCrossAttnUpBlock2D.init(self, in_channels, out_channels, prev_output_channel, temb_channels, dropout, num_layers, transformer_layers_per_block, resnet_eps, resnet_time_scale_shift, resnet_act_fn, resnet_groups, resnet_pre_norm, num_attention_heads, cross_attention_dim, output_scale_factor, add_upsample, dual_cross_attention, use_linear_projection, only_cross_attention, upcast_attention) 874 resnets.append( 875 ResidualResnetBlock2D( 876 in_channels=resnet_in_channels + res_skip_channels, (...) 886 ) 887 ) 888 if not dual_cross_attention: 889 attentions.append( --> 890 ResidualTransformer2DModel( 891 num_attention_heads, 892 out_channels // num_attention_heads, 893 in_channels=out_channels, 894 num_layers=transformer_layers_per_block, 895 cross_attention_dim=cross_attention_dim, 896 norm_num_groups=resnet_groups, 897 use_linear_projection=use_linear_projection, 898 only_cross_attention=only_cross_attention, 899 upcast_attention=upcast_attention, 900 ) 901 ) 902 else: 903 attentions.append( 904 DualTransformer2DModel( 905 num_attention_heads, (...) 911 ) 912 )

File /usr/local/lib/python3.9/site-packages/diffusers/configuration_utils.py:654, in register_to_config..inner_init(self, args, kwargs) 652 new_kwargs = {config_init_kwargs, new_kwargs} 653 getattr(self, "register_to_config")(new_kwargs) --> 654 init(self, args, **init_kwargs)

File /data/CFLD/models/unet.py:502, in ResidualTransformer2DModel.init(self, num_attention_heads, attention_head_dim, in_channels, out_channels, num_layers, dropout, norm_num_groups, cross_attention_dim, attention_bias, sample_size, num_vector_embeds, patch_size, activation_fn, num_embeds_ada_norm, use_linear_projection, only_cross_attention, upcast_attention, norm_type, norm_elementwise_affine) 479 @register_to_config 480 def init( 481 self, (...) 500 norm_elementwise_affine: bool = True, 501 ): --> 502 super(Transformer2DModel, self).init() 503 self.use_linear_projection = use_linear_projection 504 self.num_attention_heads = num_attention_heads

File /usr/local/lib/python3.9/site-packages/diffusers/configuration_utils.py:654, in register_to_config..inner_init(self, args, kwargs) 652 new_kwargs = {config_init_kwargs, new_kwargs} 653 getattr(self, "register_to_config")(new_kwargs) --> 654 init(self, args, **init_kwargs)

File /usr/local/lib/python3.9/site-packages/diffusers/models/transformers/transformer_2d.py:151, in Transformer2DModel.init(self, num_attention_heads, attention_head_dim, in_channels, out_channels, num_layers, dropout, norm_num_groups, cross_attention_dim, attention_bias, sample_size, num_vector_embeds, patch_size, activation_fn, num_embeds_ada_norm, use_linear_projection, only_cross_attention, double_self_attention, upcast_attention, norm_type, norm_elementwise_affine, norm_eps, attention_type, caption_channels, interpolation_scale) 146 raise ValueError( 147 f"Cannot define both num_vector_embeds: {num_vector_embeds} and patch_size: {patch_size}. Make" 148 " sure that either num_vector_embeds or num_patches is None." 149 ) 150 elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches: --> 151 raise ValueError( 152 f"Has to define in_channels: {in_channels}, num_vector_embeds: {num_vector_embeds}, or patch_size:" 153 f" {patch_size}. Make sure that in_channels, num_vector_embeds or num_patches is not None." 154 ) 156 # 2. Define input layers 157 if self.is_input_continuous:

ValueError: Has to define in_channels: None, num_vector_embeds: None, or patch_size: None. Make sure that in_channels, num_vector_embeds or num_patches is not None.

YanzuoLu commented 3 months ago

Hi, @tonywang-sh You seem failed to download the complete vae checkpoints. Please make sure the pretrained models are fully available.