suzukimain commented 1 year ago

Describe the bug

If I set num_inference_steps to 30 in txt2video, an error occurs, but if I set it to 60, no error occurs.

Reproduction

import torch import imageio from diffusers import TextToVideoZeroPipeline import numpy as np

model_id = "runwayml/stable-diffusion-v1-5" pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda") seed = 0 video_length = 8 chunk_size = 4 prompt = "A panda is playing guitar on times square"

Generate the video chunk-by-chunk

result = [] chunk_ids = np.arange(0, video_length, chunk_size - 1) generator = torch.Generator(device="cuda")

num_inference_steps=30

for i in range(len(chunk_ids)): print(f"Processing chunk {i + 1} / {len(chunk_ids)}") ch_start = chunk_ids[i] ch_end = video_length if i == len(chunk_ids) - 1 else chunk_ids[i + 1]

Attach the first frame for Cross Frame Attention

frame_ids = [0] + list(range(ch_start, ch_end))
# Fix the seed for the temporal consistency
generator.manual_seed(seed)
output = pipe(prompt=prompt,video_length=len(frame_ids), generator=generator, num_inference_steps=num_inference_steps,frame_ids=frame_ids)
result.append(output.images[1:])

Concatenate chunks and save

result = np.concatenate(result) result = [(r * 255).astype("uint8") for r in result] imageio.mimsave("video2.mp4", result, fps=4)

Logs

Loading pipeline components...: 100%
7/7 [00:48<00:00, 8.43s/it]
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.
Processing chunk 1 / 3
/usr/local/lib/python3.10/dist-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py:237: FutureWarning: `_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple.
  deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
---------------------------------------------------------------------------
TraitError                                Traceback (most recent call last)
<ipython-input-2-18538a708b96> in <cell line: 20>()
     26     # Fix the seed for the temporal consistency
     27     generator.manual_seed(seed)
---> 28     output = pipe(prompt=prompt,video_length=len(frame_ids), generator=generator, num_inference_steps=num_inference_steps,frame_ids=frame_ids)
     29     result.append(output.images[1:])
     30 

11 frames
/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py in decorate_context(*args, **kwargs)
    113     def decorate_context(*args, **kwargs):
    114         with ctx_factory():
--> 115             return func(*args, **kwargs)
    116 
    117     return decorate_context

/usr/local/lib/python3.10/dist-packages/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py in __call__(self, prompt, video_length, height, width, num_inference_steps, guidance_scale, negative_prompt, num_videos_per_prompt, eta, generator, latents, motion_field_strength_x, motion_field_strength_y, output_type, return_dict, callback, callback_steps, t0, t1, frame_ids)
    562 
    563         # Perform the first backward process up to time T_1
--> 564         x_1_t1 = self.backward_loop(
    565             timesteps=timesteps[: -t1 - 1],
    566             prompt_embeds=prompt_embeds,

/usr/local/lib/python3.10/dist-packages/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py in backward_loop(self, latents, timesteps, prompt_embeds, guidance_scale, callback, callback_steps, num_warmup_steps, extra_step_kwargs, cross_attention_kwargs)
    389         do_classifier_free_guidance = guidance_scale > 1.0
    390         num_steps = (len(timesteps) - num_warmup_steps) // self.scheduler.order
--> 391         with self.progress_bar(total=num_steps) as progress_bar:
    392             for i, t in enumerate(timesteps):
    393                 # expand the latents if we are doing classifier free guidance

/usr/local/lib/python3.10/dist-packages/diffusers/pipelines/pipeline_utils.py in progress_bar(self, iterable, total)
   1710             return tqdm(iterable, **self._progress_bar_config)
   1711         elif total is not None:
-> 1712             return tqdm(total=total, **self._progress_bar_config)
   1713         else:
   1714             raise ValueError("Either `total` or `iterable` has to be defined.")

/usr/local/lib/python3.10/dist-packages/tqdm/notebook.py in __init__(self, *args, **kwargs)
    231         unit_scale = 1 if self.unit_scale is True else self.unit_scale or 1
    232         total = self.total * unit_scale if self.total else self.total
--> 233         self.container = self.status_printer(self.fp, total, self.desc, self.ncols)
    234         self.container.pbar = proxy(self)
    235         self.displayed = False

/usr/local/lib/python3.10/dist-packages/tqdm/notebook.py in status_printer(_, total, desc, ncols)
    108             raise ImportError(WARN_NOIPYW)
    109         if total:
--> 110             pbar = IProgress(min=0, max=total)
    111         else:  # No total? Show info style bar with no progress tqdm status
    112             pbar = IProgress(min=0, max=1)

/usr/local/lib/python3.10/dist-packages/ipywidgets/widgets/widget_float.py in __init__(self, value, **kwargs)
     24         if value is not None:
     25             kwargs['value'] = value
---> 26         super(_Float, self).__init__(**kwargs)
     27 
     28 

/usr/local/lib/python3.10/dist-packages/ipywidgets/widgets/widget.py in __init__(self, **kwargs)
    475         """Public constructor"""
    476         self._model_id = kwargs.pop('model_id', None)
--> 477         super(Widget, self).__init__(**kwargs)
    478 
    479         Widget._call_widget_constructed(self)

/usr/local/lib/python3.10/dist-packages/traitlets/traitlets.py in __init__(self, *args, **kwargs)
   1355             changed = set(kwargs) & set(self._traits)
   1356             for key in changed:
-> 1357                 value = self._traits[key]._cross_validate(self, getattr(self, key))
   1358                 self.set_trait(key, value)
   1359                 changes[key]['new'] = value

/usr/local/lib/python3.10/dist-packages/traitlets/traitlets.py in _cross_validate(self, obj, value)
    741         if self.name in obj._trait_validators:
    742             proposal = Bunch({"trait": self, "value": value, "owner": obj})
--> 743             value = obj._trait_validators[self.name](obj, proposal)
    744         elif hasattr(obj, "_%s_validate" % self.name):
    745             meth_name = "_%s_validate" % self.name

/usr/local/lib/python3.10/dist-packages/traitlets/traitlets.py in __call__(self, *args, **kwargs)
   1227         """Pass `*args` and `**kwargs` to the handler's function if it exists."""
   1228         if hasattr(self, "func"):
-> 1229             return self.func(*args, **kwargs)
   1230         else:
   1231             return self._init_call(*args, **kwargs)

/usr/local/lib/python3.10/dist-packages/ipywidgets/widgets/widget_float.py in _validate_max(self, proposal)
     54         max = proposal['value']
     55         if max < self.min:
---> 56             raise TraitError('setting max < min')
     57         if max < self.value:
     58             self.value = max

TraitError: setting max < min

System Info

diffusers version: 0.21.4
Platform: Linux-5.15.120+-x86_64-with-glibc2.35
Python version: 3.10.12
PyTorch version (GPU?): 2.0.1+cu118 (True)
Huggingface_hub version: 0.17.3
Transformers version: 4.34.0
Accelerate version: not installed
xFormers version: not installed
Using GPU in script?:
Using distributed or parallel set-up in script?:

Who can help?

No response

DN6 commented 1 year ago

Hi @suzukimain I believe you have to set the t0 and t1 arguments in the pipeline based on the number of inference steps. See: https://huggingface.co/docs/diffusers/v0.21.0/en/api/pipelines/text_to_video_zero#diffusers.TextToVideoZeroPipeline.__call__.t1