siliconflow / onediff

OneDiff: An out-of-the-box acceleration library for diffusion models.
https://github.com/siliconflow/onediff/wiki
Apache License 2.0
1.69k stars 102 forks source link

Repeated compilation of the txt2image model under dynamic resolution #849

Closed lijunliangTG closed 4 months ago

lijunliangTG commented 6 months ago

Describe the bug

A clear and concise description of what the bug is.

When using input data with different resolution sizes, the program compiles the graphical model multiple times.

Your environment

OS

Ubuntu 20.04.4 LTS

OneDiff git commit id

6077d115

OneFlow version info

Run python -m oneflow --doctor and paste it here. image

How To Reproduce

Steps to reproduce the behavior(code or script):

import os
import argparse
import time

import torch
from safetensors.torch import load_file
from diffusers import StableDiffusionXLPipeline
from onediffx import compile_pipe, compiler_config, save_pipe, load_pipe
from huggingface_hub import hf_hub_download
import oneflow

try:
    USE_PEFT_BACKEND = diffusers.utils.USE_PEFT_BACKEND
except Exception as e:
    USE_PEFT_BACKEND = False

def main(args):
    OUTPUT_TYPE = "pil"

    n_steps = int(args.cpkt[len("sdxl_lightning_") : len("sdxl_lightning_") + 1])

    is_lora_cpkt = "lora" in args.cpkt

    if args.compile:
        from onediff.schedulers import EulerDiscreteScheduler
    else:
        from diffusers import EulerDiscreteScheduler

    if is_lora_cpkt:
        if not USE_PEFT_BACKEND:
            print("PEFT backend is required for load_lora_weights")
            exit(0)
        pipe = StableDiffusionXLPipeline.from_pretrained(
            args.base, torch_dtype=torch.float16, variant="fp16"
        ).to("cuda")
        if os.path.isfile(os.path.join(args.repo, args.cpkt)):
            pipe.load_lora_weights(os.path.join(args.repo, args.cpkt))
        else:
            pipe.load_lora_weights(hf_hub_download(args.repo, args.cpkt))
        pipe.fuse_lora()
    else:
        from diffusers import UNet2DConditionModel

        unet = UNet2DConditionModel.from_config(args.base, subfolder="unet").to(
            "cuda", torch.float16
        )
        if os.path.isfile(os.path.join(args.repo, args.cpkt)):
            unet.load_state_dict(
                load_file(os.path.join(args.repo, args.cpkt), device="cuda")
            )
        else:
            unet.load_state_dict(
                load_file(hf_hub_download(args.repo, args.cpkt), device="cuda")
            )
        pipe = StableDiffusionXLPipeline.from_pretrained(
            args.base, unet=unet, torch_dtype=torch.float16, variant="fp16"
        ).to("cuda")

    pipe.scheduler = EulerDiscreteScheduler.from_config(
        pipe.scheduler.config, timestep_spacing="trailing"
    )

    if pipe.vae.dtype == torch.float16 and pipe.vae.config.force_upcast:
        pipe.upcast_vae()

    # Compile the pipeline
    if args.compile:
        pipe = compile_pipe(pipe,)
        if args.load_graph:
            print("Loading graphs...")
            load_pipe(pipe, args.load_graph_dir)

    print("Warmup with running graphs...")
    torch.manual_seed(args.seed)
    heights =  [ 1024,1152, 1360, 1536]
    widths =  [1024, 1152, 1360, 1536]

    for height in heights:
        for width in widths:

            print(f"cur H : {height} W: {width}")
            image = pipe(
                prompt=args.prompt,
                height=height,
                width=width,
                num_inference_steps=n_steps,
                guidance_scale=0,
                output_type=OUTPUT_TYPE,
            ).images

            # Normal run
            print("Normal run...")
            torch.manual_seed(args.seed)
            start_t = time.time()
            image = pipe(
                prompt=args.prompt,
                height=height,
                width=width,
                num_inference_steps=n_steps,
                guidance_scale=0,
                output_type=OUTPUT_TYPE,
            ).images

            end_t = time.time()
            print(f"e2e ({n_steps} steps) elapsed: {end_t - start_t} s")

            image[0].save(f"text2img-sdxl-light-out-{height}-{width}.png")

            if args.save_graph:
                print("Saving graphs...")
                save_pipe(pipe, args.save_graph_dir)

    # del pipe
    # torch.cuda.empty_cache()
    # oneflow.cuda.empty_cache()

if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--base", type=str, default="/share_nfs/hf_models/stable-diffusion-xl-base-1.0"
    )
    parser.add_argument("--repo", type=str, default="/share_nfs/hf_models/SDXL-Lightning")
    parser.add_argument("--cpkt", type=str, default="sdxl_lightning_4step_unet.safetensors")
    parser.add_argument("--variant", type=str, default="fp16")
    parser.add_argument(
        "--prompt",
        type=str,
        # default="street style, detailed, raw photo, woman, face, shot on CineStill 800T",
        default="A girl smiling",
    )
    parser.add_argument("--save_graph", action="store_true")
    parser.add_argument("--load_graph", action="store_true")
    parser.add_argument("--save_graph_dir", type=str, default="cached_pipe")
    parser.add_argument("--load_graph_dir", type=str, default="cached_pipe")

    parser.add_argument("--seed", type=int, default=1)
    parser.add_argument(
        "--compile", type=(lambda x: str(x).lower() in ["true", "1", "yes"]), default=True,
    )
    args = parser.parse_args()
    main(args)

The complete error message

via

export ONEDIFF_DEBUG=1
export ONEFLOW_RUN_GRAPH_BY_VM=1

I get the logs below

image

(sd2) lijunliang@oneflow-25:~/project/onediff$ python /data/home/lijunliang/project/onediff/onediff_diffusers_extensions/examples/text_to_image_sdxl_light.py | tee -a txt2img_debug_56.txt
WARNING [2024-05-06 08:54:27] /data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/onediff/infer_compiler/transform/manager.py:119 - Pydantic version 1.10.15 is too low, please upgrade to 2.5.2 or higher.
Loading pipeline components...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  9.56it/s]
W20240506 08:55:02.956883 1588721 cuda_stream.cpp:49] Runtime version 12.1 of cuBLAS incompatible with compiletime version 12.2.
Warmup with running graphs...
cur H : 1024 W: 1024
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:45<00:00, 11.33s/it]
Normal run...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 18.69it/s]
e2e (4 steps) elapsed: 0.6722152233123779 s
cur H : 1024 W: 1152
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 12.82it/s]
Normal run...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 13.05it/s]
e2e (4 steps) elapsed: 0.8152527809143066 s
cur H : 1024 W: 1360
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 11.36it/s]
Normal run...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 11.54it/s]
e2e (4 steps) elapsed: 0.9450609683990479 s
cur H : 1024 W: 1536
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 10.75it/s]
Stack trace (most recent call last) in thread 1588721:
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781badb9d7, in 
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781badb24c, in 
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781bad6ac8, in vm::ThreadCtx::TryReceiveAndRun()
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781ba79264, in vm::EpStreamPolicyBase::Run(vm::Instruction*) const
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781ba7c567, in vm::Instruction::Compute()
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781bb038bf, in vm::FuseInstructionPolicy::Compute(vm::Instruction*)
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781ba7c567, in vm::Instruction::Compute()
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781ba83948, in vm::OpCallInstructionPolicy::Compute(vm::Instruction*)
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781ba83619, in 
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781ba7e76a, in 
   Object "/data/home/lijunliang/anaconda3/envs/sd2/lib/python3.10/site-packages/oneflow/../oneflow.libs/liboneflow-b60dc70f.so", at 0x7f781323ad3c, in 

Additional context

GPU INFO.txt txt2img_debug_withempty.txt image

lijunliangTG commented 6 months ago

image add ignores = ("vae",) can fix this

lijunliangTG commented 6 months ago

Compilation works fine with torch image

lijunliangTG commented 6 months ago
    if pipe.vae.dtype == torch.float16 and pipe.vae.config.force_upcast:
        pipe.upcast_vae()

    # Compile the pipeline
    if args.compile:
        # pipe = compile_pipe(pipe,)
        compiled_decoder = oneflow_compile(pipe.vae.decoder)
        pipe.vae.decoder = compiled_decoder
        if args.load_graph:
            print("Loading graphs...")
            load_pipe(pipe, args.load_graph_dir)

Compiled with oneflow_compile, it doesn't work properly OUTPUT_FILE_6_withcomplie.txt output.txt

add oneflow.cuda.empty_cache() at the end of each loop can run output_1.txt OUTPUT_FILE_6_oneflowempty.txt