Hi there,
I tried compiling a cdxl-model but only get a 0.005 sec speedup or thereabouts.
Is there anything to consider when using sd-turbo and sdxl-turbo.
My script boils down to this
from diffusers import (AutoPipelineForImage2Image,
EulerAncestralDiscreteScheduler)
from sfast.compilers.stable_diffusion_pipeline_compiler import (
compile, CompilationConfig)
from diffusers.utils import load_image
model = None
output = None
def load_model():
model = AutoPipelineForImage2Image.from_pretrained(
"../models/sdxl-turbo/", torch_dtype=torch.float16, variant="fp16")
model.scheduler = EulerAncestralDiscreteScheduler.from_config(
model.scheduler.config)
model.safety_checker = None
model.to(torch.device('cuda'))
return model
def init():
global model
model = load_model()
config = CompilationConfig.Default()
config.enable_cuda_graph = True
model = compile(model, config)
def generate(prompt,w,h,steps,strength):
global output
myimg = model(prompt=prompt, image=inputImg, num_inference_steps=steps, strength=strength, guidance_scale=0.0).images[0]
Hi there, I tried compiling a cdxl-model but only get a 0.005 sec speedup or thereabouts. Is there anything to consider when using sd-turbo and sdxl-turbo.
My script boils down to this