any gains to be had on sdxl-turbo?

Hi there, I tried compiling a cdxl-model but only get a 0.005 sec speedup or thereabouts. Is there anything to consider when using sd-turbo and sdxl-turbo.

My script boils down to this

from diffusers import (AutoPipelineForImage2Image,
                       EulerAncestralDiscreteScheduler)
from sfast.compilers.stable_diffusion_pipeline_compiler import (
    compile, CompilationConfig)
from diffusers.utils import load_image

model = None
output = None

def load_model():

    model = AutoPipelineForImage2Image.from_pretrained(
    "../models/sdxl-turbo/", torch_dtype=torch.float16, variant="fp16")

    model.scheduler = EulerAncestralDiscreteScheduler.from_config(
        model.scheduler.config)
    model.safety_checker = None
    model.to(torch.device('cuda'))
    return model

def init():
    global model
    model = load_model()

    config = CompilationConfig.Default()
    config.enable_cuda_graph = True
    model = compile(model, config)

def generate(prompt,w,h,steps,strength):
    global output
    myimg = model(prompt=prompt, image=inputImg, num_inference_steps=steps, strength=strength, guidance_scale=0.0).images[0]

chengzeyi / stable-fast

any gains to be had on sdxl-turbo? #123