Hello, I updated the maxperf file for systems that only run on CLI. It does not save the files to disk at the moment. While I was checking to see if there was any problem, I noticed that it created only 6 images in the genImage function. Sorry for the code I am actualy C# developer :)

` import sys import PIL import os

import numpy as np

import torch from diffusers import AutoPipelineForText2Image from sfast.compilers.stable_diffusion_pipeline_compiler import (compile, CompilationConfig)

torch.set_grad_enabled(False) torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True

mw = None batchSize = 10 prompts = ['Evil space kitty', 'Cute dog in hat, H.R. Giger style', 'Horse wearing a tie', 'Cartoon pig', 'Donkey on Mars', 'Cute kitties baked in a cake', 'Boxing chickens on farm, Maxfield Parish style', 'Future spaceship', 'A city of the past', 'Jabba the Hut wearing jewelery', 'istanbul photo scenery', 'a nice girl with hat','a dog playing footbal','an umbrella and raining',' paper cut plane flying on a desk','a cup coffee and child toys','space ship on a lake','a knife and a fork on a table','futuristic microphone','an apple, a banana, a melon']

def dwencode(pipe, prompts, batchSize: int, nTokens: int): tokenizer = pipe.tokenizer text_encoder = pipe.text_encoder

if nTokens < 0 or nTokens > 75:
    raise BaseException("n random tokens must be between 0 and 75")

if nTokens > 0:
    randIIs = torch.randint(low=0, high=49405, size=(batchSize, nTokens), device='cuda')

text_inputs = tokenizer(
    prompts,
    padding = "max_length",
    max_length = tokenizer.model_max_length,
    truncation = True,
    return_tensors = "pt",
).to('cuda')

tii = text_inputs.input_ids

# Find the end mark which is deterimine the prompt len(pl)
# terms of user tokens
#pl = np.where(tii[0] == 49407)[0][0] - 1
pl = (tii[0] == torch.tensor(49407, device='cuda')).nonzero()[0][0].item() - 1

if nTokens > 0:
    # TODO: Efficiency
    for i in range(batchSize):
        tii[i][1+pl:1+pl+nTokens] = randIIs[i]
        tii[i][1+pl+nTokens] = 49407

if False:
    for bi in range(batchSize):
        print(f"{mw.seqno:05d}-{bi:02d}: ", end='')
        for tid in tii[bi][1:1+pl+nTokens]:
            print(f"{tokenizer.decode(tid)} ", end='')
        print('')

prompt_embeds = text_encoder(tii.to('cuda'), attention_mask=None)
prompt_embeds = prompt_embeds[0]
prompt_embeds = prompt_embeds.to(dtype=pipe.unet.dtype, device='cuda')

bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.repeat(1, 1, 1)
prompt_embeds = prompt_embeds.view(bs_embed * 1, seq_len, -1)

return prompt_embeds

pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/sd-turbo", torch_dtype=torch.float16, variant="fp16") pipe.to("cuda")

pipe.unet.to(memory_format=torch.channels_last)

from diffusers import AutoencoderTiny pipe.vae = AutoencoderTiny.from_pretrained('madebyollin/taesd', torch_device='cuda', torch_dtype=torch.float16) pipe.vae = pipe.vae.cuda()

pipe.set_progress_bar_config(disable=True)

if True: config = CompilationConfig.Default()

# xformers and Triton are suggested for achieving best performance.
# It might be slow for Triton to generate, compile and fine-tune kernels.
try:
    import xformers
    config.enable_xformers = True
except ImportError:
    print('xformers not installed, skip')
# NOTE:
# When GPU VRAM is insufficient or the architecture is too old, Triton might be slow.
# Disable Triton if you encounter this problem.
try:
    import triton
    config.enable_triton = True
except ImportError:
    print('Triton not installed, skip')
# NOTE:
# CUDA Graph is suggested for small batch sizes and small resolutions to reduce CPU overhead.
# My implementation can handle dynamic shape with increased need for GPU memory.
# But when your GPU VRAM is insufficient or the image resolution is high,
# CUDA Graph could cause less efficient VRAM utilization and slow down the inference,
# especially when on Windows or WSL which has the "shared VRAM" mechanism.
# If you meet problems related to it, you should disable it.
config.enable_cuda_graph = True

if True:
    config.enable_jit = True
    config.enable_jit_freeze = True
    config.trace_scheduler = True
    config.enable_cnn_optimization = True
    config.preserve_parameters = False
    config.prefer_lowp_gemm = True

pipe = compile(pipe, config)

def genImage(output_dir, seqno, prompts, batchSize):
    global pipe
    seed = random.randint(0, 2147483647)
    torch.manual_seed(seed)

    images = genit(0, prompts=prompts, batchSize=batchSize, nSteps=1)
    for idx, img in enumerate(images):
        img_path = os.path.join(output_dir, f'image_{seqno}_{idx}.png')
        #img.save(img_path)
        print(img_path)
    return len(images)

import time import random import torch

def genit(mode, prompts, batchSize, nSteps):

tm0 = time.time()

pe = dwencode(pipe, prompts, batchSize, 9)
images = pipe(
    prompt_embeds = pe,
    width=512, height=512,
    num_inference_steps = nSteps,
    guidance_scale = 1,
    output_type="pil",
    return_dict=False
)[0]
#print(f"time = {(1000*(time.time() - tm0)):3.1f} milliseconds")

return images

if name == 'main': output_dir = 'spew' if not os.path.exists(output_dir): os.makedirs(output_dir)

seqno = 0
if len(sys.argv) == 2:
    batchSize = int(sys.argv[1])
    if batchSize > 20:
        print('Batchsize must not be greater than 20.')
        sys.exit(1)
    prompts = prompts[:batchSize]
else:
    batchSize = 20
start_time = time.time()
counter = 0

while True:
    seqno += 1
    counter += 1
    genImage(output_dir, seqno, prompts, batchSize)
    current_time = time.time()
    if current_time - start_time >= 1:  
        print(f"{counter} iterations in the last second.")
        start_time = current_time  
        counter = 0  #

aifartist / ArtSpew

Working on linux without GUI #14

pipe.unet.to(memory_format=torch.channels_last)

tm0 = time.time()