Open ariffammobox opened 1 year ago
Yes, I am going to see if I can make a nice function to make this more "user" friendly, busy testing now.
I see what the challenge is, the shape of the negative prompt embeds and input prompt embeds needs to match, busy fixing logic, the code logic works if the prompt_embeds == negative_prompt_embeds, there may be situation where the negative prompts > prompt
Here is my solution which I have tested to confirm that the prompt_embeds on input + negative prompts of varying lengths diffuse the same results
from diffusers import StableDiffusionPipeline
import torch
import random
# 1. load model
model_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")
pipe.enable_sequential_cpu_offload() # my graphics card VRAM is very low
def get_pipeline_embeds(pipeline, prompt, negative_prompt, device):
""" Get pipeline embeds for prompts bigger than the maxlength of the pipe
:param pipeline:
:param prompt:
:param negative_prompt:
:param device:
:return:
"""
max_length = pipeline.tokenizer.model_max_length
# simple way to determine length of tokens
count_prompt = len(prompt.split(" "))
count_negative_prompt = len(negative_prompt.split(" "))
# create the tensor based on which prompt is longer
if count_prompt >= count_negative_prompt:
input_ids = pipeline.tokenizer(prompt, return_tensors="pt", truncation=False).input_ids.to(device)
shape_max_length = input_ids.shape[-1]
negative_ids = pipeline.tokenizer(negative_prompt, truncation=False, padding="max_length",
max_length=shape_max_length, return_tensors="pt").input_ids.to(device)
else:
negative_ids = pipeline.tokenizer(negative_prompt, return_tensors="pt", truncation=False).input_ids.to(device)
shape_max_length = negative_ids.shape[-1]
input_ids = pipeline.tokenizer(prompt, return_tensors="pt", truncation=False, padding="max_length",
max_length=shape_max_length).input_ids.to(device)
concat_embeds = []
neg_embeds = []
for i in range(0, shape_max_length, max_length):
concat_embeds.append(pipeline.text_encoder(input_ids[:, i: i + max_length])[0])
neg_embeds.append(pipeline.text_encoder(negative_ids[:, i: i + max_length])[0])
return torch.cat(concat_embeds, dim=1), torch.cat(neg_embeds, dim=1)
prompt = (22 + random.randint(1, 10)) * "a photo of an astronaut riding a horse on mars"
negative_prompt = (22 + random.randint(1, 10)) * "some negative texts"
print("Our inputs ", prompt, negative_prompt, len(prompt.split(" ")), len(negative_prompt.split(" ")))
prompt_embeds, negative_prompt_embeds = get_pipeline_embeds(pipe, prompt, negative_prompt, "cuda")
image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds).images[0]
image.save("done.png")
Please let me know if you find some issues :)
I haven't tried the AUTOMATIC1111 yet but seems like it managed to overcome the 77 token limit issue
Would it be possible to implement this workaround with yours?