improve long prompt handling logic to split >75 token prompts at previous punctuation

damian0815 commented 11 months ago

at the moment compel hard-splits prompts at 75 tokens. because self-attention does not work outside of a single 75 token chunk, this can cause degraded generation especially if a word is split in the middle.

compel should backtrack from 75 tokens to find the previous . ; or , character (searching in that order) and split there.

tg-bomze commented 10 months ago

Hey, @damian0815. Thanks for the Сompel. It's a very useful library.

I noticed that now long prompts are handled differently than in AUTOMATIC111111's webUI. Like you said, they break tokens into chunks, disregarding the separating punctuation (",", ".", ":"). I've written a function that should bring the generation results closer to those obtained in the webUI from AUTOMATIC1111.

For starters:

import torch
from compel import Compel
from diffusers import StableDiffusionPipeline, DDIMScheduler

pipeline = StableDiffusionPipeline.from_single_file(
    "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
    torch_dtype=torch.float16,
)
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
pipeline.to("cuda")

compel = Compel(tokenizer=pipeline.tokenizer, text_encoder=pipeline.text_encoder)

prompt = "A highly detailed matte painting of retro futurist mount fuji hosting the olympics, aerial photography, ultrawide lens, by dan mumford, yusuke murata, makoto shinkai, ross tran, cosmic, heavenly, god rays, intricate detail, cinematic, 8 k, cel shaded, unreal engine, featured on artstation, pixiv, hd sharp 3d model vray render in pixar squareenix game anime manga toriyama miyazaki style trending on pixiv skeb"
prompt_emb = compel.build_conditioning_tensor(prompt)

negative_prompt = "deformed iris, deformed pupils, text, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, camera"
negative_prompt_emb = compel.build_conditioning_tensor(negative_prompt)

image = pipeline(
    prompt_embeds=prompt_emb,
    negative_prompt_embeds=neg_prompt_embeds,
    width=512,
    height=512,
    num_inference_steps=30,
    num_images_per_prompt=1,
    guidance_scale=7,
    generator=torch.Generator(device='cuda').manual_seed(42)
).images[0]

This is the result without truncate: telegram-cloud-photo-size-2-5416032465987947400-x

And here's the result with truncate: telegram-cloud-photo-size-2-5416032465987947399-x

In the webUI from AUTOMATIC111111 if we use the same parameters and the same model, we get: telegram-cloud-photo-size-2-5416032465987947396-x

Here is the necessary function that encodes the prompt and negative prompt correctly:

from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

eos_token = pipeline.tokenizer.eos_token_id
bos_token = pipeline.tokenizer.bos_token_id
comma_token = pipeline.tokenizer.encode(',')[1]
dot_token = pipeline.tokenizer.encode('.')[1]
colon_token = pipeline.tokenizer.encode(':')[1]
max_tokens = pipeline.tokenizer.model_max_length - 2

def concat_tensor(t):
    t_list = torch.split(t, 1, dim=0)
    t = torch.cat(t_list, dim=1)
    return t

def tokenize_line(line, pipeline): # split into chunks
    actual_tokens = pipeline.tokenizer.encode(line.strip(), truncation=False)[1:-1]
    chunks = []
    chunk = []
    for item in actual_tokens:
        chunk.append(item)
        if len(chunk) == max_tokens:
            if chunk[-1] not in [comma_token, dot_token, colon_token]:
                for i in range(max_tokens-1, -1, -1):
                    if chunk[i] in [comma_token, dot_token, colon_token]:
                        temp_chank = [bos_token] + chunk[:i+1] + [eos_token] * (max_tokens+1 - len(chunk[:i+1]))
                        chunks.append(temp_chank)
                        chunk = chunk[i+1:]
                        break
                else:
                    temp_chank = [bos_token] + chunk + [eos_token] * (max_tokens+1 - len(chunk))
                    chunks.append(temp_chank)
                    chunk = []
            else:
                temp_chank = [bos_token] + chunk + [eos_token] * (max_tokens+1 - len(chunk))
                chunks.append(temp_chank)
                chunk = []
    if chunk:
        temp_chank = [bos_token] + chunk + [eos_token] * (max_tokens+1 - len(chunk))
        chunks.append(temp_chank)

    return chunks

def get_embeddings_for_long_prompt(prompt, negative_prompt, pipeline):
    prompt_tokens = tokenize_line(prompt, pipeline)
    neg_prompt_tokens = tokenize_line(negative_prompt, pipeline)

    if len(prompt_tokens) != len(neg_prompt_tokens):
        diff = abs(len(prompt_tokens) - len(neg_prompt_tokens))
        if len(prompt_tokens) > len(neg_prompt_tokens):
            neg_prompt_tokens.extend([[bos_token] + [eos_token]*(max_tokens+1)]*diff)
        elif len(neg_prompt_tokens) > len(prompt_tokens):
            prompt_tokens.extend([[bos_token] + [eos_token]*(max_tokens+1)]*diff)

    prompt_embeds = []
    neg_prompt_embeds = []

    for chunks_batch in [prompt_tokens, neg_prompt_tokens]:
        tokens_tensor = pad_sequence([torch.tensor(chunk) for chunk in chunks_batch], batch_first=True).to(pipeline.device)
        attention_mask = torch.ones_like(tokens_tensor).to(pipeline.device) # Create attention mask
        model_output = pipeline.text_encoder(tokens_tensor, attention_mask=attention_mask) # Removed weights argument

        if chunks_batch is prompt_tokens:
            prompt_embeds.append(model_output.last_hidden_state)
        else:
            neg_prompt_embeds.append(model_output.last_hidden_state)

    prompt_embeds = torch.cat(prompt_embeds, dim=1)
    neg_prompt_embeds = torch.cat(neg_prompt_embeds, dim=1)

    return concat_tensor(prompt_embeds), concat_tensor(neg_prompt_embeds)

Next, call the function:

prompt_emb, neg_prompt_embeds = get_embeddings_for_long_prompt(prompt, negative_prompt, pipeline)

And this is the result: telegram-cloud-photo-size-2-5422552642955104932-x

Here's a link to Colab to test it out. The code probably looks overloaded, but I hope it's useful to you. Compel is just an awesome tool and it lacks proper handling of long prompts.

damian0815 commented 10 months ago

thanks for the suggestion! i don't suppose you'd be willing to look at the logic for how Compel actually tokenizes and adapt this to that? this does not handling fragment weighting, which makes things a bit more complicated. the internals of how this is done are due for a refactor at some point anyway.

damian0815 commented 10 months ago

however my perennial advice is always to write shorter prompts. instead of adding words, try changing or deleting.

tg-bomze commented 10 months ago

thanks for the suggestion! i don't suppose you'd be willing to look at the logic for how Compel actually tokenizes and adapt this to that? this does not handling fragment weighting, which makes things a bit more complicated. the internals of how this is done are due for a refactor at some point anyway.

I was thinking of making a PR for Compel myself, but then ran into the problem that Compel now handles prompt and negative prompt independently of each other. In AUTOMATIC1111 one of the important conditions is that the number of chunks must be the same. That is, if we have a prompt split into two chunks of 75 tokens each, and the negative prompt has only a couple of words, then before encoding the tokens of the negative prompt we must create another chunk containing only bos and 76 eos. Because of this fact it is not possible to implement my functions in Compel.

damian0815 commented 10 months ago

I was thinking of making a PR for Compel myself, but then ran into the problem that Compel now handles prompt and negative prompt independently of each other.

actually Compel does handle this. you can either use the batch __call__ api:

[prompt, negative_prompt] = compel([prompt, negative_prompt])

this will automatically ensure that prompt or negative_prompt are padded to match the number of 75 token chunks in the other.

alternatively, call Compel.pad_conditioning_tesnors_to_same_length()

tg-bomze commented 10 months ago

actually Compel does handle this. you can either use the batch __call__ api:
[prompt, negative_prompt] = compel([prompt, negative_prompt])
this will automatically ensure that prompt or negative_prompt are padded to match the number of 75 token chunks in the other.

alternatively, call Compel.pad_conditioning_tesnors_to_same_length()

Thanks for the tip. I decided to do it as an add-on on top of Compel. Here are the functions:

def concat_tensor(t):
    t_list = torch.split(t, 1, dim=0)
    t = torch.cat(t_list, dim=1)
    return t

def merge_embeds(prompt_chanks):
    num_chanks = len(prompt_chanks)
    power_prompt = 1/(num_chanks*(num_chanks+1)//2)
    prompt_embs = compel(prompt_chanks)
    t_list = list(torch.split(prompt_embs, 1, dim=0))
    for i in range(num_chanks):
        t_list[-(i+1)] = t_list[-(i+1)] * ((i+1)*power_prompt)
    prompt_emb = torch.stack(t_list, dim=0).sum(dim=0)
    return prompt_emb

def detokenize(chunk, actual_prompt):
    chunk[-1] = chunk[-1].replace('</w>', '')
    chanked_prompt = ''.join(chunk).strip()
    while '</w>' in chanked_prompt:
        if actual_prompt[chanked_prompt.find('</w>')] == ' ':
            chanked_prompt = chanked_prompt.replace('</w>', ' ', 1)
        else:
            chanked_prompt = chanked_prompt.replace('</w>', '', 1)
    actual_prompt = actual_prompt.replace(chanked_prompt,'')
    return chanked_prompt.strip(), actual_prompt.strip()

def tokenize_line(line, tokenizer): # split into chunks
    actual_prompt = line.lower().strip()
    actual_tokens = tokenizer.tokenize(actual_prompt)
    max_tokens = tokenizer.model_max_length - 2
    comma_token = tokenizer.tokenize(',')[0]

    chunks = []
    chunk = []
    for item in actual_tokens:
        chunk.append(item)
        if len(chunk) == max_tokens:
            if chunk[-1] != comma_token:
                for i in range(max_tokens-1, -1, -1):
                    if chunk[i] == comma_token:
                        actual_chunk, actual_prompt = detokenize(chunk[:i+1], actual_prompt)
                        chunks.append(actual_chunk)
                        chunk = chunk[i+1:]
                        break
                else:
                    actual_chunk, actual_prompt = detokenize(chunk, actual_prompt)
                    chunks.append(actual_chunk)
                    chunk = []
            else:
                actual_chunk, actual_prompt = detokenize(chunk, actual_prompt)
                chunks.append(actual_chunk)
                chunk = []
    if chunk:
        actual_chunk, _ = detokenize(chunk, actual_prompt)
        chunks.append(actual_chunk)

    return chunks

The functions are called as follows:

prompt = "A highly detailed matte painting of retro futurist mount fuji hosting the olympics, aerial photography, ultrawide lens, by dan mumford, yusuke murata, makoto shinkai, ross tran, cosmic, heavenly, god rays, intricate detail, cinematic, 8 k, cel shaded, unreal engine, featured on artstation, pixiv, hd sharp 3d model vray render in pixar squareenix game anime manga toriyama miyazaki style trending on pixiv skeb"
negative_prompt = "deformed iris, deformed pupils, text, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, camera"

prompt_emb = merge_embeds(tokenize_line(prompt, pipeline.tokenizer))
negative_prompt_emb = merge_embeds(tokenize_line(negative_prompt, pipeline.tokenizer))

And we get a result similar to webUI from AUTOMATIC1111:

Colab: https://colab.research.google.com/drive/1Gfk1pVbvaSYKoW2MPlRQhf4llChSFNWM?usp=sharing

damian0815 commented 10 months ago

do note that a1111 uses a different engine (CompVis) to compel+diffusers (i.e. diffusers) - it's not a good idea to expect 1:1 results

sarmientoj24 commented 9 months ago

@tg-bomze do you happen to know how to replicate how A1111 does (1) multiple loading of lora of different scale, (2) only loading LORA once it is in the prompt?

tg-bomze commented 9 months ago

@tg-bomze do you happen to know how to replicate how A1111 does (1) multiple loading of lora of different scale, (2) only loading LORA once it is in the prompt?

Lora in prompt A1111 like this: a dog <lora:lora_name:1.0> this is just initialization. For diffusers, this part of the prompt is removed, like this: a dog and lora is simply initialized after creating a pipeline using for example this script like this:

from lora_sdxl import create_network_from_weights

def add_new_lora(unet, text_encoder, path_to_lora, lora_weight):
    sd = load_file(str(path_to_lora))
    network, sd = create_network_from_weights(lora_weight, None, text_encoder, unet, sd)
    network.apply_to(text_encoder, unet)
    network.load_state_dict(sd, False)
    network.to(device, dtype=torch.float16)

add_new_lora(pipe.unet, pipe.text_encoder, './lora_name.safetensors', 1.0)

tg-bomze commented 9 months ago

Update. Proper partitioning of the WEIGHT prompt into chunks is now supported. A1111 format only. You need only one file. Just write from prompt_parser import get_embed_new

import torch
from compel import Compel
from diffusers import StableDiffusionPipeline, DDIMScheduler
from prompt_parser import get_embed_new

pipeline = StableDiffusionPipeline.from_single_file(
    "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
    torch_dtype=torch.float16,
)
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
pipeline.to("cuda")

compel = Compel(tokenizer=pipeline.tokenizer, text_encoder=pipeline.text_encoder, truncate_long_prompts=False)

Example:

prompt = "A highly detailed matte painting of retro futurist mount fuji hosting the olympics, aerial photography, ultrawide lens, by dan mumford, yusuke murata, makoto shinkai, ross tran, cosmic, heavenly, god rays, intricate detail, cinematic, 8 k, cel shaded, unreal engine, featured on artstation, pixiv, hd sharp 3d model vray render in pixar squareenix game anime manga toriyama miyazaki style trending on pixiv skeb"
negative_prompt = "deformed iris, deformed pupils, text, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, camera"

prompt_emb = get_embed_new(prompt, pipeline, compel)
negative_prompt_emb = get_embed_new(negative_prompt, pipeline, compel)

image = pipeline(
    prompt_embeds=prompt_emb,
    negative_prompt_embeds=negative_prompt_emb,
    width=512,
    height=512,
    num_inference_steps=30,
    num_images_per_prompt=1,
    guidance_scale=7,
    generator=torch.Generator(device='cuda').manual_seed(42)
).images[0]
image

Colab

segalinc commented 8 months ago

@tg-bomze do you have this file also for SDXL version? Comple 2.0.2 seems to have still the isse Token indices sequence length is longer than the specified maximum sequence length for this model (176 > 77). Running this sequence through the model will result in indexing errors

damian0815 commented 7 months ago

the Compel code already works around that message, you can ignore it

andreineamtu commented 1 month ago

Update. Proper partitioning of the WEIGHT prompt into chunks is now supported. A1111 format only. You need only one file. Just write from prompt_parser import get_embed_new

import torch
from compel import Compel
from diffusers import StableDiffusionPipeline, DDIMScheduler
from prompt_parser import get_embed_new

pipeline = StableDiffusionPipeline.from_single_file(
    "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
    torch_dtype=torch.float16,
)
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
pipeline.to("cuda")

compel = Compel(tokenizer=pipeline.tokenizer, text_encoder=pipeline.text_encoder, truncate_long_prompts=False)

Example:

prompt = "A highly detailed matte painting of retro futurist mount fuji hosting the olympics, aerial photography, ultrawide lens, by dan mumford, yusuke murata, makoto shinkai, ross tran, cosmic, heavenly, god rays, intricate detail, cinematic, 8 k, cel shaded, unreal engine, featured on artstation, pixiv, hd sharp 3d model vray render in pixar squareenix game anime manga toriyama miyazaki style trending on pixiv skeb"
negative_prompt = "deformed iris, deformed pupils, text, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, camera"

prompt_emb = get_embed_new(prompt, pipeline, compel)
negative_prompt_emb = get_embed_new(negative_prompt, pipeline, compel)

image = pipeline(
    prompt_embeds=prompt_emb,
    negative_prompt_embeds=negative_prompt_emb,
    width=512,
    height=512,
    num_inference_steps=30,
    num_images_per_prompt=1,
    guidance_scale=7,
    generator=torch.Generator(device='cuda').manual_seed(42)
).images[0]
image

Colab

I get into this issue: ValueError: prompt_embeds and negative_prompt_embeds must have the same shape when passed directly, but got: prompt_embeds torch.Size([1, 154, 768]) != negative_prompt_embeds torch.Size([1, 77, 768]).

damian0815 / compel

improve long prompt handling logic to split >75 token prompts at previous punctuation #59