Aspect ratio bucketing in SDXL lora training

ArtLeav commented 4 months ago

Hello! I have a question/problem: how can I make sure that resizing (presumably referring to image resizing) does not reduce the longer side of the image to 1024 pixels, but instead maintains specific ratios such as 768x1344? I tried make bucket_resolution higher (up to 2048) but it doesn't help

ArtLeav commented 4 months ago

My attempt to solve it look like


# @title ## **3.4. Bucketing and Latents Caching**
%store -r

# @markdown This code will create buckets based on the `bucket_resolution` provided for multi-aspect ratio training, and then convert all images within the `train_data_dir` to latents.
bucketing_json    = os.path.join(training_dir, "meta_lat.json")
metadata_json     = os.path.join(training_dir, "meta_clean.json")
bucket_resolution = 1344  # @param {type:"slider", min:512, max:2048, step:64}
bucket_reso_steps = 32  # @param {type:"slider", min:64, max:2048, step:32}
min_bucket_reso = 512  # @param {type:"slider", min:512, max:1600, step:64}
max_bucket_reso = 1536  # @param {type:"slider", min:512, max:1600, step:64}

mixed_precision   = "no"  # @param ["no", "fp16", "bf16"] {allow-input: false}
skip_existing     = False  # @param{type:"boolean"}
flip_aug          = False  # @param{type:"boolean"}
# @markdown Use `clean_caption` option to clean such as duplicate tags, `women` to `girl`, etc
clean_caption     = True #@param {type:"boolean"}
#@markdown Use the `recursive` option to process subfolders as well
recursive         = True #@param {type:"boolean"}

metadata_config = {
    "_train_data_dir": train_data_dir,
    "_out_json": metadata_json,
    "recursive": recursive,
    "full_path": recursive,
    "clean_caption": clean_caption
}

bucketing_config = {
    "min_bucket_reso": f"{min_bucket_reso}",
    "max_bucket_reso": f"{max_bucket_reso}",
    "_train_data_dir": train_data_dir,
    "_in_json": metadata_json,
    "_out_json": bucketing_json,
    "_model_name_or_path": vae_path if vae_path else model_path,
    "recursive": recursive,
    "full_path": recursive,
    "flip_aug": flip_aug,
    "skip_existing": skip_existing,
    "batch_size": 1,
    "max_data_loader_n_workers": 2,
    "max_resolution": f"{bucket_resolution}, {bucket_resolution}",
    "mixed_precision": mixed_precision,
}

def generate_args(config):
    args = ""
    for k, v in config.items():
        if k.startswith("_"):
            args += f'"{v}" '
        elif isinstance(v, str):
            args += f'--{k}="{v}" '
        elif isinstance(v, bool) and v:
            args += f"--{k} "
        elif isinstance(v, float) and not isinstance(v, bool):
            args += f"--{k}={v} "
        elif isinstance(v, int) and not isinstance(v, bool):
            args += f"--{k}={v} "
    return args.strip()

merge_metadata_args = generate_args(metadata_config)
prepare_buckets_args = generate_args(bucketing_config)

merge_metadata_command = f"python merge_all_to_metadata.py {merge_metadata_args}"
prepare_buckets_command = f"python prepare_buckets_latents.py {prepare_buckets_args}"

os.chdir(finetune_dir)
!{merge_metadata_command}
time.sleep(1)
!{prepare_buckets_command}

but it upscale 1024x1024 to 1536

ArtLeav commented 4 months ago

I made simple code to resize pictures, but doesn't know how implement it to latents buckets


from PIL import Image
import os

src_dir = r'C:\Path\to\Images\Input'
dst_dir = r'C:\Path\to\Images\Output'

quality_val = 100

resolutions = [(1024, 1024), (896, 1152), (832, 1216), (768, 1344), (640, 1536), (1152, 896), (1216, 832), (1344, 768), (1536, 640)]

def resize_and_crop(img, size):
    img_ratio = img.size[0] / float(img.size[1])
    ratio = size[0] / float(size[1])

    if ratio > img_ratio:
        img = img.resize((size[0], int(size[0] * img.size[1] / img.size[0])), Image.LANCZOS)
        box = (0, (img.size[1] - size[1]) / 2, img.size[0], (img.size[1] + size[1]) / 2)
        img = img.crop(box)
    elif ratio < img_ratio:
        img = img.resize((int(size[1] * img.size[0] / img.size[1]), size[1]), Image.LANCZOS)
        box = ((img.size[0] - size[0]) / 2, 0, (img.size[0] + size[0]) / 2, img.size[1])
        img = img.crop(box)
    else :
        img = img.resize((size[0], size[1]), Image.LANCZOS)
    return img

def closest(lst, K): 
     return lst[min(range(len(lst)), key = lambda i: abs(lst[i][0]/lst[i][1]-K))]

for filename in os.listdir(src_dir):
    if filename.endswith(('.jpg', '.png', '.jpeg')):  # add file types as needed
        img = Image.open(os.path.join(src_dir, filename))
        closest_res = closest(resolutions, img.size[0]/img.size[1])
        new_img = resize_and_crop(img, closest_res)
        if new_img.mode == 'RGBA':
            new_img = new_img.convert('RGB')
        new_filename = f'{os.path.splitext(filename)[0]}.jpg'
        new_img.save(os.path.join(dst_dir, new_filename), quality=quality_val)

Taikakim commented 4 months ago

I just set the bucket_no_upscale flag, that fixed it, and just in case changed the max reso:

But weird thing was that if my training resolution was set to 1472 for an example, I was starting to get really bad results after just a few hundred steps. So for now I'm keeping the training resolution at 1024px and output is fine. But I'm not sure what the code internally is doing now with images which are larger.

# @title ## **3.4. Bucketing and Latents Caching**
%store -r

# @markdown This code will create buckets based on the `bucket_resolution` provided for multi-aspect ratio training, and then convert all images within the `train_data_dir` to latents.
bucketing_json    = os.path.join(training_dir, "meta_lat.json")
metadata_json     = os.path.join(training_dir, "meta_clean.json")
bucket_resolution = 1472  # @param {type:"slider", min:512, max:2048, step:32}
mixed_precision   = "bf16"  # @param ["no", "fp16", "bf16"] {allow-input: false}
flip_aug          = False  # @param{type:"boolean"}
# @markdown Use `clean_caption` option to clean such as duplicate tags, `women` to `girl`, etc
clean_caption     = False #@param {type:"boolean"}
#@markdown Use the `recursive` option to process subfolders as well
recursive         = False #@param {type:"boolean"}
skip_existing     = True #@param {type: "boolean"}
bucket_no_upscale = True #@param {type: "boolean"}

metadata_config = {
    "_train_data_dir": train_data_dir,
    "_out_json": metadata_json,
    "recursive": recursive,
    "full_path": recursive,
    "clean_caption": clean_caption
}

bucketing_config = {
    "_train_data_dir": train_data_dir,
    "_in_json": metadata_json,
    "_out_json": bucketing_json,
    "_model_name_or_path": model_path,
    "recursive": recursive,
    "full_path": recursive,
    "flip_aug": flip_aug,
    "batch_size": 24,
    "max_data_loader_n_workers": 8,
    "max_resolution": f"{bucket_resolution}, {bucket_resolution}",
    "mixed_precision": mixed_precision,
    "skip_existing": skip_existing,
    "bucket_no_upscale": bucket_no_upscale
}

def generate_args(config):
    args = ""
    for k, v in config.items():
        if k.startswith("_"):
            args += f'"{v}" '
        elif isinstance(v, str):
            args += f'--{k}="{v}" '
        elif isinstance(v, bool) and v:
            args += f"--{k} "
        elif isinstance(v, float) and not isinstance(v, bool):
            args += f"--{k}={v} "
        elif isinstance(v, int) and not isinstance(v, bool):
            args += f"--{k}={v} "
    return args.strip()

merge_metadata_args = generate_args(metadata_config)
prepare_buckets_args = generate_args(bucketing_config)

merge_metadata_command = f"python merge_all_to_metadata.py {merge_metadata_args}"
prepare_buckets_command = f"python prepare_buckets_latents.py {prepare_buckets_args}"

os.chdir(finetune_dir)
!{merge_metadata_command}
time.sleep(1)
!{prepare_buckets_command}

Linaqruf / kohya-trainer

Aspect ratio bucketing in SDXL lora training #343