[BUG?] LoRA training will generate a bugged/broken file if set the network alpha above 1.

Atoli commented 9 months ago

Any LoRA i train with a higher network alpha higher than 1 will end in NaN loss and subsequently not work when tested:

gui1

Here are my full settings, it's a standard LoRA training, nothing out of the blue or unusual:

gui2

In text version:

{
  "LoRA_type": "Standard",
  "LyCORIS_preset": "full",
  "adaptive_noise_scale": 0,
  "additional_parameters": "",
  "block_alphas": "",
  "block_dims": "",
  "block_lr_zero_threshold": "",
  "bucket_no_upscale": true,
  "bucket_reso_steps": 64,
  "cache_latents": true,
  "cache_latents_to_disk": false,
  "caption_dropout_every_n_epochs": 0.0,
  "caption_dropout_rate": 0,
  "caption_extension": "",
  "clip_skip": 2,
  "color_aug": false,
  "conv_alpha": 1,
  "conv_block_alphas": "",
  "conv_block_dims": "",
  "conv_dim": 1,
  "debiased_estimation_loss": false,
  "decompose_both": false,
  "dim_from_weights": false,
  "down_lr_weight": "",
  "enable_bucket": true,
  "epoch": 10,
  "factor": -1,
  "flip_aug": false,
  "full_bf16": false,
  "full_fp16": false,
  "gradient_accumulation_steps": "1",
  "gradient_checkpointing": false,
  "keep_tokens": "0",
  "learning_rate": 0.0001,
  "logging_dir": "",
  "lora_network_weights": "",
  "lr_scheduler": "cosine_with_restarts",
  "lr_scheduler_args": "",
  "lr_scheduler_num_cycles": "3",
  "lr_scheduler_power": "",
  "lr_warmup": "10",
  "max_bucket_reso": 2048,
  "max_data_loader_n_workers": "0",
  "max_resolution": "512,512",
  "max_timestep": 1000,
  "max_token_length": "75",
  "max_train_epochs": "",
  "max_train_steps": "",
  "mem_eff_attn": false,
  "mid_lr_weight": "",
  "min_bucket_reso": 256,
  "min_snr_gamma": 5,
  "min_timestep": 0,
  "mixed_precision": "bf16",
  "model_list": "custom",
  "module_dropout": 0,
  "multires_noise_discount": 0,
  "multires_noise_iterations": 0,
  "network_alpha": 16,
  "network_dim": 32,
  "network_dropout": 0,
  "no_token_padding": false,
  "noise_offset": 0,
  "noise_offset_type": "Original",
  "num_cpu_threads_per_process": 2,
  "optimizer": "AdamW8bit",
  "optimizer_args": "",
  "output_dir": "C:/Users/PC/Desktop",
  "output_name": "TEST",
  "persistent_data_loader_workers": false,
  "pretrained_model_name_or_path": "F:/AI_Checkpoints/Stable-diffusion/nai.ckpt",
  "prior_loss_weight": 1.0,
  "random_crop": false,
  "rank_dropout": 0,
  "reg_data_dir": "",
  "resume": "",
  "sample_every_n_epochs": 0,
  "sample_every_n_steps": 0,
  "sample_prompts": "",
  "sample_sampler": "euler_a",
  "save_every_n_epochs": 10,
  "save_every_n_steps": 0,
  "save_last_n_steps": 0,
  "save_last_n_steps_state": 0,
  "save_model_as": "safetensors",
  "save_precision": "fp16",
  "save_state": false,
  "scale_v_pred_loss_like_noise_pred": false,
  "scale_weight_norms": 0,
  "sdxl": false,
  "sdxl_cache_text_encoder_outputs": false,
  "sdxl_no_half_vae": true,
  "seed": "1234",
  "shuffle_caption": false,
  "stop_text_encoder_training": 0,
  "text_encoder_lr": 5e-05,
  "train_batch_size": 2,
  "train_data_dir": "D:/Datasets/CLIPtests",
  "train_on_input": true,
  "training_comment": "",
  "unet_lr": 0.0005,
  "unit": 1,
  "up_lr_weight": "",
  "use_cp": false,
  "use_wandb": false,
  "v2": false,
  "v_parameterization": false,
  "v_pred_like_loss": 0,
  "vae": "",
  "vae_batch_size": 0,
  "wandb_api_key": "",
  "weighted_captions": false,
  "xformers": "xformers"
}

3Dkirill commented 7 months ago

Hey, I can reproduce it. Is there a solution for this? Maybe the learning rate need to be adjusted differentely? (rtx 3080TI)

bmaltais commented 7 months ago

Please report this to kohya on his sd-scripts repo. I only wrap his code with a gui. He is the one that need to fix this issue.

On Fri, Feb 16, 2024 at 13:01 3Dkirill @.***> wrote:

Hey, I can reproduce it. Is there a solution for this? Maybe the learning rate need to be adjusted differentely? (rtx 3080TI)

— Reply to this email directly, view it on GitHub https://github.com/bmaltais/kohya_ss/issues/1758#issuecomment-1948995409, or unsubscribe https://github.com/notifications/unsubscribe-auth/ABZA34TNVDDHSLTGMNOZ3I3YT6NGVAVCNFSM6AAAAABAN2Z3PWVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTSNBYHE4TKNBQHE . You are receiving this because you modified the open/close state.Message ID: @.***>

3Dkirill commented 7 months ago

Thank you for the response and for your work

bmaltais / kohya_ss

[BUG?] LoRA training will generate a bugged/broken file if set the network alpha above 1. #1758