Open betterftr opened 4 months ago
Update: did a fresh install on master branch with full default cascade settings, this time I even included training the prior (thought maybe the issue is with tenc only) but the issue is still there, this is the config:
{
"__version": 4,
"training_method": "FINE_TUNE",
"model_type": "STABLE_CASCADE_1",
"debug_mode": false,
"debug_dir": "C:/train/debug",
"workspace_dir": "C:\\train",
"cache_dir": "C:\\train",
"tensorboard": true,
"tensorboard_expose": false,
"continue_last_backup": false,
"include_train_config": "NONE",
"base_model_name": "stabilityai/stable-cascade-prior",
"weight_dtype": "BFLOAT_16",
"output_dtype": "BFLOAT_16",
"output_model_format": "SAFETENSORS",
"output_model_destination": "models/model",
"gradient_checkpointing": true,
"force_circular_padding": false,
"concept_file_name": "training_concepts/concepts.json",
"concepts": null,
"aspect_ratio_bucketing": true,
"latent_caching": true,
"clear_cache_before_training": false,
"learning_rate_scheduler": "CONSTANT",
"custom_learning_rate_scheduler": null,
"scheduler_params": [],
"learning_rate": 1.0,
"learning_rate_warmup_steps": 10,
"learning_rate_cycles": 1,
"epochs": 100,
"batch_size": 1,
"gradient_accumulation_steps": 1,
"ema": "OFF",
"ema_decay": 0.999,
"ema_update_step_interval": 5,
"dataloader_threads": 2,
"train_device": "cuda",
"temp_device": "cpu",
"train_dtype": "BFLOAT_16",
"fallback_train_dtype": "BFLOAT_16",
"enable_autocast_cache": false,
"only_cache": false,
"resolution": "512",
"attention_mechanism": "XFORMERS",
"align_prop": false,
"align_prop_probability": 0.1,
"align_prop_loss": "AESTHETIC",
"align_prop_weight": 0.01,
"align_prop_steps": 20,
"align_prop_truncate_steps": 0.5,
"align_prop_cfg_scale": 7.0,
"mse_strength": 1.0,
"mae_strength": 0.0,
"vb_loss_strength": 1.0,
"loss_weight_fn": "MIN_SNR_GAMMA",
"loss_weight_strength": 5.0,
"dropout_probability": 0.0,
"loss_scaler": "NONE",
"learning_rate_scaler": "NONE",
"offset_noise_weight": 0.0,
"perturbation_noise_weight": 0.0,
"rescale_noise_scheduler_to_zero_terminal_snr": false,
"force_v_prediction": false,
"force_epsilon_prediction": false,
"min_noising_strength": 0.0,
"max_noising_strength": 1.0,
"timestep_distribution": "UNIFORM",
"noising_weight": 0.0,
"noising_bias": 0.0,
"unet": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 0,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"prior": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 0,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"text_encoder": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 0,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"text_encoder_layer_skip": 0,
"text_encoder_2": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 30,
"stop_training_after_unit": "EPOCH",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"text_encoder_2_layer_skip": 0,
"text_encoder_3": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": 30,
"stop_training_after_unit": "EPOCH",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"text_encoder_3_layer_skip": 0,
"vae": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "FLOAT_32",
"dropout_probability": 0.0,
"train_embedding": true
},
"effnet_encoder": {
"__version": 0,
"model_name": "C:/train/cascade/effnet_encoder.safetensors",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"decoder": {
"__version": 0,
"model_name": "stabilityai/stable-cascade",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"decoder_text_encoder": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"decoder_vqgan": {
"__version": 0,
"model_name": "",
"include": true,
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE",
"dropout_probability": 0.0,
"train_embedding": true
},
"masked_training": false,
"unmasked_probability": 0.1,
"unmasked_weight": 0.1,
"normalize_masked_area_loss": false,
"embedding_learning_rate": null,
"preserve_embedding_norm": false,
"embedding": {
"__version": 0,
"uuid": "0ed09eb2-a5b2-46f2-8837-0e2d2f38d351",
"model_name": "",
"placeholder": "<embedding>",
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"token_count": 1,
"initial_embedding_text": "*"
},
"additional_embeddings": [],
"embedding_weight_dtype": "FLOAT_32",
"lora_model_name": "",
"lora_rank": 16,
"lora_alpha": 1.0,
"lora_weight_dtype": "FLOAT_32",
"bundle_additional_embeddings": true,
"optimizer": {
"__version": 0,
"optimizer": "PRODIGY",
"adam_w_mode": false,
"alpha": null,
"amsgrad": false,
"beta1": 0.9,
"beta2": 0.999,
"beta3": null,
"bias_correction": false,
"block_wise": false,
"capturable": false,
"centered": false,
"clip_threshold": null,
"d0": 1e-06,
"d_coef": 2.0,
"dampening": null,
"decay_rate": null,
"decouple": true,
"differentiable": false,
"eps": 1e-08,
"eps2": null,
"foreach": false,
"fsdp_in_use": false,
"fused": false,
"fused_back_pass": false,
"growth_rate": "inf",
"initial_accumulator_value": null,
"is_paged": false,
"log_every": null,
"lr_decay": null,
"max_unorm": null,
"maximize": false,
"min_8bit_size": null,
"momentum": null,
"nesterov": false,
"no_prox": false,
"optim_bits": null,
"percentile_clipping": null,
"r": null,
"relative_step": false,
"safeguard_warmup": true,
"scale_parameter": false,
"stochastic_rounding": true,
"use_bias_correction": true,
"use_triton": false,
"warmup_init": false,
"weight_decay": 0.05,
"weight_lr_power": null,
"decoupled_decay": false,
"fixed_decay": false,
"rectify": false,
"degenerated_to_sgd": false,
"k": null,
"xi": null,
"n_sma_threshold": null,
"ams_bound": false,
"adanorm": false,
"adam_debias": false
},
"optimizer_defaults": {
"ADAFACTOR": {
"__version": 0,
"optimizer": "ADAFACTOR",
"adam_w_mode": false,
"alpha": null,
"amsgrad": false,
"beta1": null,
"beta2": null,
"beta3": null,
"bias_correction": false,
"block_wise": false,
"capturable": false,
"centered": false,
"clip_threshold": 1.0,
"d0": null,
"d_coef": null,
"dampening": null,
"decay_rate": -0.8,
"decouple": false,
"differentiable": false,
"eps": 1e-30,
"eps2": 0.001,
"foreach": false,
"fsdp_in_use": false,
"fused": false,
"fused_back_pass": false,
"growth_rate": null,
"initial_accumulator_value": null,
"is_paged": false,
"log_every": null,
"lr_decay": null,
"max_unorm": null,
"maximize": false,
"min_8bit_size": null,
"momentum": null,
"nesterov": false,
"no_prox": false,
"optim_bits": null,
"percentile_clipping": null,
"r": null,
"relative_step": false,
"safeguard_warmup": false,
"scale_parameter": false,
"stochastic_rounding": true,
"use_bias_correction": false,
"use_triton": false,
"warmup_init": false,
"weight_decay": 0.0,
"weight_lr_power": null,
"decoupled_decay": false,
"fixed_decay": false,
"rectify": false,
"degenerated_to_sgd": false,
"k": null,
"xi": null,
"n_sma_threshold": null,
"ams_bound": false,
"adanorm": false,
"adam_debias": false
},
"PRODIGY": {
"__version": 0,
"optimizer": "PRODIGY",
"adam_w_mode": false,
"alpha": null,
"amsgrad": false,
"beta1": 0.9,
"beta2": 0.999,
"beta3": null,
"bias_correction": false,
"block_wise": false,
"capturable": false,
"centered": false,
"clip_threshold": null,
"d0": 1e-06,
"d_coef": 2.0,
"dampening": null,
"decay_rate": null,
"decouple": true,
"differentiable": false,
"eps": 1e-08,
"eps2": null,
"foreach": false,
"fsdp_in_use": false,
"fused": false,
"fused_back_pass": false,
"growth_rate": "inf",
"initial_accumulator_value": null,
"is_paged": false,
"log_every": null,
"lr_decay": null,
"max_unorm": null,
"maximize": false,
"min_8bit_size": null,
"momentum": null,
"nesterov": false,
"no_prox": false,
"optim_bits": null,
"percentile_clipping": null,
"r": null,
"relative_step": false,
"safeguard_warmup": true,
"scale_parameter": false,
"stochastic_rounding": true,
"use_bias_correction": true,
"use_triton": false,
"warmup_init": false,
"weight_decay": 0.05,
"weight_lr_power": null,
"decoupled_decay": false,
"fixed_decay": false,
"rectify": false,
"degenerated_to_sgd": false,
"k": null,
"xi": null,
"n_sma_threshold": null,
"ams_bound": false,
"adanorm": false,
"adam_debias": false
}
},
"sample_definition_file_name": "training_samples/test.json",
"samples": null,
"sample_after": 5,
"sample_after_unit": "NEVER",
"sample_image_format": "JPG",
"samples_to_tensorboard": true,
"non_ema_sampling": true,
"backup_after": 30,
"backup_after_unit": "MINUTE",
"rolling_backup": false,
"rolling_backup_count": 3,
"backup_before_save": true,
"save_after": 0,
"save_after_unit": "NEVER",
"save_filename_prefix": ""
}
Having similar issue on main branch, but when training SDXL lora, I can see:
So Text Encoder LR displays as expected but UNET LR not.
found the problem: changing safeguard warmup/coef doesnt break it, changing initial d or Bias correction does orange: prodigy's default settings loaded. blue: Bias correction turned on green: initial D changed to 1e-07
this is on master branch
Having similar issue on main branch, but when training SDXL lora, I can see:
[snip]
So Text Encoder LR displays as expected but UNET LR not.
This just looks like a display scale issue, view the right graph at the same scale as the left graph.
Having similar issue on main branch, but when training SDXL lora, I can see: [snip] So Text Encoder LR displays as expected but UNET LR not.
This just looks like a display scale issue, view the right graph at the same scale as the left graph.
You are right. I missed the different scales.
Meanwhile, still having similar problem as OP, even when using default Prodigy settings, LR shows as I set with the standard scheduler (constant, cosine, etc) but not adaptive.
I am now seeing the same behavior with other adaptive optimizers too. Here is data from a recent run, using Adafactor (adaptive) and cosine with restart. The lora is trained okay.
Is this expected behavior for adaptive schedulers? Or should I be seeing 'actual LR', like what OP posted above?
Adafactor is different and will not show a derived LR, should you turn that mode on, on the tensorboard graphs.
For Prodigy, I'd need to see your graph and your settings. It should show the true LR.
A Prodigy default setting run, using cosine with hard restart, single cycle. Stopped early:
The whole setting or just the optimizer part?
What exactly is the problem? That looks fine. Prodigy found your learning rate. OP is finding prodigy not moving from the d0 value
I was expecting the LR to be more 'rough' and not 'smooth', due to adaptive optimizer modifying LR throughout the run, like this example shown on the Optimizer wiki page:
Did you mean that in the run above, Prodigy found 0.0034 and set it just once, and didn't find a different LR for the rest of run?
It's only rough like that image if it doesn't find a good enough estimate for the problem at first. It found its best estimate early, and never needed to increase it again.
Did you mean that in the run above, Prodigy found 0.0034 and set it just once, and didn't find a different LR for the rest of run?
Correct.
Interesting. Here is another run. Prodigy default. Completely different training data from the above.
I did 9 long runs like these, across 3 different datasets (300+ images each). Some with default Prodigy settings, some changed according to Optimizer wiki page. I saw the same behavior: LR set once.
Is that behavior so common? Or am I just lucky?
Adafactor is different and will not show a derived LR, should you turn that mode on, on the tensorboard graphs.
Side question: is there a way to make Adafactor log derived LR to tensorboard? Or is that inherently impossible due to how the Adafactor optimizer works?
It is working fine for me. I am just interested in how it adjusts LR under the hood.
update again: reinstalled windows, I have only a git, a python 3.10, an nvidia driver and OneTrainer. Results are still the same. Also did test kohya as well to see if it is maybe OT's problem but got the same results (they have cu118+torch212): Kohya SDXL
deep orange=og prodigy settings
orange=decouple=text encoder only training with args: True weight_decay=0.05 d_coef=2 use_bias_correction=True safeguard_warmup=True betas=0.9,0.99
blue=unet only training with args: decouple=True weight_decay=0.05 d_coef=2 use_bias_correction=True safeguard_warmup=True betas=0.9,0.99
So this either means I have a problem with my clean windows 11 system or with my 4090.
So I went further and tested a different Prodigy implementation namely from pytorch-optimizer (https://pytorch-optimizers.readthedocs.io/en/latest/optimizer/)
changed the prodigy implementation in OT to this (had to rename some stuff like decouple to weight_decouple as per the documentation on their site) turned on bias correction that gave me problem in the og and voila:
it is working. So the problem must be og Prodigy optimizer. Or something in that combined with something with my setup
1 problem still remains; the different colors are different combinations of safeguard warmup, decouple and bias correction, the orange one is a changed Initial D (1e-07), which still breaks it even in this implementation:
What happened?
trying to train cascade tenc only with prodigy, but no matter what I tried to change the lr stays at the initial D value of the optimizer setting and does not move and the model does not learn anything.
(sd3_attention_mask branch)
What did you expect would happen?
config:
Relevant log output
No response
Output of
pip freeze