Nerogar / OneTrainer

OneTrainer is a one-stop solution for all your stable diffusion training needs.
GNU Affero General Public License v3.0
1.78k stars 149 forks source link

[Bug]: Enable safeguard warmup produce NaN #253

Closed crapthings closed 7 months ago

crapthings commented 7 months ago

What happened?

enable this option produce NaN

image image
{
    "__version": 3,
    "training_method": "FINE_TUNE",
    "model_type": "STABLE_DIFFUSION_15_INPAINTING",
    "debug_mode": false,
    "debug_dir": "debug",
    "workspace_dir": "/home/zznet/workspace/onetrainer-finetune-inpainting",
    "cache_dir": "/home/zznet/workspace/onetrainer-finetune-inpainting/cache",
    "tensorboard": true,
    "tensorboard_expose": true,
    "continue_last_backup": false,
    "include_train_config": "NONE",
    "base_model_name": "/home/zznet/workspace/common-model/majicmixRealistic_v7-inpainting.safetensors",
    "weight_dtype": "FLOAT_16",
    "output_dtype": "FLOAT_16",
    "output_model_format": "SAFETENSORS",
    "output_model_destination": "/home/zznet/workspace/onetrainer-finetune-inpainting/model-inpainting.safetensors",
    "gradient_checkpointing": false,
    "concept_file_name": "training_concepts/concepts.json",
    "concepts": [
        {
            "__version": 0,
            "image": {
                "__version": 0,
                "enable_crop_jitter": false,
                "enable_random_flip": false,
                "enable_fixed_flip": false,
                "enable_random_rotate": false,
                "enable_fixed_rotate": false,
                "random_rotate_max_angle": 0.0,
                "enable_random_brightness": false,
                "enable_fixed_brightness": false,
                "random_brightness_max_strength": 0.0,
                "enable_random_contrast": false,
                "enable_fixed_contrast": false,
                "random_contrast_max_strength": 0.0,
                "enable_random_saturation": false,
                "enable_fixed_saturation": false,
                "random_saturation_max_strength": 0.0,
                "enable_random_hue": false,
                "enable_fixed_hue": false,
                "random_hue_max_strength": 0.0,
                "enable_resolution_override": false,
                "resolution_override": "512"
            },
            "text": {
                "__version": 0,
                "prompt_source": "concept",
                "prompt_path": "/home/zznet/workspace/1-dataset/prompt.txt",
                "enable_tag_shuffling": false,
                "tag_delimiter": ",",
                "keep_tags_count": 1
            },
            "name": "",
            "path": "/home/zznet/workspace/1-dataset/10-woman",
            "seed": 135888675,
            "enabled": true,
            "include_subdirectories": false,
            "image_variations": 1,
            "text_variations": 1,
            "repeats": 1.0,
            "loss_weight": 1.0
        }
    ],
    "circular_mask_generation": true,
    "random_rotate_and_crop": false,
    "aspect_ratio_bucketing": true,
    "latent_caching": true,
    "clear_cache_before_training": true,
    "learning_rate_scheduler": "CONSTANT",
    "learning_rate": 1.0,
    "learning_rate_warmup_steps": 0,
    "learning_rate_cycles": 1,
    "epochs": 101,
    "batch_size": 1,
    "gradient_accumulation_steps": 1,
    "ema": "GPU",
    "ema_decay": 0.998,
    "ema_update_step_interval": 5,
    "train_device": "cuda",
    "temp_device": "cpu",
    "train_dtype": "FLOAT_16",
    "fallback_train_dtype": "BFLOAT_16",
    "enable_autocast_cache": true,
    "only_cache": false,
    "resolution": "512,768,896,1024,1152",
    "attention_mechanism": "SDP",
    "align_prop": false,
    "align_prop_probability": 0.1,
    "align_prop_loss": "AESTHETIC",
    "align_prop_weight": 0.01,
    "align_prop_steps": 20,
    "align_prop_truncate_steps": 0.5,
    "align_prop_cfg_scale": 7.0,
    "mse_strength": 1.0,
    "mae_strength": 0.0,
    "vb_loss_strength": 1.0,
    "loss_weight_fn": "CONSTANT",
    "loss_weight_strength": 5.0,
    "dropout_probability": 0.0,
    "loss_scaler": "NONE",
    "learning_rate_scaler": "NONE",
    "offset_noise_weight": 0.0,
    "perturbation_noise_weight": 0.0,
    "rescale_noise_scheduler_to_zero_terminal_snr": false,
    "force_v_prediction": false,
    "force_epsilon_prediction": false,
    "min_noising_strength": 0.0,
    "max_noising_strength": 1.0,
    "noising_weight": 0.0,
    "noising_bias": 0.5,
    "unet": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": 0,
        "stop_training_after_unit": "NEVER",
        "learning_rate": 1.0,
        "weight_dtype": "FLOAT_16"
    },
    "prior": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": 0,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "text_encoder": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": 0,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "FLOAT_16"
    },
    "text_encoder_layer_skip": 0,
    "text_encoder_2": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": 30,
        "stop_training_after_unit": "EPOCH",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "text_encoder_2_layer_skip": 0,
    "vae": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "FLOAT_32"
    },
    "effnet_encoder": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "decoder": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "decoder_text_encoder": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "decoder_vqgan": {
        "__version": 0,
        "model_name": "",
        "train": true,
        "stop_training_after": null,
        "stop_training_after_unit": "NEVER",
        "learning_rate": null,
        "weight_dtype": "NONE"
    },
    "masked_training": true,
    "unmasked_probability": 0.0,
    "unmasked_weight": 0.0,
    "normalize_masked_area_loss": true,
    "embeddings": [
        {
            "__version": 0,
            "model_name": "",
            "train": true,
            "stop_training_after": null,
            "stop_training_after_unit": "NEVER",
            "token_count": 1,
            "initial_embedding_text": "*",
            "weight_dtype": "FLOAT_32"
        }
    ],
    "embedding_weight_dtype": "FLOAT_32",
    "lora_model_name": "",
    "lora_rank": 16,
    "lora_alpha": 1.0,
    "lora_weight_dtype": "FLOAT_32",
    "optimizer": {
        "__version": 0,
        "optimizer": "PRODIGY",
        "adam_w_mode": false,
        "alpha": null,
        "amsgrad": false,
        "beta1": 0.9,
        "beta2": 0.999,
        "beta3": null,
        "bias_correction": false,
        "block_wise": false,
        "capturable": false,
        "centered": false,
        "clip_threshold": null,
        "d0": 1e-06,
        "d_coef": 1.0,
        "dampening": null,
        "decay_rate": null,
        "decouple": false,
        "differentiable": false,
        "eps": 1e-08,
        "eps2": null,
        "foreach": false,
        "fsdp_in_use": false,
        "fused": false,
        "fused_back_pass": false,
        "growth_rate": "inf",
        "initial_accumulator_value": null,
        "is_paged": false,
        "log_every": null,
        "lr_decay": null,
        "max_unorm": null,
        "maximize": false,
        "min_8bit_size": null,
        "momentum": null,
        "nesterov": false,
        "no_prox": false,
        "optim_bits": null,
        "percentile_clipping": null,
        "relative_step": false,
        "safeguard_warmup": false,
        "scale_parameter": false,
        "stochastic_rounding": true,
        "use_bias_correction": true,
        "use_triton": false,
        "warmup_init": false,
        "weight_decay": 0.01
    },
    "optimizer_defaults": {
        "ADAMW": {
            "__version": 0,
            "optimizer": "ADAMW",
            "adam_w_mode": false,
            "alpha": null,
            "amsgrad": false,
            "beta1": 0.9,
            "beta2": 0.999,
            "beta3": null,
            "bias_correction": false,
            "block_wise": false,
            "capturable": false,
            "centered": false,
            "clip_threshold": null,
            "d0": null,
            "d_coef": null,
            "dampening": null,
            "decay_rate": null,
            "decouple": false,
            "differentiable": false,
            "eps": 1e-08,
            "eps2": null,
            "foreach": false,
            "fsdp_in_use": false,
            "fused": false,
            "fused_back_pass": false,
            "growth_rate": null,
            "initial_accumulator_value": null,
            "is_paged": false,
            "log_every": null,
            "lr_decay": null,
            "max_unorm": null,
            "maximize": false,
            "min_8bit_size": null,
            "momentum": null,
            "nesterov": false,
            "no_prox": false,
            "optim_bits": null,
            "percentile_clipping": null,
            "relative_step": false,
            "safeguard_warmup": false,
            "scale_parameter": false,
            "stochastic_rounding": true,
            "use_bias_correction": false,
            "use_triton": false,
            "warmup_init": false,
            "weight_decay": 0.01
        },
        "PRODIGY": {
            "__version": 0,
            "optimizer": "PRODIGY",
            "adam_w_mode": false,
            "alpha": null,
            "amsgrad": false,
            "beta1": 0.9,
            "beta2": 0.999,
            "beta3": null,
            "bias_correction": false,
            "block_wise": false,
            "capturable": false,
            "centered": false,
            "clip_threshold": null,
            "d0": 1e-06,
            "d_coef": 1.0,
            "dampening": null,
            "decay_rate": null,
            "decouple": false,
            "differentiable": false,
            "eps": 1e-08,
            "eps2": null,
            "foreach": false,
            "fsdp_in_use": false,
            "fused": false,
            "fused_back_pass": false,
            "growth_rate": "inf",
            "initial_accumulator_value": null,
            "is_paged": false,
            "log_every": null,
            "lr_decay": null,
            "max_unorm": null,
            "maximize": false,
            "min_8bit_size": null,
            "momentum": null,
            "nesterov": false,
            "no_prox": false,
            "optim_bits": null,
            "percentile_clipping": null,
            "relative_step": false,
            "safeguard_warmup": false,
            "scale_parameter": false,
            "stochastic_rounding": true,
            "use_bias_correction": true,
            "use_triton": false,
            "warmup_init": false,
            "weight_decay": 0.01
        }
    },
    "sample_definition_file_name": "training_samples/samples.json",
    "samples": [],
    "sample_after": 10,
    "sample_after_unit": "MINUTE",
    "sample_image_format": "JPG",
    "samples_to_tensorboard": true,
    "non_ema_sampling": true,
    "backup_after": 30,
    "backup_after_unit": "NEVER",
    "rolling_backup": false,
    "rolling_backup_count": 3,
    "backup_before_save": false,
    "save_after": 10,
    "save_after_unit": "EPOCH",
    "save_filename_prefix": ""
}

What did you expect would happen?

should not return NaN?

Relevant log output

No response

Output of pip freeze

No response

mx commented 7 months ago

Not a bug. Using safeguard warmup without any warmup steps will misestimate the learning rate parameter in prodigy. Don't use it if you don't use warmup steps.