bmaltais / kohya_ss

Apache License 2.0
9.37k stars 1.21k forks source link

[Help] Kohyaa - Logs from terminal that freeze during Lora training #2096

Closed Jinouga31 closed 6 months ago

Jinouga31 commented 6 months ago

Hello everyone,

I am currently facing a really frustrating problem with Lora XL training on Kohya. I am also surprised not to have found people complaining about the same problem as mine.

During training, the logs freeze randomly without any particular errors, meaning that the number of steps no longer advances at all, however the GPU continues to run at full speed, but the training does not 'is no longer in progress. Here is a screenshot. image

The terminal logs that you see in the image have been in the same state for over 1 hour, the training is no longer progressing, however the process is still in progress in view of the resources consumed by the GPU.

Obviously I already left the logs blocked for one night, it never unlocked on its own, and the training therefore never finished.

Has anyone had this problem? And if so, what was the solution? THANKS. I've been stuck on this problem for 2 weeks, so I need to ask for help. For the GPU, I have a 4080 Super. As for my .json config on Kohya, here it is but I have the same problem regardless of the config used:

{

"LoRA_type": "Standard",

"LyCORIS_preset": "full",

"adaptive_noise_scale": 0,

"additional_parameters": "--network_train_unet_only",

"block_alphas": "",

"block_dims": "",

"block_lr_zero_threshold": "",

"bucket_no_upscale": true,

"bucket_reso_steps": 64,

"cache_latents": true,

"cache_latents_to_disk": true,

"caption_dropout_every_n_epochs": 0.0,

"caption_dropout_rate": 0,

"caption_extension": ".txt",

"clip_skip": "1",

"color_aug": false,

"constrain": 0.0,

"conv_alpha": 64,

"conv_block_alphas": "",

"conv_block_dims": "",

"conv_dim": 64,

"debiased_estimation_loss": false,

"decompose_both": false,

"dim_from_weights": false,

"down_lr_weight": "",

"enable_bucket": true,

"epoch": 5,

"factor": -1,

"flip_aug": false,

"fp8_base": false,

"full_bf16": false,

"full_fp16": false,

"gpu_ids": "",

"gradient_accumulation_steps": 1,

"gradient_checkpointing": true,

"keep_tokens": "0",

"learning_rate": 1.0,

"logging_dir": "",

"lora_network_weights": "",

"lr_scheduler": "constant_with_warmup",

"lr_scheduler_args": "",

"lr_scheduler_num_cycles": "",

"lr_scheduler_power": "",

"lr_warmup": 0,

"max_bucket_reso": 2048,

"max_data_loader_n_workers": "0",

"max_grad_norm": 1,

"max_resolution": "1024,1024",

"max_timestep": 1000,

"max_token_length": "75",

"max_train_epochs": "",

"max_train_steps": "",

"mem_eff_attn": false,

"mid_lr_weight": "",

"min_bucket_reso": 512,

"min_snr_gamma": 10,

"min_timestep": 0,

"mixed_precision": "bf16",

"model_list": "custom",

"module_dropout": 0.1,

"multi_gpu": false,

"multires_noise_discount": 0.2,

"multires_noise_iterations": 8,

"network_alpha": 32,

"network_dim": 64,

"network_dropout": 0,

"noise_offset": 0.0357,

"noise_offset_type": "Multires",

"num_cpu_threads_per_process": 2,

"num_machines": 1,

"num_processes": 1,

"optimizer": "Adafactor",

"optimizer_args": "\"scale_parameter=False\", \"relative_step=False\", \"warmup_init=False\" ",

"output_dir": "",

"output_name": "SakuraXL3",

"persistent_data_loader_workers": false,

"pretrained_model_name_or_path": "C:/Users/Lah/Documents/Stable Diffusion/Forge/webui/models/Stable-diffusion/ponyDiffusionV6XL_v6StartWithThisOne.safetensors",

"prior_loss_weight": 1.0,

"random_crop": false,

"rank_dropout": 0.1,

"rank_dropout_scale": false,

"reg_data_dir": "",

"rescaled": false,

"resume": "",

"sample_every_n_epochs": 0,

"sample_every_n_steps": 0,

"sample_prompts": "",

"sample_sampler": "euler_a",

"save_every_n_epochs": 1,

"save_every_n_steps": 0,

"save_last_n_steps": 0,

"save_last_n_steps_state": 0,

"save_model_as": "safetensors",

"save_precision": "fp16",

"save_state": false,

"scale_v_pred_loss_like_noise_pred": false,

"scale_weight_norms": 0,

"sdxl": true,

"sdxl_cache_text_encoder_outputs": false,

"sdxl_no_half_vae": true,

"seed": "",

"shuffle_caption": false,

"stop_text_encoder_training": 0,

"text_encoder_lr": 0.0,

"train_batch_size": 2,

"train_data_dir": "",

"train_norm": false,

"train_on_input": true,

"training_comment": "",

"unet_lr": 0.0,

"unit": 1,

"up_lr_weight": "",

"use_cp": true,

"use_scalar": false,

"use_tucker": false,

"use_wandb": false,

"v2": false,

"v_parameterization": false,

"v_pred_like_loss": 0,

"vae": "",

"vae_batch_size": 0,

"wandb_api_key": "",

"weighted_captions": false,

"xformers": "xformers"

}
bmaltais commented 6 months ago

Hi, this is interesting. But this is something kohya need to look into in his main sd-script repo. I only provide a GUI wrapper for his great scripts. I suggest you open a seperate issue in his github repo to see if he can find a solution. Maybe others have already raised it there: https://github.com/kohya-ss/sd-scripts/tree/main