[Help] Kohyaa - Logs from terminal that freeze during Lora training

Hello everyone,

I am currently facing a really frustrating problem with Lora XL training on Kohya. I am also surprised not to have found people complaining about the same problem as mine.

During training, the logs freeze randomly without any particular errors, meaning that the number of steps no longer advances at all, however the GPU continues to run at full speed, but the training does not 'is no longer in progress. Here is a screenshot.

The terminal logs that you see in the image have been in the same state for over 1 hour, the training is no longer progressing, however the process is still in progress in view of the resources consumed by the GPU.

Obviously I already left the logs blocked for one night, it never unlocked on its own, and the training therefore never finished.

Has anyone had this problem? And if so, what was the solution? THANKS. I've been stuck on this problem for 2 weeks, so I need to ask for help. For the GPU, I have a 4080 Super. As for my .json config on Kohya, here it is but I have the same problem regardless of the config used:

{

"LoRA_type": "Standard",

"LyCORIS_preset": "full",

"adaptive_noise_scale": 0,

"additional_parameters": "--network_train_unet_only",

"block_alphas": "",

"block_dims": "",

"block_lr_zero_threshold": "",

"bucket_no_upscale": true,

"bucket_reso_steps": 64,

"cache_latents": true,

"cache_latents_to_disk": true,

"caption_dropout_every_n_epochs": 0.0,

"caption_dropout_rate": 0,

"caption_extension": ".txt",

"clip_skip": "1",

"color_aug": false,

"constrain": 0.0,

"conv_alpha": 64,

"conv_block_alphas": "",

"conv_block_dims": "",

"conv_dim": 64,

"debiased_estimation_loss": false,

"decompose_both": false,

"dim_from_weights": false,

"down_lr_weight": "",

"enable_bucket": true,

"epoch": 5,

"factor": -1,

"flip_aug": false,

"fp8_base": false,

"full_bf16": false,

"full_fp16": false,

"gpu_ids": "",

"gradient_accumulation_steps": 1,

"gradient_checkpointing": true,

"keep_tokens": "0",

"learning_rate": 1.0,

"logging_dir": "",

"lora_network_weights": "",

"lr_scheduler": "constant_with_warmup",

"lr_scheduler_args": "",

"lr_scheduler_num_cycles": "",

"lr_scheduler_power": "",

"lr_warmup": 0,

"max_bucket_reso": 2048,

"max_data_loader_n_workers": "0",

"max_grad_norm": 1,

"max_resolution": "1024,1024",

"max_timestep": 1000,

"max_token_length": "75",

"max_train_epochs": "",

"max_train_steps": "",

"mem_eff_attn": false,

"mid_lr_weight": "",

"min_bucket_reso": 512,

"min_snr_gamma": 10,

"min_timestep": 0,

"mixed_precision": "bf16",

"model_list": "custom",

"module_dropout": 0.1,

"multi_gpu": false,

"multires_noise_discount": 0.2,

"multires_noise_iterations": 8,

"network_alpha": 32,

"network_dim": 64,

"network_dropout": 0,

"noise_offset": 0.0357,

"noise_offset_type": "Multires",

"num_cpu_threads_per_process": 2,

"num_machines": 1,

"num_processes": 1,

"optimizer": "Adafactor",

"optimizer_args": "\"scale_parameter=False\", \"relative_step=False\", \"warmup_init=False\" ",

"output_dir": "",

"output_name": "SakuraXL3",

"persistent_data_loader_workers": false,

"pretrained_model_name_or_path": "C:/Users/Lah/Documents/Stable Diffusion/Forge/webui/models/Stable-diffusion/ponyDiffusionV6XL_v6StartWithThisOne.safetensors",

"prior_loss_weight": 1.0,

"random_crop": false,

"rank_dropout": 0.1,

"rank_dropout_scale": false,

"reg_data_dir": "",

"rescaled": false,

"resume": "",

"sample_every_n_epochs": 0,

"sample_every_n_steps": 0,

"sample_prompts": "",

"sample_sampler": "euler_a",

"save_every_n_epochs": 1,

"save_every_n_steps": 0,

"save_last_n_steps": 0,

"save_last_n_steps_state": 0,

"save_model_as": "safetensors",

"save_precision": "fp16",

"save_state": false,

"scale_v_pred_loss_like_noise_pred": false,

"scale_weight_norms": 0,

"sdxl": true,

"sdxl_cache_text_encoder_outputs": false,

"sdxl_no_half_vae": true,

"seed": "",

"shuffle_caption": false,

"stop_text_encoder_training": 0,

"text_encoder_lr": 0.0,

"train_batch_size": 2,

"train_data_dir": "",

"train_norm": false,

"train_on_input": true,

"training_comment": "",

"unet_lr": 0.0,

"unit": 1,

"up_lr_weight": "",

"use_cp": true,

"use_scalar": false,

"use_tucker": false,

"use_wandb": false,

"v2": false,

"v_parameterization": false,

"v_pred_like_loss": 0,

"vae": "",

"vae_batch_size": 0,

"wandb_api_key": "",

"weighted_captions": false,

"xformers": "xformers"

}

bmaltais / kohya_ss

[Help] Kohyaa - Logs from terminal that freeze during Lora training #2096