bmaltais / kohya_ss

Apache License 2.0
9.7k stars 1.25k forks source link

[Bug] `subprocess.CalledProcessError` occurred while running `train_network.py` script #2887

Open baicai99 opened 1 month ago

baicai99 commented 1 month ago

Issue Content:

Description: It usually happens during lora training for some time.

I encountered a subprocess.CalledProcessError when running the train_network.py script using the specified configuration file. The script failed and returned a non-zero exit status (Exit Status: 3221225477).

Below is the full error traceback:

raceback (most recent call last):
  File "C:\Users\boringsoft\anaconda3\envs\kohyass\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\boringsoft\anaconda3\envs\kohyass\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\boringsoft\Desktop\kohya_ss\venv\Scripts\accelerate.EXE\__main__.py", line 7, in <module>
  File "C:\Users\boringsoft\Desktop\kohya_ss\venv\lib\site-packages\accelerate\commands\accelerate_cli.py", line 47, in main
    args.func(args)
  File "C:\Users\boringsoft\Desktop\kohya_ss\venv\lib\site-packages\accelerate\commands\launch.py", line 1017, in launch_command
    simple_launcher(args)
  File "C:\Users\boringsoft\Desktop\kohya_ss\venv\lib\site-packages\accelerate\commands\launch.py", line 637, in simple_launcher
    raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['C:\\Users\\boringsoft\\Desktop\\kohya_ss\\venv\\Scripts\\python.exe', 'C:/Users/boringsoft/Desktop/kohya_ss/sd-scripts/train_network.py', '--config_file', 'C:\\Users\\boringsoft\\Desktop\\zhaolusi_train\\models/config_lora-20241007-171434.toml']' returned non-zero exit status 3221225477.
bucket_reso_steps = 64
cache_latents = true
caption_extension = ".txt"
clip_skip = 1
dynamo_backend = "no"
epoch = 1
gradient_accumulation_steps = 1
huber_c = 0.1
huber_schedule = "snr"
learning_rate = 1.0
logging_dir = "C:\\Users\\boringsoft\\Desktop\\zhaolusi_train\\logs"
loss_type = "l2"
lr_scheduler = "cosine_with_restarts"
lr_scheduler_args = []
lr_scheduler_num_cycles = 2
lr_scheduler_power = 1
lr_warmup_steps = 198
max_bucket_reso = 2048
max_data_loader_n_workers = 0
max_grad_norm = 1
max_timestep = 1000
max_token_length = 75
max_train_epochs = 10
max_train_steps = 1984
min_bucket_reso = 256
mixed_precision = "fp16"
multires_noise_discount = 0.3
network_alpha = 64
network_args = []
network_dim = 128
network_module = "networks.lora"
noise_offset_type = "Original"
optimizer_args = []
optimizer_type = "AdamW8bit"
output_dir = "C:\\Users\\boringsoft\\Desktop\\zhaolusi_train\\models"
output_name = "zhaolusi"
pretrained_model_name_or_path = "C:/Users/boringsoft/Desktop/kohya_ss/models/真人模特-majicMIX realistic 麦橘写实_v7.safetensors"
prior_loss_weight = 1
resolution = "768,768"
sample_every_n_steps = 200
sample_prompts = "C:\\Users\\boringsoft\\Desktop\\zhaolusi_train\\models\\prompt.txt"
sample_sampler = "euler_a"
save_every_n_epochs = 1
save_model_as = "safetensors"
save_precision = "bf16"
text_encoder_lr = 4e-5
train_batch_size = 1
train_data_dir = "C:\\Users\\boringsoft\\Desktop\\zhaolusi_train\\images"
unet_lr = 0.0001
xformers = true
{
  "LoRA_type": "Standard",
  "LyCORIS_preset": "full",
  "adaptive_noise_scale": 0,
  "additional_parameters": "",
  "async_upload": false,
  "block_alphas": "",
  "block_dims": "",
  "block_lr_zero_threshold": "",
  "bucket_no_upscale": false,
  "bucket_reso_steps": 64,
  "bypass_mode": false,
  "cache_latents": true,
  "cache_latents_to_disk": false,
  "caption_dropout_every_n_epochs": 0,
  "caption_dropout_rate": 0,
  "caption_extension": ".txt",
  "clip_skip": 1,
  "color_aug": false,
  "constrain": 0,
  "conv_alpha": 1,
  "conv_block_alphas": "",
  "conv_block_dims": "",
  "conv_dim": 1,
  "dataset_config": "",
  "debiased_estimation_loss": false,
  "decompose_both": false,
  "dim_from_weights": false,
  "dora_wd": false,
  "down_lr_weight": "",
  "dynamo_backend": "no",
  "dynamo_mode": "default",
  "dynamo_use_dynamic": false,
  "dynamo_use_fullgraph": false,
  "enable_bucket": false,
  "epoch": 1,
  "extra_accelerate_launch_args": "",
  "factor": -1,
  "flip_aug": false,
  "fp8_base": false,
  "full_bf16": false,
  "full_fp16": false,
  "gpu_ids": "",
  "gradient_accumulation_steps": 1,
  "gradient_checkpointing": false,
  "huber_c": 0.1,
  "huber_schedule": "snr",
  "huggingface_path_in_repo": "",
  "huggingface_repo_id": "",
  "huggingface_repo_type": "",
  "huggingface_repo_visibility": "",
  "huggingface_token": "",
  "ip_noise_gamma": 0,
  "ip_noise_gamma_random_strength": false,
  "keep_tokens": 0,
  "learning_rate": 1,
  "log_tracker_config": "",
  "log_tracker_name": "",
  "log_with": "",
  "logging_dir": "C:\\Users\\boringsoft\\Desktop\\zhaolusi_train\\logs",
  "loss_type": "l2",
  "lr_scheduler": "cosine_with_restarts",
  "lr_scheduler_args": "",
  "lr_scheduler_num_cycles": 2,
  "lr_scheduler_power": 1,
  "lr_warmup": 10,
  "main_process_port": 0,
  "masked_loss": false,
  "max_bucket_reso": 2048,
  "max_data_loader_n_workers": 0,
  "max_grad_norm": 1,
  "max_resolution": "768,768",
  "max_timestep": 1000,
  "max_token_length": 75,
  "max_train_epochs": 10,
  "max_train_steps": 0,
  "mem_eff_attn": false,
  "metadata_author": "",
  "metadata_description": "",
  "metadata_license": "",
  "metadata_tags": "",
  "metadata_title": "",
  "mid_lr_weight": "",
  "min_bucket_reso": 256,
  "min_snr_gamma": 0,
  "min_timestep": 0,
  "mixed_precision": "fp16",
  "model_list": "custom",
  "module_dropout": 0,
  "multi_gpu": false,
  "multires_noise_discount": 0.3,
  "multires_noise_iterations": 0,
  "network_alpha": 64,
  "network_dim": 128,
  "network_dropout": 0,
  "network_weights": "",
  "noise_offset": 0,
  "noise_offset_random_strength": false,
  "noise_offset_type": "Original",
  "num_cpu_threads_per_process": 2,
  "num_machines": 1,
  "num_processes": 1,
  "optimizer": "AdamW8bit",
  "optimizer_args": "",
  "output_dir": "C:\\Users\\boringsoft\\Desktop\\zhaolusi_train\\models",
  "output_name": "zhaolusi",
  "persistent_data_loader_workers": false,
  "pretrained_model_name_or_path": "C:/Users/boringsoft/Desktop/kohya_ss/models/\u771f\u4eba\u6a21\u7279-majicMIX realistic \u9ea6\u6a58\u5199\u5b9e_v7.safetensors",
  "prior_loss_weight": 1,
  "random_crop": false,
  "rank_dropout": 0,
  "rank_dropout_scale": false,
  "reg_data_dir": "",
  "rescaled": false,
  "resume": "",
  "resume_from_huggingface": "",
  "sample_every_n_epochs": 0,
  "sample_every_n_steps": 200,
  "sample_prompts": "zhaolusi, 1girl, portrait,  --w 512, --h 512, --d 6666666, --l 7, --s 20 ",
  "sample_sampler": "euler_a",
  "save_every_n_epochs": 1,
  "save_every_n_steps": 0,
  "save_last_n_steps": 0,
  "save_last_n_steps_state": 0,
  "save_model_as": "safetensors",
  "save_precision": "bf16",
  "save_state": false,
  "save_state_on_train_end": false,
  "save_state_to_huggingface": false,
  "scale_v_pred_loss_like_noise_pred": false,
  "scale_weight_norms": 0,
  "sdxl": false,
  "sdxl_cache_text_encoder_outputs": false,
  "sdxl_no_half_vae": false,
  "seed": 0,
  "shuffle_caption": false,
  "stop_text_encoder_training_pct": 0,
  "text_encoder_lr": 4e-05,
  "train_batch_size": 1,
  "train_data_dir": "C:\\Users\\boringsoft\\Desktop\\zhaolusi_train\\images",
  "train_norm": false,
  "train_on_input": true,
  "training_comment": "",
  "unet_lr": 0.0001,
  "unit": 1,
  "up_lr_weight": "",
  "use_cp": false,
  "use_scalar": false,
  "use_tucker": false,
  "v2": false,
  "v_parameterization": false,
  "v_pred_like_loss": 0,
  "vae": "",
  "vae_batch_size": 0,
  "wandb_api_key": "",
  "wandb_run_name": "",
  "weighted_captions": false,
  "xformers": "xformers"
}
baicai99 commented 1 month ago

Suspected to be a bug caused by overheating of the 13th and 14th generation Intel CPUs. Please try updating to the latest BIOS.