Akegarasu / lora-scripts

LoRA & Dreambooth training scripts & GUI use kohya-ss's trainer, for diffusion model.
GNU Affero General Public License v3.0
4.4k stars 542 forks source link

sdxl训练错误 #224

Closed wzgrx closed 1 year ago

wzgrx commented 1 year ago

100%|█████████████████████████████████████| 105/105 [00:00<00:00, 107598.81it/s] caching latents... 0%| | 0/105 [00:01<?, ?it/s] ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /root/lora-scripts/./sd-scripts/sdxl_train_network.py:176 in │ │ │ │ 173 │ args = train_util.read_config_from_file(args, parser) │ │ 174 │ │ │ 175 │ trainer = SdxlNetworkTrainer() │ │ ❱ 176 │ trainer.train(args) │ │ 177 │ │ │ │ /root/lora-scripts/sd-scripts/train_network.py:252 in train │ │ │ │ 249 │ │ │ vae.requiresgrad(False) │ │ 250 │ │ │ vae.eval() │ │ 251 │ │ │ with torch.no_grad(): │ │ ❱ 252 │ │ │ │ train_dataset_group.cache_latents(vae, args.vaebatch │ │ 253 │ │ │ vae.to("cpu") │ │ 254 │ │ │ if torch.cuda.is_available(): │ │ 255 │ │ │ │ torch.cuda.empty_cache() │ │ │ │ /root/lora-scripts/sd-scripts/library/train_util.py:1839 in cache_latents │ │ │ │ 1836 │ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=Fals │ │ 1837 │ │ for i, dataset in enumerate(self.datasets): │ │ 1838 │ │ │ print(f"[Dataset {i}]") │ │ ❱ 1839 │ │ │ dataset.cache_latents(vae, vae_batch_size, cache_to_disk, │ │ 1840 │ │ │ 1841 │ def cache_text_encoder_outputs( │ │ 1842 │ │ self, tokenizers, text_encoders, device, weightdtype, cache │ │ │ │ /root/lora-scripts/sd-scripts/library/train_util.py:876 in cache_latents │ │ │ │ 873 │ │ # iterate batches: batch doesn't have image, image will be lo │ │ 874 │ │ print("caching latents...") │ │ 875 │ │ for batch in tqdm(batches, smoothing=1, total=len(batches)): │ │ ❱ 876 │ │ │ cache_batch_latents(vae, cache_to_disk, batch, subset.fli │ │ 877 │ │ │ 878 │ # weight_dtypeを指定するとText Encoderそのもの、およひ出力がweigh │ │ 879 │ # SDXLでのみ有効だが、datasetのメソッドとする必要があるので、sdxl │ │ │ │ /root/lora-scripts/sd-scripts/library/train_util.py:2182 in │ │ cache_batch_latents │ │ │ │ 2179 │ for info, latent, flipped_latent in zip(image_infos, latents, fli │ │ 2180 │ │ # check NaN │ │ 2181 │ │ if torch.isnan(latents).any() or (flipped_latent is not None │ │ ❱ 2182 │ │ │ raise RuntimeError(f"NaN detected in latents: {info.absol │ │ 2183 │ │ │ │ 2184 │ │ if cache_to_disk: │ │ 2185 │ │ │ save_latents_to_disk(info.latents_npz, latent, info.laten │ ╰──────────────────────────────────────────────────────────────────────────────╯ RuntimeError: NaN detected in latents: train/10_tags/00001-0-002havc1ly1gugdqwjqk2j62c0340kjl02.png ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /root/miniconda3/lib/python3.10/runpy.py:196 in _run_module_as_main │ │ │ │ 193 │ main_globals = sys.modules["main"].dict │ │ 194 │ if alter_argv: │ │ 195 │ │ sys.argv[0] = mod_spec.origin │ │ ❱ 196 │ return _run_code(code, main_globals, None, │ │ 197 │ │ │ │ │ "main", mod_spec) │ │ 198 │ │ 199 def run_module(mod_name, init_globals=None, │ │ │ │ /root/miniconda3/lib/python3.10/runpy.py:86 in _run_code │ │ │ │ 83 │ │ │ │ │ loader = loader, │ │ 84 │ │ │ │ │ package = pkg_name, │ │ 85 │ │ │ │ │ spec = mod_spec) │ │ ❱ 86 │ exec(code, run_globals) │ │ 87 │ return run_globals │ │ 88 │ │ 89 def _run_module_code(code, init_globals=None, │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │ │ 928 in │ │ │ │ 925 │ │ 926 │ │ 927 if name == "main": │ │ ❱ 928 │ main() │ │ 929 │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │ │ 924 in main │ │ │ │ 921 def main(): │ │ 922 │ parser = launch_command_parser() │ │ 923 │ args = parser.parse_args() │ │ ❱ 924 │ launch_command(args) │ │ 925 │ │ 926 │ │ 927 if name == "main": │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │ │ 918 in launch_command │ │ │ │ 915 │ elif defaults is not None and defaults.compute_environment == Comp │ │ 916 │ │ sagemaker_launcher(defaults, args) │ │ 917 │ else: │ │ ❱ 918 │ │ simple_launcher(args) │ │ 919 │ │ 920 │ │ 921 def main(): │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │ │ 580 in simple_launcher │ │ │ │ 577 │ process.wait() │ │ 578 │ if process.returncode != 0: │ │ 579 │ │ if not args.quiet: │ │ ❱ 580 │ │ │ raise subprocess.CalledProcessError(returncode=process.ret │ │ 581 │ │ else: │ │ 582 │ │ │ sys.exit(1) │ │ 583 │ ╰──────────────────────────────────────────────────────────────────────────────╯ CalledProcessError: Command '['/root/miniconda3/bin/python', './sd-scripts/sdxl_train_network.py', '--config_file', '/root/lora-scripts/config/autosave/20230907-171833.toml']' returned non-zero exit status 1. 17:19:02-111442 ERROR Training failed / 训练失败

训练参数 model_train_type = "sdxl-lora" pretrained_model_name_or_path = "./sd-models/XL_v0.1.safetensors" v2 = false train_data_dir = "./train" prior_loss_weight = 1 resolution = "1024,1024" enable_bucket = false min_bucket_reso = 256 max_bucket_reso = 2_048 bucket_reso_steps = 32 output_name = "aki" output_dir = "./output" save_model_as = "safetensors" save_precision = "fp16" save_every_n_epochs = 1 max_train_epochs = 10 train_batch_size = 2 gradient_checkpointing = false network_train_unet_only = true network_train_text_encoder_only = false learning_rate = 0.0001 unet_lr = 0.0001 text_encoder_lr = 0.00001 lr_scheduler = "cosine_with_restarts" lr_warmup_steps = 0 lr_scheduler_num_cycles = 1 optimizer_type = "AdaFactor" min_snr_gamma = 5 network_module = "networks.lora" network_dim = 32 network_alpha = 16 network_dropout = 0 log_with = "tensorboard" logging_dir = "./logs" caption_extension = ".txt" shuffle_caption = true weighted_captions = false keep_tokens = 0 max_token_length = 255 multires_noise_iterations = 8 multires_noise_discount = 0.3 seed = 1_337 clip_skip = 2 mixed_precision = "fp16" full_fp16 = false full_bf16 = false xformers = true lowram = false cache_latents = false cache_latents_to_disk = true cache_text_encoder_outputs = false cache_text_encoder_outputs_to_disk = false persistent_data_loader_workers = false optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]

yhygta commented 1 year ago

我也遇到了一样的问题,请问你是怎么解决的呢?

yhygta commented 1 year ago

找到了 https://github.com/Akegarasu/lora-scripts/issues/207#issuecomment-1722425967