Closed wzgrx closed 1 year ago
100%|█████████████████████████████████████| 105/105 [00:00<00:00, 107598.81it/s] caching latents... 0%| | 0/105 [00:01<?, ?it/s] ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /root/lora-scripts/./sd-scripts/sdxl_train_network.py:176 in │ │ │ │ 173 │ args = train_util.read_config_from_file(args, parser) │ │ 174 │ │ │ 175 │ trainer = SdxlNetworkTrainer() │ │ ❱ 176 │ trainer.train(args) │ │ 177 │ │ │ │ /root/lora-scripts/sd-scripts/train_network.py:252 in train │ │ │ │ 249 │ │ │ vae.requiresgrad(False) │ │ 250 │ │ │ vae.eval() │ │ 251 │ │ │ with torch.no_grad(): │ │ ❱ 252 │ │ │ │ train_dataset_group.cache_latents(vae, args.vaebatch │ │ 253 │ │ │ vae.to("cpu") │ │ 254 │ │ │ if torch.cuda.is_available(): │ │ 255 │ │ │ │ torch.cuda.empty_cache() │ │ │ │ /root/lora-scripts/sd-scripts/library/train_util.py:1839 in cache_latents │ │ │ │ 1836 │ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=Fals │ │ 1837 │ │ for i, dataset in enumerate(self.datasets): │ │ 1838 │ │ │ print(f"[Dataset {i}]") │ │ ❱ 1839 │ │ │ dataset.cache_latents(vae, vae_batch_size, cache_to_disk, │ │ 1840 │ │ │ 1841 │ def cache_text_encoder_outputs( │ │ 1842 │ │ self, tokenizers, text_encoders, device, weightdtype, cache │ │ │ │ /root/lora-scripts/sd-scripts/library/train_util.py:876 in cache_latents │ │ │ │ 873 │ │ # iterate batches: batch doesn't have image, image will be lo │ │ 874 │ │ print("caching latents...") │ │ 875 │ │ for batch in tqdm(batches, smoothing=1, total=len(batches)): │ │ ❱ 876 │ │ │ cache_batch_latents(vae, cache_to_disk, batch, subset.fli │ │ 877 │ │ │ 878 │ # weight_dtypeを指定するとText Encoderそのもの、およひ出力がweigh │ │ 879 │ # SDXLでのみ有効だが、datasetのメソッドとする必要があるので、sdxl │ │ │ │ /root/lora-scripts/sd-scripts/library/train_util.py:2182 in │ │ cache_batch_latents │ │ │ │ 2179 │ for info, latent, flipped_latent in zip(image_infos, latents, fli │ │ 2180 │ │ # check NaN │ │ 2181 │ │ if torch.isnan(latents).any() or (flipped_latent is not None │ │ ❱ 2182 │ │ │ raise RuntimeError(f"NaN detected in latents: {info.absol │ │ 2183 │ │ │ │ 2184 │ │ if cache_to_disk: │ │ 2185 │ │ │ save_latents_to_disk(info.latents_npz, latent, info.laten │ ╰──────────────────────────────────────────────────────────────────────────────╯ RuntimeError: NaN detected in latents: train/10_tags/00001-0-002havc1ly1gugdqwjqk2j62c0340kjl02.png ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /root/miniconda3/lib/python3.10/runpy.py:196 in _run_module_as_main │ │ │ │ 193 │ main_globals = sys.modules["main"].dict │ │ 194 │ if alter_argv: │ │ 195 │ │ sys.argv[0] = mod_spec.origin │ │ ❱ 196 │ return _run_code(code, main_globals, None, │ │ 197 │ │ │ │ │ "main", mod_spec) │ │ 198 │ │ 199 def run_module(mod_name, init_globals=None, │ │ │ │ /root/miniconda3/lib/python3.10/runpy.py:86 in _run_code │ │ │ │ 83 │ │ │ │ │ loader = loader, │ │ 84 │ │ │ │ │ package = pkg_name, │ │ 85 │ │ │ │ │ spec = mod_spec) │ │ ❱ 86 │ exec(code, run_globals) │ │ 87 │ return run_globals │ │ 88 │ │ 89 def _run_module_code(code, init_globals=None, │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │ │ 928 in │ │ │ │ 925 │ │ 926 │ │ 927 if name == "main": │ │ ❱ 928 │ main() │ │ 929 │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │ │ 924 in main │ │ │ │ 921 def main(): │ │ 922 │ parser = launch_command_parser() │ │ 923 │ args = parser.parse_args() │ │ ❱ 924 │ launch_command(args) │ │ 925 │ │ 926 │ │ 927 if name == "main": │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │ │ 918 in launch_command │ │ │ │ 915 │ elif defaults is not None and defaults.compute_environment == Comp │ │ 916 │ │ sagemaker_launcher(defaults, args) │ │ 917 │ else: │ │ ❱ 918 │ │ simple_launcher(args) │ │ 919 │ │ 920 │ │ 921 def main(): │ │ │ │ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │ │ 580 in simple_launcher │ │ │ │ 577 │ process.wait() │ │ 578 │ if process.returncode != 0: │ │ 579 │ │ if not args.quiet: │ │ ❱ 580 │ │ │ raise subprocess.CalledProcessError(returncode=process.ret │ │ 581 │ │ else: │ │ 582 │ │ │ sys.exit(1) │ │ 583 │ ╰──────────────────────────────────────────────────────────────────────────────╯ CalledProcessError: Command '['/root/miniconda3/bin/python', './sd-scripts/sdxl_train_network.py', '--config_file', '/root/lora-scripts/config/autosave/20230907-171833.toml']' returned non-zero exit status 1. 17:19:02-111442 ERROR Training failed / 训练失败
训练参数 model_train_type = "sdxl-lora" pretrained_model_name_or_path = "./sd-models/XL_v0.1.safetensors" v2 = false train_data_dir = "./train" prior_loss_weight = 1 resolution = "1024,1024" enable_bucket = false min_bucket_reso = 256 max_bucket_reso = 2_048 bucket_reso_steps = 32 output_name = "aki" output_dir = "./output" save_model_as = "safetensors" save_precision = "fp16" save_every_n_epochs = 1 max_train_epochs = 10 train_batch_size = 2 gradient_checkpointing = false network_train_unet_only = true network_train_text_encoder_only = false learning_rate = 0.0001 unet_lr = 0.0001 text_encoder_lr = 0.00001 lr_scheduler = "cosine_with_restarts" lr_warmup_steps = 0 lr_scheduler_num_cycles = 1 optimizer_type = "AdaFactor" min_snr_gamma = 5 network_module = "networks.lora" network_dim = 32 network_alpha = 16 network_dropout = 0 log_with = "tensorboard" logging_dir = "./logs" caption_extension = ".txt" shuffle_caption = true weighted_captions = false keep_tokens = 0 max_token_length = 255 multires_noise_iterations = 8 multires_noise_discount = 0.3 seed = 1_337 clip_skip = 2 mixed_precision = "fp16" full_fp16 = false full_bf16 = false xformers = true lowram = false cache_latents = false cache_latents_to_disk = true cache_text_encoder_outputs = false cache_text_encoder_outputs_to_disk = false persistent_data_loader_workers = false optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]
我也遇到了一样的问题,请问你是怎么解决的呢?
找到了 https://github.com/Akegarasu/lora-scripts/issues/207#issuecomment-1722425967
100%|█████████████████████████████████████| 105/105 [00:00<00:00, 107598.81it/s] caching latents... 0%| | 0/105 [00:01<?, ?it/s] ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /root/lora-scripts/./sd-scripts/sdxl_train_network.py:176 in │
│ │
│ 173 │ args = train_util.read_config_from_file(args, parser) │
│ 174 │ │
│ 175 │ trainer = SdxlNetworkTrainer() │
│ ❱ 176 │ trainer.train(args) │
│ 177 │
│ │
│ /root/lora-scripts/sd-scripts/train_network.py:252 in train │
│ │
│ 249 │ │ │ vae.requiresgrad(False) │
│ 250 │ │ │ vae.eval() │
│ 251 │ │ │ with torch.no_grad(): │
│ ❱ 252 │ │ │ │ train_dataset_group.cache_latents(vae, args.vaebatch │
│ 253 │ │ │ vae.to("cpu") │
│ 254 │ │ │ if torch.cuda.is_available(): │
│ 255 │ │ │ │ torch.cuda.empty_cache() │
│ │
│ /root/lora-scripts/sd-scripts/library/train_util.py:1839 in cache_latents │
│ │
│ 1836 │ def cache_latents(self, vae, vae_batch_size=1, cache_to_disk=Fals │
│ 1837 │ │ for i, dataset in enumerate(self.datasets): │
│ 1838 │ │ │ print(f"[Dataset {i}]") │
│ ❱ 1839 │ │ │ dataset.cache_latents(vae, vae_batch_size, cache_to_disk, │
│ 1840 │ │
│ 1841 │ def cache_text_encoder_outputs( │
│ 1842 │ │ self, tokenizers, text_encoders, device, weightdtype, cache │
│ │
│ /root/lora-scripts/sd-scripts/library/train_util.py:876 in cache_latents │
│ │
│ 873 │ │ # iterate batches: batch doesn't have image, image will be lo │
│ 874 │ │ print("caching latents...") │
│ 875 │ │ for batch in tqdm(batches, smoothing=1, total=len(batches)): │
│ ❱ 876 │ │ │ cache_batch_latents(vae, cache_to_disk, batch, subset.fli │
│ 877 │ │
│ 878 │ # weight_dtypeを指定するとText Encoderそのもの、およひ出力がweigh │
│ 879 │ # SDXLでのみ有効だが、datasetのメソッドとする必要があるので、sdxl │
│ │
│ /root/lora-scripts/sd-scripts/library/train_util.py:2182 in │
│ cache_batch_latents │
│ │
│ 2179 │ for info, latent, flipped_latent in zip(image_infos, latents, fli │
│ 2180 │ │ # check NaN │
│ 2181 │ │ if torch.isnan(latents).any() or (flipped_latent is not None │
│ ❱ 2182 │ │ │ raise RuntimeError(f"NaN detected in latents: {info.absol │
│ 2183 │ │ │
│ 2184 │ │ if cache_to_disk: │
│ 2185 │ │ │ save_latents_to_disk(info.latents_npz, latent, info.laten │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: NaN detected in latents:
train/10_tags/00001-0-002havc1ly1gugdqwjqk2j62c0340kjl02.png
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /root/miniconda3/lib/python3.10/runpy.py:196 in _run_module_as_main │
│ │
│ 193 │ main_globals = sys.modules["main"].dict │
│ 194 │ if alter_argv: │
│ 195 │ │ sys.argv[0] = mod_spec.origin │
│ ❱ 196 │ return _run_code(code, main_globals, None, │
│ 197 │ │ │ │ │ "main", mod_spec) │
│ 198 │
│ 199 def run_module(mod_name, init_globals=None, │
│ │
│ /root/miniconda3/lib/python3.10/runpy.py:86 in _run_code │
│ │
│ 83 │ │ │ │ │ loader = loader, │
│ 84 │ │ │ │ │ package = pkg_name, │
│ 85 │ │ │ │ │ spec = mod_spec) │
│ ❱ 86 │ exec(code, run_globals) │
│ 87 │ return run_globals │
│ 88 │
│ 89 def _run_module_code(code, init_globals=None, │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │
│ 928 in │
│ │
│ 925 │
│ 926 │
│ 927 if name == "main": │
│ ❱ 928 │ main() │
│ 929 │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │
│ 924 in main │
│ │
│ 921 def main(): │
│ 922 │ parser = launch_command_parser() │
│ 923 │ args = parser.parse_args() │
│ ❱ 924 │ launch_command(args) │
│ 925 │
│ 926 │
│ 927 if name == "main": │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │
│ 918 in launch_command │
│ │
│ 915 │ elif defaults is not None and defaults.compute_environment == Comp │
│ 916 │ │ sagemaker_launcher(defaults, args) │
│ 917 │ else: │
│ ❱ 918 │ │ simple_launcher(args) │
│ 919 │
│ 920 │
│ 921 def main(): │
│ │
│ /root/miniconda3/lib/python3.10/site-packages/accelerate/commands/launch.py: │
│ 580 in simple_launcher │
│ │
│ 577 │ process.wait() │
│ 578 │ if process.returncode != 0: │
│ 579 │ │ if not args.quiet: │
│ ❱ 580 │ │ │ raise subprocess.CalledProcessError(returncode=process.ret │
│ 581 │ │ else: │
│ 582 │ │ │ sys.exit(1) │
│ 583 │
╰──────────────────────────────────────────────────────────────────────────────╯
CalledProcessError: Command '['/root/miniconda3/bin/python',
'./sd-scripts/sdxl_train_network.py', '--config_file',
'/root/lora-scripts/config/autosave/20230907-171833.toml']' returned non-zero
exit status 1.
17:19:02-111442 ERROR Training failed / 训练失败
训练参数 model_train_type = "sdxl-lora" pretrained_model_name_or_path = "./sd-models/XL_v0.1.safetensors" v2 = false train_data_dir = "./train" prior_loss_weight = 1 resolution = "1024,1024" enable_bucket = false min_bucket_reso = 256 max_bucket_reso = 2_048 bucket_reso_steps = 32 output_name = "aki" output_dir = "./output" save_model_as = "safetensors" save_precision = "fp16" save_every_n_epochs = 1 max_train_epochs = 10 train_batch_size = 2 gradient_checkpointing = false network_train_unet_only = true network_train_text_encoder_only = false learning_rate = 0.0001 unet_lr = 0.0001 text_encoder_lr = 0.00001 lr_scheduler = "cosine_with_restarts" lr_warmup_steps = 0 lr_scheduler_num_cycles = 1 optimizer_type = "AdaFactor" min_snr_gamma = 5 network_module = "networks.lora" network_dim = 32 network_alpha = 16 network_dropout = 0 log_with = "tensorboard" logging_dir = "./logs" caption_extension = ".txt" shuffle_caption = true weighted_captions = false keep_tokens = 0 max_token_length = 255 multires_noise_iterations = 8 multires_noise_discount = 0.3 seed = 1_337 clip_skip = 2 mixed_precision = "fp16" full_fp16 = false full_bf16 = false xformers = true lowram = false cache_latents = false cache_latents_to_disk = true cache_text_encoder_outputs = false cache_text_encoder_outputs_to_disk = false persistent_data_loader_workers = false optimizer_args = [ "scale_parameter=False", "relative_step=False", "warmup_init=False" ]