Open sashasubbbb opened 11 months ago
I am unable to train with gradient_checkpoining on RuntimeError: element 0 of variables does not require grad and does not have a grad_fn Any idea how to fix that?
RuntimeError: element 0 of variables does not require grad and does not have a grad_fn
Full training command:
accelerate launch --num_cpu_threads_per_process=2 "./train_network.py" --enable_bucket --min_bucket_reso=256 --max_bucket_reso=1024 --pretrained_model_name_or_path="B:/AIimages/stable-diffusion-webui/models/Stable-diffusion/model.ckpt" --train_data_dir="B:\AIimages\training\data" --resolution="576,576" --output_dir="B:\AIimages\stable-diffusion-webui\models\Lora\lora\out" --logging_dir="B:\AIimages\stable-diffusion-webui\models\Lora\lora\out" --network_alpha="128" --save_model_as=safetensors --network_module=networks.lora --unet_lr=0.0001 --network_train_unet_only --network_dim=128 --output_name="test_v1" --lr_scheduler_num_cycles="48" --no_half_vae --learning_rate="1.0" --lr_scheduler="cosine" --lr_warmup_steps="34" --train_batch_size="8" --max_train_steps="684" --save_every_n_epochs="1" --mixed_precision="fp16" --save_precision="fp16" --seed="420" --caption_extension=".txt" --cache_latents --cache_latents_to_disk --optimizer_type="AdamW8bit" --optimizer_args betas=0.9,0.999 weight_decay=0.01 --max_data_loader_n_workers="0" --max_token_length=225 --clip_skip=2 --bucket_reso_steps=1 --min_snr_gamma=10 --flip_aug --shuffle_caption --gradient_checkpointing --xformers --bucket_no_upscale --noise_offset=0.0 --sample_sampler=euler_a --sample_prompts="B:\AIimages\stable-diffusion-webui\models\Lora\lora\out\sample\prompt.txt" --sample_every_n_epochs="1"
Full error:
epoch 1/7 B:\AIimages\sd-scripts\kohya_ss\venv\lib\site-packages\torch\utils\checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None warnings.warn("None of the inputs have requires_grad=True. Gradients will be None") ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ B:\AIimages\sd-scripts\kohya_ss\train_network.py:990 in <module> │ │ │ │ 987 │ args = train_util.read_config_from_file(args, parser) │ │ 988 │ │ │ 989 │ trainer = NetworkTrainer() │ │ ❱ 990 │ trainer.train(args) │ │ 991 │ │ │ │ B:\AIimages\sd-scripts\kohya_ss\train_network.py:798 in train │ │ │ │ 795 │ │ │ │ │ │ │ 796 │ │ │ │ │ loss = loss.mean() # 平均なのでbatch_sizeで割る必要なし │ │ 797 │ │ │ │ │ │ │ ❱ 798 │ │ │ │ │ accelerator.backward(loss) │ │ 799 │ │ │ │ │ if accelerator.sync_gradients and args.max_grad_norm != 0.0: │ │ 800 │ │ │ │ │ │ params_to_clip = network.get_trainable_params() │ │ 801 │ │ │ │ │ │ accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) │ │ │ │ B:\AIimages\sd-scripts\kohya_ss\venv\lib\site-packages\accelerate\accelerator.py:1743 in │ │ backward │ │ │ │ 1740 │ │ elif self.distributed_type == DistributedType.MEGATRON_LM: │ │ 1741 │ │ │ return │ │ 1742 │ │ elif self.scaler is not None: │ │ ❱ 1743 │ │ │ self.scaler.scale(loss).backward(**kwargs) │ │ 1744 │ │ else: │ │ 1745 │ │ │ loss.backward(**kwargs) │ │ 1746 │ │ │ │ B:\AIimages\sd-scripts\kohya_ss\venv\lib\site-packages\torch\_tensor.py:487 in backward │ │ │ │ 484 │ │ │ │ create_graph=create_graph, │ │ 485 │ │ │ │ inputs=inputs, │ │ 486 │ │ │ ) │ │ ❱ 487 │ │ torch.autograd.backward( │ │ 488 │ │ │ self, gradient, retain_graph, create_graph, inputs=inputs │ │ 489 │ │ ) │ │ 490 │ │ │ │ B:\AIimages\sd-scripts\kohya_ss\venv\lib\site-packages\torch\autograd\__init__.py:200 in │ │ backward │ │ │ │ 197 │ # The reason we repeat same the comment below is that │ │ 198 │ # some Python versions print out the first line of a multi-line function │ │ 199 │ # calls in the traceback and some print out the last line │ │ ❱ 200 │ Variable._execution_engine.run_backward( # Calls into the C++ engine to run the bac │ │ 201 │ │ tensors, grad_tensors_, retain_graph, create_graph, inputs, │ │ 202 │ │ allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to ru │ │ 203 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn steps: 0%| | 0/684 [00:01<?, ?it/s] ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ C:\Users\Aleks\AppData\Local\Programs\Python\Python310\lib\runpy.py:196 in │ │ _run_module_as_main │ │ │ │ 193 │ main_globals = sys.modules["__main__"].__dict__ │ │ 194 │ if alter_argv: │ │ 195 │ │ sys.argv[0] = mod_spec.origin │ │ ❱ 196 │ return _run_code(code, main_globals, None, │ │ 197 │ │ │ │ │ "__main__", mod_spec) │ │ 198 │ │ 199 def run_module(mod_name, init_globals=None, │ │ │ │ C:\Users\Aleks\AppData\Local\Programs\Python\Python310\lib\runpy.py:86 in _run_code │ │ │ │ 83 │ │ │ │ │ __loader__ = loader, │ │ 84 │ │ │ │ │ __package__ = pkg_name, │ │ 85 │ │ │ │ │ __spec__ = mod_spec) │ │ ❱ 86 │ exec(code, run_globals) │ │ 87 │ return run_globals │ │ 88 │ │ 89 def _run_module_code(code, init_globals=None, │ │ │ │ in <module>:7 │ │ │ │ 4 from accelerate.commands.accelerate_cli import main │ │ 5 if __name__ == '__main__': │ │ 6 │ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) │ │ ❱ 7 │ sys.exit(main()) │ │ 8 │ │ │ │ B:\AIimages\sd-scripts\kohya_ss\venv\lib\site-packages\accelerate\commands\accelerate_cli.py:45 │ │ in main │ │ │ │ 42 │ │ exit(1) │ │ 43 │ │ │ 44 │ # Run │ │ ❱ 45 │ args.func(args) │ │ 46 │ │ 47 │ │ 48 if __name__ == "__main__": │ │ │ │ B:\AIimages\sd-scripts\kohya_ss\venv\lib\site-packages\accelerate\commands\launch.py:918 in │ │ launch_command │ │ │ │ 915 │ elif defaults is not None and defaults.compute_environment == ComputeEnvironment.AMA │ │ 916 │ │ sagemaker_launcher(defaults, args) │ │ 917 │ else: │ │ ❱ 918 │ │ simple_launcher(args) │ │ 919 │ │ 920 │ │ 921 def main(): │ │ │ │ B:\AIimages\sd-scripts\kohya_ss\venv\lib\site-packages\accelerate\commands\launch.py:580 in │ │ simple_launcher │ │ │ │ 577 │ process.wait() │ │ 578 │ if process.returncode != 0: │ │ 579 │ │ if not args.quiet: │ │ ❱ 580 │ │ │ raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) │ │ 581 │ │ else: │ │ 582 │ │ │ sys.exit(1) │ │ 583 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
--network_train_unet_only and --gradient_checkpointing may cause this error (with lora)
I am unable to train with gradient_checkpoining on
RuntimeError: element 0 of variables does not require grad and does not have a grad_fn
Any idea how to fix that?Full training command:
Full error: