OOM when training flux lora on 8gb vram (4060 mobile)

oleg996 commented 2 weeks ago

Getting OOM when trying to train a flux lora on 4060 mobile. My config: { "LoRA_type": "Flux1", "LyCORIS_preset": "full", "adaptive_noise_scale": 0, "additional_parameters": "", "ae": "/home/oleg/AI/ComfyUI/models/vae/ae.sft", "apply_t5_attn_mask": false, "async_upload": false, "block_alphas": "", "block_dims": "", "block_lr_zero_threshold": "", "bucket_no_upscale": true, "bucket_reso_steps": 64, "bypass_mode": false, "cache_latents": true, "cache_latents_to_disk": true, "caption_dropout_every_n_epochs": 0, "caption_dropout_rate": 0, "caption_extension": ".txt", "clip_l": "/home/oleg/AI/ComfyUI/models/clip/clip_l.safetensors", "clip_skip": 1, "color_aug": false, "constrain": 0, "conv_alpha": 1, "conv_block_alphas": "", "conv_block_dims": "", "conv_dim": 1, "dataset_config": "", "debiased_estimation_loss": false, "decompose_both": false, "dim_from_weights": false, "discrete_flow_shift": 3, "dora_wd": false, "down_lr_weight": "", "dynamo_backend": "no", "dynamo_mode": "reduce-overhead", "dynamo_use_dynamic": false, "dynamo_use_fullgraph": false, "enable_bucket": true, "epoch": 1, "extra_accelerate_launch_args": "", "factor": -1, "flip_aug": false, "flux1_cache_text_encoder_outputs": true, "flux1_cache_text_encoder_outputs_to_disk": true, "flux1_checkbox": true, "fp8_base": true, "fp8_base_unet": false, "full_bf16": false, "full_fp16": false, "gpu_ids": "", "gradient_accumulation_steps": 1, "gradient_checkpointing": true, "guidance_scale": 1, "highvram": false, "huber_c": 0.1, "huber_schedule": "snr", "huggingface_path_in_repo": "", "huggingface_repo_id": "", "huggingface_repo_type": "", "huggingface_repo_visibility": "", "huggingface_token": "", "ip_noise_gamma": 0, "ip_noise_gamma_random_strength": false, "keep_tokens": 0, "learning_rate": 0.0003, "log_config": false, "log_tracker_config": "", "log_tracker_name": "", "log_with": "", "logging_dir": "./test/logs-saruman", "loraplus_lr_ratio": 0, "loraplus_text_encoder_lr_ratio": 0, "loraplus_unet_lr_ratio": 0, "loss_type": "l2", "lowvram": false, "lr_scheduler": "constant", "lr_scheduler_args": "", "lr_scheduler_num_cycles": 1, "lr_scheduler_power": 1, "lr_scheduler_type": "", "lr_warmup": 0, "main_process_port": 0, "masked_loss": false, "max_bucket_reso": 2048, "max_data_loader_n_workers": 0, "max_grad_norm": 1, "max_resolution": "512,512", "max_timestep": 1000, "max_token_length": 75, "max_train_epochs": 0, "max_train_steps": 1000, "mem_eff_attn": false, "mem_eff_save": false, "metadata_author": "", "metadata_description": "", "metadata_license": "", "metadata_tags": "", "metadata_title": "", "mid_lr_weight": "", "min_bucket_reso": 256, "min_snr_gamma": 7, "min_timestep": 0, "mixed_precision": "bf16", "model_list": "custom", "model_prediction_type": "raw", "module_dropout": 0, "multi_gpu": false, "multires_noise_discount": 0.3, "multires_noise_iterations": 0, "network_alpha": 16, "network_dim": 9, "network_dropout": 0, "network_weights": "", "noise_offset": 0.05, "noise_offset_random_strength": false, "noise_offset_type": "Original", "num_cpu_threads_per_process": 10, "num_machines": 1, "num_processes": 1, "optimizer": "Adafactor", "optimizer_args": "\"relative_step=False\" \"scale_parameter=False\" \"warmup_init=False\"", "output_dir": "put the full path to output folder here", "output_name": "Flux.my-oleglora", "persistent_data_loader_workers": false, "pretrained_model_name_or_path": "/home/oleg/AI/ComfyUI/models/diffusion_models/flux1-dev-fp8.safetensors", "prior_loss_weight": 1, "random_crop": false, "rank_dropout": 0, "rank_dropout_scale": false, "reg_data_dir": "", "rescaled": false, "resume": "", "resume_from_huggingface": "", "sample_every_n_epochs": 0, "sample_every_n_steps": 0, "sample_prompts": "saruman posing under a stormy lightning sky, photorealistic --w 832 --h 1216 --s 20 --l 4 --d 42", "sample_sampler": "euler", "save_every_n_epochs": 1, "save_every_n_steps": 50, "save_last_n_steps": 0, "save_last_n_steps_state": 0, "save_model_as": "safetensors", "save_precision": "bf16", "save_state": false, "save_state_on_train_end": false, "save_state_to_huggingface": false, "scale_v_pred_loss_like_noise_pred": false, "scale_weight_norms": 0, "sdxl": false, "sdxl_cache_text_encoder_outputs": true, "sdxl_no_half_vae": true, "seed": 42, "shuffle_caption": false, "split_mode": true, "split_qkv": false, "stop_text_encoder_training_pct": 0, "t5xxl": "/home/oleg/AI/ComfyUI/models/clip/t5xxl_fp8_e4m3fn tiny.safetensors", "t5xxl_max_token_length": 10, "text_encoder_lr": 0, "timestep_sampling": "sigmoid", "train_batch_size": 3, "train_blocks": "single", "train_data_dir": "/home/oleg/AI/LORA/kohya_ss/dataset/images", "train_norm": false, "train_on_input": true, "training_comment": "", "unet_lr": 0.0003, "unit": 1, "up_lr_weight": "", "use_cp": false, "use_scalar": false, "use_tucker": false, "v2": false, "v_parameterization": false, "v_pred_like_loss": 0, "vae": "", "vae_batch_size": 0, "wandb_api_key": "", "wandb_run_name": "", "weighted_captions": false, "xformers": "sdpa" } The OOM happens on "Load text encoders into gpu" The t5xxl fp8 is 4.6 gb and should fit inside the vram.

oleg996 commented 2 weeks ago

The training command : home/oleg/AI/LORA/kohya_ss/venv/bin/accelerate launch --dynamo_backend no --dynamo_mode reduce-overhead --mixed_precision bf16 --num_processes 1 --num_machines 1 --num_cpu_threads_per_process 10 /home/oleg/AI/LORA/kohya_ss/sd-scripts/flux_train.py --config_file put the full path to output folder here/config_dreambooth-20240829-111431.toml

kohya-ss commented 2 weeks ago

Unfortunately T5XXL fp8 is not supported. Only fp16 is supported for T5XXL.

oleg996 commented 2 weeks ago

Sad.No plans to add it?

oleg996 commented 2 weeks ago

Found a "hacky" way to run training : offload t5 back to cpu.

mayjay10 commented 2 weeks ago

Found a "hacky" way to run training : offload t5 back to cpu.

Similar issues here, how did you 'hack' it to get it to offload the t5 to the cpu please?

oleg996 commented 2 weeks ago

In flux_train_network.py comment out line 196 and 197 like this

         # When TE is not be trained, it will not be prepared so we need to use explicit autocast
            logger.info("move text encoders to gpu")
         #  text_encoders[0].to(accelerator.device, dtype=weight_dtype)
         #   text_encoders[1].to(accelerator.device, dtype=weight_dtype)
            with accelerator.autocast():

oleg996 commented 1 week ago

Fp8 T5 support was added . Now you can train lora on 8 gb vram.

kohya-ss / sd-scripts

OOM when training flux lora on 8gb vram (4060 mobile) #1526