kohya-ss / sd-scripts

Apache License 2.0
4.96k stars 833 forks source link

Flux CUDA memory issue on H100 #1517

Closed defnotkenski closed 3 weeks ago

defnotkenski commented 3 weeks ago

Hey guys,

I'm getting CUDA out of memory errors while using the flux_train.py script with 80Gb of VRAM using Prodigy.

Any configuration recommendations to get this working? Thanks.

The error in question:

Screenshot 2024-08-26 at 10 32 04 PM

The executed command:

Screenshot 2024-08-26 at 10 32 21 PM

My complete configuration:

{
  "adaptive_noise_scale": "",
  "ae": "",
  "alpha_mask": "",
  "apply_t5_attn_mask": "",
  "async_upload": "",
  "blockwise_fused_optimizers": "",
  "bucket_no_upscale": true,
  "bucket_reso_steps": "",
  "cache_info": "",
  "cache_latents": "",
  "cache_latents_to_disk": "",
  "cache_text_encoder_outputs": "",
  "cache_text_encoder_outputs_to_disk": "",
  "caption_dropout_every_n_epochs": "",
  "caption_dropout_rate": "",
  "caption_extension": "",
  "caption_prefix": "",
  "caption_separator": "",
  "caption_suffix": "",
  "caption_tag_dropout_rate": "",
  "clip_l": "",
  "clip_skip": 1,
  "color_aug": "",
  "conditioning_data_dir": "",
  "config_file": "",
  "console_log_file": "",
  "console_log_level": "",
  "console_log_simple": "",
  "cpu_offload_checkpointing": "",
  "dataset_class": "",
  "dataset_config": "",
  "dataset_repeats": "",
  "ddp_gradient_as_bucket_view": "",
  "ddp_static_graph": "",
  "ddp_timeout": "",
  "debiased_estimation_loss": "",
  "debug_dataset": "",
  "deepspeed": "",
  "disable_mmap_load_safetensors": "",
  "discrete_flow_shift": 3.0,
  "double_blocks_to_swap": "",
  "dynamo_backend": "",
  "enable_bucket": true,
  "enable_wildcard": "",
  "face_crop_aug_range": "",
  "flip_aug": "",
  "fp16_master_weights_and_gradients": "",
  "fp8_base": "",
  "full_bf16": true,
  "full_fp16": "",
  "fused_backward_pass": "",
  "fused_optimizer_groups": "",
  "gradient_accumulation_steps": "",
  "gradient_checkpointing": true,
  "guidance_scale": 1.0,
  "highvram": true,
  "huber_c": "",
  "huber_schedule": "",
  "huggingface_path_in_repo": "",
  "huggingface_repo_id": "",
  "huggingface_repo_type": "",
  "huggingface_repo_visibility": "",
  "huggingface_token": "",
  "in_json": "",
  "ip_noise_gamma": "",
  "ip_noise_gamma_random_strength": "",
  "keep_tokens": "",
  "keep_tokens_separator": "",
  "learning_rate": 1,
  "log_config": "",
  "log_prefix": "",
  "log_tracker_config": "",
  "log_tracker_name": "",
  "log_with": "",
  "logging_dir": "",
  "logit_mean": 0.0,
  "logit_std": 1.0,
  "loss_type": "l2",
  "lowram": "",
  "lr_scheduler": "cosine",
  "lr_scheduler_args": "",
  "lr_scheduler_num_cycles": "",
  "lr_scheduler_power": "",
  "lr_scheduler_type": "",
  "lr_warmup_steps": "",
  "masked_loss": "",
  "max_bucket_reso": 2048,
  "max_data_loader_n_workers": "",
  "max_grad_norm": "",
  "max_timestep": "",
  "max_token_length": "",
  "max_train_epochs": 130,
  "max_train_steps": 0,
  "mem_eff_attn": "",
  "mem_eff_save": "",
  "min_bucket_reso": "",
  "min_snr_gamma": 5,
  "min_timestep": "",
  "mixed_precision": "bf16",
  "mode_scale": "",
  "model_prediction_type": "sigma_scaled",
  "multires_noise_discount": 0.1,
  "multires_noise_iterations": 6,
  "noise_offset": "",
  "noise_offset_random_strength": "",
  "offload_optimizer_device": "",
  "offload_optimizer_nvme_path": "",
  "offload_param_device": "",
  "offload_param_nvme_path": "",
  "optimizer_args": ["decouple=True", "weight_decay=0.01", "d_coef=0.8", "use_bias_correction=True", "safeguard_warmup=True", "betas=0.9,0.99"],
  "optimizer_type": "Prodigy",
  "output_config": "",
  "output_dir": "",
  "output_name": "",
  "persistent_data_loader_workers": "",
  "pretrained_model_name_or_path": "",
  "random_crop": "",
  "reg_data_dir": "",
  "resolution": "1024,1024",
  "resume": "",
  "resume_from_huggingface": "",
  "sample_at_first": "",
  "sample_every_n_epochs": "",
  "sample_every_n_steps": "",
  "sample_prompts": "",
  "sample_sampler": "",
  "save_every_n_epochs": "",
  "save_every_n_steps": "",
  "save_last_n_epochs": "",
  "save_last_n_epochs_state": "",
  "save_last_n_steps": "",
  "save_last_n_steps_state": "",
  "save_model_as": "safetensors",
  "save_n_epoch_ratio": "",
  "save_precision": "bf16",
  "save_state": "",
  "save_state_on_train_end": "",
  "save_state_to_huggingface": "",
  "scale_v_pred_loss_like_noise_pred": "",
  "sdpa": "",
  "secondary_separator": "",
  "seed": "",
  "shuffle_caption": "",
  "sigmoid_scale": 1.0,
  "single_blocks_to_swap": "",
  "skip_latents_validity_check": "",
  "t5xxl": "",
  "t5xxl_max_token_length": "",
  "text_encoder_batch_size": "",
  "timestep_sampling": "sigma",
  "token_warmup_min": "",
  "token_warmup_step": "",
  "tokenizer_cache_dir": "",
  "torch_compile": "",
  "train_batch_size": "",
  "train_data_dir": "",
  "use_8bit_adam": "",
  "use_lion_optimizer": "",
  "use_safetensors": "",
  "v2": "",
  "v_parameterization": "",
  "v_pred_like_loss": "",
  "vae": "",
  "vae_batch_size": "",
  "wandb_api_key": "",
  "wandb_run_name": "",
  "weighted_captions": "",
  "weighting_scheme": "none",
  "xformers": "",
  "zero3_init_flag": "",
  "zero3_save_16bit_model": "",
  "zero_stage": "",
  "zero_terminal_snr": ""
}

My Accelerate config:

command_file: null
commands: null
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: 'NO'
downcast_bf16: 'no'
dynamo_backend: 'NO'
fsdp_config: {}
gpu_ids: all
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
megatron_lm_config: {}
mixed_precision: 'bf16'
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_name: null
tpu_zone: null
use_cpu: false
tristanwqy commented 3 weeks ago

you'd better cache all the latents and te, expecially te.

Indeed, if you dont cache te, on each batch, the whole text encoder must be inside GPU, that costs a huge amout of meomory.

close the highvram, and use xformers may help as well

BootsofLagrangian commented 3 weeks ago

With cached output, both latent and text condition, you can finetune Flux with 80GB GPU.

I tested 1 x A100 80GB, adamw8bit.

defnotkenski commented 3 weeks ago

With cached output, both latent and text condition, you can finetune Flux with 80GB GPU.

I tested 1 x A100 80GB, adamw8bit.

you'd better cache all the latents and te, expecially te.

Indeed, if you dont cache te, on each batch, the whole text encoder must be inside GPU, that costs a huge amout of meomory.

close the highvram, and use xformers may help as well

Thanks, that did the trick. Ran into another error but at least it's not a memory issue anymore haha.