Flux CUDA memory issue on H100

defnotkenski commented 2 months ago

Hey guys,

I'm getting CUDA out of memory errors while using the flux_train.py script with 80Gb of VRAM using Prodigy.

Any configuration recommendations to get this working? Thanks.

The error in question:

The executed command:

My complete configuration:

{
  "adaptive_noise_scale": "",
  "ae": "",
  "alpha_mask": "",
  "apply_t5_attn_mask": "",
  "async_upload": "",
  "blockwise_fused_optimizers": "",
  "bucket_no_upscale": true,
  "bucket_reso_steps": "",
  "cache_info": "",
  "cache_latents": "",
  "cache_latents_to_disk": "",
  "cache_text_encoder_outputs": "",
  "cache_text_encoder_outputs_to_disk": "",
  "caption_dropout_every_n_epochs": "",
  "caption_dropout_rate": "",
  "caption_extension": "",
  "caption_prefix": "",
  "caption_separator": "",
  "caption_suffix": "",
  "caption_tag_dropout_rate": "",
  "clip_l": "",
  "clip_skip": 1,
  "color_aug": "",
  "conditioning_data_dir": "",
  "config_file": "",
  "console_log_file": "",
  "console_log_level": "",
  "console_log_simple": "",
  "cpu_offload_checkpointing": "",
  "dataset_class": "",
  "dataset_config": "",
  "dataset_repeats": "",
  "ddp_gradient_as_bucket_view": "",
  "ddp_static_graph": "",
  "ddp_timeout": "",
  "debiased_estimation_loss": "",
  "debug_dataset": "",
  "deepspeed": "",
  "disable_mmap_load_safetensors": "",
  "discrete_flow_shift": 3.0,
  "double_blocks_to_swap": "",
  "dynamo_backend": "",
  "enable_bucket": true,
  "enable_wildcard": "",
  "face_crop_aug_range": "",
  "flip_aug": "",
  "fp16_master_weights_and_gradients": "",
  "fp8_base": "",
  "full_bf16": true,
  "full_fp16": "",
  "fused_backward_pass": "",
  "fused_optimizer_groups": "",
  "gradient_accumulation_steps": "",
  "gradient_checkpointing": true,
  "guidance_scale": 1.0,
  "highvram": true,
  "huber_c": "",
  "huber_schedule": "",
  "huggingface_path_in_repo": "",
  "huggingface_repo_id": "",
  "huggingface_repo_type": "",
  "huggingface_repo_visibility": "",
  "huggingface_token": "",
  "in_json": "",
  "ip_noise_gamma": "",
  "ip_noise_gamma_random_strength": "",
  "keep_tokens": "",
  "keep_tokens_separator": "",
  "learning_rate": 1,
  "log_config": "",
  "log_prefix": "",
  "log_tracker_config": "",
  "log_tracker_name": "",
  "log_with": "",
  "logging_dir": "",
  "logit_mean": 0.0,
  "logit_std": 1.0,
  "loss_type": "l2",
  "lowram": "",
  "lr_scheduler": "cosine",
  "lr_scheduler_args": "",
  "lr_scheduler_num_cycles": "",
  "lr_scheduler_power": "",
  "lr_scheduler_type": "",
  "lr_warmup_steps": "",
  "masked_loss": "",
  "max_bucket_reso": 2048,
  "max_data_loader_n_workers": "",
  "max_grad_norm": "",
  "max_timestep": "",
  "max_token_length": "",
  "max_train_epochs": 130,
  "max_train_steps": 0,
  "mem_eff_attn": "",
  "mem_eff_save": "",
  "min_bucket_reso": "",
  "min_snr_gamma": 5,
  "min_timestep": "",
  "mixed_precision": "bf16",
  "mode_scale": "",
  "model_prediction_type": "sigma_scaled",
  "multires_noise_discount": 0.1,
  "multires_noise_iterations": 6,
  "noise_offset": "",
  "noise_offset_random_strength": "",
  "offload_optimizer_device": "",
  "offload_optimizer_nvme_path": "",
  "offload_param_device": "",
  "offload_param_nvme_path": "",
  "optimizer_args": ["decouple=True", "weight_decay=0.01", "d_coef=0.8", "use_bias_correction=True", "safeguard_warmup=True", "betas=0.9,0.99"],
  "optimizer_type": "Prodigy",
  "output_config": "",
  "output_dir": "",
  "output_name": "",
  "persistent_data_loader_workers": "",
  "pretrained_model_name_or_path": "",
  "random_crop": "",
  "reg_data_dir": "",
  "resolution": "1024,1024",
  "resume": "",
  "resume_from_huggingface": "",
  "sample_at_first": "",
  "sample_every_n_epochs": "",
  "sample_every_n_steps": "",
  "sample_prompts": "",
  "sample_sampler": "",
  "save_every_n_epochs": "",
  "save_every_n_steps": "",
  "save_last_n_epochs": "",
  "save_last_n_epochs_state": "",
  "save_last_n_steps": "",
  "save_last_n_steps_state": "",
  "save_model_as": "safetensors",
  "save_n_epoch_ratio": "",
  "save_precision": "bf16",
  "save_state": "",
  "save_state_on_train_end": "",
  "save_state_to_huggingface": "",
  "scale_v_pred_loss_like_noise_pred": "",
  "sdpa": "",
  "secondary_separator": "",
  "seed": "",
  "shuffle_caption": "",
  "sigmoid_scale": 1.0,
  "single_blocks_to_swap": "",
  "skip_latents_validity_check": "",
  "t5xxl": "",
  "t5xxl_max_token_length": "",
  "text_encoder_batch_size": "",
  "timestep_sampling": "sigma",
  "token_warmup_min": "",
  "token_warmup_step": "",
  "tokenizer_cache_dir": "",
  "torch_compile": "",
  "train_batch_size": "",
  "train_data_dir": "",
  "use_8bit_adam": "",
  "use_lion_optimizer": "",
  "use_safetensors": "",
  "v2": "",
  "v_parameterization": "",
  "v_pred_like_loss": "",
  "vae": "",
  "vae_batch_size": "",
  "wandb_api_key": "",
  "wandb_run_name": "",
  "weighted_captions": "",
  "weighting_scheme": "none",
  "xformers": "",
  "zero3_init_flag": "",
  "zero3_save_16bit_model": "",
  "zero_stage": "",
  "zero_terminal_snr": ""
}

My Accelerate config:

command_file: null
commands: null
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: 'NO'
downcast_bf16: 'no'
dynamo_backend: 'NO'
fsdp_config: {}
gpu_ids: all
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
megatron_lm_config: {}
mixed_precision: 'bf16'
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_name: null
tpu_zone: null
use_cpu: false

tristanwqy commented 2 months ago

you'd better cache all the latents and te, expecially te.

Indeed, if you dont cache te, on each batch, the whole text encoder must be inside GPU, that costs a huge amout of meomory.

close the highvram, and use xformers may help as well

BootsofLagrangian commented 2 months ago

With cached output, both latent and text condition, you can finetune Flux with 80GB GPU.

I tested 1 x A100 80GB, adamw8bit.

defnotkenski commented 2 months ago

With cached output, both latent and text condition, you can finetune Flux with 80GB GPU.

I tested 1 x A100 80GB, adamw8bit.

you'd better cache all the latents and te, expecially te.

Indeed, if you dont cache te, on each batch, the whole text encoder must be inside GPU, that costs a huge amout of meomory.

close the highvram, and use xformers may help as well

Thanks, that did the trick. Ran into another error but at least it's not a memory issue anymore haha.

kohya-ss / sd-scripts

Flux CUDA memory issue on H100 #1517