Open thomasbtnfr opened 2 weeks ago
how many GPUs do you have? What is your run script?
I'm trying with 1 node of 4 H100 GPUs.
The script:
def main():
parser = FlexibleArgumentParser(description="xFuser Arguments")
args = xFuserArgs.add_cli_args(parser).parse_args()
engine_args = xFuserArgs.from_cli_args(args)
engine_config, input_config = engine_args.create_config()
engine_config.runtime_config.dtype = torch.bfloat16
local_rank = get_world_group().local_rank
pipe = xFuserFluxPipeline.from_pretrained(
pretrained_model_name_or_path=engine_config.model_config.model,
engine_config=engine_config,
torch_dtype=torch.bfloat16,
)
if args.enable_sequential_cpu_offload:
pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
logging.info(f"rank {local_rank} sequential CPU offload enabled")
else:
pipe = pipe.to(f"cuda:{local_rank}")
parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
pipe.prepare_run(input_config, steps=1)
torch.cuda.reset_peak_memory_stats()
start_time = time.time()
output = pipe(
height=input_config.height,
width=input_config.width,
prompt=input_config.prompt,
num_inference_steps=input_config.num_inference_steps,
output_type=input_config.output_type,
max_sequence_length=256,
guidance_scale=0.0,
generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
)
end_time = time.time()
elapsed_time = end_time - start_time
peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
parallel_info = (
f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
f"tp{engine_args.tensor_parallel_degree}_"
f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
)
if input_config.output_type == "pil":
dp_group_index = get_data_parallel_rank()
num_dp_groups = get_data_parallel_world_size()
dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
if pipe.is_dp_last_group():
for i, image in enumerate(output.images):
image_rank = dp_group_index * dp_batch_size + i
image_name = f"result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
image.save(f"{image_name}")
if get_world_group().rank == get_world_group().world_size - 1:
print(
f"generation time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB"
)
get_runtime_state().destory_distributed_env()
if __name__ == "__main__":
main()
The command I use to run this script is:
python xdit.py \
--model models/FLUX.1-schnell \
--pipefusion_parallel_degree 4 \
--num_inference_steps 20 \
--warmup_steps 1 \
--prompt "A small dog"
I suppose you run it on 4 GPUs. We can not reproduce the error. Can you successfully run the examples/run.sh?
My file is similar to examples/run.sh
. The only difference is that I'm working in a SLURM environment.
Here is an example of a SLURM script I'm running (2 GPUs):
#!/bin/bash
#SBATCH --job-name=xdit
#SBATCH --gres=gpu:2
#SBATCH --ntasks-per-node=2
#SBATCH --nodes=1
#SBATCH --time=00:30:00
## load my python environment
# ...
## launch script on every node
set -x
## define MASTER_ADDR and MASTER_PORT
# MASTER_ADDR=
# MASTER_PORT=
export LAUNCHER="torchrun --nnodes 1 --nproc_per_node 2 \
--nnodes $SLURM_NNODES \
--rdzv_backend c10d \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT"
CMD="xdit_flux.py \
--model models/FLUX.1-dev \
--pipefusion_parallel_degree 2 \
--num_inference_steps 28 \
--warmup_steps 1 \
--prompt 'A small dog' \
--height 1024 \
--width 1024 \
--no_use_resolution_binning"
srun bash -c "$LAUNCHER --node_rank \$SLURM_PROCID $CMD"
With this command, I always get an error in prepare_run
.
0%| | 0/1 [00:00<?, ?it/s]
0%| | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 8.25it/s]
0%| | 0/1 [00:02<?, ?it/s]
[rank1]: Traceback (most recent call last):
[rank1]: File "xdit_flux.py", line 101, in <module>
[rank1]: main()
[rank1]: File "xdit_flux.py", line 56, in main
[rank1]: pipe.prepare_run(input_config, steps=1)
[rank1]: File "site-packages/xfuser/model_executor/pipelines/pipeline_flux.py", line 70, in prepare_run
[rank1]: self.__call__(
[rank1]: File "site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
[rank1]: return func(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^
[rank1]: File ".local_xdit/lib/python3.12/site-packages/xfuser/model_executor/pipelines/base_pipeline.py", line 218, in wrapper
[rank1]: return func(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^
[rank1]: File ".local_xdit/lib/python3.12/site-packages/xfuser/model_executor/pipelines/base_pipeline.py", line 166, in data_parallel_fn
[rank1]: return func(self, *args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File ".local_xdit/lib/python3.12/site-packages/xfuser/model_executor/pipelines/base_pipeline.py", line 186, in check_naive_forward_fn
[rank1]: return func(self, *args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File ".local_xdit/lib/python3.12/site-packages/xfuser/model_executor/pipelines/pipeline_flux.py", line 332, in __call__
[rank1]: latents = self._sync_pipeline(
[rank1]: ^^^^^^^^^^^^^^^^^^^^
[rank1]: File ".local_xdit/lib/python3.12/site-packages/xfuser/model_executor/pipelines/pipeline_flux.py", line 432, in _sync_pipeline
[rank1]: latents = get_pp_group().pipeline_recv()
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File ".local_xdit/lib/python3.12/site-packages/xfuser/core/distributed/group_coordinator.py", line 925, in pipeline_recv
[rank1]: self._check_shape_and_buffer(recv_prev=True, name=name, segment_idx=idx)
[rank1]: File ".local_xdit/lib/python3.12/site-packages/xfuser/core/distributed/group_coordinator.py", line 796, in _check_shape_and_buffer
[rank1]: recv_prev_shape = self._communicate_shapes(
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File ".local_xdit/lib/python3.12/site-packages/xfuser/core/distributed/group_coordinator.py", line 859, in _communicate_shapes
[rank1]: reqs = torch.distributed.batch_isend_irecv(ops)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "torch/distributed/distributed_c10d.py", line 2372, in batch_isend_irecv
[rank1]: p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
[rank1]: File "torch/distributed/distributed_c10d.py", line 2110, in irecv
[rank1]: return pg.recv([tensor], group_src_rank, tag)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: torch.distributed.DistBackendError: NCCL error in: torch/csrc/distributed/c10d/NCCLUtils.hpp:317, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.21.5
[rank1]: ncclUnhandledCudaError: Call to CUDA function failed.
[rank1]: Last error:
[rank1]: Cuda failure 'an illegal memory access was encountered'
Btw, in the log of my previous message, I wasn't using torchrun, which certainly explains the small difference... Perhaps the call to torchrun in SLURM is incorrect?
Can you check your env with a simple 2 GPUs pytorch P2P program?
Hello,
I have an error when running the Flux example script with multiple GPUs (H100). I tested with 2 and 4. With only one GPU, no error, the generation is going well. I tested by varying
pipefusion_parallel_degree
between 2 and 4. The error occurs at this point in the script:pipe.prepare_run(input_config, steps=1)
. Here is the error message:Environment information:
Any ideas?