gpu=4
prompt="a photo of an astronaut riding a horse on mars"
args=(
--model /mnt/models/source/
--use_onediff
--pipefusion_parallel_degree 2
--use_cfg_parallel
--height 1024
--width 1024
--prompt "$prompt"
--num_inference_steps 12
)
torchrun --nproc_per_node=${gpu} sd3_example.py "${args[@]}"
results in error:
[rank0]: Traceback (most recent call last):
[rank0]: File "/data/script/sd3_xdit/sd3_example.py", line 107, in <module>
[rank0]: main()
[rank0]: File "/data/script/sd3_xdit/sd3_example.py", line 36, in main
[rank0]: pipe.prepare_run(input_config)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/xfuser/model_executor/pipelines/pipeline_stable_diffusion_3.py", line 75, in prepare_run
[rank0]: self.__call__(
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
[rank0]: return func(*args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/xfuser/model_executor/pipelines/base_pipeline.py", line 166, in data_parallel_fn
[rank0]: return func(self, *args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/xfuser/model_executor/pipelines/base_pipeline.py", line 186, in check_naive_forward_fn
[rank0]: return func(self, *args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/xfuser/model_executor/pipelines/pipeline_stable_diffusion_3.py", line 348, in __call__
[rank0]: latents = self._sync_pipeline(
[rank0]: File "/opt/conda/lib/python3.10/site-packages/xfuser/model_executor/pipelines/pipeline_stable_diffusion_3.py", line 450, in _sync_pipeline
[rank0]: latents, encoder_hidden_states = self._backbone_forward(
[rank0]: File "/opt/conda/lib/python3.10/site-packages/xfuser/model_executor/pipelines/pipeline_stable_diffusion_3.py", line 746, in _backbone_forward
[rank0]: noise_pred, encoder_hidden_states = self.transformer(
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/onediff/infer_compiler/backends/nexfort/deployable_module.py", line 55, in deploy_function
[rank0]: return compiled_model(*args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 451, in _fn
[rank0]: return fn(*args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/xfuser/model_executor/models/transformers/transformer_sd3.py", line 149, in forward
[rank0]: encoder_hidden_states, hidden_states = block(
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 921, in catch_errors
[rank0]: return callback(frame, cache_entry, hooks, frame_state, skip=1)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 786, in _convert_frame
[rank0]: result = inner_convert(
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 400, in _convert_frame_assert
[rank0]: return _compile(
[rank0]: File "/opt/conda/lib/python3.10/contextlib.py", line 79, in inner
[rank0]: return func(*args, **kwds)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 703, in _compile
[rank0]: raise InternalTorchDynamoError(str(e)).with_traceback(
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 676, in _compile
[rank0]: guarded_code = compile_inner(code, one_graph, hooks, transform)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 262, in time_wrapper
[rank0]: r = func(*args, **kwargs)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 535, in compile_inner
[rank0]: out_code = transform_code_object(code, transform)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py", line 1036, in transform_code_object
[rank0]: transformations(instructions, code_options)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 176, in _fn
[rank0]: torch.cuda.set_rng_state(cuda_rng_state) # type: ignore[possibly-undefined]
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/cuda/random.py", line 74, in set_rng_state
[rank0]: _lazy_call(cb)
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/cuda/__init__.py", line 223, in _lazy_call
[rank0]: callable()
[rank0]: File "/opt/conda/lib/python3.10/site-packages/torch/cuda/random.py", line 72, in cb
[rank0]: default_generator.set_state(new_state_copy)
[rank0]: torch._dynamo.exc.InternalTorchDynamoError: CUDA error: misaligned address
[rank0]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
[rank0]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
[rank0]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
[rank0]: You can suppress this exception and fall back to eager by setting:
[rank0]: import torch._dynamo
[rank0]: torch._dynamo.config.suppress_errors = True
W1114 14:59:22.109000 140607834957632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 13577 closing signal SIGTERM
W1114 14:59:22.109000 140607834957632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 13578 closing signal SIGTERM
W1114 14:59:22.110000 140607834957632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 13579 closing signal SIGTERM
E1114 14:59:22.396000 140607834957632 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 13576) of binary: /opt/conda/bin/python
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==2.3.0', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
run(args)
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
elastic_launch(
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
sd3_example.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-11-14_14:59:22
host : atmodels-optim-v2-9mbrz-1286637184
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 13576)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
tested on both A800 and 4090 with script:
results in error: