Closed jackalcooper closed 2 years ago
能稳定复现吗?
能稳定复现吗?
能
run without gdb, the py stack is a little different but it contains a _numpy
frame as well
RUN_SLOW=1 python3 -m pytest tests/test_pipelines_oneflow.py::PipelineTesterMixin::test_stable_diffusion_memory_chunking
============================================ test session starts ============================================
platform linux -- Python 3.8.10, pytest-7.0.1, pluggy-1.0.0
rootdir: /home/caishenghang/diffusers
plugins: timeout-2.1.0, forked-1.4.0, xdist-2.5.0
collected 1 item
tests/test_pipelines_oneflow.py Fatal Python error: Aborted
Thread 0x00007f902dfff700 (most recent call first):
File "/usr/lib/python3.8/threading.py", line 306 in wait
File "/usr/lib/python3.8/threading.py", line 558 in wait
File "/home/caishenghang/.local/lib/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007f93ce7e0740 (most recent call first):
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/python/oneflow/framework/tensor.py", line 473 in _numpy
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/python/oneflow/framework/tensor.py", line 175 in _item
File "/home/caishenghang/diffusers/src/diffusers/modeling_oneflow_utils.py", line 44 in extract_scalar
File "/home/caishenghang/diffusers/src/diffusers/schedulers/scheduling_pndm_oneflow.py", line 382 in _get_prev_sample
File "/home/caishenghang/diffusers/src/diffusers/schedulers/scheduling_pndm_oneflow.py", line 338 in step_plms
File "/home/caishenghang/diffusers/src/diffusers/schedulers/scheduling_pndm_oneflow.py", line 223 in step
File "/home/caishenghang/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_oneflow.py", line 280 in __call__
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/python/oneflow/autograd/autograd_mode.py", line 154 in wrapper
File "/home/caishenghang/diffusers/tests/test_pipelines_oneflow.py", line 1188 in test_stable_diffusion_memory_chunking
File "/usr/lib/python3.8/unittest/case.py", line 633 in _callTestMethod
File "/usr/lib/python3.8/unittest/case.py", line 676 in run
File "/usr/lib/python3.8/unittest/case.py", line 736 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/unittest.py", line 327 in runtest
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 168 in pytest_runtest_call
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_callers.py", line 39 in _multicall
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_manager.py", line 80 in _hookexec
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_hooks.py", line 265 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 261 in <lambda>
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 340 in from_call
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 260 in call_runtest_hook
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 221 in call_and_report
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 132 in runtestprotocol
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 113 in pytest_runtest_protocol
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_callers.py", line 39 in _multicall
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_manager.py", line 80 in _hookexec
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_hooks.py", line 265 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/main.py", line 347 in pytest_runtestloop
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_callers.py", line 39 in _multicall
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_manager.py", line 80 in _hookexec
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_hooks.py", line 265 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/main.py", line 322 in _main
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/main.py", line 268 in wrap_session
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/main.py", line 315 in pytest_cmdline_main
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_callers.py", line 39 in _multicall
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_manager.py", line 80 in _hookexec
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_hooks.py", line 265 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/config/__init__.py", line 165 in main
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/config/__init__.py", line 188 in console_main
File "/home/caishenghang/.local/lib/python3.8/site-packages/pytest/__main__.py", line 5 in <module>
File "/usr/lib/python3.8/runpy.py", line 87 in _run_code
File "/usr/lib/python3.8/runpy.py", line 194 in _run_module_as_main
Aborted (core dumped)
update: it is OOM error. pytest capture the output
PYTHONUNBUFFERED=1 RUN_SLOW=1 python3 -m pytest --capture=no tests/test_pipelines_oneflow.py::PipelineTesterMixin::test_stable_diffusion_memory_chunking
============================================ test session starts ============================================
platform linux -- Python 3.8.10, pytest-7.0.1, pluggy-1.0.0
rootdir: /home/caishenghang/diffusers
plugins: timeout-2.1.0, forked-1.4.0, xdist-2.5.0
collected 1 item
tests/test_pipelines_oneflow.py OneFlowPNDMScheduler {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012, 'beta_schedule': 'scaled_linear', 'trained_betas': None, 'skip_prk_steps': True, 'set_alpha_to_one': False, 'steps_offset': 0}
OneFlowUNet2DConditionModel {'sample_size': 64, 'in_channels': 4, 'out_channels': 4, 'center_input_sample': False, 'flip_sin_to_cos': True, 'freq_shift': 0, 'down_block_types': ['CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'], 'up_block_types': ['UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'], 'block_out_channels': [320, 640, 1280, 1280], 'layers_per_block': 2, 'downsample_padding': 1, 'mid_block_scale_factor': 1, 'act_fn': 'silu', 'norm_num_groups': 32, 'norm_eps': 1e-05, 'cross_attention_dim': 768, 'attention_head_dim': 8}
OneFlowAutoencoderKL {'in_channels': 3, 'out_channels': 3, 'down_block_types': ['DownEncoderBlock2D', 'DownEncoderBlock2D', 'DownEncoderBlock2D', 'DownEncoderBlock2D'], 'up_block_types': ['UpDecoderBlock2D', 'UpDecoderBlock2D', 'UpDecoderBlock2D', 'UpDecoderBlock2D'], 'block_out_channels': [128, 256, 512, 512], 'layers_per_block': 2, 'act_fn': 'silu', 'latent_channels': 4, 'norm_num_groups': 32, 'sample_size': 512}
0%| | 0/11 [00:00<?, ?it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
9%|██████▋ | 1/11 [00:00<00:09, 1.04it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
18%|█████████████▎ | 2/11 [00:01<00:04, 1.91it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
27%|███████████████████▉ | 3/11 [00:01<00:03, 2.50it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
36%|██████████████████████████▌ | 4/11 [00:01<00:02, 2.94it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
45%|█████████████████████████████████▏ | 5/11 [00:01<00:01, 3.23it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
55%|███████████████████████████████████████▊ | 6/11 [00:02<00:01, 3.38it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
64%|██████████████████████████████████████████████▍ | 7/11 [00:02<00:01, 3.50it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
73%|█████████████████████████████████████████████████████ | 8/11 [00:02<00:00, 3.62it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
82%|███████████████████████████████████████████████████████████▋ | 9/11 [00:02<00:00, 3.68it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
91%|█████████████████████████████████████████████████████████████████▍ | 10/11 [00:03<00:00, 3.75it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:03<00:00, 3.19it/s]
0%| | 0/11 [00:00<?, ?it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
9%|██████▋ | 1/11 [00:00<00:03, 3.20it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
18%|█████████████▎ | 2/11 [00:00<00:02, 3.43it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
27%|███████████████████▉ | 3/11 [00:00<00:02, 3.97it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
36%|██████████████████████████▌ | 4/11 [00:01<00:01, 4.28it/s]prev_timestep.dtype oneflow.int64
prev_timestep.device cpu:0
prev_timestep.shape oneflow.Size([])
W20220930 10:27:09.851022 3479310 ep_backend_allocator.cpp:37] OOM error is detected, process will exit. And it will start to reset CUDA device for releasing device memory.
F20220930 10:27:09.850808 3479310 virtual_machine_engine.cpp:390] out of memory
Error message from /mnt/DATA/csh/oneflow-cuda-bare-metal/oneflow/core/vm/virtual_machine_engine.cpp:390
instruction->Prepare(): reset device
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/oneflow/core/vm/virtual_machine_engine.cpp", line 390, in DispatchInstruction
instruction->Prepare()
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/oneflow/core/vm/op_call_instruction_policy.cpp", line 31, in Prepare
AllocateOutputBlobsMemory(op_call_instruction_policy, allocator)
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/oneflow/core/vm/op_call_instruction_policy.cpp", line 82, in AllocateOutputBlobsMemory
blob_object->TryAllocateBlobBodyMemory(allocator)
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/oneflow/core/eager/eager_blob_object.cpp", line 113, in TryAllocateBlobBodyMemory
allocator->Allocate(&dptr, required_body_bytes)
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/oneflow/core/vm/bin_allocator.h", line 392, in Allocate
AllocateBlockToExtendTotalMem(aligned_size)
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/oneflow/core/vm/bin_allocator.h", line 305, in AllocateBlockToExtendTotalMem
backend_->Allocate(&mem_ptr, final_allocate_bytes)
Error Type: oneflow.ErrorProto.out_of_memory_error
*** Check failure stack trace: ***
@ 0x7efaa9296cd3 google::LogMessage::Fail()
@ 0x7efaa929bf2b google::LogMessage::SendToLog()
@ 0x7efaa929693f google::LogMessage::Flush()
@ 0x7efaa92971ff google::LogMessageFatal::~LogMessageFatal()
@ 0x7efacb0e09ba _ZZN7oneflow2vm20VirtualMachineEngine19DispatchInstructionIXadL_ZNS1_34BusyWaitInstructionsDoneThenShrinkEPNS0_6StreamERKNS0_11ScheduleCtxEEEEEvPNS0_11InstructionES7_ENKUlPKcE_clESB_
@ 0x7efacb0e1f4a oneflow::vm::VirtualMachineEngine::DispatchInstruction<>()
@ 0x7efacb0dc517 oneflow::vm::VirtualMachineEngine::DispatchAndPrescheduleInstructions()
@ 0x7efacb0dd72b oneflow::vm::VirtualMachineEngine::Schedule()
@ 0x7efacb0d1099 oneflow::VirtualMachine::ScheduleLoop()
@ 0x7efb870f3de4 (unknown)
@ 0x7efc3cd9d609 start_thread
@ 0x7efc3ced7163 clone
Fatal Python error: Aborted
Thread 0x00007ef8a9f5a700 (most recent call first):
File "/usr/lib/python3.8/threading.py", line 306 in wait
File "/usr/lib/python3.8/threading.py", line 558 in wait
File "/home/caishenghang/.local/lib/python3.8/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib/python3.8/threading.py", line 932 in _bootstrap_inner
File "/usr/lib/python3.8/threading.py", line 890 in _bootstrap
Thread 0x00007efc3cbec740 (most recent call first):
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/python/oneflow/framework/tensor.py", line 473 in _numpy
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/python/oneflow/framework/tensor.py", line 175 in _item
File "/home/caishenghang/diffusers/src/diffusers/modeling_oneflow_utils.py", line 44 in extract_scalar
File "/home/caishenghang/diffusers/src/diffusers/schedulers/scheduling_pndm_oneflow.py", line 386 in _get_prev_sample
File "/home/caishenghang/diffusers/src/diffusers/schedulers/scheduling_pndm_oneflow.py", line 338 in step_plms
File "/home/caishenghang/diffusers/src/diffusers/schedulers/scheduling_pndm_oneflow.py", line 223 in step
File "/home/caishenghang/diffusers/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_oneflow.py", line 280 in __call__
File "/mnt/DATA/csh/oneflow-cuda-bare-metal/python/oneflow/autograd/autograd_mode.py", line 154 in wrapper
File "/home/caishenghang/diffusers/tests/test_pipelines_oneflow.py", line 1188 in test_stable_diffusion_memory_chunking
File "/usr/lib/python3.8/unittest/case.py", line 633 in _callTestMethod
File "/usr/lib/python3.8/unittest/case.py", line 676 in run
File "/usr/lib/python3.8/unittest/case.py", line 736 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/unittest.py", line 327 in runtest
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 168 in pytest_runtest_call
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_callers.py", line 39 in _multicall
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_manager.py", line 80 in _hookexec
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_hooks.py", line 265 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 261 in <lambda>
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 340 in from_call
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 260 in call_runtest_hook
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 221 in call_and_report
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 132 in runtestprotocol
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/runner.py", line 113 in pytest_runtest_protocol
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_callers.py", line 39 in _multicall
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_manager.py", line 80 in _hookexec
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_hooks.py", line 265 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/main.py", line 347 in pytest_runtestloop
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_callers.py", line 39 in _multicall
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_manager.py", line 80 in _hookexec
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_hooks.py", line 265 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/main.py", line 322 in _main
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/main.py", line 268 in wrap_session
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/main.py", line 315 in pytest_cmdline_main
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_callers.py", line 39 in _multicall
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_manager.py", line 80 in _hookexec
File "/home/caishenghang/.local/lib/python3.8/site-packages/pluggy/_hooks.py", line 265 in __call__
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/config/__init__.py", line 165 in main
File "/home/caishenghang/.local/lib/python3.8/site-packages/_pytest/config/__init__.py", line 188 in console_main
File "/home/caishenghang/.local/lib/python3.8/site-packages/pytest/__main__.py", line 5 in <module>
File "/usr/lib/python3.8/runpy.py", line 87 in _run_code
File "/usr/lib/python3.8/runpy.py", line 194 in _run_module_as_main
Aborted (core dumped)
Summary
code
terminal output
stack in GDB
A short description about the bug/issue
Code to reproduce bug
Please post a minimal example to repro the bug. GitHub Gist or repo is highly recommended.
System Information
python3 -m oneflow --doctor
):