Closed AnddyWang closed 3 years ago
@gitabtion 训练过程中,不到一个Epoch,提示显存不足 Traceback (most recent call last): File "main.py", line 101, in main() File "main.py", line 93, in main trainer.fit(model, train_loader, valid_loader) File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit results = self.accelerator_backend.train() File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 57, in train return self.train_or_test() File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test results = self.trainer.train() File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train self.train_loop.run_training_epoch() File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 549, in run_training_epoch batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx) File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 678, in run_training_batch self.trainer.hiddens) File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 802, in training_step_and_backward self.backward(result, optimizer, opt_idx) File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 829, in backward result.closure_loss, optimizer, opt_idx, *args, kwargs File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 109, in backward model.backward(closure_loss, optimizer, opt_idx, *args, *kwargs) File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1162, in backward loss.backward(args, kwargs) File "/opt/conda/lib/python3.7/site-packages/torch/tensor.py", line 185, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph) File "/opt/conda/lib/python3.7/site-packages/torch/autograd/init.py", line 127, in backward allow_unreachable=True) # allow_unreachable flag RuntimeError: CUDA out of memory. Tried to allocate 666.00 MiB (GPU 0; 15.78 GiB total capacity; 12.76 GiB already allocated; 184.19 MiB free; 13.50 GiB reserved in total by PyTorch) Exception raised from malloc at /opt/conda/conda-bld/pytorch_1595629403081/work/c10/cuda/CUDACachingAllocator.cpp:272 (most recent call first): frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x4d (0x7eff1c9bf77d in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so) frame #1: + 0x20626 (0x7eff1cc17626 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so) frame #2: + 0x214f4 (0x7eff1cc184f4 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so) frame #3: + 0x21b81 (0x7eff1cc18b81 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so) frame #4: at::native::empty_cuda(c10::ArrayRef, c10::TensorOptions const&, c10::optional) + 0x249 (0x7eff1fb27c79 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so) frame #5: + 0xd25dc9 (0x7eff1db4adc9 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so) frame #6: + 0xd3fbf7 (0x7eff1db64bf7 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so) frame #7: + 0xe450dd (0x7eff4fc7e0dd in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #8: + 0xe453f7 (0x7eff4fc7e3f7 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #9: at::empty(c10::ArrayRef, c10::TensorOptions const&, c10::optional) + 0xfa (0x7eff4fd88e7a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #10: at::native::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional) + 0x49e (0x7eff4fa0709e in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #11: + 0xfe3521 (0x7eff4fe1c521 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #12: + 0x101ecc3 (0x7eff4fe57cc3 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #13: at::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional) + 0x101 (0x7eff4fd6bf91 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #14: at::Tensor at::native::(anonymous namespace)::host_softmax_backward<at::native::(anonymous namespace)::LogSoftMaxBackwardEpilogue, true>(at::Tensor const&, at::Tensor const&, long, bool) + 0x16c (0x7eff1f275eac in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so) frame #15: at::native::log_softmax_backward_cuda(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x8d (0x7eff1f25117d in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so) frame #16: + 0xd13a40 (0x7eff1db38a40 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so) frame #17: + 0xe6f636 (0x7eff4fca8636 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #18: at::_log_softmax_backward_data(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x119 (0x7eff4fd36aa9 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #19: + 0x2c217ff (0x7eff51a5a7ff in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #20: + 0xe6f636 (0x7eff4fca8636 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #21: at::_log_softmax_backward_data(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x119 (0x7eff4fd36aa9 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #22: torch::autograd::generated::LogSoftmaxBackward::apply(std::vector<at::Tensor, std::allocator >&&) + 0x1d7 (0x7eff518d64b7 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #23: + 0x30d1017 (0x7eff51f0a017 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #24: torch::autograd::Engine::evaluate_function(std::shared_ptr&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr const&) + 0x1400 (0x7eff51f05860 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #25: torch::autograd::Engine::thread_main(std::shared_ptr const&) + 0x451 (0x7eff51f06401 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #26: torch::autograd::Engine::thread_init(int, std::shared_ptr const&, bool) + 0x89 (0x7eff51efe579 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so) frame #27: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr const&, bool) + 0x4a (0x7eff5622d99a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so) frame #28: + 0xc819d (0x7eff58d7819d in /opt/conda/lib/python3.7/site-packages/torch/lib/../../../.././libstdc++.so.6) frame #29: + 0x76db (0x7eff7154a6db in /lib/x86_64-linux-gnu/libpthread.so.0) frame #30: clone + 0x3f (0x7eff7127388f in /lib/x86_64-linux-gnu/libc.so.6)
稍微把batch size调小一些即可,在经历一个完整的train,valid流程后,显存占用便不会再增加了。
@gitabtion 训练过程中,不到一个Epoch,提示显存不足 Traceback (most recent call last): File "main.py", line 101, in
main()
File "main.py", line 93, in main
trainer.fit(model, train_loader, valid_loader)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit
results = self.accelerator_backend.train()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 57, in train
return self.train_or_test()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
results = self.trainer.train()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train
self.train_loop.run_training_epoch()
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 549, in run_training_epoch
batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 678, in run_training_batch
self.trainer.hiddens)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 802, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 829, in backward
result.closure_loss, optimizer, opt_idx, *args, kwargs
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 109, in backward
model.backward(closure_loss, optimizer, opt_idx, *args, *kwargs)
File "/opt/conda/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1162, in backward
loss.backward(args, kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/tensor.py", line 185, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/opt/conda/lib/python3.7/site-packages/torch/autograd/init.py", line 127, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 666.00 MiB (GPU 0; 15.78 GiB total capacity; 12.76 GiB already allocated; 184.19 MiB free; 13.50 GiB reserved in total by PyTorch)
Exception raised from malloc at /opt/conda/conda-bld/pytorch_1595629403081/work/c10/cuda/CUDACachingAllocator.cpp:272 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x4d (0x7eff1c9bf77d in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: + 0x20626 (0x7eff1cc17626 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: + 0x214f4 (0x7eff1cc184f4 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #3: + 0x21b81 (0x7eff1cc18b81 in /opt/conda/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #4: at::native::empty_cuda(c10::ArrayRef, c10::TensorOptions const&, c10::optional) + 0x249 (0x7eff1fb27c79 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #5: + 0xd25dc9 (0x7eff1db4adc9 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #6: + 0xd3fbf7 (0x7eff1db64bf7 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #7: + 0xe450dd (0x7eff4fc7e0dd in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #8: + 0xe453f7 (0x7eff4fc7e3f7 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #9: at::empty(c10::ArrayRef, c10::TensorOptions const&, c10::optional) + 0xfa (0x7eff4fd88e7a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #10: at::native::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional) + 0x49e (0x7eff4fa0709e in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #11: + 0xfe3521 (0x7eff4fe1c521 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #12: + 0x101ecc3 (0x7eff4fe57cc3 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #13: at::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional) + 0x101 (0x7eff4fd6bf91 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #14: at::Tensor at::native::(anonymous namespace)::host_softmax_backward<at::native::(anonymous namespace)::LogSoftMaxBackwardEpilogue, true>(at::Tensor const&, at::Tensor const&, long, bool) + 0x16c (0x7eff1f275eac in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #15: at::native::log_softmax_backward_cuda(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x8d (0x7eff1f25117d in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #16: + 0xd13a40 (0x7eff1db38a40 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #17: + 0xe6f636 (0x7eff4fca8636 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #18: at::_log_softmax_backward_data(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x119 (0x7eff4fd36aa9 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #19: + 0x2c217ff (0x7eff51a5a7ff in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #20: + 0xe6f636 (0x7eff4fca8636 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #21: at::_log_softmax_backward_data(at::Tensor const&, at::Tensor const&, long, at::Tensor const&) + 0x119 (0x7eff4fd36aa9 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #22: torch::autograd::generated::LogSoftmaxBackward::apply(std::vector<at::Tensor, std::allocator >&&) + 0x1d7 (0x7eff518d64b7 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #23: + 0x30d1017 (0x7eff51f0a017 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #24: torch::autograd::Engine::evaluate_function(std::shared_ptr&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr const&) + 0x1400 (0x7eff51f05860 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #25: torch::autograd::Engine::thread_main(std::shared_ptr const&) + 0x451 (0x7eff51f06401 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #26: torch::autograd::Engine::thread_init(int, std::shared_ptr const&, bool) + 0x89 (0x7eff51efe579 in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #27: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr const&, bool) + 0x4a (0x7eff5622d99a in /opt/conda/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #28: + 0xc819d (0x7eff58d7819d in /opt/conda/lib/python3.7/site-packages/torch/lib/../../../.././libstdc++.so.6)
frame #29: + 0x76db (0x7eff7154a6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #30: clone + 0x3f (0x7eff7127388f in /lib/x86_64-linux-gnu/libc.so.6)