marsbzp commented 1 year ago

系统环境/System Environment：Centos 版本号/Version：Paddle_inference：2.3 PaddleOCR： 2.4 cuda:10.1 运行指令/Command Code：完整报错/Complete Error Message：多线程调用C++推理库进行OCR推理时出现报错，使用的是官网下载的识别模型ch_ppocr_server_v2.0_rec_infer，非必现，线程数越多，越容易出现。不报错时可以正常分析，基本都是崩在了RNN算子那里，是否是Paddle实现的RNN算子有问题（每个线程有单独的检测识别模型，各线程间模型不共享）和下面这个issue基本一致可以参照里面描述进行复现 https://github.com/PaddlePaddle/PaddleOCR/issues/6514

报错信息如下

`Detected boxes num: 8 The detection visualized image saved in ./ocr_vis.png Detected boxes num: 8 The detection visualized image saved in ./ocr_vis.png Detected boxes num: 8 Detected boxes num: 8 Detected boxes num: 8 The detection visualized image saved in ./ocr_vis.png The detection visualized image saved in ./ocr_vis.png The detection visualized image saved in ./ocr_vis.png

Program received signal SIGSEGV, Segmentation fault. [Switching to Thread 0x7ffd397fe700 (LWP 5157)] 0x00007fff5308b04e in ?? () from /lib64/libcuda.so.1 Missing separate debuginfos, use: debuginfo-install glibc-2.17-324.el7_9.x86_64 libgcc-4.8.5-44.el7.x86_64 libgomp-4.8.5-44.el7.x86_64 (gdb) bt

0 0x00007fff5308b04e in ?? () from /lib64/libcuda.so.1

1 0x00007fff53204b0f in ?? () from /lib64/libcuda.so.1

2 0x00007fff530751e0 in ?? () from /lib64/libcuda.so.1

3 0x00007fff531dded6 in ?? () from /lib64/libcuda.so.1

4 0x00007fff52f85a1b in ?? () from /lib64/libcuda.so.1

5 0x00007fff52f85c98 in ?? () from /lib64/libcuda.so.1

6 0x00007fff52f85cde in ?? () from /lib64/libcuda.so.1

7 0x00007fff5310c806 in cuLaunchKernel () from /lib64/libcuda.so.1

8 0x00007fff64b5aa19 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

9 0x00007fff64b5aaa7 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

10 0x00007fff64b90e9b in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

11 0x00007fff647f83de in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

12 0x00007fff647f29ea in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

13 0x00007fff646d7a76 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

14 0x00007fff647a4bf3 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

15 0x00007fff647e908f in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

16 0x00007fff647ebfd8 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

17 0x00007fff647ec7bf in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

18 0x00007fff647f23b0 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

19 0x00007fff645a4342 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

20 0x00007fff645cf335 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

21 0x00007fff645d3e81 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

22 0x00007fff645c1d20 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

23 0x00007fff645c277f in cudnnRNNForwardInference () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

24 0x00007fff995599ef in ?? () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

25 0x00007fff9955e2f4 in void phi::RnnKernel<float, phi::GPUContext>(phi::GPUContext const&, phi::DenseTensor const&, std::vector<phi::DenseTensor const, std::allocator<phi::DenseTensor const> > const&, std::vector<phi::DenseTensor const, std::allocator<phi::DenseTensor const> > const&, paddle::optional<phi::DenseTensor const&>, float, bool, int, int, int, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int, bool, phi::DenseTensor, phi::DenseTensor, std::vector<phi::DenseTensor, std::allocator<phi::DenseTensor> >, phi::DenseTensor*) () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

26 0x00007fff9955ee5d in void phi::KernelImpl<void ()(phi::GPUContext const&, phi::DenseTensor const&, std::vector<phi::DenseTensor const, std::allocator<phi::DenseTensor const> > const&, std::vector<phi::DenseTensor const, std::allocator<phi::DenseTensor const> > const&, paddle::optional<phi::DenseTensor const&>, float, bool, int, int, int, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int, bool, phi::DenseTensor, phi::DenseTensor, std::vector<phi::DenseTensor, std::allocator<phi::DenseTensor> >, phi::DenseTensor), &(void phi::RnnKernel<float, phi::GPUContext>(phi::GPUContext const&, phi::DenseTensor const&, std::vector<phi::DenseTensor const, std::allocator<phi::DenseTensor const> > const&, std::vector<phi::DenseTensor const, std::allocator<phi::DenseTensor const> > const&, paddle::optional<phi::DenseTensor const&>, float, bool, int, int, int, std::cxx11::basic_string<char, std::char_traits, std::allocator > const&, int, bool, phi::DenseTensor, phi::DenseTensor, std::vector<phi::DenseTensor, std::allocator<phi::DenseTensor> >, phi::DenseTensor))>::KernelCallHelper<std::vector<phi::DenseTensor const, std::allocator<phi::DenseTensor const> > const&, std::vector<phi::DenseTensor const, std::allocator<phi::DenseTensor const*> > const&, paddle::optional<phi::DenseTensor const&>, float, bool, int, int, int, std::cxx11::basic_string<char, std::char_traits, std::allocator > const&, int, bool, phi::DenseTensor, phi::DenseTensor, std::vector<phi::DenseTensor, std::allocator<phi::DenseTensor> >, phi::DenseTensor, phi::TypeTag >::Compute<1, 1, 0, 0, phi::GPUContext const, phi::DenseTensor const>(phi::KernelContext, phi::GPUContext const&, phi::DenseTensor const&) ()

from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

27 0x00007fff9cefaa3a in paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, phi::Place const&, paddle::framework::RuntimeContext*) const ()

from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

28 0x00007fff9cefb629 in paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, phi::Place const&) const ()

from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

29 0x00007fff9ceec10b in paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, phi::Place const&) () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

30 0x00007fff96fa23d0 in paddle::framework::NaiveExecutor::Run() () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

31 0x00007fff96bfd8fb in paddle::AnalysisPredictor::ZeroCopyRun() () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

32 0x000000000044198c in PaddleOCR::CRNNRecognizer::Run (this=0x7ffd397ecfd0, img_list=..., times=0x7ffd397ecf70) at /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/src/ocr_rec.cpp:63

33 0x000000000042ce4a in predictor_thread (cv_all_img_names=..., thread_id=14) at /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/src/main.cpp:139

34 0x00000000004314e7 in std::invoke_impl<void, void ()(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >, int>(std::__invoke_other, void (&&)(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >&&, int&&) (f=<unknown type in /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/build/ppocr, CU 0x0, DIE 0x40211>,

__args#0=<unknown type in /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/build/ppocr, CU 0x0, DIE 0x40234>, __args#1=<unknown type in /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/build/ppocr, CU 0x0, DIE 0x40244>)
at /usr/local/gcc-8.2/include/c++/8.2.0/bits/invoke.h:60

35 0x000000000042fca9 in std::invoke<void ()(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >, int>(void (&&)(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >&&, int&&) (fn=<unknown type in /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/build/ppocr, CU 0x0, DIE 0x42458>,

__args#0=<unknown type in /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/build/ppocr, CU 0x0, DIE 0x4247a>, __args#1=<unknown type in /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/build/ppocr, CU 0x0, DIE 0x42489>)

---Type to continue, or q to quit--- at /usr/local/gcc-8.2/include/c++/8.2.0/bits/invoke.h:95

36 0x00000000004350b9 in std::thread::_Invoker<std::tuple<void (*)(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >, int> >::_M_invoke<0ul, 1ul, 2ul> (this=0x2ace388)

at /usr/local/gcc-8.2/include/c++/8.2.0/thread:234

37 0x0000000000435058 in std::thread::_Invoker<std::tuple<void (*)(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >, int> >::operator() (this=0x2ace388) at /usr/local/gcc-8.2/include/c++/8.2.0/thread:243

38 0x000000000043503c in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >, int> > >::_M_run (this=0x2ace380)

at /usr/local/gcc-8.2/include/c++/8.2.0/thread:186

39 0x00007ffff7f3f19d in std::execute_native_thread_routine (__p=0x2ace380) at /home/nwani/m3/conda-bld/compilers_linux-64_1560109574129/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:80

40 0x00007fff62ccaea5 in start_thread () from /lib64/libpthread.so.0

41 0x00007fff624db9fd in clone () from /lib64/libc.so.6

(gdb) (gdb) `

yiouejv commented 1 year ago

解决了嘛

marsbzp commented 1 year ago

解决了嘛

没解决呀，建议参考下onnx runtime的实现方式，改写下paddle推理库吧，我测试过同样的模型1.4.0的onnx runtime可以跑的，可能是百度实现时调的cudnn里面的rnn算子进行推理导致的

github-actions[bot] commented 1 year ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed in 7 days if no further activity occurs. Thank you for your contributions.

PaddlePaddle / PaddleOCR

多线程调用C++推理库进行OCR推理时程序会崩溃！！！！ #8823

0 0x00007fff5308b04e in ?? () from /lib64/libcuda.so.1

1 0x00007fff53204b0f in ?? () from /lib64/libcuda.so.1

2 0x00007fff530751e0 in ?? () from /lib64/libcuda.so.1

3 0x00007fff531dded6 in ?? () from /lib64/libcuda.so.1

4 0x00007fff52f85a1b in ?? () from /lib64/libcuda.so.1

5 0x00007fff52f85c98 in ?? () from /lib64/libcuda.so.1

6 0x00007fff52f85cde in ?? () from /lib64/libcuda.so.1

7 0x00007fff5310c806 in cuLaunchKernel () from /lib64/libcuda.so.1

8 0x00007fff64b5aa19 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

9 0x00007fff64b5aaa7 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

10 0x00007fff64b90e9b in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

11 0x00007fff647f83de in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

12 0x00007fff647f29ea in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

13 0x00007fff646d7a76 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

14 0x00007fff647a4bf3 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

15 0x00007fff647e908f in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

16 0x00007fff647ebfd8 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

17 0x00007fff647ec7bf in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

18 0x00007fff647f23b0 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

19 0x00007fff645a4342 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

20 0x00007fff645cf335 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

21 0x00007fff645d3e81 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

22 0x00007fff645c1d20 in ?? () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

23 0x00007fff645c277f in cudnnRNNForwardInference () from /usr/local/cuda-10.1/lib64/libcudnn.so.7

24 0x00007fff995599ef in ?? () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

27 0x00007fff9cefaa3a in paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, phi::Place const&, paddle::framework::RuntimeContext*) const ()

28 0x00007fff9cefb629 in paddle::framework::OperatorWithKernel::RunImpl(paddle::framework::Scope const&, phi::Place const&) const ()

29 0x00007fff9ceec10b in paddle::framework::OperatorBase::Run(paddle::framework::Scope const&, phi::Place const&) () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

30 0x00007fff96fa23d0 in paddle::framework::NaiveExecutor::Run() () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

31 0x00007fff96bfd8fb in paddle::AnalysisPredictor::ZeroCopyRun() () from /home/share/disk2/zhangjinlong21/inference_project/ocr_inference/cpp_infer_release/3rdparty/paddle_inference_2.3/paddle/lib/libpaddle_inference.so

32 0x000000000044198c in PaddleOCR::CRNNRecognizer::Run (this=0x7ffd397ecfd0, img_list=..., times=0x7ffd397ecf70) at /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/src/ocr_rec.cpp:63

33 0x000000000042ce4a in predictor_thread (cv_all_img_names=..., thread_id=14) at /home/share/disk1/bzp/ocr_inference_multithread/cpp_infer_qm/src/main.cpp:139

36 0x00000000004350b9 in std::thread::_Invoker<std::tuple<void (*)(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >, int> >::_M_invoke<0ul, 1ul, 2ul> (this=0x2ace388)

37 0x0000000000435058 in std::thread::_Invoker<std::tuple<void (*)(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >, int> >::operator() (this=0x2ace388) at /usr/local/gcc-8.2/include/c++/8.2.0/thread:243

38 0x000000000043503c in std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (*)(std::vector<cv::String, std::allocator >, int), std::vector<cv::String, std::allocator >, int> > >::_M_run (this=0x2ace380)

39 0x00007ffff7f3f19d in std::execute_native_thread_routine (__p=0x2ace380) at /home/nwani/m3/conda-bld/compilers_linux-64_1560109574129/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:80

40 0x00007fff62ccaea5 in start_thread () from /lib64/libpthread.so.0

41 0x00007fff624db9fd in clone () from /lib64/libc.so.6