pykeio / ort

Fast ML inference & training for Rust with ONNX Runtime
https://ort.pyke.io/
Apache License 2.0
859 stars 100 forks source link

ONNX Runtime with CUDA: LoadLibrary failed with error 126 #232

Closed 86maid closed 3 months ago

86maid commented 3 months ago

An error occurred during the execution of register.

  let builder = ort::Session::builder()?;

        let cuda = ort::CUDAExecutionProvider::default()
            .with_device_id(device_id)
            .with_arena_extend_strategy(ort::ArenaExtendStrategy::NextPowerOfTwo)
            .with_memory_limit(2 * 1024 * 1024 * 1024)
            .with_conv_algorithm_search(ort::CUDAExecutionProviderCuDNNConvAlgoSearch::Exhaustive)
            .with_copy_in_default_stream(true);

        if !ort::ExecutionProvider::is_available(&cuda)? {
            anyhow::bail!("Please compile ONNX Runtime with CUDA!")
        }

        ort::ExecutionProvider::register(&cuda, &builder).map_err(|v| {
            anyhow::anyhow!("Please check if ONNX Runtime is compiled with CUDA support: {v}")
        })?;

output.

called `Result::unwrap()` on an `Err` value: Please check if ONNX Runtime is compiled with CUDA support: D:\a\ort-artifacts-staging\ort-artifacts-staging\onnxruntime\onnxruntime\core\session\provider_bridge_ort.cc:1426 onnxruntime::ProviderLibrary::Get [ONNXRuntimeError] : 1 : FAIL : LoadLibrary failed with error 126 "" when trying to load "C:\Users\XChuang233\Desktop\ddddocr-rust\ddddocr\ddddocr\target\debug\deps\onnxruntime_providers_cuda.dll"

stack backtrace:
   0:     0x7ff77fd59028 - std::backtrace_rs::backtrace::dbghelp64::trace
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\..\..\backtrace\src\backtrace\dbghelp64.rs:91
   1:     0x7ff77fd59028 - std::backtrace_rs::backtrace::trace_unsynchronized
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\..\..\backtrace\src\backtrace\mod.rs:66
   2:     0x7ff77fd59028 - std::sys_common::backtrace::_print_fmt
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\sys_common\backtrace.rs:68
   3:     0x7ff77fd59028 - std::sys_common::backtrace::_print::impl$0::fmt
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\sys_common\backtrace.rs:44
   4:     0x7ff77fd79629 - core::fmt::rt::Argument::fmt
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\core\src\fmt\rt.rs:165
   5:     0x7ff77fd79629 - core::fmt::write
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\core\src\fmt\mod.rs:1157
   6:     0x7ff77fd552a1 - std::io::Write::write_fmt<alloc::vec::Vec<u8,alloc::alloc::Global> >
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\io\mod.rs:1832
   7:     0x7ff77fd58e06 - std::sys_common::backtrace::print
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\sys_common\backtrace.rs:34
   8:     0x7ff77fd5b628 - std::panicking::default_hook::closure$1
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\panicking.rs:271
   9:     0x7ff77fd5b20d - std::panicking::default_hook
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\panicking.rs:295
  10:     0x7ff77f860fe2 - alloc::boxed::impl$50::call
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\alloc\src\boxed.rs:2036
  11:     0x7ff77f860fe2 - test::test_main::closure$0
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\test\src\lib.rs:137
  12:     0x7ff77fd5bb87 - alloc::boxed::impl$50::call
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\alloc\src\boxed.rs:2036
  13:     0x7ff77fd5bb87 - std::panicking::rust_panic_with_hook
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\panicking.rs:799
  14:     0x7ff77fd5ba17 - std::panicking::begin_panic_handler::closure$0
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\panicking.rs:664
  15:     0x7ff77fd5999f - std::sys_common::backtrace::__rust_end_short_backtrace<std::panicking::begin_panic_handler::closure_env$0,never$>
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\sys_common\backtrace.rs:171
  16:     0x7ff77fd5b6c8 - std::panicking::begin_panic_handler
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\panicking.rs:652
  17:     0x7ff77fd84534 - core::panicking::panic_fmt
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\core\src\panicking.rs:72
  18:     0x7ff77fd84a30 - core::result::unwrap_failed
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\core\src\result.rs:1654
  19:     0x7ff77f735ecd - enum2$<core::result::Result<ddddocr::Ddddocr,anyhow::Error> >::unwrap
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081\library\core\src\result.rs:1077
  20:     0x7ff77f735ecd - ddddocr::tests::cuda
                               at C:\Users\XChuang233\Desktop\ddddocr-rust\ddddocr\ddddocr\src\lib.rs:1564
  21:     0x7ff77f7a37c8 - ddddocr::tests::cuda::closure$0
                               at C:\Users\XChuang233\Desktop\ddddocr-rust\ddddocr\ddddocr\src\lib.rs:1563
  22:     0x7ff77f76ebc2 - core::ops::function::FnOnce::call_once<ddddocr::tests::cuda::closure_env$0,tuple$<> >
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081\library\core\src\ops\function.rs:250
  23:     0x7ff77f867a80 - core::ops::function::FnOnce::call_once
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\core\src\ops\function.rs:250
  24:     0x7ff77f867a80 - test::__rust_begin_short_backtrace<enum2$<core::result::Result<tuple$<>,alloc::string::String> >,enum2$<core::result::Result<tuple$<>,alloc::string::String> > (*)()>
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\test\src\lib.rs:623
  25:     0x7ff77f8669a2 - test::run_test::closure$0
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\test\src\lib.rs:569
  26:     0x7ff77f8254db - test::run_test::closure$1
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\test\src\lib.rs:597
  27:     0x7ff77f8254db - std::sys_common::backtrace::__rust_begin_short_backtrace<test::run_test::closure_env$1,tuple$<> >
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\sys_common\backtrace.rs:155
  28:     0x7ff77f82ae6d - std::thread::impl$0::spawn_unchecked_::closure$2::closure$0
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\thread\mod.rs:542
  29:     0x7ff77f82ae6d - core::panic::unwind_safe::impl$25::call_once
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\core\src\panic\unwind_safe.rs:272
  30:     0x7ff77f82ae6d - std::panicking::try::do_call
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\panicking.rs:559
  31:     0x7ff77f82ae6d - std::panicking::try
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\panicking.rs:523
  32:     0x7ff77f82ae6d - std::panic::catch_unwind
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\panic.rs:149
  33:     0x7ff77f82ae6d - std::thread::impl$0::spawn_unchecked_::closure$2
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\thread\mod.rs:541
  34:     0x7ff77f82ae6d - core::ops::function::FnOnce::call_once<std::thread::impl$0::spawn_unchecked_::closure_env$2<test::run_test::closure_env$1,tuple$<> >,tuple$<> >
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\core\src\ops\function.rs:250
  35:     0x7ff77fd66ead - alloc::boxed::impl$48::call_once
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\alloc\src\boxed.rs:2022
  36:     0x7ff77fd66ead - alloc::boxed::impl$48::call_once
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\alloc\src\boxed.rs:2022
  37:     0x7ff77fd66ead - std::sys::pal::windows::thread::impl$0::new::thread_start
                               at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library\std\src\sys\pal\windows\thread.rs:52
  38:     0x7ffbef1e7344 - BaseThreadInitThunk
  39:     0x7ffbef9dcc91 - RtlUserThreadStart

environment

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras\demo_suite>nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Apr_17_19:36:51_Pacific_Daylight_Time_2024
Cuda compilation tools, release 12.5, V12.5.40
Build cuda_12.5.r12.5/compiler.34177558_0

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras\demo_suite>deviceQuery.exe
deviceQuery.exe Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce GTX 1650"
  CUDA Driver Version / Runtime Version          12.5 / 12.5
  CUDA Capability Major/Minor version number:    7.5
  Total amount of global memory:                 4096 MBytes (4294508544 bytes)
  (14) Multiprocessors, ( 64) CUDA Cores/MP:     896 CUDA Cores
  GPU Max Clock rate:                            1515 MHz (1.51 GHz)
  Memory Clock rate:                             6001 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 1048576 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               zu bytes
  Total amount of shared memory per block:       zu bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1024
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          zu bytes
  Texture alignment:                             zu bytes
  Concurrent copy and kernel execution:          Yes with 6 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 12.5, CUDA Runtime Version = 12.5, NumDevs = 1, Device0 = NVIDIA GeForce GTX 1650
Result = PASS

C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.5\extras\demo_suite>bandwidthTest.exe
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: NVIDIA GeForce GTX 1650
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     6486.0

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     6369.7

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(MB/s)
   33554432                     157015.5

Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

cargo

[features]
load-dynamic = ["ort/load-dynamic"]
cuda = ["ort/cuda"]
default = ["cuda"]

[dependencies]
ort = { git = "https://github.com/pykeio/ort.git", rev = "0407adb" }

My system version is Windows 10 22H2, and I am using the compiled dll.

86maid commented 3 months ago

Another issue is that cargo test cannot find the DLL because the automatically generated DLL is located in target\debug, and it needs to be manually copied to target\debug\deps.