LaurentMazare / tch-rs

Rust bindings for the C++ api of PyTorch.
Apache License 2.0
4.28k stars 340 forks source link

CUDA error: driver shutting down #741

Closed arthur19q3 closed 1 year ago

arthur19q3 commented 1 year ago

i am trying to run this Gradient Descent test,

use tch::nn::{Module, OptimizerConfig};
use tch::{kind, nn, Device, Tensor};

fn my_module(p: nn::Path, dim: i64) -> impl nn::Module {
    let x1 = p.zeros("x1", &[dim]);
    let x2 = p.zeros("x2", &[dim]);
    nn::func(move |xs| xs * &x1 + xs.exp() * &x2)
}

fn gradient_descent() {
    let vs = nn::VarStore::new(Device::Cpu);
    let my_module = my_module(vs.root(), 7);
    let mut opt = nn::Sgd::default().build(&vs, 1e-2).unwrap();
    for _idx in 1..50 {
        // Dummy mini-batches made of zeros.
        let xs = Tensor::zeros(&[7], kind::FLOAT_CPU);
        let ys = Tensor::zeros(&[7], kind::FLOAT_CPU);
        let loss = (my_module.forward(&xs) - ys).pow_tensor_scalar(2).sum(kind::Kind::Float);
        opt.backward_step(&loss);
    }
}

fn main() {
    gradient_descent();
}

but somehow failed:


terminate called after throwing an instance of 'c10::Error'
  what():  CUDA error: driver shutting down
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Exception raised from c10_cuda_check_implementation at ../c10/cuda/CUDAException.cpp:44 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x6b (0x7fd26ce5a6bb in /usr/lib/libtorch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xbf (0x7fd26ce555ef in /usr/lib/libtorch/lib/libc10.so)
frame #2: c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) + 0x58f (0x7fd26c849f5f in /usr/lib/libtorch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0x20b2a (0x7fd26c820b2a in /usr/lib/libtorch/lib/libc10_cuda.so)
frame #4: <unknown function> + 0x45c895a (0x7fd2717c895a in /usr/lib/libtorch/lib/libtorch_cpu.so)
frame #5: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x3a (0x7fd2717c963a in /usr/lib/libtorch/lib/libtorch_cpu.so)
frame #6: <unknown function> + 0xe1943 (0x7fd2dc0e1943 in /usr/lib/libstdc++.so.6)
frame #7: <unknown function> + 0x8744b (0x7fd26cc9d44b in /usr/lib/libc.so.6)
frame #8: <unknown function> + 0x10ae40 (0x7fd26cd20e40 in /usr/lib/libc.so.6)
Process finished with exit code 134 (interrupted by signal 6: SIGABRT)

i am using cuda 12.1 (only thing available in archlinux repo at the moment) and installed cudann.

nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Mon_Apr__3_17:16:06_PDT_2023
Cuda compilation tools, release 12.1, V12.1.105
Build cuda_12.1.r12.1/compiler.32688072_0

LD_LIBRARY_PATH is also set according to where cuda and libtorch is installed:

LD_LIBRARY_PATH=/usr/lib/libtorch/lib:/opt/cuda/targets/x86_64-linux/lib/:$LD_LIBRARY_PATH;

arthur19q3 commented 1 year ago

fixed by changing Device and Option parameters.


fn gradient_descent() {
    let vs = nn::VarStore::new(Device::Cuda(0));
    let my_module = my_module(vs.root(), 7);
    let mut opt = nn::Sgd::default().build(&vs, 1e-2).unwrap();
    for _idx in 1..50 {
        // Dummy mini-batches made of zeros.
        let xs = Tensor::zeros(&[7], kind::FLOAT_CUDA);
        let ys = Tensor::zeros(&[7], kind::FLOAT_CUDA);
        let loss = (my_module.forward(&xs) - ys).pow_tensor_scalar(2).sum(kind::Kind::Float);
        opt.backward_step(&loss);
    }
}

fn main() {
    gradient_descent();
}