CUDA backend - Githubissues

dselivanov commented 5 years ago

I've added these lines to device.hpp and trying to allocate tensor on GPU.

library(torch)
x <- matrix(runif(100), ncol = 2)
x_t <- tensor(x, device = "CUDA")

This throws

terminate called after throwing an instance of 'std::bad_alloc' what(): std::bad_alloc Aborted

Any ideas?

dfalbel commented 5 years ago

No idea.

I am having the same issue. I added a function:

// [[Rcpp::export]]
bool tch_cuda_available () {
  return torch::cuda::is_available();  
}

And it returns FALSE. So maybe torch is not finding cuda yet.

dfalbel commented 5 years ago

This is the CMake file they use to find torch:

# FindTorch
# -------
#
# Finds the Torch library
#
# This will define the following variables:
#
#   TORCH_FOUND        -- True if the system has the Torch library
#   TORCH_INCLUDE_DIRS -- The include directories for torch
#   TORCH_LIBRARIES    -- Libraries to link against
#   TORCH_CXX_FLAGS    -- Additional (required) compiler flags
#
# and the following imported targets:
#
#   torch

include(FindPackageHandleStandardArgs)

if (DEFINED ENV{TORCH_INSTALL_PREFIX})
  set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX})
else()
  # Assume we are in <install-prefix>/share/cmake/Torch/TorchConfig.cmake
  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
  get_filename_component(TORCH_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE)
endif()

# Include directories.
if (EXISTS "${TORCH_INSTALL_PREFIX}/lib/include")
  set(TORCH_INCLUDE_DIRS
    ${TORCH_INSTALL_PREFIX}/lib/include
    ${TORCH_INSTALL_PREFIX}/lib/include/torch/csrc/api/include)
else()
  set(TORCH_INCLUDE_DIRS
    ${TORCH_INSTALL_PREFIX}/include
    ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include)
endif()

# Library dependencies.
find_package(Caffe2 REQUIRED PATHS ${CMAKE_CURRENT_LIST_DIR}/../Caffe2)

find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib")
add_library(torch UNKNOWN IMPORTED)
set(TORCH_LIBRARIES torch ${Caffe2_MAIN_LIBS})

if (1)
  if(MSVC)
    set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt")
    if ($ENV{NVTOOLEXT_HOME})
      set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME})
    endif()
    set(TORCH_CUDA_LIBRARIES
      ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib
      ${CUDA_LIBRARIES})
    list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include)
  elseif(APPLE)
    set(TORCH_CUDA_LIBRARIES
      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib
      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib
      ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib
      ${CUDA_LIBRARIES})
  else()
    set(TORCH_CUDA_LIBRARIES
      ${CUDA_CUDA_LIB}
      ${CUDA_NVRTC_LIB}
      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
      ${CUDA_LIBRARIES})
  endif()
  list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES})
endif()

# When we build libtorch with the old GCC ABI, dependent libraries must too.
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
  set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=0")
endif()

set_target_properties(torch PROPERTIES
    IMPORTED_LOCATION "${TORCH_LIBRARY}"
    INTERFACE_INCLUDE_DIRECTORIES "${TORCH_INCLUDE_DIRS}"
    CXX_STANDARD 11
)
if (TORCH_CXX_FLAGS)
  set_property(TARGET torch PROPERTY INTERFACE_COMPILE_OPTIONS "${TORCH_CXX_FLAGS}")
endif()

find_package_handle_standard_args(torch DEFAULT_MSG TORCH_LIBRARY TORCH_INCLUDE_DIRS)

We should probably find which libraries they link with. I think the relevant things are:

${CUDA_CUDA_LIB}
${CUDA_NVRTC_LIB}
${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvToolsExt.so
${CUDA_LIBRARIES})

dfalbel commented 5 years ago

I tried compiling a simple example cpp with GPU support and had no success too.

I followed the steps here: https://pytorch.org/cppdocs/installing.html (with GPU download link).

And the app source code was:

#include <torch/torch.h>
#include <iostream>

int main() {
  torch::Tensor tensor = torch::randn({3, 4}, torch::dtype(torch::kFloat32).device(torch::kCUDA, 1).requires_grad(true));
  std::cout << tensor << std::endl;
}

This compiles fine, but have a runtime error:

dani@dfalbel-System-Product-Name:~/torch2/example-app/build$ ./example-app
terminate called after throwing an instance of 'c10::Error'
  what():  p ASSERT FAILED at /home/dani/libtorch/include/c10/impl/DeviceGuardImplInterface.h:125, please report a bug to PyTorch. DeviceGuardImpl for cuda is not available (getDeviceGuardImpl at /home/dani/libtorch/include/c10/impl/DeviceGuardImplInterface.h:125)

I did not find any post of people having trouble when compiling cpp programs with GPU support with torch. Let me know if you can compile at least a simple program. Maybe we should open an issue in pytorch.

Also, it will work only with cuda9.0 and cudnn > 7. I tested with cuda9.2 and got strange errors.

dfalbel commented 5 years ago

It just worked for me on your branch.

> x <- torch::tensor(c(1,2), device = "CUDA")
> x
tensor 
 1
 2
[ Variable[CUDAFloatType]{2} ]

The problem was that I was mistakenly using the CPU lib. This happens if you try installing the CPU version first and then installs the GPU version. Since the configure assumes that libtorch is already downloaded it does not download the right version.

Maybe the default TORCH_HOME when TORCH_BACKEND="CUDA" should be $HOME/libtorch-cuda to avoid this.

Also, it will work only with cuda9.0 and cudnn > 7. I tested with cuda9.2 and got strange errors.

dselivanov commented 5 years ago

I didn't have cudnn installed, mb this is the root of the problem - will try.

dselivanov commented 5 years ago

I've installed cudnn and can confirm that now it works fine.

dfalbel / torch

CUDA backend #19