modestyachts / neural_kernels_code

36 stars 5 forks source link

RuntimeError: Could not compile function #1

Open Haoxiang-Wang opened 4 years ago

Haoxiang-Wang commented 4 years ago

I use your docker to run on a 4x 2080ti Ubuntu 18.04 workstation, and finally encounter the following error. Could you give me a hint on how to resolve this issue?

Current Count Is:  1084000
Current Count Is:  1085000
Data_q size start 1085764
  0%|                               | 0/2500000000 [00:00<?, ?it/s]Context already set..
Context already set..
Context already set..
Context already set..
Layer KWARGS: [{}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {'precision': 'float64'}, {'store_norm': False, 'precision': 'float64'}, {'precision': 'float64', 'store_norm': True}, {'store_norm': False, 'precision': 'float64'}]
Layer KWARGS: [{}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {'precision': 'float64'}, {'store_norm': False, 'precision': 'float64'}, {'precision': 'float64', 'store_norm': True}, {'store_norm': False, 'precision': 'float64'}]
Layer KWARGS: [{}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {'precision': 'float64'}, {'store_norm': False, 'precision': 'float64'}, {'precision': 'float64', 'store_norm': True}, {'store_norm': False, 'precision': 'float64'}]
Layer KWARGS: [{}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {}, {'store_norm': False}, {'store_norm': False}, {'precision': 'float64'}, {'store_norm': False, 'precision': 'float64'}, {'precision': 'float64', 'store_norm': True}, {'store_norm': False, 'precision': 'float64'}]
TC "conv3_input" was not explicitly compiled for inputs of sizes:
  torch.Size([8, 30, 30, 1]) torch.Size([8, 30, 30, 1])
....Generate implicit MappingOptions
TC "conv3_input" was not explicitly compiled for inputs of sizes:
  torch.Size([8, 30, 30, 1]) torch.Size([8, 30, 30, 1])
....Generate implicit MappingOptions
TC "conv3_input" was not explicitly compiled for inputs of sizes:
  torch.Size([8, 30, 30, 1]) torch.Size([8, 30, 30, 1])
....Generate implicit MappingOptions
TC "conv3_input" was not explicitly compiled for inputs of sizes:
  torch.Size([8, 30, 30, 1]) torch.Size([8, 30, 30, 1])
....Generate implicit MappingOptions
E0618 20:39:27.390336    39] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))

#ifndef __CUDACC_RTC__
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
#include <cuda_fp16.h>

#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)

// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
    ( defined(__clang__) && CUDA_VERSION < 9000)
inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}

extern "C" {
__global__ void conv3_input_8_30_1_30(int B, int H, int P, int W, float* pconv_output, const float* pX, const float* pY) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float (*conv_output)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)] = reinterpret_cast<float (*)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)]>(pconv_output);
  const float (*X)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pX);
  const float (*Y)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pY);
  for (int c7 = 0; c7 <= 7; c7 += 1) {
    for (int c8 = 0; c8 <= 7; c8 += 1) {
      for (int c9 = 0; c9 <= 27; c9 += 1) {
        for (int c10 = 0; c10 <= 27; c10 += 1) {
          for (int c11 = t1; c11 <= 27; c11 += 8) {
            conv_output[c7][c8][c9][c10][c11][t0] = (float)0.000000;
            for (int c13 = 0; c13 <= 2; c13 += 1) {
              for (int c14 = 0; c14 <= 2; c14 += 1) {
                conv_output[c7][c8][c9][c10][c11][t0] = (conv_output[c7][c8][c9][c10][c11][t0] + (X[c7][(c9 + c13)][(c10 + c14)][0]*Y[c8][(c11 + c13)][(t0 + c14)][0]));
Process Process-4:
Traceback (most recent call last):
  File "/root/conda/lib/python3.6/multiprocessing/", line 258, in _bootstrap
  File "/root/conda/lib/python3.6/multiprocessing/", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/neural_kernels_code/", line 351, in _kernel_gen_help
    kx = dnet.forward(x_b, y_b, gpu=gpu_idx, pp_net=net).cpu().numpy().squeeze()
  File "/neural_kernels_code/", line 127, in forward
    prev_norm = self.layers[0](x_b, x_b, **self.kwargs_list[0])
  File "/neural_kernels_code/", line 434, in conv3zp_input
    return res.conv3_input(x,y)/(3*3)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 348, in fun
    tc_def_name, *inputs, outputs=outputs, unchecked=unchecked)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 398, in __call__
    implicit_compile(self, entry_point, *inputs)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 392, in implicit_compile
    entry_point, inputs, mapping_options)
RuntimeError: Could not compile function
E0618 20:39:27.467238    37] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))

#ifndef __CUDACC_RTC__
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
#include <cuda_fp16.h>

#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)

// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
    ( defined(__clang__) && CUDA_VERSION < 9000)
inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}

extern "C" {
__global__ void conv3_input_8_30_1_30(int B, int H, int P, int W, float* pconv_output, const float* pX, const float* pY) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float (*conv_output)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)] = reinterpret_cast<float (*)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)]>(pconv_output);
  const float (*X)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pX);
  const float (*Y)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pY);
  for (int c7 = 0; c7 <= 7; c7 += 1) {
    for (int c8 = 0; c8 <= 7; c8 += 1) {
      for (int c9 = 0; c9 <= 27; c9 += 1) {
        for (int c10 = 0; c10 <= 27; c10 += 1) {
          for (int c11 = t1; c11 <= 27; c11 += 8) {
            conv_output[c7][c8][c9][c10][c11][t0] = (float)0.000000;
            for (int c13 = 0; c13 <= 2; c13 += 1) {
              for (int c14 = 0; c14 <= 2; c14 += 1) {
                conv_output[c7][c8][c9][c10][c11][t0] = (conv_output[c7][c8][c9][c10][c11][t0] + (X[c7][(c9 + c13)][(c10 + c14)][0]*Y[c8][(c11 + c13)][(t0 + c14)][0]));
Process Process-2:
Traceback (most recent call last):
  File "/root/conda/lib/python3.6/multiprocessing/", line 258, in _bootstrap
  File "/root/conda/lib/python3.6/multiprocessing/", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/neural_kernels_code/", line 351, in _kernel_gen_help
    kx = dnet.forward(x_b, y_b, gpu=gpu_idx, pp_net=net).cpu().numpy().squeeze()
  File "/neural_kernels_code/", line 127, in forward
    prev_norm = self.layers[0](x_b, x_b, **self.kwargs_list[0])
  File "/neural_kernels_code/", line 434, in conv3zp_input
    return res.conv3_input(x,y)/(3*3)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 348, in fun
    tc_def_name, *inputs, outputs=outputs, unchecked=unchecked)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 398, in __call__
    implicit_compile(self, entry_point, *inputs)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 392, in implicit_compile
    entry_point, inputs, mapping_options)
RuntimeError: Could not compile function
E0618 20:39:27.564579    36] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))

#ifndef __CUDACC_RTC__
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
#include <cuda_fp16.h>

#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)

// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
    ( defined(__clang__) && CUDA_VERSION < 9000)
inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}

extern "C" {
__global__ void conv3_input_8_30_1_30(int B, int H, int P, int W, float* pconv_output, const float* pX, const float* pY) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float (*conv_output)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)] = reinterpret_cast<float (*)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)]>(pconv_output);
  const float (*X)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pX);
  const float (*Y)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pY);
  for (int c7 = 0; c7 <= 7; c7 += 1) {
    for (int c8 = 0; c8 <= 7; c8 += 1) {
      for (int c9 = 0; c9 <= 27; c9 += 1) {
        for (int c10 = 0; c10 <= 27; c10 += 1) {
          for (int c11 = t1; c11 <= 27; c11 += 8) {
            conv_output[c7][c8][c9][c10][c11][t0] = (float)0.000000;
            for (int c13 = 0; c13 <= 2; c13 += 1) {
              for (int c14 = 0; c14 <= 2; c14 += 1) {
                conv_output[c7][c8][c9][c10][c11][t0] = (conv_output[c7][c8][c9][c10][c11][t0] + (X[c7][(c9 + c13)][(c10 + c14)][0]*Y[c8][(c11 + c13)][(t0 + c14)][0]));
Process Process-1:
Traceback (most recent call last):
  File "/root/conda/lib/python3.6/multiprocessing/", line 258, in _bootstrap
  File "/root/conda/lib/python3.6/multiprocessing/", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/neural_kernels_code/", line 351, in _kernel_gen_help
    kx = dnet.forward(x_b, y_b, gpu=gpu_idx, pp_net=net).cpu().numpy().squeeze()
  File "/neural_kernels_code/", line 127, in forward
    prev_norm = self.layers[0](x_b, x_b, **self.kwargs_list[0])
  File "/neural_kernels_code/", line 434, in conv3zp_input
    return res.conv3_input(x,y)/(3*3)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 348, in fun
    tc_def_name, *inputs, outputs=outputs, unchecked=unchecked)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 398, in __call__
    implicit_compile(self, entry_point, *inputs)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 392, in implicit_compile
    entry_point, inputs, mapping_options)
RuntimeError: Could not compile function
E0618 20:39:27.591982    38] Compilation failure for nvrtc(NVRTC_ERROR_INVALID_OPTION):
nvrtc: error: invalid value for --gpu-architecture (-arch)
template<typename T> inline __device__ T floord(T n, T d) {
  return n < 0 ? - (-n + d - 1)/d : n / d;
#define if_then_else(cond,a,b) ((cond) ? (a) : (b))

#ifndef __CUDACC_RTC__
// Can't include system dependencies with NVRTC
// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
#include <cuda_fp16.h>

#define inff __int_as_float(0x7f800000)
#define inf __longlong_as_double(0x7ff0000000000000LL)

// Before CUDA 9, syncwarp is a noop since warps are always synchronized.
#if (!defined(__clang__) && __CUDACC_VER_MAJOR__ < 9) || \
    ( defined(__clang__) && CUDA_VERSION < 9000)
inline __device__ void __syncwarp(unsigned mask = 0xFFFFFFFF) {}

extern "C" {
__global__ void conv3_input_8_30_1_30(int B, int H, int P, int W, float* pconv_output, const float* pX, const float* pY) {
  int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
  int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
  float (*conv_output)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)] = reinterpret_cast<float (*)[8][(30 + -2)][(30 + -2)][(30 + -2)][(30 + -2)]>(pconv_output);
  const float (*X)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pX);
  const float (*Y)[30][30][1] = reinterpret_cast<const float (*)[30][30][1]>(pY);
  for (int c7 = 0; c7 <= 7; c7 += 1) {
    for (int c8 = 0; c8 <= 7; c8 += 1) {
      for (int c9 = 0; c9 <= 27; c9 += 1) {
        for (int c10 = 0; c10 <= 27; c10 += 1) {
          for (int c11 = t1; c11 <= 27; c11 += 8) {
            conv_output[c7][c8][c9][c10][c11][t0] = (float)0.000000;
            for (int c13 = 0; c13 <= 2; c13 += 1) {
              for (int c14 = 0; c14 <= 2; c14 += 1) {
                conv_output[c7][c8][c9][c10][c11][t0] = (conv_output[c7][c8][c9][c10][c11][t0] + (X[c7][(c9 + c13)][(c10 + c14)][0]*Y[c8][(c11 + c13)][(t0 + c14)][0]));
Process Process-3:
Traceback (most recent call last):
  File "/root/conda/lib/python3.6/multiprocessing/", line 258, in _bootstrap
  File "/root/conda/lib/python3.6/multiprocessing/", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/neural_kernels_code/", line 351, in _kernel_gen_help
    kx = dnet.forward(x_b, y_b, gpu=gpu_idx, pp_net=net).cpu().numpy().squeeze()
  File "/neural_kernels_code/", line 127, in forward
    prev_norm = self.layers[0](x_b, x_b, **self.kwargs_list[0])
  File "/neural_kernels_code/", line 434, in conv3zp_input
    return res.conv3_input(x,y)/(3*3)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 348, in fun
    tc_def_name, *inputs, outputs=outputs, unchecked=unchecked)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 398, in __call__
    implicit_compile(self, entry_point, *inputs)
  File "/root/conda/lib/python3.6/site-packages/tensor_comprehensions/", line 392, in implicit_compile
    entry_point, inputs, mapping_options)
RuntimeError: Could not compile function

After I use Ctrl+C to interrupt it, the following error gets printed out:

^CTraceback (most recent call last):
  File "", line 156, in <module>
  File "", line 51, in main
    K_train, K_test = generate_kernels(cfg, X_train, X_test)
  File "", line 101, in generate_kernels
    K_train = kernel_gen.generate_kernel_parallel(cfg.KERNEL, X_train, X_train, num_gpus=cfg.SYSTEM.NUM_GPUS, symmetric=True, batch_size=cfg.SYSTEM.BATCH_SIZE, cache_path=cfg.SYSTEM.CACHE_PATH, float32=cfg.SYSTEM.FLOAT_32, extra_info={"kernel_type": "Train"})
  File "/neural_kernels_code/", line 438, in generate_kernel_parallel
    progress = done_q.get()
  File "/root/conda/lib/python3.6/multiprocessing/", line 94, in get
    res = self._recv_bytes()
  File "/root/conda/lib/python3.6/multiprocessing/", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/root/conda/lib/python3.6/multiprocessing/", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/root/conda/lib/python3.6/multiprocessing/", line 379, in _recv
    chunk = read(handle, remaining)
^CError in atexit._run_exitfuncs:
Traceback (most recent call last):
  File "/root/conda/lib/python3.6/multiprocessing/", line 262, in _run_finalizers
  File "/root/conda/lib/python3.6/multiprocessing/", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/root/conda/lib/python3.6/multiprocessing/", line 191, in _finalize_join
  File "/root/conda/lib/python3.6/", line 1056, in join
  File "/root/conda/lib/python3.6/", line 1072, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
Vaishaal commented 4 years ago

Ah I am pretty sure the code only works on V100s or TitanV right now.

I am working on an implementation based on Jax that supports more GPU types but alas it is not out yet.

Haoxiang-Wang commented 4 years ago

@Vaishaal Yes, V100 can indeed run the code. But I find your generate_kernel_parallel function broke for some new dataset (MNIST-style) on a single GPU, while the single-thread generate_kernel works fine. So I suggest you make the single-thread one as the default for the single GPU case.

Vaishaal commented 4 years ago

Noted! What was the exception, that should not happen. I can try to fix it.