SHI-Labs / Neighborhood-Attention-Transformer

Neighborhood Attention Transformer, arxiv 2022 / CVPR 2023. Dilated Neighborhood Attention Transformer, arxiv 2022
MIT License
1.05k stars 86 forks source link

some problem during train #85

Closed guoguo1314 closed 1 year ago

guoguo1314 commented 1 year ago

When training with cuda,The following problems occur:

RuntimeError:

define POS_INFINITY __int_as_float(0x7f800000)

define INFINITY POS_INFINITY

define NEG_INFINITY __int_as_float(0xff800000)

define NAN __int_as_float(0x7fffffff)

typedef long long int int64_t; typedef unsigned int uint32_t; typedef signed char int8_t; typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char" typedef short int16_t; static_assert(sizeof(int64_t) == 8, "expected size does not match"); static_assert(sizeof(uint32_t) == 4, "expected size does not match"); static_assert(sizeof(int8_t) == 1, "expected size does not match"); constexpr int num_threads = 128; constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live constexpr int block_work_size = thread_work_size * num_threads; //TODO use _assert_fail, because assert is disabled in non-debug builds

define ERROR_UNSUPPORTED_CAST assert(false);

namespace std {

using ::signbit; using ::isfinite; using ::isinf; using ::isnan;

using ::abs;

using ::acos; using ::acosf; using ::asin; using ::asinf; using ::atan; using ::atanf; using ::atan2; using ::atan2f; using ::ceil; using ::ceilf; using ::cos; using ::cosf; using ::cosh; using ::coshf;

using ::exp; using ::expf;

using ::fabs; using ::fabsf; using ::floor; using ::floorf;

using ::fmod; using ::fmodf;

using ::frexp; using ::frexpf; using ::ldexp; using ::ldexpf;

using ::log; using ::logf;

using ::log10; using ::log10f; using ::modf; using ::modff;

using ::pow; using ::powf;

using ::sin; using ::sinf; using ::sinh; using ::sinhf;

using ::sqrt; using ::sqrtf; using ::tan; using ::tanf;

using ::tanh; using ::tanhf;

using ::acosh; using ::acoshf; using ::asinh; using ::asinhf; using ::atanh; using ::atanhf; using ::cbrt; using ::cbrtf;

using ::copysign; using ::copysignf;

using ::erf; using ::erff; using ::erfc; using ::erfcf; using ::exp2; using ::exp2f; using ::expm1; using ::expm1f; using ::fdim; using ::fdimf; using ::fmaf; using ::fma; using ::fmax; using ::fmaxf; using ::fmin; using ::fminf; using ::hypot; using ::hypotf; using ::ilogb; using ::ilogbf; using ::lgamma; using ::lgammaf; using ::llrint; using ::llrintf; using ::llround; using ::llroundf; using ::log1p; using ::log1pf; using ::log2; using ::log2f; using ::logb; using ::logbf; using ::lrint; using ::lrintf; using ::lround; using ::lroundf;

using ::nan; using ::nanf;

using ::nearbyint; using ::nearbyintf; using ::nextafter; using ::nextafterf; using ::remainder; using ::remainderf; using ::remquo; using ::remquof; using ::rint; using ::rintf; using ::round; using ::roundf; using ::scalbln; using ::scalblnf; using ::scalbn; using ::scalbnf; using ::tgamma; using ::tgammaf; using ::trunc; using ::truncf;

} // namespace std

// NB: Order matters for this macro; it is relied upon in // _promoteTypesLookup and the serialization format. // Note, some types have ctype as void because we don't support them in codegen

define AT_FORALL_SCALAR_TYPES_WITHCOMPLEX() \

_(uint8t, Byte) / 0 / \ (int8t, Char) / 1 / \ (int16t, Short) / 2 / \ (int, Int) / 3 / \ _(int64t, Long) / 4 / \ (at::Half, Half) / 5 / \ (float, Float) / 6 / \ (double, Double) / 7 / \ (std::complex, ComplexHalf) / 8 / \ (std::complex, ComplexFloat) / 9 / \ (std::complex, ComplexDouble) / 10 / \ (bool, Bool) / 11 / \ (void, QInt8) / 12 / \ (void, QUInt8) / 13 / \ (void, QInt32) / 14 / \ (at::BFloat16, BFloat16) / 15 / \

define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPTQINT() \

_(uint8t, Byte) \ (int8t, Char) \ (int16t, Short) \ (int, Int) \ _(int64t, Long) \ (at::Half, Half) \ (float, Float) \ (double, Double) \ (std::complex, ComplexHalf) \ (std::complex, ComplexFloat) \ (std::complex, ComplexDouble) \ (bool, Bool) \ _(at::BFloat16, BFloat16)

enum class ScalarType : int8_t {

define DEFINE_ENUM(_1, n) n,

AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)

undef DEFINE_ENUM

  Undefined,

NumOptions };

template <typename T, int size> struct Array { T data[size];

device T operator[](int i) const { return data[i]; } device T& operator[](int i) { return data[i]; } Array() = default; Array(const Array&) = default; Array& operator=(const Array&) = default; device Array(T x) { for (int i = 0; i < size; i++) { data[i] = x; } } };

template device inline scalar_t load(char base_ptr, uint32_t offset) { return (reinterpret_cast<scalar_t*>(base_ptr) + offset); }

template device inline void store(scalar_t value, char base_ptr, uint32_t offset) { (reinterpret_cast<scalar_t *>(base_ptr) + offset) = value; }

// aligned vector generates vectorized load/store on CUDA template<typename scalar_t, int vec_size> struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { scalar_t val[vec_size]; };

template T erfc_kernel(T a) { return erfc(a); }

// TODO: setup grid-stride loop

extern "C" global void erfc_kernel_vectorized4_kernel( const int N, Array<char, 1+1> data, float scalar_val) //[1+1], { constexpr int vec_size = 4; int remaining = N - block_work_size blockIdx.x; auto thread_idx = threadIdx.x; int idx = blockIdx.x; float arg0[4];

  float out0[4];

  if (remaining < block_work_size) {
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++){
      if (thread_idx >= remaining) {
        break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      arg0[j] = load<float>(data[1], linear_idx);

      thread_idx += num_threads;
    }
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if ((threadIdx.x  + j*num_threads) < remaining) {
        out0[j] = erfc_kernel<float>(arg0[j] );
      }
    }
    thread_idx = threadIdx.x;
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if (thread_idx >= remaining) {
          break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      store<float>(out0[j], data[0], linear_idx);

      thread_idx += num_threads;
    }
  } else {
    static constexpr int loop_size = thread_work_size / vec_size;

//actual loading using vec_t_input = aligned_vector<float, vec_size>; vec_t_input vec0 = reinterpret_cast<vec_t_input >(data[0+1]) + block_work_size / vec_size * idx;

    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_input v;
      v = vec0[thread_idx];
      #pragma unroll
      for (int j=0; j < vec_size; j++){
        arg0[vec_size * i + j] = v.val[j];
      }

      thread_idx += num_threads;
    }

    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      out0[j] = erfc_kernel<float>(arg0[j] );
    }

    using vec_t_output = aligned_vector<float, vec_size>;
    vec_t_output* to_0 = reinterpret_cast<vec_t_output*>(data[0]) + block_work_size / vec_size * idx;

    int thread_idx = threadIdx.x;
    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_output v;
      #pragma unroll
      for (int j=0; j<vec_size; j++){
      v.val[j] = out0[vec_size * i + j];
      }
      to_0[thread_idx] = v;

      thread_idx += num_threads;
    }
  }

} nvrtc: error: invalid value for --gpu-architecture (-arch)

When training with cpu is normal

The installed versions are as follows: image

thanks a million!!!

alihassanijr commented 1 year ago

Hello and thank you for your interest.

Could you please share which version of NATTEN you're on, and confirm you are not adding any external modules to the code?

Because the issue does not appear to be anything related to this repository specifically.

guoguo1314 commented 1 year ago

Thank you very much! natten uses 0.14.6 (other packages are shown in the screenshot above); Yes, I just integrated NATLayer into the model in our direction, but it could run normally before it was integrated. After it was integrated, this mistake was made by training with 4090ti.

alihassanijr commented 1 year ago

As far as I can tell the issue is that part of the code you're attempting to run tries to just-in-time compile something, and the arguments there are incorrect. That's why you're getting:

nvrtc: error: invalid value for --gpu-architecture (-arch)

Could you check whether a forward and backward pass into NATLayer works separately?


# from .nat import NATLayer
import torch

nat_layer = NATLayer(dim=32, num_heads=4, kernel_size=7, dilation=1).cuda()
x = torch.randn((2, 14, 14, 128)).cuda()

x_out = nat_layer(x)
x_out_sum = x_out.sum()

x_out_sum.backward()
stevenwalton commented 1 year ago

@alihassanijr is the natten version correct? He has 0.14.6+torch1120cu113 but cuda version is 11.7 (mine end in torch1130cu117` for pytorch version 11.3). I just want to verify because the simple solution could be reinstalling pytorch and ensuring the versions are correct.

this mistake was made by training with 4090ti.

Can you also verify this? The 4090Ti is not publicly released.

guoguo1314 commented 1 year ago

As far as I can tell the issue is that part of the code you're attempting to run tries to just-in-time compile something, and the arguments there are incorrect. That's why you're getting:

nvrtc: error: invalid value for --gpu-architecture (-arch)

Could you check whether a forward and backward pass into NATLayer works separately?

# from .nat import NATLayer
import torch

nat_layer = NATLayer(dim=32, num_heads=4, kernel_size=7, dilation=1).cuda()
x = torch.randn((2, 14, 14, 128)).cuda()

x_out = nat_layer(x)
x_out_sum = x_out.sum()

x_out_sum.backward()

Ok, I can try,thanks!

guoguo1314 commented 1 year ago

@alihassanijr is the natten version correct? He has 0.14.6+torch1120cu113 but cuda version is 11.7 (mine end in torch1130cu117` for pytorch version 11.3). I just want to verify because the simple solution could be reinstalling pytorch and ensuring the versions are correct.

this mistake was made by training with 4090ti.

Can you also verify this? The 4090Ti is not publicly released.

I said it wrong. It's 4090 As for the package issue, I am downloading a specific package from our direction, which will automatically download torch, and cuda packages such as Nvidia-Cuda-Cuti-Cu11, so I cannot specify torch version. but I wanted to use natten again, cuda version I installed is 11.3, and cuda11.3 supports torch11.2.0 by checking the table on pytorch official website. so I downloaded torch1.12.0 to override torch, which was downloading automatically. Maybe there was a problem here, but I didn't find a good way. Oh, that comes to mind, I can replace the cuda11.3 packs with the auto-download cuda11.7. I am running other code these days, and I can try this method a few days later. Thanks.

alihassanijr commented 1 year ago

Just to be clear, there is a difference between the CUDA driver/compiler version you have installed, and the one you'd see in your PyTorch version. The cuda version suffix in your PyTorch version (if you installed via pip) indicates which version of CUDA was used to compile the PyTorch binaries you're downloading.

Your local CUDA compiler can be on a different version compared to your PyTorch (though not usually a good idea), but as long as the version of NATTEN you install exactly matches your torch AND the version of CUDA it was compiled with, everything should work out.

But again, to be very clear, the error you shared indicates that something in your code is trying to compile CUDA code, and it fails because of an argument mismatch with your compiler, which is probably because of the mismatch between your CUDA version and PyTorch build. However, neither of those are related to NATTEN, since you already have it installed. Therefore, the issue here is completely unrelated to this project.

guoguo1314 commented 1 year ago

ok, i see, thank you! ! !

alihassanijr commented 1 year ago

I'm closing the issue now, but feel free to reopen if you still have questions related to NA/NATTEN.