Open Pandarenlql opened 1 year ago
I have met so many times for this nvrtc: error: invalid value for --gpu-architecture (-arch). And I think the error lies in tensor.prod ops, modify the 25 line in test.py (level_start_index = torch.cat((shapes.new_zeros((1, )), (shapes[:,0]*shapes[:,1]).cumsum(0)[:-1]))) . It works well!
Hi, Thanks for your excellent work! There are some questions that occur when I run these codes.
`cd ./models/ops sh ./make.sh
unit test (should see all checking is True)
python test.py`
My graphic card is RTX4090 and CUDA version is 11.6, pytorch is 1.12.0, cudatoolkit is 11.6
The following messages are the Traceback of the test.py:
`Traceback (most recent call last): File "test.py", line 24, in
level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
RuntimeError:
define POS_INFINITY __int_as_float(0x7f800000)
define INFINITY POS_INFINITY
define NEG_INFINITY __int_as_float(0xff800000)
define NAN __int_as_float(0x7fffffff)
typedef long long int int64_t; typedef unsigned int uint32_t; typedef signed char int8_t; typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char" typedef short int16_t; static_assert(sizeof(int64_t) == 8, "expected size does not match"); static_assert(sizeof(uint32_t) == 4, "expected size does not match"); static_assert(sizeof(int8_t) == 1, "expected size does not match"); constexpr int num_threads = 128; constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live constexpr int block_work_size = thread_work_size * num_threads; //TODO use _assert_fail, because assert is disabled in non-debug builds
define ERROR_UNSUPPORTED_CAST assert(false);
namespace std {
using ::signbit; using ::isfinite; using ::isinf; using ::isnan;
using ::abs;
using ::acos; using ::acosf; using ::asin; using ::asinf; using ::atan; using ::atanf; using ::atan2; using ::atan2f; using ::ceil; using ::ceilf; using ::cos; using ::cosf; using ::cosh; using ::coshf;
using ::exp; using ::expf;
using ::fabs; using ::fabsf; using ::floor; using ::floorf;
using ::fmod; using ::fmodf;
using ::frexp; using ::frexpf; using ::ldexp; using ::ldexpf;
using ::log; using ::logf;
using ::log10; using ::log10f; using ::modf; using ::modff;
using ::pow; using ::powf;
using ::sin; using ::sinf; using ::sinh; using ::sinhf;
using ::sqrt; using ::sqrtf; using ::tan; using ::tanf;
using ::tanh; using ::tanhf;
using ::acosh; using ::acoshf; using ::asinh; using ::asinhf; using ::atanh; using ::atanhf; using ::cbrt; using ::cbrtf;
using ::copysign; using ::copysignf;
using ::erf; using ::erff; using ::erfc; using ::erfcf; using ::exp2; using ::exp2f; using ::expm1; using ::expm1f; using ::fdim; using ::fdimf; using ::fmaf; using ::fma; using ::fmax; using ::fmaxf; using ::fmin; using ::fminf; using ::hypot; using ::hypotf; using ::ilogb; using ::ilogbf; using ::lgamma; using ::lgammaf; using ::llrint; using ::llrintf; using ::llround; using ::llroundf; using ::log1p; using ::log1pf; using ::log2; using ::log2f; using ::logb; using ::logbf; using ::lrint; using ::lrintf; using ::lround; using ::lroundf;
using ::nan; using ::nanf;
using ::nearbyint; using ::nearbyintf; using ::nextafter; using ::nextafterf; using ::remainder; using ::remainderf; using ::remquo; using ::remquof; using ::rint; using ::rintf; using ::round; using ::roundf; using ::scalbln; using ::scalblnf; using ::scalbn; using ::scalbnf; using ::tgamma; using ::tgammaf; using ::trunc; using ::truncf;
} // namespace std
// NB: Order matters for this macro; it is relied upon in // _promoteTypesLookup and the serialization format. // Note, some types have ctype as void because we don't support them in codegen
define AT_FORALL_SCALAR_TYPES_WITHCOMPLEX() \
_(uint8t, Byte) / 0 / \ (int8t, Char) / 1 / \ (int16t, Short) / 2 / \ (int, Int) / 3 / \ _(int64t, Long) / 4 / \ (at::Half, Half) / 5 / \ (float, Float) / 6 / \ (double, Double) / 7 / \ (std::complex, ComplexHalf) / 8 / \
(std::complex, ComplexFloat) / 9 / \
(std::complex, ComplexDouble) / 10 / \
(bool, Bool) / 11 / \
(void, QInt8) / 12 / \
(void, QUInt8) / 13 / \
(void, QInt32) / 14 / \
(at::BFloat16, BFloat16) / 15 / \
define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPTQINT() \
_(uint8t, Byte) \ (int8t, Char) \ (int16t, Short) \ (int, Int) \ _(int64t, Long) \ (at::Half, Half) \ (float, Float) \ (double, Double) \ (std::complex, ComplexHalf) \
(std::complex, ComplexFloat) \
(std::complex, ComplexDouble) \
(bool, Bool) \
_(at::BFloat16, BFloat16)
enum class ScalarType : int8_t {
define DEFINE_ENUM(_1, n) n,
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
undef DEFINE_ENUM
NumOptions };
template <typename T, int size> struct Array { T data[size];
device T operator[](int i) const { return data[i]; } device T& operator[](int i) { return data[i]; } Array() = default; Array(const Array&) = default; Array& operator=(const Array&) = default; device Array(T x) { for (int i = 0; i < size; i++) { data[i] = x; } } };
template
struct DivMod {
T div;
T mod;
device DivMod(T _div, T _mod) { div = _div; mod = _mod; } };
//
struct IntDivider {
IntDivider() = default;
device inline unsigned int div(unsigned int n) const { unsigned int t = __umulhi(n, m1); return (t + n) >> shift; }
device inline unsigned int mod(unsigned int n) const { return n - div(n) * divisor; }
device inline DivMod divmod(unsigned int n) const {
unsigned int q = div(n);
return DivMod(q, n - q * divisor);
}
unsigned int divisor; // d above. unsigned int m1; // Magic number: m' above. unsigned int shift; // Shift amounts. };
template
struct TrivialOffsetCalculator {
// The offset for each argument. Wrapper around fixed-size array.
// The offsets are in # of elements, not in bytes.
Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
Array<unsigned int, NARGS> offsets;
pragma unroll
};
template
struct OffsetCalculator {
OffsetCalculator() = default;
device forceinline Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
Array<unsigned int, NARGS> offsets;
pragma unroll
}
};
define C10_HOST_DEVICE host device
define C10_DEVICE device
template
device forceinline T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
return __shfl_down_sync(mask, value, delta, width);
}
if 0
template
device forceinline std::complex WARP_SHFL_DOWN(std::complex value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
return std::complex(
shfl_down_sync(mask, value.real(), delta, width),
shfl_down_sync(mask, value.imag(), delta, width));
}
endif
// aligned vector generates vectorized load/store on CUDA template<typename scalar_t, int vec_size> struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { scalar_t val[vec_size]; };
C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { // get GCD of num and denom using Euclid's algorithm. // Can replace this with std::gcd if we ever support c++17. size_t a = denominator; size_t b = numerator; while (b != 0) { a %= b; // swap(a,b) size_t tmp = a; a = b; b = tmp; }
}
struct ReduceConfig { //has to match host-side ReduceConfig in the eager code static constexpr int BLOCK_X = 0; static constexpr int BLOCK_Y = 1; static constexpr int CTA = 2;
static constexpr int input_vec_size = 4; int element_size_bytes; int num_inputs; int num_outputs; int step_input = 1; int step_output = 1; int ctas_per_output = 1; int input_mult[3] = {0, 0, 0}; int output_mult[2] = {0, 0};
int block_width; int block_height; int num_threads;
bool vectorize_input = false; int output_vec_size = 1;
C10_HOST_DEVICE bool should_block_x_reduce() const { return input_mult[BLOCK_X] != 0; }
C10_HOST_DEVICE bool should_block_y_reduce() const { return input_mult[BLOCK_Y] != 0; }
C10_HOST_DEVICE bool should_global_reduce() const { return input_mult[CTA] != 0; }
C10_DEVICE bool should_store(int output_idx) const { return output_idx < num_outputs && (!should_block_x_reduce() || threadIdx.x == 0) && (!should_block_y_reduce() || threadIdx.y == 0); }
C10_DEVICE bool should_reduce_tail() const { return (!should_block_y_reduce() || threadIdx.y == 0) && (!should_global_reduce() || blockIdx.y == 0); }
C10_HOST_DEVICE int input_idx() const { int lane = threadIdx.x; int warp = threadIdx.y; int cta2 = blockIdx.y; return (lane input_mult[BLOCK_X] + warp input_mult[BLOCK_Y] + cta2 * input_mult[CTA]); }
template
C10_HOST_DEVICE int output_idx() const {
int lane = threadIdx.x;
int warp = threadIdx.y;
int cta1 = blockIdx.x;
return (lane output_mult[BLOCK_X] +
warp output_mult[BLOCK_Y] +
cta1 step_output) output_vec_size;
}
C10_DEVICE int shared_memory_offset(int offset) const { return threadIdx.x + (threadIdx.y + offset) * blockDim.x; }
C10_DEVICE int staging_memory_offset(int cta2) const { int offset = cta2 + blockIdx.x gridDim.y; if (!should_block_x_reduce()) { offset = threadIdx.x + offset blockDim.x; } return offset; }
};
//TODO this will need to be different for more generic reduction functions namespace reducer {
using scalar_t = int64_t; using arg_t = int64_t; using out_scalar_t = int64_t;
inline device arg_t combine(arg_t a, arg_t b) { return a * b; }
inline device out_scalar_t project(arg_t arg) { return (out_scalar_t) arg; }
inline device arg_t warp_shfl_down(arg_t arg, int offset) { return WARP_SHFL_DOWN(arg, offset); }
inline device arg_t translate_idx(arg_t acc, int64_t /idx/) { return acc; }
// wrap a normal reduction that ignores the index inline device arg_t reduce(arg_t acc, arg_t val, int64_t idx) { return combine(acc, val); } }
struct ReduceJitOp { using scalar_t = int64_t; using arg_t = int64_t; using out_scalar_t = int64_t;
using InputCalculator = OffsetCalculator<1>; using OutputCalculator = OffsetCalculator<2>;
// static constexpr bool can_accumulate_in_output = // std::is_convertible<arg_t, out_scalar_t>::value // && std::is_convertible<out_scalar_t, arg_t>::value;
static constexpr int input_vec_size = ReduceConfig::input_vec_size;
arg_t ident; ReduceConfig config; InputCalculator input_calc; OutputCalculator output_calc; const void src; const char dst[2]; //it accepts at most two destinations // acc_buf used for accumulation among sub Tensor Iterator when accumulation on // output is not permissible void acc_buf; // cta_buf used for accumulation between blocks during global reduction void cta_buf; int* semaphores; int64_t base_idx; bool accumulate; bool final_output; int noutputs;
C10_DEVICE void run() const { extern shared char shared_memory[]; uint32_t output_idx = config.output_idx<1>(); uint32_t input_idx = config.input_idx(); auto base_offsets1 = output_calc.get(output_idx)[1];
}
template
C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t data) const {
if (config.vectorize_input) {
assert(output_vec_size == 1);
// reduce at the header of input_slice where memory is not aligned,
// so that thread_reduce will have an aligned memory to work on.
return {input_vectorized_thread_reduce_impl(data)};
} else {
uint32_t element_stride = inputcalc.strides[0][0] / sizeof(scalar_t);
bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
if (is_contiguous) {
return thread_reduce_impl(data, [](uint32_t idx) { return idx; });
} else if (input_calc.dims == 1) {
return thread_reduce_impl(data, [&](uint32_t idx) { return idx element_stride; });
} else {
return thread_reduce_impl(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
}
}
}
C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { uint32_t end = config.num_inputs;
}
template <int output_vec_size, typename offset_calc_t> C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalart* data, offset_calc_t calc) const { uint32_t idx = config.input_idx(); const uint32_t end = config.num_inputs; const uint32_t stride = config.step_input; const int vt0=4;
} template
C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char shared_memory) const {
using args_vec_t = Array<arg_t, output_vec_size>;
int dim_x = blockDim.x;
args_vec_t shared = (args_vec_t)shared_memory;
if (dim_x > warpSize) {
int address_base = threadIdx.x + threadIdx.yblockDim.x;
shared[address_base] = value;
for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
__syncthreads();
if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
args_vec_t other = shared[address_base + offset];
pragma unroll
}
template
C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char shared_memory) const {
using args_vec_t = Array<arg_t, output_vec_size>;
args_vec_t shared = (args_vec_t*)shared_memory;
shared[config.shared_memory_offset(0)] = value;
for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
__syncthreads();
if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
args_vec_t other = shared[config.shared_memory_offset(offset)];
pragma unroll
}
C10_DEVICE bool mark_block_finished() const { shared bool is_last_block_done_shared;
}
template
C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
Array<out_scalar_t*, output_vec_size> out,
Array<arg_t, output_vec_size> value
) const {
Array<arg_t, output_vec_size> ret;
pragma unroll
}
C10_DEVICE out_scalar_t get_accumulated_output( out_scalar_t* out, arg_t value ) const { assert(!final_output); return (out_scalar_t)value; }
template
C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
assert(noutputs == 1);
auto res = (out_scalar_t)((char)dst[0] + base_offset);
*res = x;
}
//TODO - multi-output reduction - we won't be able to use thrust::pair //just explicitly specify typed output reads/writes //Currently implemented for max of two outputs // template<class T1, class T2> // C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const { // if (noutputs >= 1) { // auto res0 = (T1)((char)dst[0] + base_offset); // res0 = x.first; // } // if (noutputs >= 2) { // // base offset is computed assuming element size being sizeof(T1), so we need to make a // // correction to obtain the correct base offset // auto res1 = (T2) ((char ) dst[1] + base_offset / sizeof(T1) sizeof(T2)); // *res1 = x.second; // } // }
template
C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
assert(final_output);
pragma unroll
}
template
C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> acc, char shared_memory) const {
using arg_vec_t = Array<arg_t, output_vec_size>;
using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
using offset_vec_t = Array<uint32_t, output_vec_size>;
} };
extern "C" __launch_bounds(512, 4) global__ void reduction_prod_kernel(ReduceJitOp r){ r.run(); } nvrtc: error: invalid value for --gpu-architecture (-arch) `