Open danth opened 1 year ago
Bumped the version, but there is still no improvement in the AMD side of things.
For context, this is the error message:
Traceback (most recent call last):
File "/nix/store/b2rrw1dmg01lyywrc33rvwlvg04vfrwm-fooocus-source/modules/async_worker.py", line 822, in worker
handler(task)
File "/nix/store/vi69k4prkwj1wqjz84p1ni960wd9dcha-python3-3.11.6-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/vi69k4prkwj1wqjz84p1ni960wd9dcha-python3-3.11.6-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/b2rrw1dmg01lyywrc33rvwlvg04vfrwm-fooocus-source/modules/async_worker.py", line 425, in handler
expansion = pipeline.final_expansion(t['task_prompt'], t['task_seed'])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/vi69k4prkwj1wqjz84p1ni960wd9dcha-python3-3.11.6-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/vi69k4prkwj1wqjz84p1ni960wd9dcha-python3-3.11.6-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/b2rrw1dmg01lyywrc33rvwlvg04vfrwm-fooocus-source/extras/expansion.py", line 117, in __call__
features = self.model.generate(**tokenized_kwargs,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/vi69k4prkwj1wqjz84p1ni960wd9dcha-python3-3.11.6-env/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/nix/store/vi69k4prkwj1wqjz84p1ni960wd9dcha-python3-3.11.6-env/lib/python3.11/site-packages/transformers/generation/utils.py", line 1719, in generate
return self.sample(
^^^^^^^^^^^^
File "/nix/store/vi69k4prkwj1wqjz84p1ni960wd9dcha-python3-3.11.6-env/lib/python3.11/site-packages/transformers/generation/utils.py", line 2856, in sample
next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
RuntimeError:
#pragma clang force_cuda_host_device begin
#ifdef __HIPCC__
#define ERROR_UNSUPPORTED_CAST ;
// corresponds to aten/src/ATen/native/cuda/thread_constants.h
#define CUDA_OR_ROCM_NUM_THREADS 256
// corresponds to aten/src/ATen/cuda/detail/OffsetCalculator.cuh
#define MAX_DIMS 16
#ifndef __forceinline__
#define __forceinline__ inline __attribute__((always_inline))
#endif
#else
//TODO use _assert_fail, because assert is disabled in non-debug builds
#define ERROR_UNSUPPORTED_CAST assert(false);
#define CUDA_OR_ROCM_NUM_THREADS 128
#define MAX_DIMS 25
#endif
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)
typedef long long int int64_t;
typedef unsigned int uint32_t;
typedef signed char int8_t;
typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
typedef short int16_t;
static_assert(sizeof(int64_t) == 8, "expected size does not match");
static_assert(sizeof(uint32_t) == 4, "expected size does not match");
static_assert(sizeof(int8_t) == 1, "expected size does not match");
constexpr int num_threads = CUDA_OR_ROCM_NUM_THREADS;
constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
constexpr int block_work_size = thread_work_size * num_threads;
namespace std {
using ::signbit;
using ::isfinite;
using ::isinf;
using ::isnan;
using ::abs;
using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atanf;
using ::atan2;
using ::atan2f;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;
using ::exp;
using ::expf;
using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;
using ::fmod;
using ::fmodf;
using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;
using ::log;
using ::logf;
using ::log10;
using ::log10f;
using ::modf;
using ::modff;
using ::pow;
using ::powf;
using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;
using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;
using ::tanh;
using ::tanhf;
using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;
using ::copysign;
using ::copysignf;
using ::erf;
using ::erff;
using ::erfc;
using ::erfcf;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fmaf;
using ::fma;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;
} // namespace std
// NB: Order matters for this macro; it is relied upon in
// _promoteTypesLookup and the serialization format.
// Note, some types have ctype as void because we don't support them in codegen
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(_) \
_(uint8_t, Byte) /* 0 */ \
_(int8_t, Char) /* 1 */ \
_(int16_t, Short) /* 2 */ \
_(int, Int) /* 3 */ \
_(int64_t, Long) /* 4 */ \
_(at::Half, Half) /* 5 */ \
_(float, Float) /* 6 */ \
_(double, Double) /* 7 */ \
_(std::complex<at::Half>, ComplexHalf) /* 8 */ \
_(std::complex<float>, ComplexFloat) /* 9 */ \
_(std::complex<double>, ComplexDouble) /* 10 */ \
_(bool, Bool) /* 11 */ \
_(void, QInt8) /* 12 */ \
_(void, QUInt8) /* 13 */ \
_(void, QInt32) /* 14 */ \
_(at::BFloat16, BFloat16) /* 15 */ \
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_QINT(_) \
_(uint8_t, Byte) \
_(int8_t, Char) \
_(int16_t, Short) \
_(int, Int) \
_(int64_t, Long) \
_(at::Half, Half) \
_(float, Float) \
_(double, Double) \
_(std::complex<at::Half>, ComplexHalf) \
_(std::complex<float>, ComplexFloat) \
_(std::complex<double>, ComplexDouble) \
_(bool, Bool) \
_(at::BFloat16, BFloat16)
enum class ScalarType : int8_t {
#define DEFINE_ENUM(_1, n) n,
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
#undef DEFINE_ENUM
Undefined,
NumOptions
};
template <typename T, int size>
struct Array {
T data[size];
__device__ T operator[](int i) const {
return data[i];
}
__device__ T& operator[](int i) {
return data[i];
}
Array() = default;
Array(const Array&) = default;
Array& operator=(const Array&) = default;
__device__ Array(T x) {
for (int i = 0; i < size; i++) {
data[i] = x;
}
}
};
template <typename T>
struct DivMod {
T div;
T mod;
__device__ DivMod(T _div, T _mod) {
div = _div;
mod = _mod;
}
};
//<unsigned int>
struct IntDivider {
IntDivider() = default;
__device__ inline unsigned int div(unsigned int n) const {
unsigned int t = __umulhi(n, m1);
return (t + n) >> shift;
}
__device__ inline unsigned int mod(unsigned int n) const {
return n - div(n) * divisor;
}
__device__ inline DivMod<unsigned int> divmod(unsigned int n) const {
unsigned int q = div(n);
return DivMod<unsigned int>(q, n - q * divisor);
}
unsigned int divisor; // d above.
unsigned int m1; // Magic number: m' above.
unsigned int shift; // Shift amounts.
};
template <int NARGS>
struct TrivialOffsetCalculator {
// The offset for each argument. Wrapper around fixed-size array.
// The offsets are in # of elements, not in bytes.
Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
Array<unsigned int, NARGS> offsets;
#pragma unroll
for (int arg = 0; arg < NARGS; arg++) {
offsets[arg] = linear_idx;
}
return offsets;
}
};
template<int NARGS>
struct OffsetCalculator {
OffsetCalculator() = default;
__device__ __forceinline__ Array<unsigned int, NARGS> get(unsigned int linear_idx) const {
Array<unsigned int, NARGS> offsets;
#pragma unroll
for (int arg = 0; arg < NARGS; ++arg) {
offsets[arg] = 0;
}
#pragma unroll
for (int dim = 0; dim < MAX_DIMS; ++dim) {
if (dim == dims) {
break;
}
auto divmod = sizes_[dim].divmod(linear_idx);
linear_idx = divmod.div;
#pragma unroll
for (int arg = 0; arg < NARGS; ++arg) {
offsets[arg] += divmod.mod * strides_[dim][arg];
}
//printf("offset calc thread dim size stride offset %d %d %d %d %d %d %d %d\n",
//threadIdx.x, dim, sizes_[dim].divisor, strides_[dim][0], offsets[0], linear_idx, divmod.div, divmod.mod);
}
return offsets;
}
int dims;
IntDivider sizes_[MAX_DIMS];
// NOTE: this approach will not support nInputs == 0
unsigned int strides_[MAX_DIMS][NARGS];
};
#define C10_HOST_DEVICE __host__ __device__
#define C10_DEVICE __device__
#if defined(__clang__) && defined(__HIP__)
#ifndef __forceinline__
#define __forceinline__ inline __attribute__((always_inline))
#endif
// until ROCm support for kernel asserts is restored
#define assert(expr) (static_cast<void>(0))
#endif
template <typename T>
__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if defined(__clang__) && defined(__HIP__)
return __shfl_down(value, delta, width);
#else
return __shfl_down_sync(mask, value, delta, width);
#endif
}
#if 0
template <typename T>
__device__ __forceinline__ std::complex<T> WARP_SHFL_DOWN(std::complex<T> value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
return std::complex<T>(
#if defined(__clang__) && defined(__HIP__)
__shfl_down(value.real(), delta, width),
__shfl_down(value.imag(), delta, width));
#else
__shfl_down_sync(mask, value.real(), delta, width),
__shfl_down_sync(mask, value.imag(), delta, width));
#endif
}
#endif
// aligned vector generates vectorized load/store on CUDA
template<typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
scalar_t val[vec_size];
};
C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) {
// get GCD of num and denom using Euclid's algorithm.
// Can replace this with std::gcd if we ever support c++17.
size_t a = denominator;
size_t b = numerator;
while (b != 0) {
a %= b;
// swap(a,b)
size_t tmp = a;
a = b;
b = tmp;
}
// a is now the GCD
numerator /= a;
denominator /= a;
}
struct ReduceConfig {
//has to match host-side ReduceConfig in the eager code
static constexpr int BLOCK_X = 0;
static constexpr int BLOCK_Y = 1;
static constexpr int CTA = 2;
static constexpr int input_vec_size = 4;
int element_size_bytes;
int num_inputs;
int num_outputs;
int step_input = 1;
int step_output = 1;
int ctas_per_output = 1;
int input_mult[3] = {0, 0, 0};
int output_mult[2] = {0, 0};
int block_width;
int block_height;
int num_threads;
bool vectorize_input = false;
int output_vec_size = 1;
C10_HOST_DEVICE bool should_block_x_reduce() const {
return input_mult[BLOCK_X] != 0;
}
C10_HOST_DEVICE bool should_block_y_reduce() const {
return input_mult[BLOCK_Y] != 0;
}
C10_HOST_DEVICE bool should_global_reduce() const {
return input_mult[CTA] != 0;
}
C10_DEVICE bool should_store(int output_idx) const {
return output_idx < num_outputs &&
(!should_block_x_reduce() || threadIdx.x == 0) &&
(!should_block_y_reduce() || threadIdx.y == 0);
}
C10_DEVICE bool should_reduce_tail() const {
return (!should_block_y_reduce() || threadIdx.y == 0) &&
(!should_global_reduce() || blockIdx.y == 0);
}
C10_HOST_DEVICE int input_idx() const {
int lane = threadIdx.x;
int warp = threadIdx.y;
int cta2 = blockIdx.y;
return (lane * input_mult[BLOCK_X] +
warp * input_mult[BLOCK_Y] +
cta2 * input_mult[CTA]);
}
template <int output_vec_size>
C10_HOST_DEVICE int output_idx() const {
int lane = threadIdx.x;
int warp = threadIdx.y;
int cta1 = blockIdx.x;
return (lane * output_mult[BLOCK_X] +
warp * output_mult[BLOCK_Y] +
cta1 * step_output) * output_vec_size;
}
C10_DEVICE int shared_memory_offset(int offset) const {
return threadIdx.x + (threadIdx.y + offset) * blockDim.x;
}
C10_DEVICE int staging_memory_offset(int cta2) const {
int offset = cta2 + blockIdx.x * gridDim.y;
if (!should_block_x_reduce()) {
offset = threadIdx.x + offset * blockDim.x;
}
return offset;
}
};
//TODO this will need to be different for more generic reduction functions
namespace reducer {
using scalar_t = int64_t;
using arg_t = int64_t;
using out_scalar_t = int64_t;
inline __device__ arg_t combine(arg_t a, arg_t b) { return a * b; }
inline __device__ out_scalar_t project(arg_t arg) {
return (out_scalar_t) arg;
}
inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) {
return WARP_SHFL_DOWN(arg, offset);
}
inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) {
return acc;
}
// wrap a normal reduction that ignores the index
inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) {
return combine(acc, val);
}
}
struct ReduceJitOp {
using scalar_t = int64_t;
using arg_t = int64_t;
using out_scalar_t = int64_t;
using InputCalculator = OffsetCalculator<1>;
using OutputCalculator = OffsetCalculator<2>;
// static constexpr bool can_accumulate_in_output =
// std::is_convertible<arg_t, out_scalar_t>::value
// && std::is_convertible<out_scalar_t, arg_t>::value;
static constexpr int input_vec_size = ReduceConfig::input_vec_size;
arg_t ident;
ReduceConfig config;
InputCalculator input_calc;
OutputCalculator output_calc;
const void* src;
const char* dst[2]; //it accepts at most two destinations
// acc_buf used for accumulation among sub Tensor Iterator when accumulation on
// output is not permissible
void* acc_buf;
// cta_buf used for accumulation between blocks during global reduction
void* cta_buf;
int* semaphores;
int64_t base_idx;
bool accumulate;
bool final_output;
int noutputs;
C10_DEVICE void run() const {
extern __shared__ char shared_memory[];
uint32_t output_idx = config.output_idx<1>();
uint32_t input_idx = config.input_idx();
auto base_offsets1 = output_calc.get(output_idx)[1];
using arg_vec_t = Array<arg_t, 1>;
arg_vec_t value;
if (output_idx < config.num_outputs && input_idx < config.num_inputs) {
const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1);
value = thread_reduce<1>(input_slice);
}
if (config.should_block_y_reduce()) {
value = block_y_reduce<1>(value, shared_memory);
}
if (config.should_block_x_reduce()) {
value = block_x_reduce<1>(value, shared_memory);
}
using out_ptr_vec_t = Array<out_scalar_t*, 1>;
using offset_vec_t = Array<uint32_t, 1>;
offset_vec_t base_offsets;
out_ptr_vec_t out;
#pragma unroll
for (int i = 0; i < 1; i++) {
base_offsets[i] = output_calc.get(output_idx + i)[0];
out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
}
arg_vec_t* acc = nullptr;
if (acc_buf != nullptr) {
size_t numerator = sizeof(arg_t);
size_t denominator = sizeof(out_scalar_t);
reduce_fraction(numerator, denominator);
acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator));
}
if (config.should_global_reduce()) {
value = global_reduce<1>(value, acc, shared_memory);
} else if (config.should_store(output_idx)) {
if (accumulate) {
#pragma unroll
for (int i = 0; i < 1; i++) {
value[i] = reducer::translate_idx(value[i], base_idx);
}
}
if (acc == nullptr) {
if (accumulate) {
value = accumulate_in_output<1>(out, value);
}
if (final_output) {
set_results_to_output<1>(value, base_offsets);
} else {
#pragma unroll
for (int i = 0; i < 1; i++) {
*(out[i]) = get_accumulated_output(out[i], value[i]);
}
}
} else {
if (accumulate) {
#pragma unroll
for (int i = 0; i < 1; i++) {
value[i] = reducer::combine((*acc)[i], value[i]);
}
}
if (final_output) {
set_results_to_output<1>(value, base_offsets);
} else {
*acc = value;
}
}
}
}
template <int output_vec_size>
C10_DEVICE Array<arg_t, output_vec_size> thread_reduce(const scalar_t* data) const {
if (config.vectorize_input) {
assert(output_vec_size == 1);
// reduce at the header of input_slice where memory is not aligned,
// so that thread_reduce will have an aligned memory to work on.
return {input_vectorized_thread_reduce_impl(data)};
} else {
uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t);
bool is_contiguous = (input_calc.dims == 1 && element_stride == 1);
if (is_contiguous) {
return thread_reduce_impl<output_vec_size>(data, [](uint32_t idx) { return idx; });
} else if (input_calc.dims == 1) {
return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return idx * element_stride; });
} else {
return thread_reduce_impl<output_vec_size>(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); });
}
}
}
C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const {
uint32_t end = config.num_inputs;
// Handle the head of input slice where data is not aligned
arg_t value = ident;
constexpr int align_bytes = alignof(aligned_vector<scalar_t, input_vec_size>);
constexpr int align_elements = align_bytes / sizeof(scalar_t);
int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t);
if (shift > 0) {
data -= shift;
end += shift;
if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){
value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift);
}
end -= align_elements;
data += align_elements;
shift = align_elements - shift;
}
// Do the vectorized reduction
using load_t = aligned_vector<scalar_t, input_vec_size>;
uint32_t idx = config.input_idx();
const uint32_t stride = config.step_input;
// Multiple accumulators to remove dependency between unrolled loops.
arg_t value_list[input_vec_size];
value_list[0] = value;
#pragma unroll
for (int i = 1; i < input_vec_size; i++) {
value_list[i] = ident;
}
scalar_t values[input_vec_size];
load_t *values_vector = reinterpret_cast<load_t*>(&values[0]);
while (idx * input_vec_size + input_vec_size - 1 < end) {
*values_vector = reinterpret_cast<const load_t*>(data)[idx];
#pragma unroll
for (uint32_t i = 0; i < input_vec_size; i++) {
value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i);
}
idx += stride;
}
// tail
uint32_t tail_start = end - end % input_vec_size;
if (config.should_reduce_tail()) {
int idx = tail_start + threadIdx.x;
if (idx < end) {
value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift);
}
}
// combine accumulators
#pragma unroll
for (int i = 1; i < input_vec_size; i++) {
value_list[0] = reducer::combine(value_list[0], value_list[i]);
}
return value_list[0];
}
template <int output_vec_size, typename offset_calc_t>
C10_DEVICE Array<arg_t, output_vec_size> thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const {
uint32_t idx = config.input_idx();
const uint32_t end = config.num_inputs;
const uint32_t stride = config.step_input;
const int vt0=4;
using arg_vec_t = Array<arg_t, output_vec_size>;
using load_t = aligned_vector<scalar_t, output_vec_size>;
const load_t* data = reinterpret_cast<const load_t*>(data_);
// Multiple accumulators to remove dependency between unrolled loops.
arg_vec_t value_list[vt0];
#pragma unroll
for (int i = 0; i < vt0; i++) {
#pragma unroll
for (int j = 0; j < output_vec_size; j++) {
value_list[i][j] = ident;
}
}
load_t values[vt0];
while (idx + (vt0 - 1) * stride < end) {
#pragma unroll
for (uint32_t i = 0; i < vt0; i++) {
values[i] = data[calc(idx + i * stride) / output_vec_size];
}
#pragma unroll
for (uint32_t i = 0; i < vt0; i++) {
#pragma unroll
for (uint32_t j = 0; j < output_vec_size; j++) {
value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride);
}
}
idx += stride * vt0;
}
// tail
int idx_ = idx;
#pragma unroll
for (uint32_t i = 0; i < vt0; i++) {
if (idx >= end) {
break;
}
values[i] = data[calc(idx) / output_vec_size];
idx += stride;
}
idx = idx_;
#pragma unroll
for (uint32_t i = 0; i < vt0; i++) {
if (idx >= end) {
break;
}
#pragma unroll
for (uint32_t j = 0; j < output_vec_size; j++) {
value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx);
}
idx += stride;
}
// combine accumulators
#pragma unroll
for (int i = 1; i < vt0; i++) {
#pragma unroll
for (uint32_t j = 0; j < output_vec_size; j++) {
value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]);
}
}
return value_list[0];
}
template <int output_vec_size>
C10_DEVICE Array<arg_t, output_vec_size> block_x_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
using args_vec_t = Array<arg_t, output_vec_size>;
int dim_x = blockDim.x;
args_vec_t* shared = (args_vec_t*)shared_memory;
if (dim_x > warpSize) {
int address_base = threadIdx.x + threadIdx.y*blockDim.x;
shared[address_base] = value;
for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) {
__syncthreads();
if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) {
args_vec_t other = shared[address_base + offset];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], other[i]);
}
shared[address_base] = value;
}
}
dim_x = warpSize;
}
__syncthreads();
for (int offset = 1; offset < dim_x; offset <<= 1) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
arg_t other = reducer::warp_shfl_down(value[i], offset);
value[i] = reducer::combine(value[i], other);
}
}
return value;
}
template <int output_vec_size>
C10_DEVICE Array<arg_t, output_vec_size> block_y_reduce(Array<arg_t, output_vec_size> value, char* shared_memory) const {
using args_vec_t = Array<arg_t, output_vec_size>;
args_vec_t* shared = (args_vec_t*)shared_memory;
shared[config.shared_memory_offset(0)] = value;
for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) {
__syncthreads();
if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) {
args_vec_t other = shared[config.shared_memory_offset(offset)];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], other[i]);
}
shared[config.shared_memory_offset(0)] = value;
}
}
return value;
}
C10_DEVICE bool mark_block_finished() const {
__shared__ bool is_last_block_done_shared;
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0) {
int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1);
is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1);
}
__syncthreads();
return is_last_block_done_shared;
}
template <int output_vec_size>
C10_DEVICE Array<arg_t, output_vec_size> accumulate_in_output(
Array<out_scalar_t*, output_vec_size> out,
Array<arg_t, output_vec_size> value
) const {
Array<arg_t, output_vec_size> ret;
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
ret[i] = reducer::combine(*(out[i]), value[i]);
}
return ret;
}
C10_DEVICE out_scalar_t get_accumulated_output(
out_scalar_t* out, arg_t value
) const {
assert(!final_output);
return (out_scalar_t)value;
}
template<class T>
C10_DEVICE void set_results(const T x, const uint32_t base_offset) const {
assert(noutputs == 1);
auto res = (out_scalar_t*)((char*)dst[0] + base_offset);
*res = x;
}
//TODO - multi-output reduction - we won't be able to use thrust::pair
//just explicitly specify typed output reads/writes
//Currently implemented for max of two outputs
// template<class T1, class T2>
// C10_DEVICE void set_results(const thrust::pair<T1, T2> x, const index_t base_offset) const {
// if (noutputs >= 1) {
// auto res0 = (T1*)((char*)dst[0] + base_offset);
// *res0 = x.first;
// }
// if (noutputs >= 2) {
// // base offset is computed assuming element size being sizeof(T1), so we need to make a
// // correction to obtain the correct base offset
// auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2));
// *res1 = x.second;
// }
// }
template <int output_vec_size>
C10_DEVICE void set_results_to_output(Array<arg_t, output_vec_size> value, Array<uint32_t, output_vec_size> base_offset) const {
assert(final_output);
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
set_results(reducer::project(value[i]), base_offset[i]);
}
}
template <int output_vec_size>
C10_DEVICE Array<arg_t, output_vec_size> global_reduce(Array<arg_t, output_vec_size> value, Array<arg_t, output_vec_size> *acc, char* shared_memory) const {
using arg_vec_t = Array<arg_t, output_vec_size>;
using out_ptr_vec_t = Array<out_scalar_t*, output_vec_size>;
using offset_vec_t = Array<uint32_t, output_vec_size>;
arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf;
uint32_t output_idx = config.output_idx<output_vec_size>();
offset_vec_t base_offsets;
out_ptr_vec_t out;
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
base_offsets[i] = output_calc.get(output_idx + i)[0];
out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]);
}
bool should_store = config.should_store(output_idx);
if (should_store) {
uint32_t offset = config.staging_memory_offset(blockIdx.y);
reduce_buffer[offset] = value;
}
__threadfence(); // make sure writes are globally visible
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
bool is_last_block_done = mark_block_finished();
if (is_last_block_done) {
value = ident;
if (config.should_block_x_reduce()) {
uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x;
uint32_t step = blockDim.x * blockDim.y;
for (; input_offset < config.ctas_per_output; input_offset += step) {
uint32_t idx = config.staging_memory_offset(input_offset);
arg_vec_t next = reduce_buffer[idx];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], next[i]);
}
}
} else {
uint32_t input_offset = threadIdx.y;
uint32_t step = blockDim.y;
for (; input_offset < config.ctas_per_output; input_offset += step) {
uint32_t idx = config.staging_memory_offset(input_offset);
arg_vec_t next = reduce_buffer[idx];
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine(value[i], next[i]);
}
}
}
value = block_y_reduce(value, shared_memory);
if (config.should_block_x_reduce()) {
value = block_x_reduce<output_vec_size>(value, shared_memory);
}
if (should_store) {
if (accumulate) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::translate_idx(value[i], base_idx);
}
}
if (acc == nullptr) {
if (accumulate) {
value = accumulate_in_output<output_vec_size>(out, value);
}
if (final_output) {
set_results_to_output<output_vec_size>(value, base_offsets);
} else {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
*(out[i]) = get_accumulated_output(out[i], value[i]);
}
}
} else {
if (accumulate) {
#pragma unroll
for (int i = 0; i < output_vec_size; i++) {
value[i] = reducer::combine((*acc)[i], value[i]);
}
}
if (final_output) {
set_results_to_output<output_vec_size>(value, base_offsets);
} else {
*acc = value;
}
}
}
}
return value;
}
};
extern "C"
__launch_bounds__(512, 4)
__global__ void reduction_prod_kernel(ReduceJitOp r){
r.run();
}
#pragma clang force_cuda_host_device end
hip runtime failed to load.
Error: Please provide architecture for which code is to be generated.
Adds support for Fooocus.
This is a draft because it's not working properly on AMD yet.