Error when trying to use dlprimitives in the diffusion model

Hi artyom, I've tried using dlprimitives in a diffusion model to work with the MNIST dataset. And my code has the following bug:

(torch) C:\deps\pytorch_dlprim\Diffusion>python train_mnist.py --device=ocl:0 Accessing device #0:NVIDIA GeForce RTX 2060 on NVIDIA CUDA C:\deps\pytorch_dlprim\Diffusion\model.py:63: UserWarning: The operator 'aten::gather.out' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) return self.sqrt_alphas_cumprod.gather(-1,t).reshape(x_0.shape[0],1,1,1)x_0+ \ C:\deps\pytorch_dlprim\Diffusion\model.py:64: UserWarning: The operator 'aten::gather.out' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) self.sqrt_one_minus_alphas_cumprod.gather(-1,t).reshape(x_0.shape[0],1,1,1)noise C:\venv\torch\lib\site-packages\torch\nn\functional.py:2210: UserWarning: The operator 'aten::index_select' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) C:\venv\torch\lib\site-packages\torch\nn\functional.py:3950: UserWarning: The operator 'aten::upsample_bilinear2d.out' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) return torch._C._nn.upsample_bilinear2d(input, output_size, align_corners, scale_factors) C:\venv\torch\lib\site-packages\torch\autograd__init__.py:197: UserWarning: The operator 'aten::upsample_bilinear2d_backward.grad_input' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass C:\venv\torch\lib\site-packages\torch\autograd__init__.py:197: UserWarning: The operator 'aten::embedding_dense_backward' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass Epoch[1/100], Step[0/469], loss:1.13105, lr:0.00004 Epoch[1/100], Step[10/469], loss:1.10539, lr:0.00004 Epoch[1/100], Step[20/469], loss:1.08877, lr:0.00004 Epoch[1/100], Step[30/469], loss:1.06681, lr:0.00004 Epoch[1/100], Step[40/469], loss:1.04868, lr:0.00004 Epoch[1/100], Step[50/469], loss:1.03398, lr:0.00004 Epoch[1/100], Step[60/469], loss:1.02504, lr:0.00004 Epoch[1/100], Step[70/469], loss:1.01928, lr:0.00004 Epoch[1/100], Step[80/469], loss:0.99907, lr:0.00004 Epoch[1/100], Step[90/469], loss:0.98367, lr:0.00004 Epoch[1/100], Step[100/469], loss:0.96129, lr:0.00004 Epoch[1/100], Step[110/469], loss:0.95355, lr:0.00004 Epoch[1/100], Step[120/469], loss:0.94075, lr:0.00004 Epoch[1/100], Step[130/469], loss:0.92182, lr:0.00004 Epoch[1/100], Step[140/469], loss:0.89627, lr:0.00004 Epoch[1/100], Step[150/469], loss:0.89367, lr:0.00004 Epoch[1/100], Step[160/469], loss:0.86775, lr:0.00004 Epoch[1/100], Step[170/469], loss:0.86031, lr:0.00004 Epoch[1/100], Step[180/469], loss:0.83219, lr:0.00004 Epoch[1/100], Step[190/469], loss:0.81324, lr:0.00004 Epoch[1/100], Step[200/469], loss:0.79009, lr:0.00004 Epoch[1/100], Step[210/469], loss:0.78913, lr:0.00004 Epoch[1/100], Step[220/469], loss:0.76493, lr:0.00004 Epoch[1/100], Step[230/469], loss:0.75102, lr:0.00004 Epoch[1/100], Step[240/469], loss:0.74663, lr:0.00004 Epoch[1/100], Step[250/469], loss:0.71013, lr:0.00004 Epoch[1/100], Step[260/469], loss:0.70468, lr:0.00004 Epoch[1/100], Step[270/469], loss:0.69016, lr:0.00004 Epoch[1/100], Step[280/469], loss:0.69655, lr:0.00004 Epoch[1/100], Step[290/469], loss:0.67101, lr:0.00004 Epoch[1/100], Step[300/469], loss:0.68522, lr:0.00004 Epoch[1/100], Step[310/469], loss:0.65023, lr:0.00004 Epoch[1/100], Step[320/469], loss:0.65304, lr:0.00004 Epoch[1/100], Step[330/469], loss:0.64394, lr:0.00004 Epoch[1/100], Step[340/469], loss:0.62623, lr:0.00004 Epoch[1/100], Step[350/469], loss:0.61216, lr:0.00004 Epoch[1/100], Step[360/469], loss:0.61336, lr:0.00004 Epoch[1/100], Step[370/469], loss:0.59924, lr:0.00004 Epoch[1/100], Step[380/469], loss:0.58977, lr:0.00004 Epoch[1/100], Step[390/469], loss:0.59183, lr:0.00004 Epoch[1/100], Step[400/469], loss:0.57439, lr:0.00004 Epoch[1/100], Step[410/469], loss:0.56940, lr:0.00004 Epoch[1/100], Step[420/469], loss:0.56418, lr:0.00004 Epoch[1/100], Step[430/469], loss:0.55326, lr:0.00004 Epoch[1/100], Step[440/469], loss:0.55371, lr:0.00004 Epoch[1/100], Step[450/469], loss:0.53270, lr:0.00004 Epoch[1/100], Step[460/469], loss:0.53683, lr:0.00004 Epoch 1 completed in 405.32 seconds Sampling: 0%| | 0/1000 [00:00<?, ?it/s]C:\deps\pytorch_dlprim\Diffusion\model.py:99: UserWarning: The operator 'aten::gather.out' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) alpha_t=self.alphas.gather(-1,t).reshape(x_t.shape[0],1,1,1) C:\deps\pytorch_dlprim\Diffusion\model.py:100: UserWarning: The operator 'aten::gather.out' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) alpha_t_cumprod=self.alphas_cumprod.gather(-1,t).reshape(x_t.shape[0],1,1,1) C:\deps\pytorch_dlprim\Diffusion\model.py:101: UserWarning: The operator 'aten::gather.out' is not currently supported on the ocl backend. Please open an issue at for requesting support https://github.com/artyom-beilis/pytorch_dlprim/issues (Triggered internally at C:\deps\pytorch_dlprim\src\tensor_ops.cpp:313.) beta_t=self.betas.gather(-1,t).reshape(x_t.shape[0],1,1,1) 1 error generated. Failed Program Code:

=========================

define PARAMS PARAM_INPUT(long,0) PARAM_OUTPUT(long,0)

define PREPARE_LOAD_INPUT_ALL PREPARE_LOAD_INPUT(long,0) ;\

typedef long typeof_x0;\ typedef long typeof_y0;\

define REDUCE_INIT_ALL REDUCE_INIT(long,0) ;\

reduce_y0 = LONG;

define LOAD_INPUT_ALL LOAD_INPUT(0);\

define LOAD_REDUCE_ALL LOAD_REDUCE(0);\

define SAVE_REDUCE_ALL SAVE_REDUCE(0);\

define LOAD_REDUCED_SAVE_GLOBAL_ALL LOAD_REDUCED_SAVE_GLOBAL(0);\

define REDUCE reduce_y0 = y0 < reduce_y0 ? y0 : reduce_y0;

define CALC y0=x0;

/////////////////////////////////////////////////////////////////////////////// /// /// Copyright (c) 2021-2022 Artyom Beilis artyomtnk@yahoo.com /// /// MIT License, see LICENSE.TXT /// ///////////////////////////////////////////////////////////////////////////////

ifndef REDUCE_DIMS

error "REDUCE_DIMS must be defined"

endif

ifndef DIMS

error "DIMS must be defined"

endif

/////////////////////////////////////////////////////////////////////////////// /// /// Copyright (c) 2021-2022 Artyom Beilis artyomtnk@yahoo.com /// /// MIT License, see LICENSE.TXT /// /////////////////////////////////////////////////////////////////////////////// typedef struct Shape { ulong s[DIMS]; } Shape;

inline Shape get_pos_broadcast(Shape limits) { Shape r;

if DIMS <= 1
r.s[0] = get_global_id(0);
elif DIMS == 2
r.s[0] = get_global_id(1);
r.s[1] = get_global_id(0);
elif DIMS == 3
r.s[0] = get_global_id(2);
r.s[1] = get_global_id(1);
r.s[2] = get_global_id(0);
elif DIMS == 4
r.s[0] = get_global_id(2);
r.s[1] = get_global_id(1);
r.s[2] = get_global_id(0) / limits.s[3];
r.s[3] = get_global_id(0) % limits.s[3];
elif DIMS == 5
r.s[0] = get_global_id(2);
r.s[1] = get_global_id(1) / limits.s[2];
r.s[2] = get_global_id(1) % limits.s[2];
r.s[3] = get_global_id(0) / limits.s[4];
r.s[4] = get_global_id(0) % limits.s[4];
elif DIMS == 6
r.s[0] = get_global_id(2) / limits.s[1];
r.s[1] = get_global_id(2) % limits.s[1];
r.s[2] = get_global_id(1) / limits.s[3];
r.s[3] = get_global_id(1) % limits.s[3];
r.s[4] = get_global_id(0) / limits.s[5];
r.s[5] = get_global_id(0) % limits.s[5];
elif DIMS == 7
r.s[0] = get_global_id(2) / limits.s[1];
r.s[1] = get_global_id(2) % limits.s[1];
r.s[2] = get_global_id(1) / limits.s[3];
r.s[3] = get_global_id(1) % limits.s[3];
r.s[4] = get_global_id(0) / (limits.s[5]*limits.s[6]);
ulong s56 = get_global_id(0) % (limits.s[5]*limits.s[6]);
r.s[5] = s56 / limits.s[6];
r.s[6] = s56 % limits.s[6];
elif DIMS == 8
r.s[0] = get_global_id(2) / limits.s[1];
r.s[1] = get_global_id(2) % limits.s[1];

r.s[2] = get_global_id(1) / (limits.s[3]*limits.s[4]);
ulong s34 = get_global_id(1) % (limits.s[3]*limits.s[4]);
r.s[3] = s34 / limits.s[4];
r.s[4] = s34 % limits.s[4];

r.s[5] = get_global_id(0) / (limits.s[6]*limits.s[7]);
ulong s67 = get_global_id(0) % (limits.s[6]*limits.s[7]);
r.s[6] = s67 / limits.s[7];
r.s[7] = s67 % limits.s[7];
else

error "Unsupported dim"

endif
return r;
}

define NORMAL_DIMS (DIMS - REDUCE_DIMS)

if REDUCE_DIMS > DIMS

error "REDUCE_DIMS must be <= DIMS"

endif

if REDUCE_DIMS < 0

error "Need at least 1 dim for reduction"

endif

ifndef SMALL_REDUCTION

define SMALL_REDUCTION 0

endif

ifndef TWO_STAGE_REDUCTION

define TWO_STAGE_REDUCTION 0

endif

inline ulong get_base_offset(Shape s,Shape strides,ulong offset) { ulong r = offset;

pragma unroll
for(int i=REDUCE_DIMS;i<DIMS;i++) {
    r+= s.s[i]*strides.s[i];
}
return r;
}

inline ulong get_reduce_offset(Shape s,Shape strides) { ulong r = 0;

pragma unroll
for(int i=0;i<REDUCE_DIMS;i++) {
    r+= s.s[i]*strides.s[i];
}
return r;
}

void next_pos(Shape limits,Shape *pos) {

if REDUCE_DIMS == 0
/// nothing
elif REDUCE_DIMS == 1
pos->s[0] ++;
elif REDUCE_DIMS == 2
pos->s[1]++;
if(pos->s[1] == limits.s[1]) {
    pos->s[1] = 0;
    pos->s[0] ++;
}
elif REDUCE_DIMS == 3
pos->s[2]++;
if(pos->s[2] == limits.s[2]) {
    pos->s[2] = 0;
    pos->s[1] ++;
    if(pos->s[1] == limits.s[1]) {
        pos->s[1] = 0;
        pos->s[0] ++;
    }
}
else

// for total dims limit = 5 shouldn't be more than 3 reduction dims otherwise they will be shrinked

error "Too many reduction dims"

endif

}

if REDUCE_DIMS >= 1

inline Shape get_pos(Shape limits,ulong reduce_item) { Shape r;

if REDUCE_DIMS == 1
r.s[0] = reduce_item;
elif REDUCE_DIMS == 2
r.s[0] = reduce_item / limits.s[1];
r.s[1] = reduce_item % limits.s[1];
elif REDUCE_DIMS == 3
r.s[2] = reduce_item % limits.s[2];
ulong ri2 = reduce_item / limits.s[2];
r.s[1] = ri2 % limits.s[1];
r.s[0] = ri2 / limits.s[1];
else

// for total dims limit = 5 shouldn't be more than 3 reduction dims otherwise they will be shrinked

error "Too many reduction dims"

endif

if NORMAL_DIMS == 0
// nothing
elif NORMAL_DIMS == 1
r.s[REDUCE_DIMS + 0] = get_global_id(1);
elif NORMAL_DIMS == 2
r.s[REDUCE_DIMS + 0] = get_global_id(2);
r.s[REDUCE_DIMS + 1] = get_global_id(1);
elif NORMAL_DIMS == 3
r.s[REDUCE_DIMS + 0] = get_global_id(2) / limits.s[REDUCE_DIMS+1];
r.s[REDUCE_DIMS + 1] = get_global_id(2) % limits.s[REDUCE_DIMS+1];
r.s[REDUCE_DIMS + 2] = get_global_id(1);
elif NORMAL_DIMS == 4
r.s[REDUCE_DIMS + 0] = get_global_id(2) / limits.s[REDUCE_DIMS+1];
r.s[REDUCE_DIMS + 1] = get_global_id(2) % limits.s[REDUCE_DIMS+1];
r.s[REDUCE_DIMS + 2] = get_global_id(1) / limits.s[REDUCE_DIMS+3];
r.s[REDUCE_DIMS + 3] = get_global_id(1) % limits.s[REDUCE_DIMS+3];
else

error "Unsupported dim"

endif
return r;
}

endif

inline bool valid_save_pos(Shape pos,Shape limits) {

pragma unroll
for(int i=REDUCE_DIMS;i<DIMS;i++)
    if(pos.s[i] >= limits.s[i])
        return 0;
return 1;
}

inline bool valid_pos(Shape pos,Shape limits) {

pragma unroll
for(int i=0;i<DIMS;i++)
    if(pos.s[i] >= limits.s[i])
        return 0;
return 1;
}

define PARAM_INPUT(type,I) ,__global type const *px##I,ulong px##I##_offset,Shape xstrides##I

if TWO_STAGE_REDUCTION == 1

define PARAM_OUTPUT(type,I) ,__global type *py##I,ulong py##I##_offset,Shape ystrides##I

else

define PARAM_OUTPUT(type,I) ,__global type *py##I,ulong py##I##_offset,Shape ystrides##I,type alpha##I,type beta##I

endif

define PAPAM_WEIGHT(type,I) ,type w##I

define PREPARE_LOAD_INPUT(type,I) \
ulong input_offset_##I = get_base_offset(index,xstrides##I,px##I##_offset); \
type x##I;
define LOAD_INPUT(I) x##I = px##I[inputoffset##I + get_reduce_offset(index,xstrides##I)];

define SAVE_OUTPUT(I) py##I[get_base_offset(index,ystrides##I,py##I##_offset)] = reduce_y##I;

define my_get_local_wg_id() ((get_local_id(2) get_local_size(1) get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0))

if SMALL_REDUCTION == 1

define REDUCE_INIT(type,I) type reduce_y##I,y##I;

else

define REDUCE_INIT(type,I) \
__local type my_reduce_##I[WG_SIZE]; \
type reduce_y##I,y##I;
endif

define SAVE_REDUCE(I) myreduce##I[lid] = reduce_y##I;

define LOAD_REDUCE(I) reduce_y##I = myreduce##I[lid]; y##I = myreduce##I[nxt];

if SMALL_REDUCTION == 1

define LOAD_REDUCED_SAVE_GLOBAL(I) \

do { \ py##I += get_base_offset(index,ystrides##I,py##I##_offset); \ reduce_y##I = alpha##I; \ if(beta##I) \ py##I = beta##I py##I + reduce_y##I; \ else \ *py##I = reduce_y##I; \ }while(0)

elif TWO_STAGE_REDUCTION == 0

define LOAD_REDUCED_SAVE_GLOBAL(I) \

do { \ y##I = alpha##I myreduce##I[0]; \ py##I += get_base_offset(index,ystrides##I,py##I##_offset); \ if(beta##I) \ py##I = beta##I py##I + y##I; \ else \ *py##I = y##I; \ } while(0)

else //TWO_STAGE_REDUCTION == 1

define LOAD_REDUCED_SAVE_GLOBAL(I) \

do { \ py##I += py##I##_offset + get_group_id(0); \ py##I += reduce_stride get_base_offset(index,ystrides##I,0); \ py##I = myreduce##I[0]; \ } while(0)

endif

__kernel

if SMALL_REDUCTION == 0

attribute((reqd_work_group_size(WG_SIZE,1,1)))

endif

void exec(Shape limit PARAMS

if TWO_STAGE_REDUCTION == 1
               ,ulong reduce_stride
endif
               )
{

if REDUCE_DIMS == 0
#if ITEMS_PER_WI > 1
#error "Invalid Items per wi size"
#endif
ulong reduce_item = 0;
Shape index0 = get_pos_broadcast(limit);
else
ulong reduce_item = get_global_id(0) * ITEMS_PER_WI;
Shape index0 = get_pos(limit,reduce_item);
endif
Shape index = index0;
PREPARE_LOAD_INPUT_ALL
REDUCE_INIT_ALL

#pragma unroll(8)
for(int item=0;item < ITEMS_PER_WI;item++) {
    if(valid_pos(index,limit)) {
        LOAD_INPUT_ALL
        CALC
        REDUCE
    }
if ITEMS_PER_WI > 1
    next_pos(limit,&index);
    reduce_item ++;
endif
}

#if SMALL_REDUCTION == 0

int lid = get_local_id(0);

SAVE_REDUCE_ALL

barrier(CLK_LOCAL_MEM_FENCE);
for(int i= WG_SIZE / 2;i>0; i>>= 1) {
    if(lid < i) {
        int nxt = lid+i;
        LOAD_REDUCE_ALL
        REDUCE
        SAVE_REDUCE_ALL
    }
    barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
    if(valid_save_pos(index0,limit)) {
        LOAD_REDUCED_SAVE_GLOBAL_ALL
    }
}

#else
if(valid_save_pos(index0,limit)) {
    LOAD_REDUCED_SAVE_GLOBAL_ALL
}
#endif
}

=========================

Sampling: 0%| | 0/1000 [00:00<?, ?it/s] Traceback (most recent call last): File "C:\deps\pytorch_dlprim\Diffusion\train_mnist.py", line 148, in main(args) File "C:\deps\pytorch_dlprim\Diffusion\train_mnist.py", line 143, in main samples = model_ema.module.sampling(args.n_samples, clipped_reverse_diffusion=not args.no_clip, device=device) File "C:\venv\torch\lib\site-packages\torch\autograd\grad_mode.py", line 27, in decorate_context return func(*args, *kwargs) File "C:\deps\pytorch_dlprim\Diffusion\model.py", line 45, in sampling x_t=self._reverse_diffusion_with_clip(x_t,t,noise) File "C:\venv\torch\lib\site-packages\torch\autograd\grad_mode.py", line 27, in decorate_context return func(args, **kwargs) File "C:\deps\pytorch_dlprim\Diffusion\model.py", line 106, in _reverse_diffusion_with_clip if t.min()>0: RuntimeError: Failed to build program source pointwise_broadcast_reduce with parameters -cl-std=CL2.0 -DREDUCE_DIMS=1 -DSMALL_REDUCTION=1 -DDIMS=1 -DWG_SIZE=0 -DITEMS_PER_WI=36 -DTWO_STAGE_REDUCTION=0 log: For device: NVIDIA GeForce RTX 2060
:322:5: error: use of undeclared identifier 'LONG' REDUCE_INIT_ALL ^ :7:13: note: expanded from macro 'REDUCE_INIT_ALL' reduce_y0 = LONG;

Do you know how to fix this?

Here is my code: MNIST Diffusion.zip

Best wishes, Haozhe

@tangjinchuan

artyom-beilis / pytorch_dlprim

Error when trying to use dlprimitives in the diffusion model #78

define PARAMS PARAM_INPUT(long,0) PARAM_OUTPUT(long,0)

define PREPARE_LOAD_INPUT_ALL PREPARE_LOAD_INPUT(long,0) ;\

define REDUCE_INIT_ALL REDUCE_INIT(long,0) ;\

define LOAD_INPUT_ALL LOAD_INPUT(0);\

define LOAD_REDUCE_ALL LOAD_REDUCE(0);\

define SAVE_REDUCE_ALL SAVE_REDUCE(0);\

define LOAD_REDUCED_SAVE_GLOBAL_ALL LOAD_REDUCED_SAVE_GLOBAL(0);\

define REDUCE reduce_y0 = y0 < reduce_y0 ? y0 : reduce_y0;

define CALC y0=x0;

ifndef REDUCE_DIMS

error "REDUCE_DIMS must be defined"

endif

ifndef DIMS

error "DIMS must be defined"

endif

if DIMS <= 1

elif DIMS == 2

elif DIMS == 3

elif DIMS == 4

elif DIMS == 5

elif DIMS == 6

elif DIMS == 7

elif DIMS == 8

else

error "Unsupported dim"

endif

define NORMAL_DIMS (DIMS - REDUCE_DIMS)

if REDUCE_DIMS > DIMS

error "REDUCE_DIMS must be <= DIMS"

endif

if REDUCE_DIMS < 0

error "Need at least 1 dim for reduction"

endif

ifndef SMALL_REDUCTION

define SMALL_REDUCTION 0

endif

ifndef TWO_STAGE_REDUCTION

define TWO_STAGE_REDUCTION 0

endif

pragma unroll

pragma unroll

if REDUCE_DIMS == 0

elif REDUCE_DIMS == 1

elif REDUCE_DIMS == 2

elif REDUCE_DIMS == 3

else

error "Too many reduction dims"

endif

if REDUCE_DIMS >= 1

if REDUCE_DIMS == 1

elif REDUCE_DIMS == 2

elif REDUCE_DIMS == 3

else

error "Too many reduction dims"

endif

if NORMAL_DIMS == 0

elif NORMAL_DIMS == 1

elif NORMAL_DIMS == 2

elif NORMAL_DIMS == 3

elif NORMAL_DIMS == 4

else

error "Unsupported dim"

endif

endif

pragma unroll

pragma unroll

define PARAM_INPUT(type,I) ,__global type const *px##I,ulong px##I##_offset,Shape xstrides##I

if TWO_STAGE_REDUCTION == 1

define PARAM_OUTPUT(type,I) ,__global type *py##I,ulong py##I##_offset,Shape ystrides##I

else

define PARAM_OUTPUT(type,I) ,__global type *py##I,ulong py##I##_offset,Shape ystrides##I,type alpha##I,type beta##I

endif

define PAPAM_WEIGHT(type,I) ,type w##I

define PREPARE_LOAD_INPUT(type,I) \

define LOAD_INPUT(I) x##I = px##I[inputoffset##I + get_reduce_offset(index,xstrides##I)];

define SAVE_OUTPUT(I) py##I[get_base_offset(index,ystrides##I,py##I##_offset)] = reduce_y##I;

define my_get_local_wg_id() ((get_local_id(2) get_local_size(1) get_local_size(0)) + (get_local_id(1) * get_local_size(0)) + get_local_id(0))

if SMALL_REDUCTION == 1