RuntimeError: Failed to build program source pooling on mnist.py

I've installed everything following the instructions, using pytorch 1.13 cpu version and I get the following error:

python mnist.py --device ocl:0
/home/pvr/gaussian-splatting/env/lib/pytpython mnist.py --device ocl:0
/home/pvr/gaussian-splatting/env/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension:
  warn(f"Failed to load image Python extension: {e}")
Using device: privateuseone:0
Accessing device #0:PowerVR Rogue GX6250 on PowerVR
OCL 189884: Compilation error
OCL     37: IMG_clBuildProgram: Failed to build program
Failed Program Code:
=========================
///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
#define ACTIVATION_IDENTITY 0
#define ACTIVATION_RELU     1
#define ACTIVATION_TANH     2
#define ACTIVATION_SIGMOID  3
#define ACTIVATION_RELU6    4

#ifndef dtype
#define dtype float
#define dtype2 float2
#define dtype4 float4
#define DTYPE_MAX FLT_MAX
#define DTYPE_MIN FLT_MIN
#endif

#ifndef ACTIVATION
#define ACTIVATION ACTIVATION_IDENTITY
#endif

#if ACTIVATION == ACTIVATION_IDENTITY
#   define ACTIVATION_F(x) (x)
#   define ACTIVATION_FINV(y,dy) (dy)
#   define ACTIVATION_NAME identity
#elif ACTIVATION == ACTIVATION_RELU
#   define ACTIVATION_F(x) (max((x),(dtype)(0)))
#   define ACTIVATION_FINV(y,dy)  ((y>0)?dy:0)
#   define ACTIVATION_NAME relu
#elif ACTIVATION == ACTIVATION_TANH
#   define ACTIVATION_F(x) (tanh((x)))
#   define ACTIVATION_FINV(y,dy) ((1-(y)*(y))*(dy))
#   define ACTIVATION_NAME tanh
#elif ACTIVATION == ACTIVATION_SIGMOID
#   define ACTIVATION_F(x) ((dtype)(1) / ((dtype)(1) + exp(-(x))))
#   define ACTIVATION_FINV(y,dy) ((y)*(1-(y))*(dy))
#   define ACTIVATION_NAME sigmoid
#elif ACTIVATION == ACTIVATION_RELU6
#   define ACTIVATION_F(x) (min(max((x),(dtype)(0)),(dtype)(6)))
#   define ACTIVATION_FINV(y,dy)  ((0<y && y<6)?dy:0)
#   define ACTIVATION_NAME relu6
#else
#   error "Unknown activation"
#endif

///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
void atomic_addf(__global volatile float *ptr,float v)
{
#if defined(cl_intel_subgroups)
    __global atomic_int *p = (__global atomic_int *)(ptr);
    int prev,newv;
    do {
        prev = atomic_load(p);
        newv = as_int(as_float(prev) + v);
    } while(! atomic_compare_exchange_weak(p,&prev,newv));
#else
    float oldv = *ptr;
    for(;;) {
        float newv = oldv + v;
        int prev = atomic_cmpxchg((__global volatile int *)(ptr),as_int(oldv),as_int(newv));
        if(prev == as_int(oldv))
            return;
        oldv = as_float(prev);
    }
#endif
}

#ifndef itype
#define itype int
#endif

#ifndef POOL_MODE
#define POOL_MODE 0
#endif

#ifndef POOL_H
#define POOL_H 1
#endif
#ifndef POOL_W
#define POOL_W 1
#endif

#ifndef STRIDE_H
#define STRIDE_H 1
#endif

#ifndef STRIDE_W
#define STRIDE_W 1
#endif

#ifndef PAD_H
#define PAD_H 0
#endif

#ifndef PAD_W
#define PAD_W 0
#endif

#ifndef COUNT_INCLUDE_PAD
#define COUNT_INCLUDE_PAD 0
#endif

#if POOL_MODE == 0
# define START_VAL -DTYPE_MAX
# define REDUCE(a,b) max((a),(b))
# define NORMALIZE_FULL(x) (x)
# define NORMALIZE_PARTIAL(x,dr,dc,vdr,vdc) (x)
#elif POOL_MODE == 1
# define START_VAL 0.0f
# define REDUCE(a,b) ((a) + (b))
# define NORMALIZE_FULL(x) ((x) * (1.0f / (POOL_H * POOL_W)))
# if COUNT_INCLUDE_PAD == 0
#  define NORMALIZE_PARTIAL(x,dr,dc,vdr,vdc) ((x) * (1.0f /((dr)*(dc))))
# else
#  define NORMALIZE_PARTIAL(x,dr,dc,vdr,vdc) ((x) * (1.0f /((vdr)*(vdc))))
# endif
#else
#error "Invalid mode"
#endif

#ifndef WG_SIZE
#define WG_SIZE 8
#endif

#if POOL_MODE == 0 && EXPORT_INDEX == 1
#define INDEX_MAX_SRC 1
#else
#define INDEX_MAX_SRC 0
#endif

__kernel
__attribute__((reqd_work_group_size(WG_SIZE,WG_SIZE,1)))
void pooling(int BC,int inp_H,int inp_W,int out_H,int out_W,
             __global const dtype *src,ulong src_offset,
             __global dtype *tgt,ulong tgt_offset
#if INDEX_MAX_SRC == 1
             ,__global itype *indx,ulong indx_offset
#endif

             )
{
    int or = get_global_id(0);
    int oc = get_global_id(1);
    int bc = get_global_id(2);
    if(bc >= BC || or >= out_H || oc >= out_W)
        return;

    int row0 = or * STRIDE_H - PAD_H;
    int col0 = oc * STRIDE_W - PAD_W;
    int row1 = row0 + POOL_H;
    int col1 = col0 + POOL_W;

    tgt += tgt_offset + bc * out_H * out_W;
    src += src_offset + bc * inp_H * inp_W;

    dtype val = START_VAL;
    #if INDEX_MAX_SRC == 1
    itype index = -1;
    indx += indx_offset + bc * out_H * out_W;
    #endif

    if(row0 >= 0 && col0 >= 0 && row1 <= inp_H && col1 <= inp_W) {
        src += row0 * inp_W + col0;
        #pragma unroll
        for(int dr=0;dr<POOL_H;dr++) {
            #pragma unroll
            for(int dc = 0;dc < POOL_W; dc++) {
                #if INDEX_MAX_SRC == 1
                dtype tmp = src[dr * inp_W + dc];
                if(tmp > val) {
                    index = (row0 + dr) * inp_W + col0 + dc;
                    val = tmp;
                }
                #else
                val = REDUCE(val,src[dr * inp_W + dc]);
                #endif
            }
        }
        val = NORMALIZE_FULL(val);
    }
    else {
        #pragma unroll
        for(int r=row0;r<row1;r++) {
            #pragma unroll
            for(int c=col0;c<col1;c++) {
                dtype loaded_val = (r >= 0 && r<inp_H && c>=0 && c<inp_W) ? src[r*inp_W + c] : START_VAL;
                #if INDEX_MAX_SRC == 1
                if(loaded_val > val) {
                    index = r*inp_W + c;
                    val = loaded_val;
                }
                #else
                val = REDUCE(val,loaded_val);
                #endif
            }
        }
        val = NORMALIZE_PARTIAL(val, min(row1,inp_H) - max(row0,0),
                                     min(col1,inp_W) - max(col0,0),
                                     min(row1,inp_H + PAD_H) - max(-PAD_H,row0),
                                     min(col1,inp_W + PAD_W) - max(-PAD_W,col0)
                                     );
    }
    tgt[or * out_W + oc] = val;
    #if INDEX_MAX_SRC == 1
    indx[or * out_W + oc] = index;
    #endif
}

void save_dx(__global dtype *ptr,dtype value)
{
    #if POOL_W <= STRIDE_W && POOL_H <= STRIDE_H
    *ptr = value + *ptr;
    #else
    atomic_addf(ptr,value);
    #endif

}

#if INDEX_MAX_SRC == 1
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,WG_SIZE,1)))
void pooling_bw(int BC,int inp_H,int inp_W,int out_H,int out_W,
             __global dtype *src,ulong src_offset,
             __global const dtype *tgt,ulong tgt_offset,
             __global const itype *indx,ulong indx_offset)
{
    int or = get_global_id(0);
    int oc = get_global_id(1);
    int bc = get_global_id(2);

    if(bc >= BC || or >= out_H || oc >= out_W)
        return;

    int row0 = or * STRIDE_H - PAD_H;
    int col0 = oc * STRIDE_W - PAD_W;
    int row1 = row0 + POOL_H;
    int col1 = col0 + POOL_W;

    tgt  += tgt_offset + bc * out_H * out_W;
    src  += src_offset + bc * inp_H * inp_W;
    indx += indx_offset + bc * out_H * out_W;

    dtype dy  =  tgt[or * out_W + oc];
    itype pos = indx[or * out_W + oc];

    save_dx(src+pos,dy);
}

#elif POOL_MODE == 0 // max pooling
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,WG_SIZE,1)))
void pooling_bw(int BC,int inp_H,int inp_W,int out_H,int out_W,
             __global const dtype *src,ulong src_offset,
             __global const dtype *tgt,ulong tgt_offset,
             __global dtype *dx,ulong dx_offset)
{
    int or = get_global_id(0);
    int oc = get_global_id(1);
    int bc = get_global_id(2);

    if(bc >= BC || or >= out_H || oc >= out_W)
        return;

    int row0 = or * STRIDE_H - PAD_H;
    int col0 = oc * STRIDE_W - PAD_W;
    int row1 = row0 + POOL_H;
    int col1 = col0 + POOL_W;

    tgt += tgt_offset + bc * out_H * out_W;
    src += src_offset + bc * inp_H * inp_W;
    dx  += dx_offset  + bc * inp_H * inp_W;

    dtype val = START_VAL;
    itype index = -1;

    if(row0 >= 0 && col0 >= 0 && row1 <= inp_H && col1 <= inp_W) {
        src += row0 * inp_W + col0;
        #pragma unroll
        for(int dr=0;dr<POOL_H;dr++) {
            #pragma unroll
            for(int dc = 0;dc < POOL_W; dc++) {
                dtype tmp = src[dr * inp_W + dc];
                if(tmp > val) {
                    index = (row0 + dr) * inp_W + col0 + dc;
                    val = tmp;
                }
            }
        }
    }
    else {
        #pragma unroll
        for(int r=row0;r<row1;r++) {
            #pragma unroll
            for(int c=col0;c<col1;c++) {
                dtype loaded_val = (r >= 0 && r<inp_H && c>=0 && c<inp_W) ? src[r*inp_W + c] : START_VAL;
                if(loaded_val > val) {
                    index = r*inp_W + c;
                    val = loaded_val;
                }
            }
        }
    }

    dtype dy = tgt[or * out_W + oc];

    save_dx(dx+index,dy);
}

#elif POOL_MODE == 1
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,WG_SIZE,1)))
void pooling_bw(int BC,int inp_H,int inp_W,int out_H,int out_W,
             __global const dtype *tgt,ulong tgt_offset,
             __global dtype *dx,ulong dx_offset)
{
    int or = get_global_id(0);
    int oc = get_global_id(1);
    int bc = get_global_id(2);
    if(bc >= BC || or >= out_H || oc >= out_W)
        return;

    int row0 = or * STRIDE_H - PAD_H;
    int col0 = oc * STRIDE_W - PAD_W;
    int row1 = row0 + POOL_H;
    int col1 = col0 + POOL_W;

    tgt += tgt_offset + bc * out_H * out_W;
    dx  += dx_offset  + bc * inp_H * inp_W;

    dtype dy = tgt[or * out_W + oc];
    if(row0 >= 0 && col0 >= 0 && row1 <= inp_H && col1 <= inp_W) {
        dtype dy_norm = NORMALIZE_FULL(dy);
        dx += row0 * inp_W + col0;
        #pragma unroll
        for(int dr=0;dr<POOL_H;dr++) {
            #pragma unroll
            for(int dc = 0;dc < POOL_W; dc++) {
                save_dx(dx + dr * inp_W + dc,dy_norm);
            }
        }
    }
    else {
        dtype dy_norm = NORMALIZE_PARTIAL(dy, min(row1,inp_H)-max(row0,0),
                                              min(col1,inp_W) - max(col0,0),
                                              min(row1,inp_H + PAD_H) - max(-PAD_H,row0),
                                              min(col1,inp_W + PAD_W) - max(-PAD_W,col0)
                                            );
        #pragma unroll
        for(int r=row0;r<row1;r++) {
            #pragma unroll
            for(int c=col0;c<col1;c++) {
                if(r >= 0 && r<inp_H && c>=0 && c<inp_W)
                    save_dx(dx + r*inp_W + c,dy_norm);
            }
        }
    }
}
#else
#error "Invalid mode"
#endif

=========================

Traceback (most recent call last):
  File "mnist.py", line 162, in <module>
    main()
  File "mnist.py", line 153, in main
    train(args, model, device, train_loader, optimizer, epoch)
  File "mnist.py", line 53, in train
    output = model(data)
  File "/home/pvr/gaussian-splatting/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "mnist.py", line 33, in forward
    x = F.max_pool2d(x, 2)
  File "/home/pvr/gaussian-splatting/env/lib/python3.8/site-packages/torch/_jit_internal.py", line 485, in fn
    return if_false(*args, **kwargs)
  File "/home/pvr/gaussian-splatting/env/lib/python3.8/site-packages/torch/nn/functional.py", line 782, in _max_pool2d
    return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
RuntimeError: Failed to build program source pooling with parameters -cl-std=CL2.0 -DWG_SIZE=8 -DPOOL_H=2 -DPOOL_W=2 -DSTRIDE_H=2 -DSTRIDE_W=2 -DPAD_H=0 -DPAD_W=0 -DPOOL_MODE=0 -DCOUNT_INCLUDE_PAD=0 log:
For device: PowerVR Rogue GX6250
BuildGroup_0:163:9: error: expected identifier or '('
    int or = get_global_id(0);
        ^
BuildGroup_0:166:20: error: expected expression
    if(bc >= BC || or >= out_H || oc >= out_W)
                   ^
BuildGroup_0:166:23: error: expected expression
    if(bc >= BC || or >= out_H || oc >= out_W)
                      ^
BuildGroup_0:169:16: error: expected expression
    int row0 = or * STRIDE_H - PAD_H;
               ^
BuildGroup_0:169:19: error: indirection requires pointer operand ('int' invalid)
    int row0 = or * STRIDE_H - PAD_H;
                  ^ ~~~~~~~~
BuildGroup_0:224:9: error: expected expression
    tgt[or * out_W + oc] = val;
        ^
BuildGroup_0:224:12: error: indirection requires pointer operand ('int' invalid)
    tgt[or * out_W + oc] = val;
           ^ ~~~~~
BuildGroup_0:278:9: error: expected identifier or '('
    int or = get_global_id(0);
        ^
BuildGroup_0:282:20: error: expected expression
    if(bc >= BC || or >= out_H || oc >= out_W)
clBuildErrorCode: -11
(204919) PVR:(Warning): OCL_CleanupDriver: Forced flush of remaining commands. Application missing explicit clFlush/clFinish/Synchronisation API call(s) [ global_data.c:1442 ]
(204919) PVR:(Warning): OCL_CleanupDriver: API refcount is not 0. Waiting for deferred threads to complete. [ global_data.c:1451 ]
(204919) PVR:(Warning): OCL_CleanupDriver: API refcount is still not 0 after deferred thread completion. [ global_data.c:1476 ]
(204919) PVR:(Warning): OCL_CleanupDriver: Aborting complete driver cleanup. Application has not called all corresponding clRelease* APIs [ global_data.c:1504 ]
(204919) PVR:(Warning):
        Remaining Objects - Debug App in Descending Order
                cl_event:         0
                cl_kernel:        0
                cl_program:       1
                cl_mem:           21
                cl_svm_alloc:     0
                cl_sampler:       0
                cl_command_queue: 1
                cl_context:       1 [ global_data.c:1506 ]
(204919) PVR:(Fatal): Debug assertion failed! (eErr == PVRSRV_OK) [ deferred_task.c:1088 ]
Aborted
hon3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension:
  warn(f"Failed to load image Python extension: {e}")
Using device: privateuseone:0
Accessing device #0:PowerVR Rogue GX6250 on PowerVR
OCL 189884: Compilation error
OCL     37: IMG_clBuildProgram: Failed to build program
Failed Program Code:
=========================
///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
#define ACTIVATION_IDENTITY 0
#define ACTIVATION_RELU     1
#define ACTIVATION_TANH     2
#define ACTIVATION_SIGMOID  3
#define ACTIVATION_RELU6    4

#ifndef dtype
#define dtype float
#define dtype2 float2
#define dtype4 float4
#define DTYPE_MAX FLT_MAX
#define DTYPE_MIN FLT_MIN
#endif

#ifndef ACTIVATION
#define ACTIVATION ACTIVATION_IDENTITY
#endif

#if ACTIVATION == ACTIVATION_IDENTITY
#   define ACTIVATION_F(x) (x)
#   define ACTIVATION_FINV(y,dy) (dy)
#   define ACTIVATION_NAME identity
#elif ACTIVATION == ACTIVATION_RELU
#   define ACTIVATION_F(x) (max((x),(dtype)(0)))
#   define ACTIVATION_FINV(y,dy)  ((y>0)?dy:0)
#   define ACTIVATION_NAME relu
#elif ACTIVATION == ACTIVATION_TANH
#   define ACTIVATION_F(x) (tanh((x)))
#   define ACTIVATION_FINV(y,dy) ((1-(y)*(y))*(dy))
#   define ACTIVATION_NAME tanh
#elif ACTIVATION == ACTIVATION_SIGMOID
#   define ACTIVATION_F(x) ((dtype)(1) / ((dtype)(1) + exp(-(x))))
#   define ACTIVATION_FINV(y,dy) ((y)*(1-(y))*(dy))
#   define ACTIVATION_NAME sigmoid
#elif ACTIVATION == ACTIVATION_RELU6
#   define ACTIVATION_F(x) (min(max((x),(dtype)(0)),(dtype)(6)))
#   define ACTIVATION_FINV(y,dy)  ((0<y && y<6)?dy:0)
#   define ACTIVATION_NAME relu6
#else
#   error "Unknown activation"
#endif

///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
void atomic_addf(__global volatile float *ptr,float v)
{
#if defined(cl_intel_subgroups)
    __global atomic_int *p = (__global atomic_int *)(ptr);
    int prev,newv;
    do {
        prev = atomic_load(p);
        newv = as_int(as_float(prev) + v);
    } while(! atomic_compare_exchange_weak(p,&prev,newv));
#else
    float oldv = *ptr;
    for(;;) {
        float newv = oldv + v;
        int prev = atomic_cmpxchg((__global volatile int *)(ptr),as_int(oldv),as_int(newv));
        if(prev == as_int(oldv))
            return;
        oldv = as_float(prev);
    }
#endif
}

#ifndef itype
#define itype int
#endif

#ifndef POOL_MODE
#define POOL_MODE 0
#endif

#ifndef POOL_H
#define POOL_H 1
#endif
#ifndef POOL_W
#define POOL_W 1
#endif

#ifndef STRIDE_H
#define STRIDE_H 1
#endif

#ifndef STRIDE_W
#define STRIDE_W 1
#endif

#ifndef PAD_H
#define PAD_H 0
#endif

#ifndef PAD_W
#define PAD_W 0
#endif

#ifndef COUNT_INCLUDE_PAD
#define COUNT_INCLUDE_PAD 0
#endif

#if POOL_MODE == 0
# define START_VAL -DTYPE_MAX
# define REDUCE(a,b) max((a),(b))
# define NORMALIZE_FULL(x) (x)
# define NORMALIZE_PARTIAL(x,dr,dc,vdr,vdc) (x)
#elif POOL_MODE == 1
# define START_VAL 0.0f
# define REDUCE(a,b) ((a) + (b))
# define NORMALIZE_FULL(x) ((x) * (1.0f / (POOL_H * POOL_W)))
# if COUNT_INCLUDE_PAD == 0
#  define NORMALIZE_PARTIAL(x,dr,dc,vdr,vdc) ((x) * (1.0f /((dr)*(dc))))
# else
#  define NORMALIZE_PARTIAL(x,dr,dc,vdr,vdc) ((x) * (1.0f /((vdr)*(vdc))))
# endif
#else
#error "Invalid mode"
#endif

#ifndef WG_SIZE
#define WG_SIZE 8
#endif

#if POOL_MODE == 0 && EXPORT_INDEX == 1
#define INDEX_MAX_SRC 1
#else
#define INDEX_MAX_SRC 0
#endif

__kernel
__attribute__((reqd_work_group_size(WG_SIZE,WG_SIZE,1)))
void pooling(int BC,int inp_H,int inp_W,int out_H,int out_W,
             __global const dtype *src,ulong src_offset,
             __global dtype *tgt,ulong tgt_offset
#if INDEX_MAX_SRC == 1
             ,__global itype *indx,ulong indx_offset
#endif

             )
{
    int or = get_global_id(0);
    int oc = get_global_id(1);
    int bc = get_global_id(2);
    if(bc >= BC || or >= out_H || oc >= out_W)
        return;

    int row0 = or * STRIDE_H - PAD_H;
    int col0 = oc * STRIDE_W - PAD_W;
    int row1 = row0 + POOL_H;
    int col1 = col0 + POOL_W;

    tgt += tgt_offset + bc * out_H * out_W;
    src += src_offset + bc * inp_H * inp_W;

    dtype val = START_VAL;
    #if INDEX_MAX_SRC == 1
    itype index = -1;
    indx += indx_offset + bc * out_H * out_W;
    #endif

    if(row0 >= 0 && col0 >= 0 && row1 <= inp_H && col1 <= inp_W) {
        src += row0 * inp_W + col0;
        #pragma unroll
        for(int dr=0;dr<POOL_H;dr++) {
            #pragma unroll
            for(int dc = 0;dc < POOL_W; dc++) {
                #if INDEX_MAX_SRC == 1
                dtype tmp = src[dr * inp_W + dc];
                if(tmp > val) {
                    index = (row0 + dr) * inp_W + col0 + dc;
                    val = tmp;
                }
                #else
                val = REDUCE(val,src[dr * inp_W + dc]);
                #endif
            }
        }
        val = NORMALIZE_FULL(val);
    }
    else {
        #pragma unroll
        for(int r=row0;r<row1;r++) {
            #pragma unroll
            for(int c=col0;c<col1;c++) {
                dtype loaded_val = (r >= 0 && r<inp_H && c>=0 && c<inp_W) ? src[r*inp_W + c] : START_VAL;
                #if INDEX_MAX_SRC == 1
                if(loaded_val > val) {
                    index = r*inp_W + c;
                    val = loaded_val;
                }
                #else
                val = REDUCE(val,loaded_val);
                #endif
            }
        }
        val = NORMALIZE_PARTIAL(val, min(row1,inp_H) - max(row0,0),
                                     min(col1,inp_W) - max(col0,0),
                                     min(row1,inp_H + PAD_H) - max(-PAD_H,row0),
                                     min(col1,inp_W + PAD_W) - max(-PAD_W,col0)
                                     );
    }
    tgt[or * out_W + oc] = val;
    #if INDEX_MAX_SRC == 1
    indx[or * out_W + oc] = index;
    #endif
}

void save_dx(__global dtype *ptr,dtype value)
{
    #if POOL_W <= STRIDE_W && POOL_H <= STRIDE_H
    *ptr = value + *ptr;
    #else
    atomic_addf(ptr,value);
    #endif

}

#if INDEX_MAX_SRC == 1
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,WG_SIZE,1)))
void pooling_bw(int BC,int inp_H,int inp_W,int out_H,int out_W,
             __global dtype *src,ulong src_offset,
             __global const dtype *tgt,ulong tgt_offset,
             __global const itype *indx,ulong indx_offset)
{
    int or = get_global_id(0);
    int oc = get_global_id(1);
    int bc = get_global_id(2);

    if(bc >= BC || or >= out_H || oc >= out_W)
        return;

    int row0 = or * STRIDE_H - PAD_H;
    int col0 = oc * STRIDE_W - PAD_W;
    int row1 = row0 + POOL_H;
    int col1 = col0 + POOL_W;

    tgt  += tgt_offset + bc * out_H * out_W;
    src  += src_offset + bc * inp_H * inp_W;
    indx += indx_offset + bc * out_H * out_W;

    dtype dy  =  tgt[or * out_W + oc];
    itype pos = indx[or * out_W + oc];

    save_dx(src+pos,dy);
}

#elif POOL_MODE == 0 // max pooling
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,WG_SIZE,1)))
void pooling_bw(int BC,int inp_H,int inp_W,int out_H,int out_W,
             __global const dtype *src,ulong src_offset,
             __global const dtype *tgt,ulong tgt_offset,
             __global dtype *dx,ulong dx_offset)
{
    int or = get_global_id(0);
    int oc = get_global_id(1);
    int bc = get_global_id(2);

    if(bc >= BC || or >= out_H || oc >= out_W)
        return;

    int row0 = or * STRIDE_H - PAD_H;
    int col0 = oc * STRIDE_W - PAD_W;
    int row1 = row0 + POOL_H;
    int col1 = col0 + POOL_W;

    tgt += tgt_offset + bc * out_H * out_W;
    src += src_offset + bc * inp_H * inp_W;
    dx  += dx_offset  + bc * inp_H * inp_W;

    dtype val = START_VAL;
    itype index = -1;

    if(row0 >= 0 && col0 >= 0 && row1 <= inp_H && col1 <= inp_W) {
        src += row0 * inp_W + col0;
        #pragma unroll
        for(int dr=0;dr<POOL_H;dr++) {
            #pragma unroll
            for(int dc = 0;dc < POOL_W; dc++) {
                dtype tmp = src[dr * inp_W + dc];
                if(tmp > val) {
                    index = (row0 + dr) * inp_W + col0 + dc;
                    val = tmp;
                }
            }
        }
    }
    else {
        #pragma unroll
        for(int r=row0;r<row1;r++) {
            #pragma unroll
            for(int c=col0;c<col1;c++) {
                dtype loaded_val = (r >= 0 && r<inp_H && c>=0 && c<inp_W) ? src[r*inp_W + c] : START_VAL;
                if(loaded_val > val) {
                    index = r*inp_W + c;
                    val = loaded_val;
                }
            }
        }
    }

    dtype dy = tgt[or * out_W + oc];

    save_dx(dx+index,dy);
}

#elif POOL_MODE == 1
__kernel
__attribute__((reqd_work_group_size(WG_SIZE,WG_SIZE,1)))
void pooling_bw(int BC,int inp_H,int inp_W,int out_H,int out_W,
             __global const dtype *tgt,ulong tgt_offset,
             __global dtype *dx,ulong dx_offset)
{
    int or = get_global_id(0);
    int oc = get_global_id(1);
    int bc = get_global_id(2);
    if(bc >= BC || or >= out_H || oc >= out_W)
        return;

    int row0 = or * STRIDE_H - PAD_H;
    int col0 = oc * STRIDE_W - PAD_W;
    int row1 = row0 + POOL_H;
    int col1 = col0 + POOL_W;

    tgt += tgt_offset + bc * out_H * out_W;
    dx  += dx_offset  + bc * inp_H * inp_W;

    dtype dy = tgt[or * out_W + oc];
    if(row0 >= 0 && col0 >= 0 && row1 <= inp_H && col1 <= inp_W) {
        dtype dy_norm = NORMALIZE_FULL(dy);
        dx += row0 * inp_W + col0;
        #pragma unroll
        for(int dr=0;dr<POOL_H;dr++) {
            #pragma unroll
            for(int dc = 0;dc < POOL_W; dc++) {
                save_dx(dx + dr * inp_W + dc,dy_norm);
            }
        }
    }
    else {
        dtype dy_norm = NORMALIZE_PARTIAL(dy, min(row1,inp_H)-max(row0,0),
                                              min(col1,inp_W) - max(col0,0),
                                              min(row1,inp_H + PAD_H) - max(-PAD_H,row0),
                                              min(col1,inp_W + PAD_W) - max(-PAD_W,col0)
                                            );
        #pragma unroll
        for(int r=row0;r<row1;r++) {
            #pragma unroll
            for(int c=col0;c<col1;c++) {
                if(r >= 0 && r<inp_H && c>=0 && c<inp_W)
                    save_dx(dx + r*inp_W + c,dy_norm);
            }
        }
    }
}
#else
#error "Invalid mode"
#endif

=========================

Traceback (most recent call last):
  File "mnist.py", line 162, in <module>
    main()
  File "mnist.py", line 153, in main
    train(args, model, device, train_loader, optimizer, epoch)
  File "mnist.py", line 53, in train
    output = model(data)
  File "/home/pvr/gaussian-splatting/env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "mnist.py", line 33, in forward
    x = F.max_pool2d(x, 2)
  File "/home/pvr/gaussian-splatting/env/lib/python3.8/site-packages/torch/_jit_internal.py", line 485, in fn
    return if_false(*args, **kwargs)
  File "/home/pvr/gaussian-splatting/env/lib/python3.8/site-packages/torch/nn/functional.py", line 782, in _max_pool2d
    return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
RuntimeError: Failed to build program source pooling with parameters -cl-std=CL2.0 -DWG_SIZE=8 -DPOOL_H=2 -DPOOL_W=2 -DSTRIDE_H=2 -DSTRIDE_W=2 -DPAD_H=0 -DPAD_W=0 -DPOOL_MODE=0 -DCOUNT_INCLUDE_PAD=0 log:
For device: PowerVR Rogue GX6250
BuildGroup_0:163:9: error: expected identifier or '('
    int or = get_global_id(0);
        ^
BuildGroup_0:166:20: error: expected expression
    if(bc >= BC || or >= out_H || oc >= out_W)
                   ^
BuildGroup_0:166:23: error: expected expression
    if(bc >= BC || or >= out_H || oc >= out_W)
                      ^
BuildGroup_0:169:16: error: expected expression
    int row0 = or * STRIDE_H - PAD_H;
               ^
BuildGroup_0:169:19: error: indirection requires pointer operand ('int' invalid)
    int row0 = or * STRIDE_H - PAD_H;
                  ^ ~~~~~~~~
BuildGroup_0:224:9: error: expected expression
    tgt[or * out_W + oc] = val;
        ^
BuildGroup_0:224:12: error: indirection requires pointer operand ('int' invalid)
    tgt[or * out_W + oc] = val;
           ^ ~~~~~
BuildGroup_0:278:9: error: expected identifier or '('
    int or = get_global_id(0);
        ^
BuildGroup_0:282:20: error: expected expression
    if(bc >= BC || or >= out_H || oc >= out_W)
clBuildErrorCode: -11
(204919) PVR:(Warning): OCL_CleanupDriver: Forced flush of remaining commands. Application missing explicit clFlush/clFinish/Synchronisation API call(s) [ global_data.c:1442 ]
(204919) PVR:(Warning): OCL_CleanupDriver: API refcount is not 0. Waiting for deferred threads to complete. [ global_data.c:1451 ]
(204919) PVR:(Warning): OCL_CleanupDriver: API refcount is still not 0 after deferred thread completion. [ global_data.c:1476 ]
(204919) PVR:(Warning): OCL_CleanupDriver: Aborting complete driver cleanup. Application has not called all corresponding clRelease* APIs [ global_data.c:1504 ]
(204919) PVR:(Warning):
        Remaining Objects - Debug App in Descending Order
                cl_event:         0
                cl_kernel:        0
                cl_program:       1
                cl_mem:           21
                cl_svm_alloc:     0
                cl_sampler:       0
                cl_command_queue: 1
                cl_context:       1 [ global_data.c:1506 ]
(204919) PVR:(Fatal): Debug assertion failed! (eErr == PVRSRV_OK) [ deferred_task.c:1088 ]
Aborted

Any help would be really appreciated, thanks!

I have the same problem, did you solve it?

/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'dlopen(/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torchvision/image.so, 6): Library not loaded: @rpath/libpng16.16.dylib
  Referenced from: /opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torchvision/image.so
  Reason: Incompatible library version: image.so requires version 56.0.0 or later, but libpng16.16.dylib provides version 54.0.0'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
Using device: ocl:0
Accessing device #0:Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz on Apple
/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: 'dlopen(/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torchvision/image.so, 6): Library not loaded: @rpath/libpng16.16.dylib
  Referenced from: /opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torchvision/image.so
  Reason: Incompatible library version: image.so requires version 56.0.0 or later, but libpng16.16.dylib provides version 54.0.0'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?
  warn(
Failed Program Code:
=========================
///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
#define ACTIVATION_IDENTITY 0
#define ACTIVATION_RELU     1
#define ACTIVATION_TANH     2
#define ACTIVATION_SIGMOID  3
#define ACTIVATION_RELU6    4

#ifndef dtype
#define dtype float
#define dtype2 float2
#define dtype4 float4
#define DTYPE_MAX FLT_MAX
#define DTYPE_MIN FLT_MIN
#endif

#ifndef ACTIVATION
#define ACTIVATION ACTIVATION_IDENTITY
#endif

#if ACTIVATION == ACTIVATION_IDENTITY
#   define ACTIVATION_F(x) (x)
#   define ACTIVATION_FINV(y,dy) (dy)
#   define ACTIVATION_NAME identity
#elif ACTIVATION == ACTIVATION_RELU
#   define ACTIVATION_F(x) (max((x),(dtype)(0)))
#   define ACTIVATION_FINV(y,dy)  ((y>0)?dy:0)
#   define ACTIVATION_NAME relu
#elif ACTIVATION == ACTIVATION_TANH
#   define ACTIVATION_F(x) (tanh((x)))
#   define ACTIVATION_FINV(y,dy) ((1-(y)*(y))*(dy))
#   define ACTIVATION_NAME tanh 
#elif ACTIVATION == ACTIVATION_SIGMOID
#   define ACTIVATION_F(x) ((dtype)(1) / ((dtype)(1) + exp(-(x))))
#   define ACTIVATION_FINV(y,dy) ((y)*(1-(y))*(dy))
#   define ACTIVATION_NAME sigmoid
#elif ACTIVATION == ACTIVATION_RELU6
#   define ACTIVATION_F(x) (min(max((x),(dtype)(0)),(dtype)(6)))
#   define ACTIVATION_FINV(y,dy)  ((0<y && y<6)?dy:0)
#   define ACTIVATION_NAME relu6
#else
#   error "Unknown activation"
#endif 

#ifndef TILE_SIZE_M
#define TILE_SIZE_M 128
#endif
#ifndef TILE_SIZE_N
#define TILE_SIZE_N 128
#endif
#ifndef BLOCK_SIZE_N
#define BLOCK_SIZE_N 8
#endif
#ifndef BLOCK_SIZE_M
#define BLOCK_SIZE_M 8
#endif

#ifndef TILE_SIZE_K
#define TILE_SIZE_K 16
#endif 

#ifndef TILE_OFFSET
#define TILE_OFFSET 1
#endif

#ifndef ZORDER
#define ZORDER 0
#endif

#ifndef ATRANS
#define ATRANS 0
#endif

#ifndef BTRANS
#define BTRANS 0
#endif

#ifndef BIAS
#define BIAS 0
#endif

#ifndef GROUPS
#define GROUPS 1
#endif

#define BLOCK_SIZE_NY (BLOCK_SIZE_N*BLOCK_SIZE_M)
#define BLOCKS_IN_TILE_N (TILE_SIZE_N / BLOCK_SIZE_N)
#define BLOCKS_IN_TILE_M (TILE_SIZE_M / BLOCK_SIZE_M)
#define WG_SIZE (BLOCKS_IN_TILE_M * BLOCKS_IN_TILE_N)

#define ALIGN_FLOAT4 __attribute__ ((aligned (16)))

#ifndef CONVGEMM
#define CONVGEMM 0
#endif

#if CONVGEMM == 3 || REDUCE_K > 1
///////////////////////////////////////////////////////////////////////////////
///
/// Copyright (c) 2021-2022 Artyom Beilis <artyomtnk@yahoo.com>
///
/// MIT License, see LICENSE.TXT
///
///////////////////////////////////////////////////////////////////////////////
void atomic_addf(__global volatile float *ptr,float v)
{
#if defined(cl_intel_subgroups)
    __global atomic_int *p = (__global atomic_int *)(ptr);
    int prev,newv;
    do {
        prev = atomic_load(p);
        newv = as_int(as_float(prev) + v);
    } while(! atomic_compare_exchange_weak(p,&prev,newv));
#else
    float oldv = *ptr;
    for(;;) {
        float newv = oldv + v;
        int prev = atomic_cmpxchg((__global volatile int *)(ptr),as_int(oldv),as_int(newv));
        if(prev == as_int(oldv))
            return;
        oldv = as_float(prev);
    }
#endif    
}

#if ACTIVATION != ACTIVATION_IDENTITY
# error "Can't use activation with atomic ops"
#endif
#endif

#if CONVGEMM != 0
    #if CONVGEMM == 3
    void add_img_value(__global float *ptr,int matrix_row,int matrix_col,float dV)
    #else
    float get_img_value(__global float const *ptr,int matrix_row,int matrix_col)
    #endif
    {
#if KERN_W == 1 && KERN_H == 1 && STRIDE_H == 1 && STRIDE_W == 1 && DILATE_H == 1 && DILATE_W == 1 \
        && PAD_H == 0 && PAD_W == 0 && GROUPS == 1 && IMG_COLS == SRC_COLS && IMG_ROWS == SRC_ROWS 
        int channel = matrix_col;
        int b  = matrix_row / (IMG_COLS * IMG_ROWS);
        int rc = matrix_row % (IMG_COLS * IMG_ROWS);

        int address = (b * CHANNELS_IN + channel) * (SRC_ROWS * SRC_COLS) + rc;
        #if CONVGEMM != 3
            return ptr[address];
        #else
            #if REDUCE_K > 1
                atomic_addf(ptr+address,dV);
            #else
                ptr[address] += dV;
            #endif
        #endif
#else
        int channel = matrix_col / (KERN_H * KERN_W);
        int k_index = matrix_col % (KERN_H * KERN_W);

        int dy = k_index / KERN_W;
        int dx = k_index % KERN_W;

        int b  = matrix_row / (IMG_COLS * IMG_ROWS);
        int rc = matrix_row % (IMG_COLS * IMG_ROWS);

        int r  = rc / IMG_COLS;
        int c  = rc % IMG_COLS;

        int y_pos = -PAD_H + r * STRIDE_H;
        int x_pos = -PAD_W + c * STRIDE_W;

        int y = y_pos + dy * DILATE_H;
        int x = x_pos + dx * DILATE_W;

        int address = ((b *  (CHANNELS_IN*GROUPS) + channel) * SRC_ROWS + y) * SRC_COLS + x;

        #if CONVGEMM != 3
            #if PAD_W > 0 || PAD_H > 0
                if(x >= 0 && y >= 0 && x < SRC_COLS && y < SRC_ROWS) {
                    return ptr[address];
                }
                return 0;
            #else
                return ptr[address];
            #endif  
        #else
            #if PAD_W > 0 || PAD_H > 0
                if(x >= 0 && y >= 0 && x < SRC_COLS && y < SRC_ROWS) 
                    atomic_addf(ptr+address,dV);
            #else
                atomic_addf(ptr+address,dV);
            #endif
        #endif
#endif  
    }
#endif

#if CONVGEMM == 0 || CONVGEMM == 3
#  if BTRANS == 0
#    define get_B(r,c) (B[(r)*ldb + (c)])
#  else
#    define get_B(r,c) (B[(c)*ldb + (r)])
#  endif
#else
#  if BTRANS == 0
#    define get_B(r,c) get_img_value(B,r,c)
#  else
#    define get_B(r,c) get_img_value(B,c,r)
#  endif
#endif

#if CONVGEMM  == 0 || CONVGEMM == 1
    #if  ATRANS == 0
        #define get_A(r,c) (A[(r)*lda + (c)])
    #else
        #define get_A(r,c) (A[(c)*lda + (r)])
    #endif
#else
    float get_y_value(int row,int matrix_col,__global float const *A,int ldc,int M)
    {
        int batch = matrix_col / IM2COL_OCHAN;
        int incol = matrix_col % IM2COL_OCHAN;
        int offset = batch * (IM2COL_OCHAN * GROUPS) * M + incol;
        int index =row*IM2COL_OCHAN + offset;
        return A[index];
    }

    #if CONVGEMM == 3
        #define GET_Y_STEP K_src
    #else
        #define GET_Y_STEP M
    #endif
    #if  ATRANS == 0
        #define get_A(r,c) (get_y_value(r,c,A,lda,GET_Y_STEP))
    #else
        #define get_A(r,c) (get_y_value(c,r,A,lda,GET_Y_STEP))
    #endif

#endif

#define lA(x,y) a_tile[(x)][(y) / BLOCK_SIZE_M][(y) % BLOCK_SIZE_M]
#define lB(x,y) b_tile[(x)][(y) / BLOCK_SIZE_N][(y) % BLOCK_SIZE_N]

#if TILE_SIZE_M != TILE_SIZE_N
#error "Unsupported condif"
#endif

#if defined(cl_intel_subgroups)
#define INTEL_PLATFORM 1
#else
#define INTEL_PLATFORM 0
#endif

#define vload1(off,addr) ((addr)[off])
#define vstore1(val,off,addr) ((addr)[off]=(val))

#if BLOCK_SIZE_M == 1
#define vloadM vload1
#define vstoreM vstore1
#define floatM float
#elif BLOCK_SIZE_M == 4
#define vloadM vload4
#define vstoreM vstore4
#define floatM float4
#elif BLOCK_SIZE_M == 8
#define vloadM vload8
#define vstoreM vstore8
#define floatM float8
#elif BLOCK_SIZE_M == 16 
#define vloadM vload16
#define vstoreM vstore16
#define floatM float16
#endif

#if BLOCK_SIZE_N == 1
#define vloadN vload1
#define vstoreN vstore1
#define floatN float
#elif BLOCK_SIZE_N == 4
#define vloadN vload4
#define vstoreN vstore4
#define floatN float4
#elif BLOCK_SIZE_N == 8
#define vloadN vload8
#define vstoreN vstore8
#define floatN float8
#elif BLOCK_SIZE_N == 16 
#define vloadN vload16
#define vstoreN vstore16
#define floatN float16
#endif

#if TILE_SIZE_K == 1
#define vloadK vload1
#define vstoreK vstore1
#define floatK float
#elif TILE_SIZE_K == 4
#define vloadK vload4
#define vstoreK vstore4
#define floatK float4
#elif TILE_SIZE_K == 8
#define floatK float8
#define vloadK vload8
#define vstoreK vstore8
#elif TILE_SIZE_K == 16 
#define vloadK vload16
#define vstoreK vstore16
#define floatK float16
#endif

#ifndef REDUCE_K
#define REDUCE_K 1
#endif

#if GROUPS == 1 && REDUCE_K == 1
#define DIM_M 0
#define DIM_N 1
#define DIM_G 2
#define EXTRA_DIM 0
#else
#define DIM_M 1
#define DIM_N 2
#define DIM_G 0
#define EXTRA_DIM 1
#endif

int zorder_a(int x)
{
    return
          ((x & (1<<0)) >> 0 )
        | ((x & (1<<2)) >> 1 )
        | ((x & (1<<4)) >> 2 )
        | ((x & (1<<6)) >> 3 )
        | ((x & (1<<8)) >> 4 )
        | ((x & (1<<10)) >> 5 )
        | ((x & (1<<12)) >> 6 )
        | ((x & (1<<14)) >> 7 )
        | ((x & (1<<16)) >> 8 )
        | ((x & (1<<18)) >> 9 )
        | ((x & (1<<20)) >> 10 )
        | ((x & (1<<22)) >> 11 )
        | ((x & (1<<24)) >> 12 );
}
int zorder_b(int x)
{
    return zorder_a(x>>1);
}

__kernel 
#if INTEL_PLATFORM == 1
__attribute__((intel_reqd_sub_group_size(8)))
#endif
#if EXTRA_DIM == 0
__attribute__((reqd_work_group_size(BLOCKS_IN_TILE_M, BLOCKS_IN_TILE_N, 1)))
#else
__attribute__((reqd_work_group_size(1,BLOCKS_IN_TILE_M, BLOCKS_IN_TILE_N)))
#endif
void    sgemm(    int M,int N,int K,
        __global const float * restrict A,ulong offset_A,int lda,
        __global const float * restrict B,ulong offset_B,int ldb,
        __global float * restrict C,ulong offset_C,int ldc,
        float beta_factor
#if BIAS != 0
        , __global const float * restrict bias,ulong offset_bias
#endif
        )
{
    A += offset_A;
    B += offset_B;
    C += offset_C;

#if CONVGEMM > 0 && GROUPS > 1
    if(get_global_id(DIM_G) >= REDUCE_K * GROUPS)
        return;
    int group = get_global_id(DIM_G) / REDUCE_K;
    #if CONVGEMM == 1
        A += M*K*group;
        B += SRC_COLS*SRC_ROWS*CHANNELS_IN*group;
        C += (IM2COL_OCHAN) * M *group;
        #if BIAS != 0
        bias += M*group;
        #endif
    #elif CONVGEMM == 2
        // M = channels_out / groups
        int step_g_y = (M*IM2COL_OCHAN);
        int step_g_x = (SRC_COLS*SRC_ROWS) * CHANNELS_IN;
        int step_g_w = M*(CHANNELS_IN*KERN_W*KERN_H);

        A += step_g_y * group;
        B += step_g_x * group;
        C += step_g_w * group;
    #elif CONVGEMM == 3
        // K = channels_out / group
        int step_g_y = (K*IM2COL_OCHAN);
        int step_g_x = (SRC_COLS*SRC_ROWS) * CHANNELS_IN;
        int step_g_w = K*(CHANNELS_IN*KERN_W*KERN_H);

        A += step_g_y * group;
        B += step_g_w * group;
        C += step_g_x * group;
    #else
    #error "Invalid CONVGEMM Value"
    #endif
#endif   

#if ZORDER == 1
    int gr_m = get_group_id(DIM_M);
    int gr_n = get_group_id(DIM_N);
    int gr_size_m = get_num_groups(DIM_M);
    int gr_size_n = get_num_groups(DIM_N);
    if(gr_size_m == gr_size_n && popcount(gr_size_m) == 1) {
        int grs  = gr_n * gr_size_m + gr_m;
        gr_n = zorder_a(grs);
        gr_m = zorder_b(grs);
    }
#else
    int gr_m = get_group_id(DIM_M);
    int gr_n = get_group_id(DIM_N);
#endif
    int tile_row0 = gr_m*TILE_SIZE_M;
    int tile_col0 = gr_n*TILE_SIZE_N;

#if ZORDER == 1
    if(tile_row0 >= M || tile_col0 >= N)
        return;
#endif        

    int row = tile_row0 + get_local_id(DIM_M) * BLOCK_SIZE_M;
    int col = tile_col0 + get_local_id(DIM_N) * BLOCK_SIZE_N;

    int lid0 = get_local_id(DIM_M);
    int lid1 = get_local_id(DIM_N);

    int local_tile_id = lid0 * get_local_size(DIM_N) + lid1;

    #define local_wg_size (BLOCKS_IN_TILE_M * BLOCKS_IN_TILE_N)
    #define load_step (TILE_SIZE_M * TILE_SIZE_K / local_wg_size)

    float c[BLOCK_SIZE_M][BLOCK_SIZE_N] = {{0.0f}};

    int K_src = K;

#if INTEL_PLATFORM == 0
    float ap[BLOCK_SIZE_M];
    float bp[BLOCK_SIZE_N];

    ALIGN_FLOAT4 __local float a_tile[TILE_SIZE_K][BLOCKS_IN_TILE_M][BLOCK_SIZE_M+TILE_OFFSET];
    ALIGN_FLOAT4 __local float b_tile[TILE_SIZE_K][BLOCKS_IN_TILE_N][BLOCK_SIZE_N+TILE_OFFSET];
#else
    #if ATRANS == 1
    float a[TILE_SIZE_K][BLOCK_SIZE_M];
    #define pA(ind1,ind2) (a[(ind2)][(ind1)])
    #else
    float a[BLOCK_SIZE_M][TILE_SIZE_K];
    #define pA(ind1,ind2) (a[(ind1)][(ind2)])
    #endif
#endif    

#if REDUCE_K > 1
    int KS = (K + REDUCE_K - 1) / REDUCE_K;
    int sec = get_global_id(DIM_G) % REDUCE_K;
    int k_start=KS * sec;
    K = min(K_src,KS * (sec + 1));
    int k = k_start;
#else
    int k=0;
#endif

#if TILE_SIZE_N == TILE_SIZE_M && TILE_SIZE_K % load_step  == 0 && load_step <= TILE_SIZE_K
#define LOAD_VARIANT 0
#else
#define LOAD_VARIANT 1
#endif

#if INTEL_PLATFORM == 0

    #if LOAD_VARIANT == 1
    int dM[load_step];
    int dN[load_step];
    int dK [load_step];
    __local float *aP[load_step];
    __local float *bP[load_step];

    for(int i=0,read_pos = local_tile_id;i<load_step;i++,read_pos+=WG_SIZE) {
        int tile_kdir = read_pos / TILE_SIZE_M;
        int tile_tdir = read_pos % TILE_SIZE_M;
        dM[i] = tile_tdir + tile_row0;
        dN[i] = tile_tdir + tile_col0;
        dK[i]  = tile_kdir;
        aP[i] = &lA(tile_kdir,tile_tdir);
        bP[i] = &lB(tile_kdir,tile_tdir);
    }
    #endif

    for(;k<K;k+=TILE_SIZE_K) {

        #if LOAD_VARIANT == 0
        {
            int tile_kdir0 = local_tile_id / TILE_SIZE_M;
            int tile_tdir  = local_tile_id % TILE_SIZE_M;
            int a_row = tile_tdir + tile_row0;
            int b_col = tile_tdir + tile_col0;

            if(a_row >= M) {
                #pragma unroll
                for(int i=0,tile_kdir=tile_kdir0;i<load_step;i++,tile_kdir+=WG_SIZE / TILE_SIZE_M) {
                    lA(tile_kdir,tile_tdir) = 0.0f;
                }
            }
            else {
                if(tile_kdir0 + k <= K - load_step * (WG_SIZE / TILE_SIZE_M)) {
                    #pragma unroll
                    for(int i=0,tile_kdir=tile_kdir0;i<load_step;i++,tile_kdir+=WG_SIZE / TILE_SIZE_M) {
                        int k_rc  = tile_kdir + k;
                        lA(tile_kdir,tile_tdir) = get_A(a_row,k_rc);
                    }
                }
                else {
                    #pragma unroll
                    for(int i=0,tile_kdir=tile_kdir0;i<load_step;i++,tile_kdir+=WG_SIZE / TILE_SIZE_M) {
                        int k_rc  = tile_kdir + k;
                        lA(tile_kdir,tile_tdir) = k_rc < K ? get_A(a_row,k_rc) : 0.0f;
                    }
                }
            }
            if(b_col >= N) {
                #pragma unroll
                for(int i=0,tile_kdir=tile_kdir0;i<load_step;i++,tile_kdir+=WG_SIZE / TILE_SIZE_M) {
                    lB(tile_kdir,tile_tdir) = 0.0f;
                }
            }
            else {
                if(tile_kdir0 + k <= K - load_step * (WG_SIZE / TILE_SIZE_N)) {
                    #pragma unroll
                    for(int i=0,tile_kdir=tile_kdir0;i<load_step;i++,tile_kdir+=WG_SIZE / TILE_SIZE_N) {
                        int k_rc  = tile_kdir + k;
                        lB(tile_kdir,tile_tdir) = get_B(k_rc,b_col);
                    }
                }
                else {
                    #pragma unroll
                    for(int i=0,tile_kdir=tile_kdir0;i<load_step;i++,tile_kdir+=WG_SIZE / TILE_SIZE_N) {
                        int k_rc  = tile_kdir + k;
                        lB(tile_kdir,tile_tdir) = k_rc < K ? get_B(k_rc,b_col) : 0.0f;
                    }
                }
            }

            barrier(CLK_LOCAL_MEM_FENCE);
        }
        #else
        {
            if(tile_row0 + TILE_SIZE_M <= M && k + TILE_SIZE_K <= K) {
                #pragma unroll
                for(int i=0;i<load_step;i++) {
                    int a_row = dM[i];
                    int k_rc  = dK[i] + k;
                    *aP[i] =  get_A(a_row,k_rc);
                }
            }
            else {
                #pragma unroll
                for(int i=0;i<load_step;i++) {
                    int a_row = dM[i];
                    int k_rc  = dK[i] + k;
                    *aP[i] = (a_row < M && k_rc < K) ?  get_A(a_row,k_rc) : 0.0f;
                }
            }
            if(tile_col0 + TILE_SIZE_N <= N && k + TILE_SIZE_K <= K) {
                #pragma unroll
                for(int i=0;i<load_step;i++) {
                    int k_rc  = dK[i]  + k;
                    int b_col = dN[i];
                    *bP[i] = get_B(k_rc,b_col);
                }
            }
            else {
                #pragma unroll
                for(int i=0;i<load_step;i++) {
                    int k_rc  = dK[i]  + k;
                    int b_col = dN[i];
                    *bP[i] = (b_col < N && k_rc < K) ? get_B(k_rc,b_col) : 0.0f;

                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        #endif

        // Mutliplication loop
        #pragma unroll(4)
        for(int dk=0;dk<TILE_SIZE_K;dk++) {
            #pragma unroll
            for(int dr=0;dr<BLOCK_SIZE_M;dr++) {
                ap[dr] = a_tile[dk][lid0][dr];
            }
            #pragma unroll
            for(int dc=0;dc<BLOCK_SIZE_N;dc++) {
                bp[dc] = b_tile[dk][lid1][dc];
            }
            #pragma unroll
            for(int dr=0;dr<BLOCK_SIZE_M;dr++) {
                #pragma unroll
                for(int dc=0;dc<BLOCK_SIZE_N;dc++) {
                    c[dr][dc] = mad(ap[dr],bp[dc],c[dr][dc]);
                }
            }
        }

        barrier(CLK_LOCAL_MEM_FENCE);
    }
#else // INTEL_PLATFORM == 1
    // for intel we don't use local memory
    // we use optimized loads from global memory for A
    // and intel_sub_group_shuffle for optimal loading B
    for(;k<K;k+=TILE_SIZE_K) {
        if(row + BLOCK_SIZE_M - 1 < M && k + TILE_SIZE_K-1 < K) {
            #if CONVGEMM == 2 || CONVGEMM == 3
                #pragma unroll
                for(int dr=0;dr<BLOCK_SIZE_M;dr++){
                    for(int dk=0;dk < TILE_SIZE_K;dk++) {
                        pA(dr,dk)=get_A(row+dr,k+dk);
                    }
                }
            #else
                #if ATRANS == 0
                    #pragma unroll
                    for(int dr=0;dr<BLOCK_SIZE_M;dr++){
                        floatK v=vloadK(0,&get_A(row+dr,k));
                        vstoreK(v,0,a[dr]);
                    }
                #else // ATRANS
                    #pragma unroll
                    for(int dk=0;dk<TILE_SIZE_K;dk++){
                        floatM v=vloadM(0,&get_A(row,k+dk));
                        vstoreM(v,0,a[dk]);
                    }
                #endif
            #endif
        }
        else {
            #pragma unroll
            for(int dr=0;dr<BLOCK_SIZE_M;dr++){
                #pragma unroll
                for(int dk=0;dk < TILE_SIZE_K;dk++) {
                    pA(dr,dk) = (row + dr < M && k+dk < K) ? get_A(row+dr,k+dk): 0;
                }
            }
        }

        #pragma unroll(TILE_SIZE_K)
        for(int dk=0;dk<TILE_SIZE_K;dk++) {
            if(k + dk >= K)
                continue;
            #if BLOCK_SIZE_N == 8
                int mycol = col + get_sub_group_local_id();
                float myv = (mycol < N) ? get_B(k+dk,col + get_sub_group_local_id()) : 0;
                #pragma unroll
                for(int dc=0;dc<BLOCK_SIZE_N;dc++){
                    float b_dc = intel_sub_group_shuffle(myv,dc);
                    for(int dr=0;dr<BLOCK_SIZE_M;dr++) {
                        c[dr][dc] = mad(pA(dr,dk),b_dc,c[dr][dc]);
                    }
                }
            #else
                #pragma unroll
                for(int dc=0;dc<BLOCK_SIZE_N;dc++){
                    float b_dc = (col + dc < N) ? get_B(k+dk,col+dc) : 0;
                    for(int dr=0;dr<BLOCK_SIZE_M;dr++) {
                        c[dr][dc] = mad(pA(dr,dk),b_dc,c[dr][dc]);
                    }
                }
            #endif
        }
    }
#endif // INTEL_PLATFORM = 1

#if BIAS != 0
    bias += offset_bias;
#endif

#if BIAS == 1
    #if REDUCE_K > 1
    if(k_start == 0)
    #endif
    {
        float offset;
        #pragma unroll
        for(int dr=0;dr<BLOCK_SIZE_M;dr++) {
            offset = row + dr < M ? bias[(row+dr)] : 0.0f;
            #pragma unroll
            for(int dc=0;dc<BLOCK_SIZE_N;dc++) {
                c[dr][dc] += offset;
            }
        }
    }
#elif BIAS == 2
    #if REDUCE_K > 1
    if(k_start == 0)
    #endif
    {
        float offset;
        #pragma unroll
        for(int dc=0;dc<BLOCK_SIZE_N;dc++) {
            offset = (col + dc) < N ? bias[(col+dc)] : 0.0f;
            #pragma unroll
            for(int dr=0;dr<BLOCK_SIZE_M;dr++) {
                c[dr][dc] += offset;
            }
        }
    }
#endif    

#if CONVGEMM == 1
    {
        #pragma unroll
        for(int dc=0;dc<BLOCK_SIZE_N;dc++) {
            if(col + dc >= N)
                continue;
            int matrix_col = col + dc;
            int batch = matrix_col / IM2COL_OCHAN;
            int incol = matrix_col % IM2COL_OCHAN;
            int offset = batch * (IM2COL_OCHAN * GROUPS) * M + incol;
            #pragma unroll
            for(int dr=0;dr<BLOCK_SIZE_M;dr++) {
                if(row+dr < M) {
                    int index =(row + dr)*ldc + offset;
                    #if REDUCE_K > 1
                    atomic_addf(C+index,c[dr][dc]);
                    #else
                    if(beta_factor != 0)
                        C[index] = mad(C[index], beta_factor,ACTIVATION_F(c[dr][dc]));
                    else
                        C[index] = ACTIVATION_F(c[dr][dc]);
                    #endif
                }
            }
        }
    }
#else
    {
        #pragma unroll
        for(int dr=0;dr<BLOCK_SIZE_M;dr++) {
            #pragma unroll
            for(int dc=0;dc<BLOCK_SIZE_N;dc++) {
                if(row + dr < M && col+dc < N) {
                    #if CONVGEMM == 3
                        add_img_value(C,row+dr,col+dc,c[dr][dc]);
                    #else
                        int index = (row+dr)*ldc+col+dc;
                        #if REDUCE_K > 1
                        atomic_addf(C+index,ACTIVATION_F(c[dr][dc]));
                        #else
                        if(beta_factor != 0)
                            C[index] = mad(C[index], beta_factor,ACTIVATION_F(c[dr][dc]));
                        else
                            C[index] = ACTIVATION_F(c[dr][dc]);
                        #endif
                    #endif
                }
            }
        }
    }
#endif
}

=========================

Traceback (most recent call last):
  File "mnist.py", line 162, in <module>
    main()
  File "mnist.py", line 153, in main
    train(args, model, device, train_loader, optimizer, epoch)
  File "mnist.py", line 53, in train
    output = model(data)
  File "/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "mnist.py", line 29, in forward
    x = self.conv1(x)
  File "/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 460, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File "/opt/anaconda3/envs/astropy/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 456, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Failed to build program source sgemm with parameters -DTILE_SIZE_M=32 -DTILE_SIZE_N=32 -DBLOCK_SIZE_M=4 -DBLOCK_SIZE_N=4 -DTILE_SIZE_K=32 -DTILE_OFFSET=0 -DBIAS=0 -DATRANS=0 -DBTRANS=1 -DIM2COL_OCHAN=676 -DCONVGEMM=1 -DKERN_H=3 -DKERN_W=3 -DDILATE_H=1 -DDILATE_W=1 -DPAD_H=0 -DPAD_W=0 -DSTRIDE_H=1 -DSTRIDE_W=1 -DGROUPS=1 -DCHANNELS_IN=1 -DSRC_COLS=28 -DSRC_ROWS=28 -DIMG_COLS=26 -DIMG_ROWS=26 -DREDUCE_K=1 -DACTIVATION=0 log:
For device: Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz

clBuildErrorCode: -11

artyom-beilis / pytorch_dlprim

RuntimeError: Failed to build program source pooling on mnist.py #39