Use local memory - Githubissues

Early tests are indicating that this might get us 10% speedup on 3x3 stencils. And the expectation is that larger stencils will see more speedup, because each cell is accessed more times (once by each time it forms part of someone's neighborhood).

(An earlier estimate of larger gains was wrong, because I was comparing different stencil sizes. Which is actually a useful insight - the speed difference between a 5-point Laplacian and a 9-point Laplacian is significant.)

A hand-written kernel:

#define LX 8
#define LY 32
#define LZ 1
#define XR 1
#define YR 1
#define ZR 0
__kernel void rd_compute(__global float4 *a_in,__global float4 *b_in,__global float4 *a_out,__global float4 *b_out)
{
    const int index_x = get_global_id(0);
    const int index_y = get_global_id(1);
    const int index_z = get_global_id(2);
    const int local_x = get_local_id(0);
    const int local_y = get_local_id(1);
    const int local_z = get_local_id(2);
    const int X = get_global_size(0);
    const int Y = get_global_size(1);
    const int Z = get_global_size(2);
    const int index_here = X*(Y*index_z + index_y) + index_x;
    float4 a = a_in[index_here];
    float4 b = b_in[index_here];

    // copy local area into local memory
    local float4 local_a[LX+2*XR][LY+2*YR][LZ+2*ZR]; // expanded to allow access to neighbors
    local float4 local_b[LX+2*XR][LY+2*YR][LZ+2*ZR];
    local_a[local_x+XR][local_y+YR][local_z+ZR] = a;
    local_b[local_x+XR][local_y+YR][local_z+ZR] = b;
    if(local_y==0) {
        local_a[local_x+XR][local_y-1+YR][local_z+ZR] = a_in[X*(Y*index_z + ((index_y-1+Y)%Y)) + index_x];
        local_b[local_x+XR][local_y-1+YR][local_z+ZR] = b_in[X*(Y*index_z + ((index_y-1+Y)%Y)) + index_x];
    }
    else if(local_y==LY-1) {
        local_a[local_x+XR][local_y+1+YR][local_z+ZR] = a_in[X*(Y*index_z + ((index_y+1)%Y)) + index_x];
        local_b[local_x+XR][local_y+1+YR][local_z+ZR] = b_in[X*(Y*index_z + ((index_y+1)%Y)) + index_x];
    }
    if(local_x==0) {
        local_a[local_x-1+XR][local_y+YR][local_z+ZR] = a_in[X*(Y*index_z + index_y) + (index_x-1+X)%X];
        local_b[local_x-1+XR][local_y+YR][local_z+ZR] = b_in[X*(Y*index_z + index_y) + (index_x-1+X)%X];
    }
    else if(local_x==LX-1) {
        local_a[local_x+1+XR][local_y+YR][local_z+ZR] = a_in[X*(Y*index_z + index_y) + (index_x+1)%X];
        local_b[local_x+1+XR][local_y+YR][local_z+ZR] = b_in[X*(Y*index_z + index_y) + (index_x+1)%X];
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    float4 a_n = local_a[local_x+XR][local_y-1+YR][local_z+ZR];
    float4 a_s = local_a[local_x+XR][local_y+1+YR][local_z+ZR];
    float4 a_e4 = local_a[local_x+1+XR][local_y+YR][local_z+ZR];
    float4 a_w4 = local_a[local_x-1+XR][local_y+YR][local_z+ZR];
    float4 b_n = local_b[local_x+XR][local_y-1+YR][local_z+ZR];
    float4 b_s = local_b[local_x+XR][local_y+1+YR][local_z+ZR];
    float4 b_e4 = local_b[local_x+1+XR][local_y+YR][local_z+ZR];
    float4 b_w4 = local_b[local_x-1+XR][local_y+YR][local_z+ZR];
    float4 a_e = (float4)(a.yzw, a_e4.x);
    float4 a_w = (float4)(a_w4.w, a.xyz);
    float4 b_e = (float4)(b.yzw, b_e4.x);
    float4 b_w = (float4)(b_w4.w, b.xyz);
    float4 laplacian_a = a_n + a_e + a_s + a_w - 4.0f * a;
    float4 laplacian_b = b_n + b_e + b_s + b_w - 4.0f * b;

    float D_a = 0.082f;
    float D_b = 0.041f;
    float F = 0.035f;
    float k = 0.064f;
    float4 delta_a = D_a * laplacian_a - a*b*b + F*(1.0f-a);
    float4 delta_b = D_b * laplacian_b + a*b*b - (F+k)*b;

    float timestep = 1.0f;
    a_out[index_here] = a + delta_a * timestep;
    b_out[index_here] = b + delta_b * timestep;
}

Here XR, YR, ZR are the radii around each cell that we need to retrieve for the stencils (here 1). LX, LY, LZ are the work group size, and must be chosen such that we don't exceed CL_DEVICE_LOCAL_MEM_SIZE or CL_DEVICE_MAX_WORK_GROUP_SIZE.

We can use local memory for formula image rules.

GollyGang / ready

Use local memory #122