ysh329 commented 5 years ago

卷积优化

【ARMv8架构CNN推理性能调优浅析】泰坦四期第二次课程 PPL库在ARM结构下的CNN推理

ysh329 commented 5 years ago

四种卷积方法

直接卷积法，根据定义滑窗滑动计算；
im2col+gemm，参考Caffe/Darknet的计算方法；
FFT，快速傅里叶变换；
Winograd（针对3x3卷积核：F=3、pad=1、stride=1）；
arm平台，类似2的做法，interleave+gemm，参考ArmComputeLibrary/Tengine。

参考

ysh329 commented 5 years ago

1. 直接卷积法

也是滑窗法，最直观最简单的方法，做法可以参考下图（来自CS229课程动图），但该方法在大规模上效果较差。除了特定卷积核大小，通常情况下不采用这种方法。

#include <stdio.h>
#include <stdlib.h>

struct conv_param
{
    // input/output/filter shape dim
    int shape_dim;
    int *input_shape;
    int *output_shape;
    int *filter_shape;

    // filter params(filter nchw defined in filter_shape)
    int pad_h;
    int pad_w;
    int stride;
    int group;
} conv_param_t;

// TODO
#define out(n,c,h,w)    (output[n*OC*OH*OW+m*OH*OW+])
#define in(n,c,h,w)     (input[n*C*H*W+c*H*W+h*W+w])
#define filter(n,c,h,w) (filter[m*D*K*K+])
int conv_direct(float *input, float *output, float *filter, conv_param_t *param)
{
    // filter
    int pad_h = conv_p->pad_h;
    int pad_w = conv_p->pad_w;
    int stride = conv_p->stride;
    int group = conv_p->group;
    // input
    int N = conv_p->input_shape[0];
    int D = conv_p->input_shape[1];
    int H = conv_p->input_shape[2];
    int W = conv_p->input_shape[3];
    int M = conv_p->filter_shape[0];
    int K = conv_p->filter_shape[2];// filter_shape[2]==filter_shape[3]

    // TODO
    for(int h=0; h<H; ++h)
    {
        for(int w=0; w<W; ++w)
        {
            for(int y=0; y<K; ++y)
            {
                for(int x=0; x<K; ++x)
                {
                    for(int m=0; m<M; ++m) // filter num.=output channels num.
                    {
                        for(int d=0; d<D; ++d) // D=input channels num.
                        {
                            out() += in() * filter(); // TODO
                        }
                    }
                }
            }
        }
    }   
    return 0;
}

#define SHAPE_DIM   (4)
int main()
{
    // input
    int input_n = 1;
    int input_c = 3;
    int input_h = 224;
    int input_w = 224;
    int input_shape[SHAPE_DIM] = {input_n,
                                  input_c,
                                  input_h,
                                  input_w};
    float *input = (float*)calloc(input_n*
                                  input_c*
                                  input_h*
                                  input_w, sizeof float);
    // filter without bias
    int filter_n = 32;
    int filter_c = input_c;
    int filter_h = 3;
    int filter_w = filter_h;
    int filter_shape[SHAPE_DIM] = {filter_n, 
                                   filter_c, 
                                   filter_h, 
                                   filter_w};
    float *filter = (float*)calloc(filter_n*
                                   filter_c*
                                   filter_h*
                                   filter_w, sizeof float);
    int pad_h = 1;
    int pad_w = 1;
    int stride = 1;
    int group = 1;

    // output
    int output_n = input_n;
    int output_c = filter_n;
    int output_h = (input_h+2*pad_h-filter_h) / stride + 1;
    int output_w = (input_w+2*pad_w-filter_w) / stride + 1;
    int output_shape[SHAPE_DIM] = {output_n, 
                                   output_c,
                                   output_h,
                                   output_w};
    float *output = (float*)calloc(output_n*
                                   output_c*
                                   output_h*
                                   output_w, sizeof float);

    // conv_param
    conv_param_t *conv_p = (conv_param_t*)calloc(1, sizeof conv_param_t);
    conv_p = {.shape_dim = SHAPE_DIM,
              .input_shape = input_shape,
              .filter_shape = filter_shape,
              .output_shape = output_shape,
              .pad_h = pad_h,
              .pad_w = pad_w,
              .stride = stride,
              .group = group};

    if(!input || !filter || !output)
    {
        printf("[ERROR] failed to allocate memory for input/filter/output\n");
        goto free_memory;
    }

    int conv_status = conv_direct(intput, output, filter, conv_p);
    if(conv_status)
    {
        printf("[ERROR] conv_direct failure\n");
        goto free_memory;
    }

    free_memory:
        if(input)  free(input);
        if(filter) free(output);
        if(filter) free(filter);
        if(conv_p) free(conv_p);
        input = NULL;
        output = NULL;
        filter = NULL;
        conv_p = NULL;

    return 0;
}

ysh329 commented 5 years ago

2. im2col+gemm

目前几乎所有的主流计算框架包括 Caffe、MXNet 等都实现了该方法。该方法把整个卷积过程转化成了矩阵乘法（GEMM），而 GEMM 在各种 BLAS 库中都是被极致优化的，一般来说, 速度较快。

darknet代码位于./src/convolutional_layer.c：

void forward_convolutional_layer(convolutional_layer l, network net)
{
    int i, j;

    fill_cpu(l.outputs*l.batch, 0, l.output, 1); 

    if(l.xnor){
        binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
        swap_binary(&l);
        binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
        net.input = l.binary_input;
    }   

    int m = l.n/l.groups;
    int k = l.size*l.size*l.c/l.groups;
    int n = l.out_w*l.out_h;
    for(i = 0; i < l.batch; ++i){
        for(j = 0; j < l.groups; ++j){
            float *a = l.weights + j*l.nweights/l.groups;
            float *b = net.workspace;
            float *c = l.output + (i*l.groups + j)*n*m;
            float *im =  net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;

            if (l.size == 1) {
                b = im; 
            } else {
                im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); 
            }   
            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
        }   
    }   

    if(l.batch_normalize){
        forward_batchnorm_layer(l, net);
    } else {
        add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
    }   

    activate_array(l.output, l.outputs*l.batch, l.activation);
    if(l.binary || l.xnor) swap_binary(&l);
}

// im2col.c
#include "im2col.h"
#include <stdio.h>
float im2col_get_pixel(float *im, int height, int width, int channels,
                        int row, int col, int channel, int pad)
{
    row -= pad;
    col -= pad;

    if (row < 0 || col < 0 ||
        row >= height || col >= width) return 0;
    return im[col + width*(row + height*channel)];
}

//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu(float* data_im,
     int channels,  int height,  int width,
     int ksize,  int stride, int pad, float* data_col) 
{
    int c,h,w;
    int height_col = (height + 2*pad - ksize) / stride + 1;
    int width_col = (width + 2*pad - ksize) / stride + 1;

    int channels_col = channels * ksize * ksize;
    for (c = 0; c < channels_col; ++c) {
        int w_offset = c % ksize;
        int h_offset = (c / ksize) % ksize;
        int c_im = c / ksize / ksize;
        for (h = 0; h < height_col; ++h) {
            for (w = 0; w < width_col; ++w) {
                int im_row = h_offset + h * stride;
                int im_col = w_offset + w * stride;
                int col_index = (c * height_col + h) * width_col + w;
                data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
                        im_row, im_col, c_im, pad);
            }   
        }   
    }   
}

// gemm.c
void gemm(int TA, int TB, int M, int N, int K, float ALPHA, 
        float *A, int lda, 
        float *B, int ldb,
        float BETA,
        float *C, int ldc)
{
    gemm_cpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
}

void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
        float *A, int lda,
        float *B, int ldb,
        float BETA,
        float *C, int ldc)
{
    //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
    int i, j;
    for(i = 0; i < M; ++i){
        for(j = 0; j < N; ++j){
            C[i*ldc + j] *= BETA;
        }
    }
    if(!TA && !TB)
        gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else if(TA && !TB)
        gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else if(!TA && TB)
        gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
    else
        gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
}

void gemm_nn(int M, int N, int K, float ALPHA,
        float *A, int lda,
        float *B, int ldb,
        float *C, int ldc)
{
    int i,j,k;
    #pragma omp parallel for
    for(i = 0; i < M; ++i){
        for(k = 0; k < K; ++k){
            register float A_PART = ALPHA*A[i*lda+k];
            for(j = 0; j < N; ++j){
                C[i*ldc+j] += A_PART*B[k*ldb+j];
            }
        }
    }
}

ysh329 commented 5 years ago

3. FFT

傅里叶变换和快速傅里叶变化是在经典图像处理里面经常使用的计算方法。但卷积网络中通常不采用，这是因为在卷积网络中，卷积模板通常都比较小，例如 3×3 等。这种情况下，FFT的时间开销反而更大，FFT适用于7x7、9x9、11x11等等这样的大卷积才划算。

ysh329 commented 5 years ago

4. Winograd

Winograd 是存在已久最近被重新发现的方法。在大部分场景中，Winograd 方法都显示和较大的优势。目前 cudnn 中计算卷积就使用了该方法，适合3x3的小卷积核。

参考

论文/博客

代码

ysh329 commented 5 years ago

ACL

gemm interleave+transpose

#define  SUM_VEC4(vec4)  vec4.s0+vec4.s1+vec4.s2+vec4.s3

__kernel void mat_trans_vec4(const int k,
                             const int n,
                             __global const CL_INPUT_TYPE *b,
                             __global       CL_INPUT_TYPE *bT) {
    const int col = get_global_id(0) * VEC_LEN;
    const int row = get_global_id(1);

    CL_ELEM_TYPE bb = vload4(0, b+row*n+col);
    vstore4(bb, 0, bT+(col/VEC_LEN) * (VEC_LEN*k) + row*VEC_LEN);
}

__kernel void mat_interleave_vec4(const int m,
                                  const int k,
                                  __global const CL_INPUT_TYPE *a,
                                  __global       CL_INPUT_TYPE *aI) {
    const int col = get_global_id(0) * VEC_LEN;
    const int row = get_global_id(1) * VEC_LEN;

    CL_ELEM_TYPE aa0 = vload4(0, a + row     * k + col),
                 aa1 = vload4(0, a + (row+1) * k + col),
                 aa2 = vload4(0, a + (row+2) * k + col),
                 aa3 = vload4(0, a + (row+3) * k + col);

    CL_ELEM_TYPE 
    res = (CL_ELEM_TYPE)(aa0.s0, aa1.s0, aa2.s0, aa3.s0);
    vstore4(res, 0, aI+(row/VEC_LEN)*(VEC_LEN*k) + (col*VEC_LEN));

    res = (CL_ELEM_TYPE)(aa0.s1, aa1.s1, aa2.s1, aa3.s1);
    vstore4(res, 0, aI+(row/VEC_LEN)*(VEC_LEN*k) + (col*VEC_LEN+VEC_LEN));

    res = (CL_ELEM_TYPE)(aa0.s2, aa1.s2, aa2.s2, aa3.s2);
    vstore4(res, 0, aI+(row/VEC_LEN)*(VEC_LEN*k) + (col*VEC_LEN+VEC_LEN*2));

    res = (CL_ELEM_TYPE)(aa0.s3, aa1.s3, aa2.s3, aa3.s3);
    vstore4(res, 0, aI+(row/VEC_LEN)*(VEC_LEN*k) + (col*VEC_LEN+VEC_LEN*3));

}

// float4
// [256,256,1] [1,1,1] p+=8  100times 1024x1024x1024 0.220724 s  9.729284 GFLOPS
// [256,256,1] [4,4,1] p+=8  100times 1024x1024x1024 0.081261 s 26.427127 GFLOPS
// [256,256,1] [4,4,1] p+=12 100times 1024x1024x1020 0.075207 s 28.442905 GFLOPS
// lws_calc_fp32_float4_max_256_gls_2048_2048.log:>>> [INFO] 1.00 CL_GPU 2048x2048x2040 {512, 512, 1} {8, 32, 1} 0.590064 s 29.001546 GFLOPS
// half4
// [256,256,1] [1,1,1] p+=8  100times 1024x1024x1024 0.103472 s 20.754221 GFLOPS 
// [256,256,1] [4,4,1] p+=8  100times 1024x1024x1024 0.061210 s 35.084041 GFLOPS
// [256,256,1] [4,4,1] p+=12 100times 1024x1024x1024 0.058183 s 36.765208 GFLOPS

__kernel void gemm_interleaved_transposed_vec4(const int aI_height, // height of aI
                                               const int bT_height, // height of bT
                                               const int aI_width,  // width of aI or bT
                                               __global const CL_INPUT_TYPE *aI,
                                               __global const CL_INPUT_TYPE *bT,
                                               __global       CL_INPUT_TYPE *c) {
#ifndef USE_LOCAL_WOKR_SIZE
    const int col = get_global_id(0); // col of bT: [0, n/4) <==> [0, bT_height)
    const int row = get_global_id(1); // row of aI: [0, m/4) <==> [0, aI_height)
#else
    const int row = get_group_id(1) * VEC_LEN + get_local_id(1);
    const int col = get_group_id(0) * VEC_LEN + get_local_id(0);
#endif

    CL_ELEM_TYPE c00 = 0.0,
                 c10 = 0.0,
                 c20 = 0.0,
                 c30 = 0.0;

    for (int p = 0; p < aI_width; p += 12) {
        CL_ELEM_TYPE
        aa = vload4(0, aI + row * aI_width + p),
        bb = vload4(0, bT + col * aI_width + p);

        c00 += (CL_ELEM_TYPE)aa.s0 * bb;
        c10 += (CL_ELEM_TYPE)aa.s1 * bb; 
        c20 += (CL_ELEM_TYPE)aa.s2 * bb;
        c30 += (CL_ELEM_TYPE)aa.s3 * bb;

        aa = vload4(0, aI + row * aI_width + p+VEC_LEN);
        bb = vload4(0, bT + col * aI_width + p+VEC_LEN);

        c00 += (CL_ELEM_TYPE)aa.s0 * bb;
        c10 += (CL_ELEM_TYPE)aa.s1 * bb; 
        c20 += (CL_ELEM_TYPE)aa.s2 * bb;
        c30 += (CL_ELEM_TYPE)aa.s3 * bb;

        aa = vload4(0, aI + row * aI_width + p+VEC_LEN*2);
        bb = vload4(0, bT + col * aI_width + p+VEC_LEN*2);

        c00 += (CL_ELEM_TYPE)aa.s0 * bb;
        c10 += (CL_ELEM_TYPE)aa.s1 * bb; 
        c20 += (CL_ELEM_TYPE)aa.s2 * bb;
        c30 += (CL_ELEM_TYPE)aa.s3 * bb;
    }

    vstore4(c00, 0, c+(row*VEC_LEN  ) * (bT_height*VEC_LEN) + (col*VEC_LEN));
    vstore4(c10, 0, c+(row*VEC_LEN+1) * (bT_height*VEC_LEN) + (col*VEC_LEN));
    vstore4(c20, 0, c+(row*VEC_LEN+2) * (bT_height*VEC_LEN) + (col*VEC_LEN));
    vstore4(c30, 0, c+(row*VEC_LEN+3) * (bT_height*VEC_LEN) + (col*VEC_LEN));
}

ysh329 commented 5 years ago

andravin/wincnn: Winograd minimal convolution algorithm generator for convolutional neural networks. https://github.com/andravin/wincnn

yuenshome / yuenshome.github.io

卷积 #6

卷积优化

四种卷积方法

参考

1. 直接卷积法

2. im2col+gemm

3. FFT

4. Winograd

参考

论文/博客

代码

ACL