Open ysh329 opened 5 years ago
也是滑窗法,最直观最简单的方法,做法可以参考下图(来自CS229课程动图), 但该方法在大规模上效果较差。除了特定卷积核大小,通常情况下不采用这种方法。
#include <stdio.h>
#include <stdlib.h>
struct conv_param
{
// input/output/filter shape dim
int shape_dim;
int *input_shape;
int *output_shape;
int *filter_shape;
// filter params(filter nchw defined in filter_shape)
int pad_h;
int pad_w;
int stride;
int group;
} conv_param_t;
// TODO
#define out(n,c,h,w) (output[n*OC*OH*OW+m*OH*OW+])
#define in(n,c,h,w) (input[n*C*H*W+c*H*W+h*W+w])
#define filter(n,c,h,w) (filter[m*D*K*K+])
int conv_direct(float *input, float *output, float *filter, conv_param_t *param)
{
// filter
int pad_h = conv_p->pad_h;
int pad_w = conv_p->pad_w;
int stride = conv_p->stride;
int group = conv_p->group;
// input
int N = conv_p->input_shape[0];
int D = conv_p->input_shape[1];
int H = conv_p->input_shape[2];
int W = conv_p->input_shape[3];
int M = conv_p->filter_shape[0];
int K = conv_p->filter_shape[2];// filter_shape[2]==filter_shape[3]
// TODO
for(int h=0; h<H; ++h)
{
for(int w=0; w<W; ++w)
{
for(int y=0; y<K; ++y)
{
for(int x=0; x<K; ++x)
{
for(int m=0; m<M; ++m) // filter num.=output channels num.
{
for(int d=0; d<D; ++d) // D=input channels num.
{
out() += in() * filter(); // TODO
}
}
}
}
}
}
return 0;
}
#define SHAPE_DIM (4)
int main()
{
// input
int input_n = 1;
int input_c = 3;
int input_h = 224;
int input_w = 224;
int input_shape[SHAPE_DIM] = {input_n,
input_c,
input_h,
input_w};
float *input = (float*)calloc(input_n*
input_c*
input_h*
input_w, sizeof float);
// filter without bias
int filter_n = 32;
int filter_c = input_c;
int filter_h = 3;
int filter_w = filter_h;
int filter_shape[SHAPE_DIM] = {filter_n,
filter_c,
filter_h,
filter_w};
float *filter = (float*)calloc(filter_n*
filter_c*
filter_h*
filter_w, sizeof float);
int pad_h = 1;
int pad_w = 1;
int stride = 1;
int group = 1;
// output
int output_n = input_n;
int output_c = filter_n;
int output_h = (input_h+2*pad_h-filter_h) / stride + 1;
int output_w = (input_w+2*pad_w-filter_w) / stride + 1;
int output_shape[SHAPE_DIM] = {output_n,
output_c,
output_h,
output_w};
float *output = (float*)calloc(output_n*
output_c*
output_h*
output_w, sizeof float);
// conv_param
conv_param_t *conv_p = (conv_param_t*)calloc(1, sizeof conv_param_t);
conv_p = {.shape_dim = SHAPE_DIM,
.input_shape = input_shape,
.filter_shape = filter_shape,
.output_shape = output_shape,
.pad_h = pad_h,
.pad_w = pad_w,
.stride = stride,
.group = group};
if(!input || !filter || !output)
{
printf("[ERROR] failed to allocate memory for input/filter/output\n");
goto free_memory;
}
int conv_status = conv_direct(intput, output, filter, conv_p);
if(conv_status)
{
printf("[ERROR] conv_direct failure\n");
goto free_memory;
}
free_memory:
if(input) free(input);
if(filter) free(output);
if(filter) free(filter);
if(conv_p) free(conv_p);
input = NULL;
output = NULL;
filter = NULL;
conv_p = NULL;
return 0;
}
目前几乎所有的主流计算框架包括 Caffe、MXNet 等都实现了该方法。该方法把整个卷积过程转化成了 矩阵乘法(GEMM),而 GEMM 在各种 BLAS 库中都是被极致优化的,一般来说, 速度较快。
darknet代码位于./src/convolutional_layer.c
:
void forward_convolutional_layer(convolutional_layer l, network net)
{
int i, j;
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
if(l.xnor){
binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
swap_binary(&l);
binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
net.input = l.binary_input;
}
int m = l.n/l.groups;
int k = l.size*l.size*l.c/l.groups;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
for(j = 0; j < l.groups; ++j){
float *a = l.weights + j*l.nweights/l.groups;
float *b = net.workspace;
float *c = l.output + (i*l.groups + j)*n*m;
float *im = net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
if (l.size == 1) {
b = im;
} else {
im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
}
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
}
if(l.batch_normalize){
forward_batchnorm_layer(l, net);
} else {
add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
}
activate_array(l.output, l.outputs*l.batch, l.activation);
if(l.binary || l.xnor) swap_binary(&l);
}
// im2col.c
#include "im2col.h"
#include <stdio.h>
float im2col_get_pixel(float *im, int height, int width, int channels,
int row, int col, int channel, int pad)
{
row -= pad;
col -= pad;
if (row < 0 || col < 0 ||
row >= height || col >= width) return 0;
return im[col + width*(row + height*channel)];
}
//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu(float* data_im,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_col)
{
int c,h,w;
int height_col = (height + 2*pad - ksize) / stride + 1;
int width_col = (width + 2*pad - ksize) / stride + 1;
int channels_col = channels * ksize * ksize;
for (c = 0; c < channels_col; ++c) {
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
for (h = 0; h < height_col; ++h) {
for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride;
int col_index = (c * height_col + h) * width_col + w;
data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
im_row, im_col, c_im, pad);
}
}
}
}
// gemm.c
void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
}
void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
//printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
int i, j;
for(i = 0; i < M; ++i){
for(j = 0; j < N; ++j){
C[i*ldc + j] *= BETA;
}
}
if(!TA && !TB)
gemm_nn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
else if(TA && !TB)
gemm_tn(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
else if(!TA && TB)
gemm_nt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
else
gemm_tt(M, N, K, ALPHA,A,lda, B, ldb,C,ldc);
}
void gemm_nn(int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float *C, int ldc)
{
int i,j,k;
#pragma omp parallel for
for(i = 0; i < M; ++i){
for(k = 0; k < K; ++k){
register float A_PART = ALPHA*A[i*lda+k];
for(j = 0; j < N; ++j){
C[i*ldc+j] += A_PART*B[k*ldb+j];
}
}
}
}
傅里叶变换和快速傅里叶变化是在经典图像处理里面经常使用的计算方法。但卷积网络中通常不采用,这是因为在卷积网络中,卷积模板通常都比较小,例如 3×3 等。这种情况下,FFT的时间开销反而更大,FFT适用于7x7、9x9、11x11等等这样的大卷积才划算。
Winograd 是存在已久最近被重新发现的方法。在大部分场景中,Winograd 方法都显示和较大的优势。目前 cudnn 中计算卷积就使用了该方法,适合3x3的小卷积核。
gemm interleave+transpose
#define SUM_VEC4(vec4) vec4.s0+vec4.s1+vec4.s2+vec4.s3
__kernel void mat_trans_vec4(const int k,
const int n,
__global const CL_INPUT_TYPE *b,
__global CL_INPUT_TYPE *bT) {
const int col = get_global_id(0) * VEC_LEN;
const int row = get_global_id(1);
CL_ELEM_TYPE bb = vload4(0, b+row*n+col);
vstore4(bb, 0, bT+(col/VEC_LEN) * (VEC_LEN*k) + row*VEC_LEN);
}
__kernel void mat_interleave_vec4(const int m,
const int k,
__global const CL_INPUT_TYPE *a,
__global CL_INPUT_TYPE *aI) {
const int col = get_global_id(0) * VEC_LEN;
const int row = get_global_id(1) * VEC_LEN;
CL_ELEM_TYPE aa0 = vload4(0, a + row * k + col),
aa1 = vload4(0, a + (row+1) * k + col),
aa2 = vload4(0, a + (row+2) * k + col),
aa3 = vload4(0, a + (row+3) * k + col);
CL_ELEM_TYPE
res = (CL_ELEM_TYPE)(aa0.s0, aa1.s0, aa2.s0, aa3.s0);
vstore4(res, 0, aI+(row/VEC_LEN)*(VEC_LEN*k) + (col*VEC_LEN));
res = (CL_ELEM_TYPE)(aa0.s1, aa1.s1, aa2.s1, aa3.s1);
vstore4(res, 0, aI+(row/VEC_LEN)*(VEC_LEN*k) + (col*VEC_LEN+VEC_LEN));
res = (CL_ELEM_TYPE)(aa0.s2, aa1.s2, aa2.s2, aa3.s2);
vstore4(res, 0, aI+(row/VEC_LEN)*(VEC_LEN*k) + (col*VEC_LEN+VEC_LEN*2));
res = (CL_ELEM_TYPE)(aa0.s3, aa1.s3, aa2.s3, aa3.s3);
vstore4(res, 0, aI+(row/VEC_LEN)*(VEC_LEN*k) + (col*VEC_LEN+VEC_LEN*3));
}
// float4
// [256,256,1] [1,1,1] p+=8 100times 1024x1024x1024 0.220724 s 9.729284 GFLOPS
// [256,256,1] [4,4,1] p+=8 100times 1024x1024x1024 0.081261 s 26.427127 GFLOPS
// [256,256,1] [4,4,1] p+=12 100times 1024x1024x1020 0.075207 s 28.442905 GFLOPS
// lws_calc_fp32_float4_max_256_gls_2048_2048.log:>>> [INFO] 1.00 CL_GPU 2048x2048x2040 {512, 512, 1} {8, 32, 1} 0.590064 s 29.001546 GFLOPS
// half4
// [256,256,1] [1,1,1] p+=8 100times 1024x1024x1024 0.103472 s 20.754221 GFLOPS
// [256,256,1] [4,4,1] p+=8 100times 1024x1024x1024 0.061210 s 35.084041 GFLOPS
// [256,256,1] [4,4,1] p+=12 100times 1024x1024x1024 0.058183 s 36.765208 GFLOPS
__kernel void gemm_interleaved_transposed_vec4(const int aI_height, // height of aI
const int bT_height, // height of bT
const int aI_width, // width of aI or bT
__global const CL_INPUT_TYPE *aI,
__global const CL_INPUT_TYPE *bT,
__global CL_INPUT_TYPE *c) {
#ifndef USE_LOCAL_WOKR_SIZE
const int col = get_global_id(0); // col of bT: [0, n/4) <==> [0, bT_height)
const int row = get_global_id(1); // row of aI: [0, m/4) <==> [0, aI_height)
#else
const int row = get_group_id(1) * VEC_LEN + get_local_id(1);
const int col = get_group_id(0) * VEC_LEN + get_local_id(0);
#endif
CL_ELEM_TYPE c00 = 0.0,
c10 = 0.0,
c20 = 0.0,
c30 = 0.0;
for (int p = 0; p < aI_width; p += 12) {
CL_ELEM_TYPE
aa = vload4(0, aI + row * aI_width + p),
bb = vload4(0, bT + col * aI_width + p);
c00 += (CL_ELEM_TYPE)aa.s0 * bb;
c10 += (CL_ELEM_TYPE)aa.s1 * bb;
c20 += (CL_ELEM_TYPE)aa.s2 * bb;
c30 += (CL_ELEM_TYPE)aa.s3 * bb;
aa = vload4(0, aI + row * aI_width + p+VEC_LEN);
bb = vload4(0, bT + col * aI_width + p+VEC_LEN);
c00 += (CL_ELEM_TYPE)aa.s0 * bb;
c10 += (CL_ELEM_TYPE)aa.s1 * bb;
c20 += (CL_ELEM_TYPE)aa.s2 * bb;
c30 += (CL_ELEM_TYPE)aa.s3 * bb;
aa = vload4(0, aI + row * aI_width + p+VEC_LEN*2);
bb = vload4(0, bT + col * aI_width + p+VEC_LEN*2);
c00 += (CL_ELEM_TYPE)aa.s0 * bb;
c10 += (CL_ELEM_TYPE)aa.s1 * bb;
c20 += (CL_ELEM_TYPE)aa.s2 * bb;
c30 += (CL_ELEM_TYPE)aa.s3 * bb;
}
vstore4(c00, 0, c+(row*VEC_LEN ) * (bT_height*VEC_LEN) + (col*VEC_LEN));
vstore4(c10, 0, c+(row*VEC_LEN+1) * (bT_height*VEC_LEN) + (col*VEC_LEN));
vstore4(c20, 0, c+(row*VEC_LEN+2) * (bT_height*VEC_LEN) + (col*VEC_LEN));
vstore4(c30, 0, c+(row*VEC_LEN+3) * (bT_height*VEC_LEN) + (col*VEC_LEN));
}
andravin/wincnn: Winograd minimal convolution algorithm generator for convolutional neural networks. https://github.com/andravin/wincnn
【caffe,这个代码分析的好】卷积层代码解析 - Atanisi - 博客园
afantideng/caffe_comments: Caffe 源码注释
Caffe: caffe::ConvolutionLayer< Dtype > Class Template Reference
Hardware Accelerators for Machine Learning (CS 217) by cs217
深度学习(四十)——模型压缩与加速, 多任务学习
Convolution optimisation resources · numforge/laser Wiki
caffe中ConvolutionLayer的前向和反向传播解析及源码阅读 - 洗盏更酌 - 博客园
(原)使用intel的ipp库计算卷积及相关 - darkknightzh - 博客园
Convolution in Caffe: a memo · Yangqing/caffe Wiki
卷积优化