chenhy97 / canny_simd

1 stars 0 forks source link

运行环境


使用的SIMD指令集


优化函数介绍


不足之处


优化思路


高斯核生成


    Mat createGaussianKernel2D(int ksize,float sigma){
    Mat kernel = Mat::zeros(ksize,ksize,CV_32FC1);
    int center = ksize/2;
    float sum = 0;
    for(int i = 0;i < ksize;i ++){
        float *data = kernel.ptr<float>(i);//对kernel的每一行进行操作。
        for(int j = 0;j < ksize;j ++){
            float temp = ((-1)*((i - center)*(i - center)+(j - center)*(j - center)))/(2*sigma*sigma);
            data[j] = (1/(2*PI*sigma*sigma))*exp(temp);
            sum = sum + data[j];
        }
    }
    //这里可以通过调用函数来实现向量化。
    for(int i = 0;i < ksize;i ++){
        float *data = kernel.ptr<float>(i);
        for(int j = 0;j < ksize;j ++){
            data[j] = data[j]/sum;
        }
    }
    return kernel;
}
Mat simd_to_create(int ksize,float sigma){
    Mat kernel = Mat::zeros(ksize,ksize,CV_32FC1);
    float center = ksize/2;
    float double_sigma = (2*sigma*sigma);
    float PI_sigma = 1/(2*PI*sigma*sigma);
    float sum = 0;//32位
    __attribute__((aligned(16))) float temp[4] = {0,1,2,3};
    __m128 X,Y,Z;
    for(int i = 0;i < ksize;i ++){
        /*__attribute__((aligned(16)))*/ float *data = kernel.ptr<float>(i);
        int j;
        for(j = 0;j + 4 < ksize;j += 4){//128位专用寄存器,一次性可以处理4组32位变量的运算,4*32 
            X = _mm_load_ps(&temp[0] + j); // 将x加载到X(由于128位可以存放四个32位数据,所以默认一次加载连续的4个参数)
            float temp_data = j - center;
            Z = _mm_set_ps1(temp_data);
            Y = _mm_add_ps(X,Z);//开始计算
            Y = _mm_mul_ps(Y,Y);
            float temp_i = i - center;
            X = _mm_set_ps1(temp_i);
            X = _mm_mul_ps(X,X);
            Y = _mm_add_ps(Y,X);
            X = _mm_set_ps1(-1);
            Y = _mm_mul_ps(Y,X);
            X = _mm_set_ps1(double_sigma);
            Y = _mm_div_ps(Y,X);//wait to exp(Y)
            Y = FastExpSse(Y);
            X = _mm_set_ps1(PI_sigma);
            Y = _mm_mul_ps(X,Y);
            _mm_storeu_ps(data + j,Y);//将计算结果存入data
            sum = sum + data[j] + data[j + 1] + data[j + 2] + data[j + 3];
        }
        for(;j < ksize;j ++){//补全运算
            float temp_abc = ((-1)*((i - center)*(i - center)+(j - center)*(j - center)))/double_sigma;
            data[j] = PI_sigma*exp(temp_abc);
            sum = sum + data[j];
        }
    }
    Y = _mm_set_ps1(sum);
    for(int i = 0;i < ksize;i ++){//归一化向量化
        /*__attribute__((aligned(16)))*/ float *data =kernel.ptr<float>(i);
        for(int j = 0;j + 4 < ksize;j += 4){
            X = _mm_loadu_ps(data + j);
            X = _mm_div_ps(X,Y);
            _mm_storeu_ps(data + j,X);
        }
        data[4] = data[4]/sum;
    }
    return kernel;
}

Sobel算子计算梯度


void SobelGradDirection(const Mat src,Mat &sobelx,Mat &sobely,float *pointDirection){
    for(int i =0;i < src.rows - 1;i ++){
        pointDirection[i] = 0;
    }
    sobelx = Mat::zeros(src.size(),CV_32SC1);
    sobely = Mat::zeros(src.size(),CV_32SC1);
    uchar* P = src.data;
    uchar* px = sobelx.data;
    uchar* py = sobely.data;
    int step = src.step;
    int stepXY = sobelx.step;
    long long k = 0;
    int i,j;
    for(i = 1;i < (src.rows - 1);i ++){
        for(j = 1;j <(src.cols - 1);j ++){
            float gradY = P[(i-1)*step + j + 1] + P[i*step + j + 1]*2 + P[(i + 1)*step + j + 1] -  P[(i-1)*step + j - 1] - P[i*step + j - 1]*2 - P[(i + 1)*step + j - 1];
            py[i * stepXY + j * (stepXY/step)] = abs(gradY);//不能直接store,因为不是放在一个连续的空间中。

            float gradX = P[(i+1)*step + j - 1] + P[(i + 1) * step + j] * 2+P[( i + 1) * step+ j + 1]-P[(i-1)*step+j-1]-P[(i-1)*step+j]*2-P[(i-1)*step+j+1];
            px[i * stepXY + j * (stepXY/step)] = abs(gradX);
            if(gradX == 0){
                gradX = 0.0000000001;//防止除法为0
            }
            pointDirection[k] = atan(gradY/gradX)*57.3;
            pointDirection[k] += 90;
            k ++;
        }
    }
    convertScaleAbs(sobelx,sobelx);//对于每个输入数组的元素函数convertScaleAbs 进行三次操作依次是:缩放,得到一个绝对值,转换成无符号8位类型
    convertScaleAbs(sobely,sobely);
}
void simd_SobelGradDirection(const Mat src,Mat &sobelx,Mat &sobely,float *pointDirection){
    for(int i =0;i < src.rows - 1;i ++){
        pointDirection[i] = 0;
    }
    sobelx = Mat::zeros(src.size(),CV_32SC1);
    sobely = Mat::zeros(src.size(),CV_32SC1);
    uchar* P = src.data;
    uchar* px = sobelx.data;
    uchar* py = sobely.data;
    float temp[20];
    float temp1[20];
    uchar for_px[512];//can't be the point ,all can not use the point type change
    uchar for_py[512];
    int step = src.step;
    int stepXY = sobelx.step;
    long long k = 0;
    int i,j;
    __m128i X,Y;

    __m128 floats1,floats2,floats3,floats4,sub_min,mask,mask_low1;
    mask_low1 = _mm_set1_ps(1);
    for(i = 1;i < (src.rows - 1);i ++){
        for(j = 1;j + 4 <(src.cols - 1);j = j + 4){
            X = _mm_loadu_si128((__m128i*)(P + ((i - 1) * step + j + 1)));
            Y = _mm_loadu_si128((__m128i*)(P + i * step + j + 1));
            X = _mm_cvtepu8_epi32(X);//8->32 整形。
            Y = _mm_cvtepu8_epi32(Y);
            floats1 = _mm_cvtepi32_ps(X);//整型-> floats
            floats2 = _mm_cvtepi32_ps(Y);
            floats2 = _mm_add_ps(floats2,floats2);
            floats1 = _mm_add_ps(floats1,floats2);
            X = _mm_loadu_si128((__m128i*)(P + ((i + 1) * step + j + 1)));
            X = _mm_cvtepu8_epi32(X);
            floats2 = _mm_cvtepi32_ps(X);
            floats1 = _mm_add_ps(floats1,floats2);
            X = _mm_loadu_si128((__m128i*)(P + ((i - 1) * step + j - 1)));
            X = _mm_cvtepu8_epi32(X);
            floats2 = _mm_cvtepi32_ps(X);
            floats1 = _mm_sub_ps(floats1,floats2);
            X = _mm_loadu_si128((__m128i*)(P + (i * step + j - 1)));
            X = _mm_cvtepu8_epi32(X);
            floats2 = _mm_cvtepi32_ps(X);
            floats2 = _mm_add_ps(floats2,floats2);
            floats1 = _mm_sub_ps(floats1,floats2);
            X = _mm_loadu_si128((__m128i*)(P + ((i + 1) * step + j - 1)));
            X = _mm_cvtepu8_epi32(X);
            floats2 = _mm_cvtepi32_ps(X);
            floats1 = _mm_sub_ps(floats1,floats2);
            floats3 = floats1;
            //_mm_storeu_ps(temp,floats1);
            //
            //
            floats1 = abs_vec(floats1);
            X = _mm_cvtps_epi32(floats1);    // Convert them to 32-bit ints
            X = _mm_packus_epi32(X, X);        // Pack down to 16 bits
            X = _mm_packus_epi16(X, X);        // Pack down to 8 bits
            *(int *)for_py = _mm_cvtsi128_si32(X); // Store the lower 32 bits
            py[i * stepXY + j * (stepXY/step)] = for_py[0];
            py[i * stepXY + (j + 1) * (stepXY/step)] = for_py[1];
            py[i * stepXY + (j + 2) * (stepXY/step)] = for_py[2];
            py[i * stepXY + (j + 3) * (stepXY/step)] = for_py[3];
            X = _mm_loadu_si128((__m128i*)(P + (i + 1) * step + j - 1));
            Y = _mm_loadu_si128((__m128i*)(P + (i + 1) * step + j ));
            X = _mm_cvtepu8_epi32(X);//8->32 整形。
            Y = _mm_cvtepu8_epi32(Y);
            floats1 = _mm_cvtepi32_ps(X);//整型-> floats
            floats2 = _mm_cvtepi32_ps(Y);
            floats2 = _mm_add_ps(floats2,floats2);
            floats1 = _mm_add_ps(floats1,floats2);
            X = _mm_loadu_si128((__m128i*)(P + ((i + 1) * step + j + 1)));
            X = _mm_cvtepu8_epi32(X);
            floats2 = _mm_cvtepi32_ps(X);
            floats1 = _mm_add_ps(floats1,floats2);
            X = _mm_loadu_si128((__m128i*)(P + ((i - 1) * step + j - 1)));
            X = _mm_cvtepu8_epi32(X);
            floats2 = _mm_cvtepi32_ps(X);
            floats1 = _mm_sub_ps(floats1,floats2);
            X = _mm_loadu_si128((__m128i*)(P + ((i - 1) * step + j)));
            X = _mm_cvtepu8_epi32(X);
            floats2 = _mm_cvtepi32_ps(X);
            floats2 = _mm_add_ps(floats2,floats2);
            floats1 = _mm_sub_ps(floats1,floats2);
            X = _mm_loadu_si128((__m128i*)(P + ((i - 1) * step + j + 1)));
            X = _mm_cvtepu8_epi32(X);
            floats2 = _mm_cvtepi32_ps(X);
            floats1 = _mm_sub_ps(floats1,floats2);
            floats4 = floats1;
            floats1 = abs_vec(floats1);
            //_mm_storeu_ps(temp1,floats4);
            X = _mm_cvtps_epi32(floats1);    // Convert them to 32-bit ints
            X = _mm_packus_epi32(X, X);        // Pack down to 16 bits
            X = _mm_packus_epi16(X, X);        // Pack down to 8 bits
            *(int *)for_px = _mm_cvtsi128_si32(X); // Store the lower 32 bits
            px[i * stepXY + j * (stepXY/step)] = for_px[0];
            px[i * stepXY + (j + 1) * (stepXY/step)] = for_px[1];
            px[i * stepXY + (j + 2) * (stepXY/step)] = for_px[2];
            px[i * stepXY + (j + 3) * (stepXY/step)] = for_px[3];
            sub_min = _mm_sub_ps(floats4,_mm_setzero_ps());
            mask = _mm_cmpeq_ps(sub_min,_mm_setzero_ps());
            mask = _mm_and_ps(mask,mask_low1);//判断gradX是否为0,为0的话,最低位改为1
            floats4 = _mm_or_ps(mask,floats4);
            floats3 = _mm_div_ps(floats3,floats4);
            floats3 = atan_ps(floats3);
            floats4 = _mm_set1_ps(57.3);
            floats3 = _mm_mul_ps(floats3,floats4);
            floats4 = _mm_set1_ps(90);
            floats3 = _mm_add_ps(floats4,floats3);
            _mm_storeu_ps(pointDirection + k,floats3);
            k = k + 4;
        }
        for(;j < (src.cols- 1);j ++){
            float gradY = P[(i-1)*step + j + 1] + P[i*step + j + 1]*2 + P[(i + 1)*step + j + 1] -  P[(i-1)*step + j - 1] - P[i*step + j - 1]*2 - P[(i + 1)*step + j - 1];
            py[i * stepXY + j * (stepXY/step)] = abs(gradY);
            float gradX = P[(i+1)*step + j - 1] + P[(i + 1) * step + j] * 2+P[( i + 1) * step+ j + 1]-P[(i-1)*step+j-1]-P[(i-1)*step+j]*2-P[(i-1)*step+j+1];
            px[i * stepXY + j * (stepXY/step)] = abs(gradX);
            if(gradX == 0){
                gradX = 0.0000000001;//防止除法为0
            }
            pointDirection[k] = atan(gradY/gradX)*57.3;
            pointDirection[k] += 90;
            k ++;
        }
    }

    convertScaleAbs(sobelx,sobelx);//对于每个输入数组的元素函数convertScaleAbs 进行三次操作依次是:缩放,得到一个绝对值,转换成无符号8位类型
    convertScaleAbs(sobely,sobely);
}

Sobel算子计算幅值


void simd_SobelAmplitude(Mat &sobelx,Mat &sobely,Mat &SobelXY){
    SobelXY = Mat::zeros(sobelx.size(),CV_32FC1);
    __m128i X,Y;
    __m128 floats1,floats2;
    for(int i = 0;i < SobelXY.rows;i ++){
        float *data = SobelXY.ptr<float>(i);
        uchar *datax = sobelx.ptr<uchar>(i);
        uchar *datay = sobely.ptr<uchar>(i);
        int j;
        for(j = 0;j + 4 < SobelXY.cols;j = j + 4){
            X = _mm_loadu_si128((__m128i*)(datax + j));
            Y = _mm_loadu_si128((__m128i*)(datay + j));
            X = _mm_cvtepu8_epi32(X);//8->32 整形。
            Y = _mm_cvtepu8_epi32(Y);
            floats1 = _mm_cvtepi32_ps(X);//整型-> floats,得到dataX
            floats2 = _mm_cvtepi32_ps(Y);//得到dataY
            floats1 = _mm_mul_ps(floats1,floats1);
            floats2 = _mm_mul_ps(floats2,floats2);
            floats1 = _mm_add_ps(floats1,floats2);
            floats1 = _mm_sqrt_ps(floats1);
            _mm_storeu_ps(data + j,floats1);
        }
        for(;j < SobelXY.cols;j ++){
            data[j] = sqrt(datax[j] * datax[j] + datay[j]*datay[j]);

        }
    }
    convertScaleAbs(SobelXY,SobelXY);
}

双阙值抑制


void DoubleThreshold(Mat &Input,uchar LowThreshod,uchar highThreshold){
    for(int i = 0;i < Input.rows;i ++){
        uchar *data = Input.ptr<uchar>(i);
        int j;
        for(int j = 0;j < Input.cols;j = j + 1){
            if(data[j] > highThreshold){
                data[j] = 255;
            }
            if(data[j] < LowThreshod){
                data[j] = 0;
            }
        }
    }
}

sub_max:

-10|-1|-2|3
----|-----|---|---|----
sub_min: -1 -10 2 -3

然后将得到的sub_min 和sub_max与全0比较,如果sub_min>0,则将那一个大于0的8位数置为1,否则设为0,如果sub_max>0,则将那一个大于0的8位数置为1,否则为0.

mask_hi:

0|0|0|1
----|-----|---|---|----

mask_lo:

0|0|1|0
----|-----|---|---|----

然后将mask_hi与原值相或,即可将该值改为255,将mask_lo与原值相与,可将该值设为0.

void simd_DoubleThreshold(Mat &Input,uchar LowThreshod,uchar highThreshold){
    __m128i loaded8,sub_min,sub_max,mask_hi,mask_lo;
    for(int i = 0;i < Input.rows;i ++){
        uchar *data = Input.ptr<uchar>(i);
        int j;
        for(j = 0;j + 16 < Input.cols;j = j + 16){
            //cout << "a" << endl;
            loaded8 = _mm_loadu_si128((__m128i*)(data + j));
            //subtract 128 from every 8-bit int
            sub_min = _mm_sub_epi8(loaded8, _mm_set1_epi8(LowThreshod));
            sub_max = _mm_sub_epi8(loaded8, _mm_set1_epi8(highThreshold));
            mask_hi = _mm_cmpgt_epi8(sub_max,_mm_setzero_si128());//submax>0 8位全1,否则全0
            mask_lo = _mm_cmpgt_epi8(sub_min,_mm_setzero_si128());//submin<=0,8位全0,否则全1
            loaded8 = _mm_and_si128(loaded8, mask_lo);
            loaded8 = _mm_or_si128(loaded8,mask_hi);
            _mm_storeu_si128((__m128i *)(data + j), loaded8);
            //cout << "i: " << i << "j: "<< j << endl;

            //greater than top limit?
        }
        for(;j < Input.cols;j = j + 1){
            if(data[j] > highThreshold){
                data[j] = 255;
            }
            if(data[j] < LowThreshod){
                data[j] = 0;
            }
        }
        //cout << i << endl;
     }
}

一些小技巧:


X = _mm_loadu_si128((__m128i*)(datax + j));//加载16个8位的数据
X = _mm_cvtepu8_epi32(X);//8->32 整形。
floats1 = _mm_cvtepi32_ps(X);//整型-> floats
loaded8 = _mm_loadu_si128((__m128i*)(data + j));//加载128位数据,其中有16个8位数据。
//subtract 128 from every 8-bit int
sub_min = _mm_sub_epi8(loaded8, _mm_set1_epi8(LowThreshod));
sub_max = _mm_sub_epi8(loaded8, _mm_set1_epi8(highThreshold));
mask_hi = _mm_cmpgt_epi8(sub_max,_mm_setzero_si128());//submax>0 8位全1,否则全0
mask_lo = _mm_cmpgt_epi8(sub_min,_mm_setzero_si128());//submin<=0,8位全0,否则全1
loaded8 = _mm_and_si128(loaded8, mask_lo);
loaded8 = _mm_or_si128(loaded8,mask_hi);//存下128位数据
mm_storeu_si128((__m128i *)(data + j), loaded8);

实验结果

=====