DennisLiu1993 / Fastest_Image_Pattern_Matching

C++ implementation of a ScienceDirect paper "An accelerating cpu-based correlation-based image alignment for real-time automatic optical inspection"
BSD 2-Clause "Simplified" License
840 stars 205 forks source link

如何实现带掩膜的快速卷积 #60

Open Roger8 opened 2 months ago

Roger8 commented 2 months ago

注意到代码中不带掩膜的快速卷积,带掩膜的可以类似操作吗,要如何修改

//From ImageShop
// 4個有符號的32位的數據相加的和。
inline int _mm_hsum_epi32 (__m128i V)      // V3 V2 V1 V0
{
    // 實測這個速度要快些,_mm_extract_epi32最慢。
    __m128i T = _mm_add_epi32 (V, _mm_srli_si128 (V, 8));  // V3+V1   V2+V0  V1  V0  
    T = _mm_add_epi32 (T, _mm_srli_si128 (T, 4));    // V3+V1+V2+V0  V2+V0+V1 V1+V0 V0 
    return _mm_cvtsi128_si32 (T);       // 提取低位 
}
// 基於SSE的字節數據的乘法。
// <param name="Kernel">需要卷積的核矩陣。 </param>
// <param name="Conv">卷積矩陣。 </param>
// <param name="Length">矩陣所有元素的長度。 </param>
inline int IM_Conv_SIMD (unsigned char* pCharKernel, unsigned char *pCharConv, int iLength)
{
    const int iBlockSize = 16, Block = iLength / iBlockSize;
    __m128i SumV = _mm_setzero_si128 ();
    __m128i Zero = _mm_setzero_si128 ();
    for (int Y = 0; Y < Block * iBlockSize; Y += iBlockSize)
    {
        __m128i SrcK = _mm_loadu_si128 ((__m128i*)(pCharKernel + Y));
        __m128i SrcC = _mm_loadu_si128 ((__m128i*)(pCharConv + Y));
        __m128i SrcK_L = _mm_unpacklo_epi8 (SrcK, Zero);
        __m128i SrcK_H = _mm_unpackhi_epi8 (SrcK, Zero);
        __m128i SrcC_L = _mm_unpacklo_epi8 (SrcC, Zero);
        __m128i SrcC_H = _mm_unpackhi_epi8 (SrcC, Zero);
        __m128i SumT = _mm_add_epi32 (_mm_madd_epi16 (SrcK_L, SrcC_L), _mm_madd_epi16 (SrcK_H, SrcC_H));
        SumV = _mm_add_epi32 (SumV, SumT);
    }
    int Sum = _mm_hsum_epi32 (SumV);
    for (int Y = Block * iBlockSize; Y < iLength; Y++)
    {
        Sum += pCharKernel[Y] * pCharConv[Y];
    }
    return Sum;
}
//#define ORG

//From ImageShop
matResult.create (matSrc.rows - pTemplData->vecPyramid[iLayer].rows + 1,
    matSrc.cols - pTemplData->vecPyramid[iLayer].cols + 1, CV_32FC1);
matResult.setTo (0);
cv::Mat& matTemplate = pTemplData->vecPyramid[iLayer];

int  t_r_end = matTemplate.rows, t_r = 0;
for (int r = 0; r < matResult.rows; r++)
{
    float* r_matResult = matResult.ptr<float> (r);
    uchar* r_source = matSrc.ptr<uchar> (r);
    uchar* r_template, *r_sub_source;
    for (int c = 0; c < matResult.cols; ++c, ++r_matResult, ++r_source)
    {
        r_template = matTemplate.ptr<uchar> ();
        r_sub_source = r_source;
        for (t_r = 0; t_r < t_r_end; ++t_r, r_sub_source += matSrc.cols, r_template += matTemplate.cols)
        {
            *r_matResult = *r_matResult + IM_Conv_SIMD (r_template, r_sub_source, matTemplate.cols);
        }
    }
}
//From ImageShop
ershat-dl commented 1 month ago

image

根据公式 带mask比较麻烦,相关性计算时要考虑mask,归一化时又考虑mask, 复杂的会提升不少。

Roger8 commented 1 month ago

对, 看看 是否可以借用SIMD 提升速度