ermig1979 / Simd

C++ image processing and machine learning library with using of SIMD: SSE, AVX, AVX-512, AMX for x86/x64, VMX(Altivec) and VSX(Power7) for PowerPC, NEON for ARM.
http://ermig1979.github.io/Simd
MIT License
2.04k stars 407 forks source link

GaussianBlur #134

Closed wangjianhong0574 closed 3 years ago

wangjianhong0574 commented 3 years ago

I used your Gaussian Blur SimdGaussianBlur3x3, It is very fast, thank you. I used SimdGaussianBlur3x3 to enhancement of image sharpness with Gaussian blur,but 3X3 that sigma 1 is too small,The sharpness is not obvious。I know that Gaussian Blur it's used very often,you have plans to increase sigmais 2 ,3 and so on.

int CLIP3(int data, int min, int max) { return (data > max) ? max : ((data < min) ? min : data); if (data > #max) return max; else if (data > min) return data; else return min; } int f_USM(unsigned char srcData, unsigned char dstData, int width, int #height,int radius, int amount, int threshold) { radius = CLIP3(radius, 0, 100); amount = CLIP3(amount, 0, 500); SimdGaussianBlur3x3(srcData, width 4, width, height, 4, dstData, width 4); int i, j, r, g, b, offset; offset = stride - width 4; amount = amount 128 / 100; unsigned char pSrc = srcData; unsigned char pDst = dstData; for (j = 0; j < height; j++) { for (i = 0; i < width; i++) { b = pSrc[0] - pDst[0]; g = pSrc[1] - pDst[1]; r = pSrc[2] - pDst[2];

        b = (pSrc[0] + ((b * amount) >> 7));
        g = (pSrc[1] + ((g * amount) >> 7));
        r = (pSrc[2] + ((r * amount) >> 7));
        pSrc[0] = CLIP3(b, 0, 255);
        pSrc[1] = CLIP3(g, 0, 255);
        pSrc[2] = CLIP3(r, 0, 255);
        pSrc += 4;
        pDst += 4;
    }
    pSrc += offset;
    pDst += offset;
}

}

ermig1979 commented 3 years ago

Current version of Gaussian Blur 3x3 uses integer calculations. I can add GaussianBlur width window 5x5, 7x7 and 9x9. The larger windows will causes 32-bit integer overflow.

wangjianhong0574 commented 3 years ago

thank you! yes the larger windows will causes 32-bit integer overflow. I also have a Gaussian fuzzy algorithm here. but the speed is not fast.

void CalcGaussCof(float Radius, float &B0, float &B1, float &;B2, float &B3) { float Q, B; if (Radius >= 2.5) Q = (double)(0.98711 Radius - 0.96330); else if ((Radius >= 0.5) &&(Radius < 2.5)) Q = (double)(3.97156 - 4.14554 sqrt(1 - 0.26891 * Radius)); else Q = (double)0.1147705018520355224609375;

B = 1.57825 + 2.44413 * Q + 1.4281 * Q * Q + 0.422205 * Q * Q * Q;
B1 = 2.44413 * Q + 2.85619 * Q * Q + 1.26661 * Q * Q * Q;
B2 = -1.4281 * Q * Q - 1.26661 * Q * Q * Q;
B3 = 0.422205 * Q * Q * Q;

B0 = 1.0 - (B1 + B2 + B3) / B;
B1 = B1 / B;
B2 = B2 / B;
B3 = B3 / B;

}

void ConvertBGR8U2BGRAF(unsigned char Src, float Dest, int Width, int Height, int Stride) { //#pragma omp parallel for for (int Y = 0; Y < Height; Y++) { unsigned char LinePS = Src + Y Stride; float LinePD = Dest + Y Width * 4; for (int X = 0; X < Width; X++, LinePS += 4, LinePD += 4) { LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; } } }

void ConvertBGR8U2BGRAF_SSE(unsigned char Src, float Dest, int Width, int Height, int Stride) { const int BlockSize = 4; //int Block = (Width - 2) / BlockSize; int Block = (Width) / BlockSize; m128i Mask = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1); __m128i Zero = _mm_setzero_si128(); for (int Y = 0; Y < Height; Y++) { unsigned char LinePS = Src + Y Stride; float LinePD = Dest + Y Width 4; int X = 0; for (; X < Block BlockSize; X += BlockSize, LinePS += BlockSize 4, LinePD += BlockSize 4) { m128i SrcV = _mm_loadu_si128((const m128i*)LinePS); m128i Src16L = _mm_unpacklo_epi8(SrcV, Zero); __m128i Src16H = _mm_unpackhi_epi8(SrcV, Zero); _mm_store_ps(LinePD + 0, _mm_cvtepi32_ps(_mm_unpacklo_epi16(Src16L, Zero))); _mm_store_ps(LinePD + 4, _mm_cvtepi32_ps(_mm_unpackhi_epi16(Src16L, Zero))); _mm_store_ps(LinePD + 8, _mm_cvtepi32_ps(_mm_unpacklo_epi16(Src16H, Zero))); _mm_store_ps(LinePD + 12, _mm_cvtepi32_ps(_mm_unpackhi_epi16(Src16H, Zero))); } for (; X < Width; X++, LinePS += 4, LinePD += 4) { LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; } } }

void ConvertBGR8U2BGRAF_Line_SSE(unsigned char Src, float Dest, int Width) { const int BlockSize = 4; int Block = (Width) / BlockSize; __m128i Mask = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1); __m128i Zero = _mm_setzero_si128(); //#pragma omp parallel for

unsigned char *LinePS = Src;
float *LinePD = Dest;
int X = 0;
for (; X < Block * BlockSize; X += BlockSize, LinePS += BlockSize * 4, LinePD += BlockSize * 4)
{

    __m128i SrcV = _mm_loadu_si128((const __m128i*)LinePS);
    __m128i Src16L = _mm_unpacklo_epi8(SrcV, Zero);
    __m128i Src16H = _mm_unpackhi_epi8(SrcV, Zero);
    _mm_store_ps(LinePD + 0, _mm_cvtepi32_ps(_mm_unpacklo_epi16(Src16L, Zero)));
    _mm_store_ps(LinePD + 4, _mm_cvtepi32_ps(_mm_unpackhi_epi16(Src16L, Zero)));
    _mm_store_ps(LinePD + 8, _mm_cvtepi32_ps(_mm_unpacklo_epi16(Src16H, Zero)));
    _mm_store_ps(LinePD + 12, _mm_cvtepi32_ps(_mm_unpackhi_epi16(Src16H, Zero)));
}
for (; X < Width; X++, LinePS += 4, LinePD += 4)
{
    LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2];
}

} void GaussBlurFromLeftToRight(float *Data, int Width, int Height, float B0, float B1, float B2, float B3) {

for (int Y = 0; Y < Height; Y++)
{
    float *LinePD = Data + Y * Width * 4;
    //w[n-1], w[n-2], w[n-3]
    float BS1 = LinePD[0], BS2 = LinePD[0], BS3 = LinePD[0];
    float GS1 = LinePD[1], GS2 = LinePD[1], GS3 = LinePD[1];
    float RS1 = LinePD[2], RS2 = LinePD[2], RS3 = LinePD[2];
    for (int X = 0; X < Width; X++, LinePD += 4)
    {
        LinePD[0] = LinePD[0] * B0 + BS1 * B1 + BS2 * B2 + BS3 * B3;
        LinePD[1] = LinePD[1] * B0 + GS1 * B1 + GS2 * B2 + GS3 * B3;
        LinePD[2] = LinePD[2] * B0 + RS1 * B1 + RS2 * B2 + RS3 * B3;
        BS3 = BS2, BS2 = BS1, BS1 = LinePD[0];
        GS3 = GS2, GS2 = GS1, GS1 = LinePD[1];
        RS3 = RS2, RS2 = RS1, RS1 = LinePD[2];
    }
}

}

void GaussBlurFromLeftToRight_SSE(float Data, int Width, int Height, float B0, float B1, float B2, float B3) { const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3); for (int Y = 0; Y < Height; Y++) { float LinePD = Data + Y Width 4; m128 V1 = _mm_set_ps(LinePD[3], LinePD[2], LinePD[1], LinePD[0]); m128 V2 = V1, V3 = V1; for (int X = 0; X < Width; X++, LinePD += 4) { m128 V0 = _mm_load_ps(LinePD); m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1)); __m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3)); __m128 V = _mm_add_ps(V01, V23); V3 = V2; V2 = V1; V1 = V; _mm_store_ps(LinePD, V); } } }

void GaussBlurLeftToRight_OneLine_SSE(float *Data, int Width, float B0, float B1, float B2, float B3) { const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3);

float *LinePD = Data;
__m128 V1 = _mm_set_ps(LinePD[3], LinePD[2], LinePD[1], LinePD[0]);
__m128 V2 = V1, V3 = V1;
for (int X = 0; X < Width; X++, LinePD += 4) {
    __m128 V0 = _mm_load_ps(LinePD);
    __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1));
    __m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3));
    __m128 V = _mm_add_ps(V01, V23);
    V3 = V2; V2 = V1; V1 = V;
    _mm_store_ps(LinePD, V);
}

} void GaussBlurLeftToRight_TwoLine_SSE(float *Data, int Width, float B0, float B1, float B2, float B3) { const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3);

float *LinePD1 = Data;
float *LinePD2 = Data + Width * 4;

__m128 V1 = _mm_load_ps(LinePD1);
__m128 V2 = V1, V3 = V1;

__m128 W1 = _mm_load_ps(LinePD2);
__m128 W2 = W1, W3 = W1;
for (int X = 0; X < Width; X++, LinePD1 += 4, LinePD2 += 4) {
    __m128 V0 = _mm_load_ps(LinePD1);
    __m128 W0 = _mm_load_ps(LinePD2);
    __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1));
    __m128 W01 = _mm_add_ps(_mm_mul_ps(CofB0, W0), _mm_mul_ps(CofB1, W1));
    __m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3));
    __m128 W23 = _mm_add_ps(_mm_mul_ps(CofB2, W2), _mm_mul_ps(CofB3, W3));
    __m128 V = _mm_add_ps(V01, V23);
    __m128 W = _mm_add_ps(W01, W23);
    V3 = V2; V2 = V1; V1 = V;
    W3 = W2; W2 = W1; W1 = W;
    _mm_store_ps(LinePD1, V);
    _mm_store_ps(LinePD2, W);
}

}

// row blur void GaussBlurFromRightToLeft(float Data, int Width, int Height, float B0, float B1, float B2, float B3) { for (int Y = 0; Y < Height; Y++) { //w[n+1], w[n+2], w[n+3] float LinePD = Data + Y Width 4 + (Width 4); float BS1 = LinePD[0], BS2 = LinePD[0], BS3 = LinePD[0];  float GS1 = LinePD[1], GS2 = LinePD[1], GS3 = LinePD[1]; float RS1 = LinePD[2], RS2 = LinePD[2], RS3 = LinePD[2]; for (int X = Width - 1; X >= 0; X--, LinePD -= 4) { LinePD[0] = LinePD[0] B0 + BS3 B1 + BS2 B2 + BS1 B3; LinePD[1] = LinePD[1] B0 + GS3 B1 + GS2 B2 + GS1 B3; LinePD[2] = LinePD[2] B0 + RS3 B1 + RS2 B2 + RS1 B3; BS1 = BS2, BS2 = BS3, BS3 = LinePD[0]; GS1 = GS2, GS2 = GS3, GS3 = LinePD[1]; RS1 = RS2, RS2 = RS3, RS3 = LinePD[2]; } } } //row blur sse void GaussBlurFromRightToLeft_SSE(float Data, int Width, int Height, float B0, float B1, float B2, float B3) { const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const m128 CofB3 = _mm_set_ps(0, B3, B3, B3); // for (int Y = 0; Y < Height; Y++) { for (int Y = 0; Y < Height; Y += 2) { float LinePD1 = Data + Y Width 4 + (Width 4); float LinePD2 = Data + (Y + 1) Width 4 + (Width 4); // __m128 V1 = _mm_set_ps(LinePD[3], LinePD[2], LinePD[1], LinePD[0]); m128 V1 = _mm_load_ps(LinePD1); m128 W1 = _mm_load_ps(LinePD2); m128 V2 = V1, V3 = V1; m128 W2 = W1, W3 = V1; for (int X = Width - 1; X >= 0; X--, LinePD1 -= 4, LinePD2 -= 4) { __m128 V0 = _mm_load_ps(LinePD1); m128 W0 = _mm_load_ps(LinePD2); m128 V03 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V3)); __m128 W03 = _mm_add_ps(_mm_mul_ps(CofB0, W0), _mm_mul_ps(CofB1, W3)); m128 V12 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V1)); m128 W12 = _mm_add_ps(_mm_mul_ps(CofB2, W2), _mm_mul_ps(CofB3, W1)); __m128 V = _mm_add_ps(V03, V12); m128 W = _mm_add_ps(W03, W12); V1 = V2; V2 = V3; V3 = V; W1 = W2; W2 = W3; W3 = W; _mm_store_ps(LinePD1, V); _mm_store_ps(LinePD2, W); } } } void GaussBlurRightToLeft_OneLine_SSE(float *Data, int Width, float B0, float B1, float B2, float B3) { const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3);

float *LinePD1 = Data + (Width * 4);
__m128 V1 = _mm_load_ps(LinePD1);
__m128 V2 = V1, V3 = V1;
for (int X = Width - 1; X >= 0; X--, LinePD1 -= 4) {
    __m128 V0 = _mm_load_ps(LinePD1);
    __m128 V03 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V3));
    __m128 V12 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V1));
    __m128 V = _mm_add_ps(V03, V12);
    V1 = V2; V2 = V3; V3 = V;
    _mm_store_ps(LinePD1, V);
}

} void GaussBlurRightToLeft_TwoLine_SSE(float *Data, int Width, float B0, float B1, float B2, float B3) { const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3);

float *LinePD1 = Data + (Width * 4);
float *LinePD2 = Data + Width * 4 + (Width * 4);
__m128 V1 = _mm_load_ps(LinePD1);
__m128 W1 = _mm_load_ps(LinePD2);
__m128 V2 = V1, V3 = V1;
__m128 W2 = W1, W3 = V1;
for (int X = Width - 1; X >= 0; X--, LinePD1 -= 4, LinePD2 -= 4) {
    __m128 V0 = _mm_load_ps(LinePD1);
    __m128 W0 = _mm_load_ps(LinePD2);
    __m128 V03 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V3));
    __m128 W03 = _mm_add_ps(_mm_mul_ps(CofB0, W0), _mm_mul_ps(CofB1, W3));
    __m128 V12 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V1));
    __m128 W12 = _mm_add_ps(_mm_mul_ps(CofB2, W2), _mm_mul_ps(CofB3, W1));
    __m128 V = _mm_add_ps(V03, V12);
    __m128 W = _mm_add_ps(W03, W12);
    V1 = V2; V2 = V3; V3 = V;
    W1 = W2; W2 = W3; W3 = W;
    _mm_store_ps(LinePD1, V);
    _mm_store_ps(LinePD2, W);
}

}

//w[n] w[n-1], w[n-2], w[n-3] void GaussBlurFromTopToBottom(float Data, int Width, int Height, float B0, float B1, float B2, float B3) { for (int Y = 0; Y < Height; Y++) { float LinePD3 = Data + (Y + 0) Width 4; float LinePD2 = Data + (Y + 1) Width 4; float LinePD1 = Data + (Y + 2) Width 4; float LinePD0 = Data + (Y + 3) Width 4; for (int X = 0; X < Width; X++, LinePD0 += 4, LinePD1 += 4, LinePD2 += 4, LinePD3 += 4) { LinePD0[0] = LinePD0[0] B0 + LinePD1[0] B1 + LinePD2[0] B2 + LinePD3[0] B3; LinePD0[1] = LinePD0[1] B0 + LinePD1[1] B1 + LinePD2[1] B2 + LinePD3[1] B3; LinePD0[2] = LinePD0[2] B0 + LinePD1[2] B1 + LinePD2[2] B2 + LinePD3[2] * B3; } } }

void GaussBlurFromTopToBottom_SSE(float Data, int Width, int Height, float B0, float B1, float B2, float B3){ const  m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const  __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const  m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const  __m128 CofB3 = _mm_set_ps(0, B3, B3, B3); for (int Y = 0; Y < Height; Y++) { float LinePS3 = Data + (Y + 0) Width 4; float LinePS2 = Data + (Y + 1) Width 4; float LinePS1 = Data + (Y + 2) Width 4; float LinePS0 = Data + (Y + 3) Width 4; for (int X = 0; X < Width 4; X += 4) { m128 V3 = _mm_load_ps(LinePS3 + X); __m128 V2 = _mm_load_ps(LinePS2 + X); m128 V1 = _mm_load_ps(LinePS1 + X); m128 V0 = _mm_load_ps(LinePS0 + X); __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1)); m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3)); _mm_store_ps(LinePS0 + X, _mm_add_ps(V01, V23)); } } }

void GaussBlurTopToBottom_LowMemory_SSE(float Line0, float Line1, float Line2, float Line3, int Width, float B0, float B1, float B2, float B3){ const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3);

for (int X = 0; X < Width * 4; X += 4)
{

    __m128 V3 = _mm_load_ps(Line0 + X);
    __m128 V2 = _mm_load_ps(Line1 + X);
    __m128 V1 = _mm_load_ps(Line2 + X);
    __m128 V0 = _mm_load_ps(Line3 + X);
    __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1));
    __m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3));
    _mm_store_ps(Line3 + X, _mm_add_ps(V01, V23));
}

}

//w[n] w[n+1], w[n+2], w[n+3] void GaussBlurFromBottomToTop(float Data, int Width, int Height, float B0, float B1, float B2, float B3) { for (int Y = Height - 1; Y >= 0; Y--) { float LinePD3 = Data + (Y + 3) Width 4; float LinePD2 = Data + (Y + 2) Width 4; float LinePD1 = Data + (Y + 1) Width 4; float LinePD0 = Data + (Y + 0) Width 4; for (int X = 0; X < Width; X++, LinePD0 += 4, LinePD1 += 4, LinePD2 += 4, LinePD3 += 4) { LinePD0[0] = LinePD0[0] B0 + LinePD1[0] B1 + LinePD2[0] B2 + LinePD3[0] B3; LinePD0[1] = LinePD0[1] B0 + LinePD1[1] B1 + LinePD2[1] B2 + LinePD3[1] B3; LinePD0[2] = LinePD0[2] B0 + LinePD1[2] B1 + LinePD2[2] B2 + LinePD3[2] * B3; } } }

void GaussBlurFromBottomToTop_SSE(float Data, int Width, int Height, float B0, float B1, float B2, float B3) { const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3); for (int Y = Height - 1; Y >= 0; Y--) { float LinePS3 = Data + (Y + 3) Width 4; float LinePS2 = Data + (Y + 2) Width 4; float LinePS1 = Data + (Y + 1) Width 4; float LinePS0 = Data + (Y + 0) Width 4; for (int X = 0; X < Width 4; X += 4) { m128 V3 = _mm_load_ps(LinePS3 + X); __m128 V2 = _mm_load_ps(LinePS2 + X); m128 V1 = _mm_load_ps(LinePS1 + X); m128 V0 = _mm_load_ps(LinePS0 + X); __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1)); m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3)); _mm_store_ps(LinePS0 + X, _mm_add_ps(V01, V23)); } } }

void GaussBlurBottomToTop_LowMemory_SSE(float Line0, float Line1, float Line2, float Line3, int Width, float B0, float B1, float B2, float B3) { const m128 CofB0 = _mm_set_ps(0, B0, B0, B0); const __m128 CofB1 = _mm_set_ps(0, B1, B1, B1); const m128 CofB2 = _mm_set_ps(0, B2, B2, B2); const __m128 CofB3 = _mm_set_ps(0, B3, B3, B3);

for (int X = 0; X < Width * 4; X += 4) {
    __m128 V3 = _mm_load_ps(Line3 + X);
    __m128 V2 = _mm_load_ps(Line2 + X);
    __m128 V1 = _mm_load_ps(Line1 + X);
    __m128 V0 = _mm_load_ps(Line0 + X);

    __m128 V01 = _mm_add_ps(_mm_mul_ps(CofB0, V0), _mm_mul_ps(CofB1, V1));
    __m128 V23 = _mm_add_ps(_mm_mul_ps(CofB2, V2), _mm_mul_ps(CofB3, V3));
    _mm_store_ps(Line0 + X, _mm_add_ps(V01, V23));
}

} void ConvertBGRAF2BGR8U(float Src, unsigned char Dest, int Width, int Height, int Stride) { //#pragma omp parallel for for (int Y = 0; Y < Height; Y++) { float LinePS = Src + Y Width 4; unsigned char LinePD = Dest + Y * Stride; for (int X = 0; X < Width; X++, LinePS += 4, LinePD += 4) { LinePD[0] = LinePS[0]; LinePD[1] = LinePS[1]; LinePD[2] = LinePS[2]; } } }

void ConvertBGRAF2BGR8U_SSE(float Src, unsigned char Dest, int Width, int Height, int Stride) {

for (int Y = 0; Y < Height; Y++)
{
    float *LinePS = Src + Y * Width * 4;
    unsigned char *LinePD = Dest + Y * Stride;
    for (int X = 0; X < Width; X++, LinePS += 4, LinePD += 4)
    {
        LinePD[0] = LinePS[0];LinePD[1] = LinePS[1];LinePD[2] = LinePS[2];
    }
}

}

void ConvertBGRAF2BGR8U_Line_SSE(float Src, unsigned char Dest, int Width) { float LinePS = Src; unsigned char LinePD = Dest; for (int X = 0; X < Width; X++, LinePS += 4, LinePD += 4) { LinePD[0] = LinePS[0];    LinePD[1] = LinePS[1];    LinePD[2] = LinePS[2]; } }

void GaussBlur(unsigned char Src, unsigned char Dest, int Width, int Height, int Stride, float Radius) { float B0, B1, B2, B3; float Buffer = (float )malloc(Width (Height + 6) sizeof(float) 4); CalcGaussCof(Radius, B0, B1, B2, B3); ConvertBGR8U2BGRAF(Src, Buffer + 3 Width 4, Width, Height, Stride); GaussBlurFromLeftToRight(Buffer + 3 Width 4, Width, Height, B0, B1, B2, B3); GaussBlurFromRightToLeft(Buffer + 3 Width * 4, Width, Height, B0, B1, B2, B3);

memcpy(Buffer + 0 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + 1 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + 2 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));

GaussBlurFromTopToBottom(Buffer, Width, Height, B0, B1, B2, B3);

memcpy(Buffer + (Height + 3) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + (Height + 4) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + (Height + 5) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float));

GaussBlurFromBottomToTop(Buffer, Width, Height, B0, B1, B2, B3);

ConvertBGRAF2BGR8U(Buffer + 3 * Width * 4, Dest, Width, Height, Stride);

free(Buffer);

}

void GaussBlur_SSE(unsigned char Src, unsigned char Dest, int Width, int Height, int Stride, float Radius) {

// way 1
float B0, B1, B2, B3;
float *Line0, *Line1, *Line2, *Line3, *Temp;
//  float *Buffer = (float *)_mm_malloc(Width * (Height + 6) * sizeof(float)* 4, 16);
float *Buffer = (float *)_mm_malloc(Width * 4 * 4 * sizeof(float), 16);
CalcGaussCof(Radius, B0, B1, B2, B3);
int Y = 0;
for (; Y < Height - 1; Y += 2)
{
    ConvertBGR8U2BGRAF_Line_SSE(Src + (Y + 0) * Stride, Buffer, Width);
    ConvertBGR8U2BGRAF_Line_SSE(Src + (Y + 1) * Stride, Buffer + Width * 4, Width);
    GaussBlurLeftToRight_TwoLine_SSE(Buffer, Width, B0, B1, B2, B3);
    GaussBlurRightToLeft_TwoLine_SSE(Buffer, Width, B0, B1, B2, B3);
    ConvertBGRAF2BGR8U_Line_SSE(Buffer, Dest + (Y + 0) * Stride, Width); float to char
    ConvertBGRAF2BGR8U_Line_SSE(Buffer, Dest + (Y + 1) * Stride, Width);
}
for (; Y < Height; Y++)
{
    ConvertBGR8U2BGRAF_Line_SSE(Src + Y * Stride, Buffer, Width);
    GaussBlurLeftToRight_OneLine_SSE(Buffer, Width, B0, B1, B2, B3);
    GaussBlurRightToLeft_OneLine_SSE(Buffer, Width, B0, B1, B2, B3);
    ConvertBGRAF2BGR8U_Line_SSE(Buffer, Dest + Y * Stride, Width);
}

//
ConvertBGR8U2BGRAF_Line_SSE(Dest, Buffer + 3 * Width * 4, Width);
memcpy(Buffer + 0 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));&nbsp;
memcpy(Buffer + 1 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + 2 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));

Line0 = Buffer + 0 * Width * 4;&nbsp; &nbsp; Line1 = Buffer + 1 * Width * 4;
Line2 = Buffer + 2 * Width * 4;&nbsp; &nbsp; Line3 = Buffer + 3 * Width * 4;
for (Y = 0; Y < Height; Y++)
{
    ConvertBGR8U2BGRAF_Line_SSE(Dest + Y * Stride, Line3, Width);
    GaussBlurTopToBottom_LowMemory_SSE(Line0, Line1, Line2, Line3, Width, B0, B1, B2, B3);
    ConvertBGRAF2BGR8U_Line_SSE(Line3, Dest + Y * Stride, Width); 
    Temp = Line0; Line0 = Line1;&nbsp; &nbsp; Line1 = Line2; Line2 = Line3 Line3 = Temp; 
}

ConvertBGR8U2BGRAF_Line_SSE(Dest + (Height - 1) * Stride, Buffer + 3 * Width * 4, Width);
memcpy(Buffer + 0 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + 1 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + 2 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));

Line0 = Buffer + 0 * Width * 4;&nbsp; &nbsp; Line1 = Buffer + 1 * Width * 4;
Line2 = Buffer + 2 * Width * 4;&nbsp; &nbsp; Line3 = Buffer + 3 * Width * 4;
for (Y = Height - 1; Y &gt; 0; Y--)
{
    ConvertBGR8U2BGRAF_Line_SSE(Dest + Y * Stride, Line0, Width);
    GaussBlurBottomToTop_LowMemory_SSE(Line0, Line1, Line2, Line3, Width, B0, B1, B2, B3);
    ConvertBGRAF2BGR8U_Line_SSE(Line0, Dest + Y * Stride, Width);
    Temp = Line3; Line3 = Line2;Line2 = Line1; Line1 = Line0; Line0 = Temp;
}

if 0 way 2

float B0, B1, B2, B3;
float *Buffer = (float *)_mm_malloc(Width * (Height + 6) * sizeof(float) * 4, 16);

GaussBlurFromLeftToRight_SSE(Buffer + 3 * Width * 4, Width, Height, B0, B1, B2, B3);&nbsp; &nbsp; &nbsp;
GaussBlurFromRightToLeft_SSE(Buffer + 3 * Width * 4, Width, Height, B0, B1, B2, B3);&nbsp;

memcpy(Buffer + 0 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + 1 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + 2 * Width * 4, Buffer + 3 * Width * 4, Width * 4 * sizeof(float));

GaussBlurFromTopToBottom_SSE(Buffer, Width, Height, B0, B1, B2, B3);

memcpy(Buffer + (Height + 3) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + (Height + 4) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float));
memcpy(Buffer + (Height + 5) * Width * 4, Buffer + (Height + 2) * Width * 4, Width * 4 * sizeof(float));
GaussBlurFromBottomToTop_SSE(Buffer, Width, Height, B0, B1, B2, B3);
ConvertBGRAF2BGR8U_SSE(Buffer + 3 * Width * 4, Dest, Width, Height, Stride);

endif

_mm_free(Buffer);

}

------------------ 原始邮件 ------------------ 发件人: "Ihar Yermalayeu"<notifications@github.com>; 发送时间: 2020年12月17日(星期四) 下午5:14 收件人: "ermig1979/Simd"<Simd@noreply.github.com>; 抄送: "渔舟唱晚 "<304573288@qq.com>; "Author"<author@noreply.github.com>; 主题: Re: [ermig1979/Simd] GaussianBlur (#134)

Current version of Gaussian Blur 3x3 uses integer calculations. I can add GaussianBlur width window 5x5, 7x7 and 9x9. The larger windows will causes 32-bit integer overflow.

— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub, or unsubscribe.

ermig1979 commented 3 years ago

It is interesting. I will watch your code before make my own implementation.

wangjianhong0574 commented 3 years ago

It is interesting. I will watch your code before make my own implementation.

thank you! CalcGauss is from this paper .https://www.researchgate.net/publication/222453003_Recursive_implementation_of_the_Gaussian_filter

ermig1979 commented 3 years ago

I have added GaussianBlur engine to perform Gaussian blur filter with arbitrary blur radius.