Unpremultiply result is often wrong when A=B=G=R (with fix!)

rickbrew commented 2 years ago

When unpremultiplying a value such as #07070707, the result should be #07FFFFFF, but I'm getting #07FEFEFE. There are several other instances of this, but always when A=B=G=R.

Link to code in question: AVX2: https://github.com/ermig1979/Simd/blob/af41bdb76b69f6414aba4f40053393e91278a035/src/Simd/SimdAvx2AlphaBlending.cpp#L354 SSE4.1: https://github.com/ermig1979/Simd/blob/8ac6b68f7c9f91d16b34b13c6c860d8589c0f40a/src/Simd/SimdSse41AlphaBlending.cpp#L277

I found a fix, however. After the multiply, add a constant "fudge factor" of 0.00001f. Also, the floor is not needed if convert-with-truncation is used (_mm256_cvttps_epi32 and _mm_cvttps_epi32). So, the instruction count remains the same (I haven't yet benchmarked it though).

Here is my working C# code, which should be easy enough to port back to C++:

private static void ConvertPbgra32ToBgra32Extent(ColorPbgra32* pExtent, nint length, ref byte unscaleLookup)
{
    nint count = length;
    byte* p = (byte*)pExtent;

    if (Avx2.IsSupported && count >= 8)
    {
        // Adapted from: https://github.com/ermig1979/Simd/blob/af41bdb76b69f6414aba4f40053393e91278a035/src/Simd/SimdAvx2AlphaBlending.cpp#L354
        // Copyright(c) 2011 - 2021 Yermalayeu Ihar.
        Vector256<float> _255f = Vector256.Create(255.0f);
        Vector256<float> _epsFudge = Vector256.Create(0.00001f); // this fixes e.g. 254.99999 so it goes to 255 instead of 254

        Vector256<byte> K8_SHUFFLE_BGRA_TO_B = Vector256.Create(
            0xFFFFFF00, 0xFFFFFF04, 0xFFFFFF08, 0xFFFFFF0C,
            0xFFFFFF00, 0xFFFFFF04, 0xFFFFFF08, 0xFFFFFF0C).AsByte();

        Vector256<byte> K8_SHUFFLE_BGRA_TO_G = Vector256.Create(
            0xFFFFFF01, 0xFFFFFF05, 0xFFFFFF09, 0xFFFFFF0D,
            0xFFFFFF01, 0xFFFFFF05, 0xFFFFFF09, 0xFFFFFF0D).AsByte();

        Vector256<byte> K8_SHUFFLE_BGRA_TO_R = Vector256.Create(
            0xFFFFFF02, 0xFFFFFF06, 0xFFFFFF0A, 0xFFFFFF0E,
            0xFFFFFF02, 0xFFFFFF06, 0xFFFFFF0A, 0xFFFFFF0E).AsByte();

        Vector256<byte> K8_SHUFFLE_BGRA_TO_A = Vector256.Create(
            0xFFFFFF03, 0xFFFFFF07, 0xFFFFFF0B, 0xFFFFFF0F,
            0xFFFFFF03, 0xFFFFFF07, 0xFFFFFF0B, 0xFFFFFF0F).AsByte();

        do
        {
            Vector256<byte> pbgra = Avx.LoadVector256(p);

            Vector256<int> bi = Avx2.Shuffle(pbgra, K8_SHUFFLE_BGRA_TO_B).AsInt32();
            Vector256<int> gi = Avx2.Shuffle(pbgra, K8_SHUFFLE_BGRA_TO_G).AsInt32();
            Vector256<int> ri = Avx2.Shuffle(pbgra, K8_SHUFFLE_BGRA_TO_R).AsInt32();
            Vector256<int> ai = Avx2.Shuffle(pbgra, K8_SHUFFLE_BGRA_TO_A).AsInt32();
            Vector256<float> af = Avx.ConvertToVector256Single(ai);

            Vector256<float> k0 = Avx.Compare(af, Vector256<float>.Zero, FloatComparisonMode.OrderedEqualNonSignaling);
            Vector256<float> k1 = Avx.Divide(_255f, af);
            Vector256<float> k = Avx.BlendVariable(k1, af, k0);

            Vector256<float> bf = Avx.ConvertToVector256Single(bi);
            Vector256<float> b0 = Avx.Multiply(bf, k);
            Vector256<float> b1 = Avx.Add(b0, _epsFudge);
            Vector256<float> b2 = Avx.Min(b1, _255f);
            Vector256<int> b = Avx.ConvertToVector256Int32WithTruncation(b2);

            Vector256<float> gf = Avx.ConvertToVector256Single(gi);
            Vector256<float> g0 = Avx.Multiply(gf, k);
            Vector256<float> g1 = Avx.Add(g0, _epsFudge);
            Vector256<float> g2 = Avx.Min(g1, _255f);
            Vector256<int> g = Avx.ConvertToVector256Int32WithTruncation(g2);

            Vector256<float> rf = Avx.ConvertToVector256Single(ri);
            Vector256<float> r0 = Avx.Multiply(rf, k);
            Vector256<float> r1 = Avx.Add(r0, _epsFudge);
            Vector256<float> r2 = Avx.Min(r1, _255f);
            Vector256<int> r = Avx.ConvertToVector256Int32WithTruncation(r2);

            Vector256<int> bgra0 = Avx2.Or(b, Avx2.ShiftLeftLogical128BitLane(g, 1));
            Vector256<int> bgra1 = Avx2.Or(bgra0, Avx2.ShiftLeftLogical128BitLane(r, 2));
            Vector256<int> bgra = Avx2.Or(bgra1, Avx2.ShiftLeftLogical128BitLane(ai, 3));

            Avx.Store((int*)p, bgra);

            p += 32;
            count -= 8;

        } while (count >= 8);
    }

    if (Sse41.IsSupported && count >= 4)
    {
        // Adapted from: https://github.com/ermig1979/Simd/blob/8ac6b68f7c9f91d16b34b13c6c860d8589c0f40a/src/Simd/SimdSse41AlphaBlending.cpp#L277
        // Copyright(c) 2011 - 2021 Yermalayeu Ihar.
        Vector128<float> _255f = Vector128.Create(255.0f);
        Vector128<float> _epsFudge = Vector128.Create(0.00001f); // this fixes e.g. 254.99999 so it goes to 255 instead of 254

        Vector128<byte> K8_SHUFFLE_BGRA_TO_B = Vector128.Create(0xFFFFFF00, 0xFFFFFF04, 0xFFFFFF08, 0xFFFFFF0C).AsByte();
        Vector128<byte> K8_SHUFFLE_BGRA_TO_G = Vector128.Create(0xFFFFFF01, 0xFFFFFF05, 0xFFFFFF09, 0xFFFFFF0D).AsByte();
        Vector128<byte> K8_SHUFFLE_BGRA_TO_R = Vector128.Create(0xFFFFFF02, 0xFFFFFF06, 0xFFFFFF0A, 0xFFFFFF0E).AsByte();
        Vector128<byte> K8_SHUFFLE_BGRA_TO_A = Vector128.Create(0xFFFFFF03, 0xFFFFFF07, 0xFFFFFF0B, 0xFFFFFF0F).AsByte();

        do
        {
            Vector128<byte> pbgra = Sse2.LoadVector128(p);

            Vector128<int> bi = Ssse3.Shuffle(pbgra, K8_SHUFFLE_BGRA_TO_B).AsInt32();
            Vector128<int> gi = Ssse3.Shuffle(pbgra, K8_SHUFFLE_BGRA_TO_G).AsInt32();
            Vector128<int> ri = Ssse3.Shuffle(pbgra, K8_SHUFFLE_BGRA_TO_R).AsInt32();
            Vector128<int> ai = Ssse3.Shuffle(pbgra, K8_SHUFFLE_BGRA_TO_A).AsInt32();
            Vector128<float> af = Sse2.ConvertToVector128Single(ai);

            Vector128<float> k0 = Sse.CompareEqual(af, Vector128<float>.Zero);
            Vector128<float> k1 = Sse.Divide(_255f, af);
            Vector128<float> k = Sse41.BlendVariable(k1, af, k0);

            Vector128<float> bf = Sse2.ConvertToVector128Single(bi);
            Vector128<float> b0 = Sse.Multiply(bf, k);
            Vector128<float> b1 = Sse.Add(b0, _epsFudge);
            Vector128<float> b2 = Sse.Min(b1, _255f);
            Vector128<int> b = Sse2.ConvertToVector128Int32WithTruncation(b2);

            Vector128<float> gf = Sse2.ConvertToVector128Single(gi);
            Vector128<float> g0 = Sse.Multiply(gf, k);
            Vector128<float> g1 = Sse.Add(g0, _epsFudge);
            Vector128<float> g2 = Sse.Min(g1, _255f);
            Vector128<int> g = Sse2.ConvertToVector128Int32WithTruncation(g2);

            Vector128<float> rf = Sse2.ConvertToVector128Single(ri);
            Vector128<float> r0 = Sse.Multiply(rf, k);
            Vector128<float> r1 = Sse.Add(r0, _epsFudge);
            Vector128<float> r2 = Sse.Min(r1, _255f);
            Vector128<int> r = Sse2.ConvertToVector128Int32WithTruncation(r2);

            Vector128<int> bgra0 = Sse2.Or(b, Sse2.ShiftLeftLogical128BitLane(g, 1));
            Vector128<int> bgra1 = Sse2.Or(bgra0, Sse2.ShiftLeftLogical128BitLane(r, 2));
            Vector128<int> bgra = Sse2.Or(bgra1, Sse2.ShiftLeftLogical128BitLane(ai, 3));

            Sse2.Store((int*)p, bgra);

            p += 16;
            count -= 4;
        } while (count >= 4);
    }

    while (count > 0)
    {
        byte a = p[3];
        byte b = ByteUtil.FastUnscale(ref unscaleLookup, p[0], a);
        byte g = ByteUtil.FastUnscale(ref unscaleLookup, p[1], a);
        byte r = ByteUtil.FastUnscale(ref unscaleLookup, p[2], a);

        *(uint*)p = ((uint)a << 24) | ((uint)r << 16) | ((uint)g << 8) | (uint)b;

        p += sizeof(ColorPbgra32);
        --count;
    }
}

ermig1979 commented 2 years ago

Hello! Thank you for bug report and for suggested solution!

To fix this bug is enough to change coefficient from 255.0f to 255.00001f (see the last commit).

Any way It would be great if you perform independend checking of this fix.

rickbrew commented 2 years ago

Okay I just tried it out: 1) changed coefficient from 255.0f to 255.00001f, 2) removed the Add(_epsFudge) (while also still excluding the Floor() call). It runs faster (~11%) and passes verification against known-good scalar path. Great!

rickbrew commented 2 years ago

There is on more change I made. The call to Min() should still use 255.0f, not 255.00001f, so two constants are needed. It did pass validation either way, so this may not actually be necessary in practice, and it did not affect my benchmark results.

rickbrew commented 2 years ago

Okay I think we can close this now :)

One last thing I'll add is that @saucecontrol came up with an even faster implementation of UnPremultiply, which I'll include here. It's in C# but should be straightforward to port back to C++ since there's a 1:1 mapping for all the intrinsics. Benchmarking shows this to be about 12% faster for AVX2, 18% faster for SSE4.1. It manages to use floating point only once per pixel group, for the division, and after that it's all integer. It also passes validation against a known-good scalar implementation.

// btw, nint is basically just ssize_t
public static void ConvertPbgra32ToBgra32Extent(uint* pExtent, nint length)
{
    nint count = length;
    byte* p = (byte*)pExtent;

    if (Avx2.IsSupported && count >= Vector256<uint>.Count)
    {
        // Credit for this goes to Clinton Ingram, https://github.com/saucecontrol
        var vmask0 = Vector256.Create(0x000000ff).AsByte();
        var vmask1 = Vector256.Create(0x80808001, 0x80808005, 0x80808009, 0x8080800d, 0x80808001, 0x80808005, 0x80808009, 0x8080800d).AsByte();
        var vmask2 = Vector256.Create(0x80808002, 0x80808006, 0x8080800a, 0x8080800e, 0x80808002, 0x80808006, 0x8080800a, 0x8080800e).AsByte();
        var vmask3 = Vector256.Create(0x80808003, 0x80808007, 0x8080800b, 0x8080800f, 0x80808003, 0x80808007, 0x8080800b, 0x8080800f).AsByte();
        var vmaskf = Vector256.Create(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15).AsByte();
        var vscale = Vector256.Create((float)0xff00ff);

        do
        {
            var vi = Avx.LoadVector256(p);
            var vi3 = Avx2.Shuffle(vi, vmask3).AsInt32();
            var vim = Avx.ConvertToVector256Int32WithTruncation(Avx.Divide(vscale, Avx.ConvertToVector256Single(vi3)));
            vim = Avx2.BlendVariable(vim, Vector256<int>.Zero, Avx2.CompareEqual(vi3, Vector256<int>.Zero));

            var vi2 = Avx2.Shuffle(vi, vmask2).AsInt32();
            var vi1 = Avx2.Shuffle(vi, vmask1).AsInt32();
            var vi0 = Avx2.And(vi, vmask0).AsInt32();

            vi0 = Avx2.MultiplyLow(vi0, vim);
            vi1 = Avx2.MultiplyLow(vi1, vim);
            vi2 = Avx2.MultiplyLow(vi2, vim);

            vi0 = Avx2.ShiftRightLogical(vi0, 16);
            vi1 = Avx2.ShiftRightLogical(vi1, 16);
            vi2 = Avx2.ShiftRightLogical(vi2, 16);

            var vil = Avx2.PackSignedSaturate(vi0, vi1);
            var vih = Avx2.PackSignedSaturate(vi2, vi3);
            vi = Avx2.PackUnsignedSaturate(vil, vih);

            Avx.Store(p, Avx2.Shuffle(vi, vmaskf));

            p += Vector256<byte>.Count;
            count -= Vector256<uint>.Count;
        } while (count >= Vector256<uint>.Count);
    }

    if (Sse41.IsSupported && count >= 4)
    {
        // Credit for this goes to Clinton Ingram, https://github.com/saucecontrol
        var vmask0 = Vector128.Create(0x000000ff).AsByte();
        var vmask1 = Vector128.Create(0x80808001, 0x80808005, 0x80808009, 0x8080800d).AsByte();
        var vmask2 = Vector128.Create(0x80808002, 0x80808006, 0x8080800a, 0x8080800e).AsByte();
        var vmask3 = Vector128.Create(0x80808003, 0x80808007, 0x8080800b, 0x8080800f).AsByte();
        var vmaskf = Vector128.Create(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15).AsByte();
        var vscale = Vector128.Create((float)0xff00ff);

        do
        {
            var vi = Sse2.LoadVector128(p);
            var vi3 = Ssse3.Shuffle(vi, vmask3).AsInt32();
            var vim = Sse2.ConvertToVector128Int32WithTruncation(Sse.Divide(vscale, Sse2.ConvertToVector128Single(vi3)));
            vim = Sse41.BlendVariable(vim, Vector128<int>.Zero, Sse2.CompareEqual(vi3, Vector128<int>.Zero));

            var vi2 = Ssse3.Shuffle(vi, vmask2).AsInt32();
            var vi1 = Ssse3.Shuffle(vi, vmask1).AsInt32();
            var vi0 = Sse2.And(vi, vmask0).AsInt32();

            vi0 = Sse41.MultiplyLow(vi0, vim);
            vi1 = Sse41.MultiplyLow(vi1, vim);
            vi2 = Sse41.MultiplyLow(vi2, vim);

            vi0 = Sse2.ShiftRightLogical(vi0, 16);
            vi1 = Sse2.ShiftRightLogical(vi1, 16);
            vi2 = Sse2.ShiftRightLogical(vi2, 16);

            var vil = Sse2.PackSignedSaturate(vi0, vi1);
            var vih = Sse2.PackSignedSaturate(vi2, vi3);
            vi = Sse2.PackUnsignedSaturate(vil, vih);

            Sse2.Store(p, Ssse3.Shuffle(vi, vmaskf));

            p += Vector128<byte>.Count;
            count -= Vector128<uint>.Count;
        } while (count >= 4);
    }

    while (count > 0) { ... scalar implementation for the last few pixels ... }
}

saucecontrol commented 2 years ago

@rickbrew looks like you missed the last update to my gist, which had one extra micro-optimization 😄

https://gist.github.com/saucecontrol/77efe86bc8ab96d70b9045e71db7447e#file-unpremultiplybench-cs-L200

That's just swapping a blendvb for a pandn, which is same latency but smaller encoding (moreso on .NET since RyuJit emits 2 xorpss for the two zero vector constants in the blend version).

Disassembly for the inner loop:

 vmovdqu     ymm6,ymmword ptr [rcx]  
 vpshufb     ymm7,ymm6,ymm3  
 vcvtdq2ps   ymm8,ymm7  
 vdivps      ymm8,ymm5,ymm8  
 vcvttps2dq  ymm8,ymm8  
 vxorps      ymm9,ymm9,ymm9  
 vpcmpeqd    ymm9,ymm7,ymm9  
 vpandn      ymm8,ymm9,ymm8  
 vpshufb     ymm9,ymm6,ymm2  
 vpshufb     ymm10,ymm6,ymm1  
 vpand       ymm6,ymm6,ymm0  
 vpmulld     ymm6,ymm6,ymm8  
 vpmulld     ymm10,ymm10,ymm8  
 vpmulld     ymm9,ymm9,ymm8  
 vpsrld      ymm6,ymm6,10h  
 vpsrld      ymm10,ymm10,10h  
 vpsrld      ymm9,ymm9,10h  
 vpackssdw   ymm6,ymm6,ymm10  
 vpackssdw   ymm7,ymm9,ymm7  
 vpackuswb   ymm6,ymm6,ymm7  
 vpshufb     ymm6,ymm6,ymm4  
 vmovdqu     ymmword ptr [rcx],ymm6

rickbrew commented 2 years ago

@saucecontrol oh nice! I just confirmed correctness vs. known-good scalar code, and it's just a smidge faster than before :)

ermig1979 / Simd

Unpremultiply result is often wrong when A=B=G=R (with fix!) #194