JuliaSIMD / LoopVectorization.jl

Macro(s) for vectorizing loops.
MIT License
731 stars 65 forks source link

[bug] incorrect result #138

Open kose-y opened 3 years ago

kose-y commented 3 years ago
function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    @avx for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij = block & 3
            i = 4 * (l-1) + 1
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 2
            Aij = (block >> 2) & 3
            i = 4 * (l-1) + 2
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij = (block >> 4) & 3
            i = 4 * (l-1) + 3
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 4
            Aij = (block >> 6) & 3
            i = 4 * (l-1) + 4
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

        end
    end
    out
end

The result is different from the code without @avx.

chriselrod commented 3 years ago

Hi, mind giving code I can copy and paste to run the example (and perhaps eventually add to the tests)?

kose-y commented 3 years ago
using LoopVectorization
function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    @avx for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij = block & 3
            i = 4 * (l-1) + 1
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 2
            Aij = (block >> 2) & 3
            i = 4 * (l-1) + 2
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij = (block >> 4) & 3
            i = 4 * (l-1) + 3
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 4
            Aij = (block >> 6) & 3
            i = 4 * (l-1) + 4
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

        end
    end
    out
end

function _snparray_ax_additive_noavx!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij = block & 3
            i = 4 * (l-1) + 1
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 2
            Aij = (block >> 2) & 3
            i = 4 * (l-1) + 2
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij = (block >> 4) & 3
            i = 4 * (l-1) + 3
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 4
            Aij = (block >> 6) & 3
            i = 4 * (l-1) + 4
            out[i] += ((Aij >= 2) + (Aij >= 3)) * v[j]

        end
    end
    out
end

using Random, Test
a = rand(UInt8, 512, 512)
v1 = zeros(512 << 2)
v1_ = zeros(512 << 2)

v2 = rand(Float64, 512)
_snparray_ax_additive!(v1, a, v2)
_snparray_ax_additive_noavx!(v1_, a, v2)

@test isapprox(v1, v1_)
kose-y commented 3 years ago

It works correctly without the intermediate variable i:

function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    @avx for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij = block & 3
            out[4*(l-1)+1] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 2
            Aij = (block >> 2) & 3
            i = 4 * (l-1) + 2
            out[4*(l-1)+2] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij = (block >> 4) & 3
            out[4*(l-1)+3] += ((Aij >= 2) + (Aij >= 3)) * v[j]

            # i = 4 * (l-1) + 4
            Aij = (block >> 6) & 3
            out[4*l] += ((Aij >= 2) + (Aij >= 3)) * v[j]

        end
    end
    out
end
chriselrod commented 3 years ago

The problem is that it doesn't un-alias the names. That is, creating two distinct variables with the same name (i or Aij) is likely to cause problems. It's more likely to cause problems for variables used for indexing (like i), because indexes are going to be reordered more heavily. This passes the tests:

function _snparray_ax_additive!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    @avx for j ∈ eachindex(v)
        for l in 1:k
            block = s[(j-1)*size(s, 1) + (l-1) + 1]

            # i = 4 * (l-1) + 1
            Aij_1 = block & 3
            i_1 = 4 * (l-1) + 1
            out[i_1] += ((Aij_1 >= 2) + (Aij_1 >= 3)) * v[j]

            # i = 4 * (l-1) + 2
            Aij_2 = (block >> 2) & 3
            i_2 = 4 * (l-1) + 2
            out[i_2] += ((Aij_2 >= 2) + (Aij_2 >= 3)) * v[j]

            # i = 4 * (l-1) + 3
            Aij_3 = (block >> 4) & 3
            i_3 = 4 * (l-1) + 3
            out[i_3] += ((Aij_3 >= 2) + (Aij_3 >= 3)) * v[j]

            # i = 4 * (l-1) + 4
            Aij_4 = (block >> 6) & 3
            i_4 = 4 * (l-1) + 4
            out[i_4] += ((Aij_4 >= 2) + (Aij_4 >= 3)) * v[j]

        end
    end
    out
end

I'll add something to change the names of the repeated variables.

However, I'd also like to point out that LoopVectorization only SIMDs across iterations of the loops. That means that writing:

out[4*(l-1)+1] += ...
out[4*(l-1)+2] += ...
out[4*(l-1)+3] += ...
out[4*l] += ...

Will force it to load elements like:

out[4*((l .+ 0:7)-1)+1] .+= ...
out[4*((l .+ 0:7)-1)+2] .+= ...
out[4*((l .+ 0:7)-1)+3] .+= ...
out[4*(l .+ 0:7)] .+= ...

This requires using gather and scatter intrinsics. These are slow, and if you don't have AVX512, scatter has to be emulated with repeated extractions and scalar stores.

It would be faster if you could define

v1__ = copy(reshape(v1_, (4, length(v1_) >> 2))'); # make it a matrix

function _snparray_ax_additive_contig!(out, s::Matrix{UInt8}, v)
    fill!(out, zero(UInt8))
    k = size(s, 1)
    # manually set unroll, seems to choose something a little faster
    @avx unroll=(2,2) for j ∈ eachindex(v)
        for l in 1:k
            block = s[l,j]

            Aij_1 = block & 3
            out[l,1] += ((Aij_1 >= 2) + (Aij_1 >= 3)) * v[j]

            Aij_2 = (block >> 2) & 3
            out[l,2] += ((Aij_2 >= 2) + (Aij_2 >= 3)) * v[j]

            Aij_3 = (block >> 4) & 3
            out[l,3] += ((Aij_3 >= 2) + (Aij_3 >= 3)) * v[j]

            Aij_4 = (block >> 6) & 3
            out[l,4] += ((Aij_4 >= 2) + (Aij_4 >= 3)) * v[j]

    end
    out
end
_snparray_ax_additive_contig!(v1__, a, v2);

@test isapprox(vec(v1__'), v1_)

This doesn't actually make much difference for me, because it swaps the order of the loops to make l the outer loop, so that it only has to gather/scatter once per for j in eachindex.

kose-y commented 3 years ago

Thanks for the information!