ccall and bittensor performance

nickolasrm commented 3 years ago

Performance analysis. Try these methods and report which one was better performed.

[x] Dot operation
[ ] For loop
[x] Ccall
- [x] Julia side parallelization
- [x] C side parallelization
- [x] Regular ccall

If ccall shows itself as being the best performed, find a multiplatform solution for compiling it

[ ] Multiplatform solution

nickolasrm commented 3 years ago

Flux uses dot operation to implement Dense output operation. Also, a benchmark test has shown BitDense when compared to Flux Dense, only achieves 10.7x speedup in a scenario it was expected to get about a 32x speedup. In reason of that, I noticed, by system monitor, Julia is using multicore on Dense operation even when I set JULIA_NUM_THREADS to 1. Maybe this performance difference is due to multicore optimization.

This thread focuses on finding a better performance alternative since BitDense has no other optimizations, except for gcc -O3 flag.

nickolasrm commented 3 years ago

First try, Julia-side multicore ccall. It splits the work by dividing the number of neurons by the number of threads and then making a different ccall for each of the parts.

Benchmark:

BenchmarkTools.Trial: 
  memory estimate:  1.68 KiB
  allocs estimate:  12
  --------------
  minimum time:     3.671 μs (0.00% GC)
  median time:      4.888 μs (0.00% GC)
  mean time:        5.330 μs (1.80% GC)
  maximum time:     495.731 μs (97.55% GC)
  --------------
  samples:          10000
  evals/sample:     8

Julia-side

function bitDenseBitExecute(l::BitDense, x::BitArray{1})
    out = BitArray{1}(undef, size(l.W, 1))

    Threads.@threads for i in 1:Threads.nthreads()
        rows = size(l.W.chunks, 1)
        loopStart = round(Int, (Threads.threadid() - 1) * rows / 
            Threads.nthreads())
        loopEnd =  round(Int, Threads.threadid() * rows / 
            Threads.nthreads())
        ccall((:binaryBitOutDotProduct, binarynetlib), Cvoid, 
            (Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Cint, Cint, Cint, Cint), 
            x.chunks, l.W.chunks, l.b.chunks, out.chunks, loopStart, loopEnd, 
            size(l.W, 1), size(l.W, 2))
    end

    return out
end

C-side

void binaryBitOutDotProduct(uint64_t* in, uint64_t* weights, uint64_t* bias, 
    uint64_t* out, int neuronStart, int neuronEnd, int neuronNumber, int wNumber)
{
    int wNumberChunks = ceil(wNumber / 64.0);

    if(wNumberChunks > 1)
    {
        int nLim = neuronEnd - neuronStart;
        int tempOut[nLim];
        int iLim = wNumberChunks - 1;

        weights += neuronStart;
        bias += neuronStart;

        uint64_t firstIn = in[0];
        for(int i = 0; i < nLim; i++)
            tempOut[i] = __builtin_popcountl(firstIn ^ weights[i]);
        weights += neuronNumber;

        for(int i = 1; i < iLim; i++)
        {
            for(int j = 0; j < nLim; j++)
                tempOut[j] += __builtin_popcountl(in[i] ^ weights[j]);
            weights += neuronNumber;
        }

        uint64_t lastIn = in[iLim];
        for(int i = 0; i < nLim; i++)
        {
            int b = getBit(bias, i);
            tempOut[i] += __builtin_popcountl(lastIn ^ weights[i]);
            setBit(out, i, ((tempOut[i] + b) > ((wNumber + !b) - tempOut[i])));
        }
    }
    else
    {
        for(int i = neuronStart; i < neuronEnd; i++)
        {
            int b = getBit(bias, i);
            int temp = __builtin_popcountl(in[0] ^ weights[i]);
            setBit(out, i, ((temp + b) > ((wNumber + !b) - temp)));
        }
    }
}

nickolasrm commented 3 years ago

All tests were performed by using Dense layers with 640 input values and 64 neurons Flux benchmark:

BenchmarkTools.Trial: 
  memory estimate:  3.28 KiB
  allocs estimate:  3
  --------------
  minimum time:     5.440 μs (0.00% GC)
  median time:      8.124 μs (0.00% GC)
  mean time:        9.134 μs (1.97% GC)
  maximum time:     577.807 μs (95.47% GC)
  --------------
  samples:          10000
  evals/sample:     7

nickolasrm commented 3 years ago

Second try, dot operations. Benchmark

BenchmarkTools.Trial: 
  memory estimate:  6.06 KiB
  allocs estimate:  8
  --------------
  minimum time:     1.313 μs (0.00% GC)
  median time:      1.524 μs (0.00% GC)
  mean time:        1.869 μs (10.91% GC)
  maximum time:     176.946 μs (95.32% GC)
  --------------
  samples:          10000
  evals/sample:     10

Changes

@inline _bitOperation(w::Int, b::Bool, nWeights::Int) = 
    (w + b) > (!b + nWeights - w)

function bitDenseBitExecute(l::BitDense, x::BitArray{1})
    return BitArray(
            _bitOperation.(
                reshape(
                    sum(
                        count_ones.(
                            (transpose(l.W.chunks) .⊻ x.chunks)
                        )
                    , dims=1)
                , :)
            , l.b, length(x))
        )
end

nickolasrm commented 3 years ago

Regular BitDense benchmark:

BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  2
  --------------
  minimum time:     863.136 ns (0.00% GC)
  median time:      1.010 μs (0.00% GC)
  mean time:        1.036 μs (0.98% GC)
  maximum time:     102.533 μs (98.70% GC)
  --------------
  samples:          10000
  evals/sample:     59

nickolasrm commented 3 years ago

Third try, openmp parallelization. Benchmark (4 threads):

BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  2
  --------------
  minimum time:     1.524 μs (0.00% GC)
  median time:      1.704 μs (0.00% GC)
  mean time:        1.768 μs (0.00% GC)
  maximum time:     8.275 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     10

Julia-side

function bitDenseBitExecute(l::BitDense, x::BitArray{1})
    out = BitArray{1}(undef, size(l.W, 1))

    ccall((:binaryBitOutDotProduct, binarynetlib), Cvoid, 
        (Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Cint, Cint, Cint), 
        x.chunks, l.W.chunks, l.b.chunks, out.chunks, 
        size(l.W, 1), size(l.W, 2), Threads.nthreads())

    return out
end

C-side

void binaryBitOutDotProduct(uint64_t* in, uint64_t* weights, uint64_t* bias, 
    uint64_t* out, int neuronNumber, int wNumber, int nthreads)
{
    omp_set_num_threads(nthreads);
    int wNumberChunks = ceil(wNumber / 64.0);

    if(wNumberChunks > 1)
    {
        int tempOut[neuronNumber];
        int iLim = wNumberChunks - 1;

        uint64_t *baseWeights = weights;

        uint64_t firstIn = in[0];
        for(int i = 0; i < neuronNumber; i++)
            tempOut[i] = __builtin_popcountl(firstIn ^ weights[i]);
        weights += neuronNumber;

        #pragma omp parallel for private(weights) firstprivate(tempOut)
        for(int i = 1; i < iLim; i++)
        {
            weights = i * neuronNumber + baseWeights;
            for(int j = 0; j < neuronNumber; j++)
                tempOut[j] += __builtin_popcountl(in[i] ^ weights[j]);
        }
        weights = iLim * neuronNumber + baseWeights;

        uint64_t lastIn = in[iLim];
        for(int i = 0; i < neuronNumber; i++)
        {
            int b = getBit(bias, i);
            tempOut[i] += __builtin_popcountl(lastIn ^ weights[i]);
            setBit(out, i, ((tempOut[i] + b) > ((wNumber + !b) - tempOut[i])));
        }
    }
    else
    {
        for(int i = 0; i < neuronNumber; i++)
        {
            int b = getBit(bias, i);
            int temp = __builtin_popcountl(in[0] ^ weights[i]);
            setBit(out, i, ((temp + b) > ((wNumber + !b) - temp)));
        }
    }
}

nickolasrm commented 3 years ago

Fourth try, for loop execution: Benchmark:

BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  2
  --------------
  minimum time:     684.178 ns (0.00% GC)
  median time:      830.253 ns (0.00% GC)
  mean time:        855.372 ns (1.65% GC)
  maximum time:     37.776 μs (97.62% GC)
  --------------
  samples:          10000
  evals/sample:     146

Changes:

function bitDenseBitExecute(l::BitDense, x::BitArray{1})
    W, b = l.W.chunks, l.b

    ilim, jlim, wcnt = size(W, 1), size(W, 2), length(x)
    out = BitArray{1}(undef, ilim)

    Wx = x.chunks

    for i in 1:ilim
        temp = 0
        for j in 1:jlim
            temp += count_ones(W[i, j] ⊻ Wx[j])
        end
        @inbounds out[i] = (temp + b[i]) > (!b[i] + wcnt - temp)
    end

    return out
end

nickolasrm commented 3 years ago

Fifth try, Julia for loop with matrix transposing: Benchmark:

BenchmarkTools.Trial: 
  memory estimate:  192 bytes
  allocs estimate:  2
  --------------
  minimum time:     610.212 ns (0.00% GC)
  median time:      646.159 ns (0.00% GC)
  mean time:        665.056 ns (1.07% GC)
  maximum time:     12.925 μs (94.52% GC)
  --------------
  samples:          10000
  evals/sample:     179

Changes: BitDense.jl

function bitDenseBitExecute(l::BitDense, x::BitArray{1})
    Wc, b, Xc = l.W.chunks, l.b, x.chunks
    neuronNumber, weightsNumber, = size(Wc, 2), size(l.W, 2)
    wChunksNumber = ceil(Int, weightsNumber / 64.0)

    out = BitArray{1}(undef, length(x))

    for i in 1:neuronNumber
        temp = 0
        for j in 1:wChunksNumber
            temp += count_ones(Wc[j,i] ⊻ Xc[j])
        end
        @inbounds out[i] = (temp + b[i]) > (!b[i] + weightsNumber - temp)
    end

    return out
end

BitTensor.jl

  function BitTensor{2}(value, rows::Int, cols::Int)
        #inverted to be cache friendly when iterating over rows
        chunkDims = (ceil(Int, cols / 64), rows)
        if value == undef
            chunks = rand(UInt64, chunkDims)
        elseif value == true
            chunks = fill(typemax(UInt64), chunkDims)
        else
            chunks = zeros(UInt64, chunkDims)
        end

        if value != false
            rest = 64 - (cols % 64)
            if rest < 64
                for i in 1:rows
                    chunks[end,i] = (chunks[end,i] << rest) >> rest
                end
            end
        end

        len = rows * cols
        bmat = new(chunks, len)
        bmat.dims = (rows, cols)

        return bmat
end

@inline function Base.getindex(mat::BitTensor{2}, i::Int, j::Int)
    jMod = (j - 1)  % 64
    return Bool((mat.chunks[ceil(Int, j / 64), i] & (1 << jMod)) >> jMod)
end

@inline function Base.setindex!(mat::BitTensor{2}, value::Bool, i::Int, j::Int)
    jMod = (j - 1) % 64
    mat.chunks[ceil(Int, j / 64), i] &= ~(1 << jMod)
    mat.chunks[ceil(Int, j / 64), i] |= value << jMod 
end

ATISLabs / TinyML.jl

ccall and bittensor performance #7