proposal for multidimensional in-place circshift!

JKrehl commented 7 years ago

Based on #20696 (modified for improved readability) and by extent #19923 and #16032 I developed the following algorithm for the n-dimensional truly in-place circshift! using the three-reversions-method (which in n dimensions can be reorganised into an operation which splits each axis and reverses the parts of this axis).

Any thoughts?

using Base.Cartesian

function _reversing_loop(ex, half, ix, irx, be, en, hex=nothing)
    ixen = half ? :(div($be+$en-1,2)) : en
    quote
        $irx = $en
        for $ix in $be:$ixen
            $ex
            $irx -= 1
        end
        if $half && isodd($en-$be-1)
           $ix = $irx = $ixen+1
           $hex
        end
    end
end

@generated function splitreverse!{T,N,I<:Integer}(dest::AbstractArray{T,N}, src::AbstractArray{T,N}, split::NTuple{N, I})
    xs = ((Symbol("x_", i) for i in 1:N)...)
    rxs = ((Symbol("rx_", i) for i in 1:N)...)

    hex = ex = :((dest[$(xs...)], dest[$(rxs...)]) = (src[$(rxs...)], src[$(xs...)]))

    for i in 1:N
        hex = quote
            $(_reversing_loop(ex, true, xs[i], rxs[i], 1, :(split[$i]), hex))
            $(_reversing_loop(ex, true, xs[i], rxs[i], :(split[$i]+1), :(size(src, $i)), hex))
        end

        ex = quote
            $(_reversing_loop(ex, false, xs[i], rxs[i], 1, :(split[$i])))
            $(_reversing_loop(ex, false, xs[i], rxs[i], :(split[$i]+1), :(size(src, $i))))
        end
    end

    quote
        @inbounds $hex
        dest
    end
end

function circshift!{T,N,I<:Integer}(dest::AbstractArray{T,N}, src::AbstractArray{T,N}, shifts::NTuple{N, I})
    @assert size(dest) == size(src)

    splits = map(mod, map(-, shifts), size(src))

    splitreverse!(dest, src, splits)
    splitreverse!(dest, dest, size(src))
end

circshift!{I<:Integer}(dest::AbstractVector, src::AbstractVector, shifts::I) =  circshift!(dest, src, (shifts,))

It passes the following tests:

using Base.Test
@testset begin
    @test begin A = collect(1:6); circshift!(A, A, 0) == Int[1, 2, 3, 4, 5, 6] end
    @test begin A = collect(1:5); circshift!(A, A, 2) == Int[4, 5, 1, 2, 3] end
    @test begin A = collect(1:5); circshift!(A, A, 7) == Int[4, 5, 1, 2, 3] end
    @test begin A = collect(1:7); circshift!(A, A, -4) == Int[5, 6, 7, 1, 2, 3, 4] end
    begin
        A = reshape(collect(1:30), (5,6))
        circshift!(A, A, (2,4))
        @test A== [14 19 24 29 4 9;
                           15 20 25 30 5 10;
                            11 16 21 26 1 6;
                            12 17 22 27 2 7;
                            13 18 23 28 3 8]
        A = reshape(collect(1:90), (5,6,3))
        @test circshift!(copy(A), A, (1,2,2)) == circshift!(A, A, (1,2,2))
        A = reshape(collect(1:90), (5,6,3))
        B = copy(A)
        @test circshift!(B, A, (3,3,1)) == circshift!(A, A, (3,3,1))
        B = copy(A)
        circshift!(A, A, (0,0,1))
        circshift!(A, A, (4,0,0))
        circshift!(A, A, (0,1,-1))
        @test A == circshift!(B, B, (4,1,0))
    end
end;

stevengj commented 7 years ago

Presumably we would only want to use this when dest === src? Wouldn't the existing code be faster for the out-of-place dest !== src case?

JKrehl commented 7 years ago

@stevengj In 1D the current cirshift! is indeed faster but in 2D and 3D my new one is. But the benchmarks below are all I did and may be not conclusive (maybe in regards to more extreme cases).

Profiling (systemwide Intel Performance Counters) shows a considerably increased L2 Hit Ratio for the new algorithm in the 2D in 3D cases, where the L2HR is quite poor (50%). For what 5 minutes of profiling are worth.

In [6]: @benchmark circshift!(A, A, (1,2,3)) setup=(A = rand(Int, (64,64,64)))
Out[6]: BenchmarkTools.Trial: 
  memory estimate:  80 bytes
  allocs estimate:  3
  mean time:        228.192 μs (0.00% GC)
  maximum time:     941.883 μs (0.00% GC)
In [8]: @benchmark Base.circshift!(B, A, (1,2,3)) setup=(A = rand(Int, (64,64,64)); B = copy(A))
Out[8]: BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  mean time:        1.179 ms (0.00% GC)
  maximum time:     2.093 ms (0.00% GC)

In[17]: @benchmark circshift!(A, A, (17,12)) setup=(A = rand(Int, (64,64)))
Out[17]: BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  mean time:        2.831 μs (0.00% GC)
  maximum time:     7.548 μs (0.00% GC)
In [16]: @benchmark Base.circshift!(B, A, (17,12)) setup=(A = rand(Int, (64,64)); B = copy(A))
Out[16]: BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  mean time:        5.516 μs (0.00% GC)
  maximum time:     17.720 μs (0.00% GC)
In [12]: @benchmark circshift!(A, A, (17,)) setup=(A = rand(Int, 64^2))
Out[12]: BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  mean time:        2.826 μs (0.00% GC)
  maximum time:     9.344 μs (0.00% GC)
In [13]: @benchmark Base.circshift!(B, A, (17,)) setup=(A = rand(Int, 64^2); B = copy(A))
Out[13]: BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  mean time:        2.199 μs (0.00% GC)
  maximum time:     6.105 μs (0.00% GC)

Some lines are removed in order to save precious vertical space.

JKrehl commented 7 years ago

@stevengj

The performance problem of the existing circshift! is the copy mechanism using the cartesian ranges, if the elements are copied by hand the copy based circshift! is twice as fast as the three-reversion based one in the 3D benchmark.

timholy commented 7 years ago

Wouldn't it be better to fix the copy! function, then? The one using two arrays is a lot easier to understand, and if it can be made faster...

A good thing to try would be to split out the first dimension for special treatment, somewhat similar to https://github.com/JuliaLang/julia/blob/c716e638bba74f52114c7adedfe4a653e42c40d0/base/reducedim.jl#L191-L198.

JKrehl commented 7 years ago

@timholy For the circshift! between different arrays maybe, although I do not immediately see what is wrong with thecopy! using CartesianRanges. This can, however, not handle the case of circshift!ing in one array. As my use-case is memory-critical the extra allocation is prohibitive beyond speed concerns. The proposed algorithm could mybe be simplified in some ways (e.g. shifting axes after each other at the cost of speed, or nesting the loops outside-in instead of inside-out).

timholy commented 7 years ago

As my use-case is memory-critical the extra allocation is prohibitive beyond speed concerns.

Understood, and in some cases this might be a convincing argument.

But because this change would affect the maintainability of Julia's code base, allow me push back on you a bit. This is a factor of 2 problem: any problem that can be solved with the in-place algorithm can also be solved out-of-place if you have twice the RAM. To me it seems better to keep an algorithm that's fast (or could be) and easy to understand than to replace it with one that's slower and quite opaque. Yes, the extra RAM might cost a couple hundred bucks, but source code is vastly more expensive and it affects everyone.

JKrehl commented 7 years ago

@timholy So, there is no interest in a truly in-place circshift!? Why wasn't that brought up in the previous discussions (#16032, #19923)?

Factor 2 is probably near the theoretical difference in speed (two runs of exchanges (two reads two writes) over half the index space versus one run of copy (1r 1w) over all the index space) without resorting to discontiguous memory access patterns.

stevengj commented 7 years ago

@timholy, I think the main concern is probably the repeated memory allocation and gc pressure in situations where one is doing circshift over and over in a loop... but I agree that there is a significant price being paid here in term of the code complexity.

timholy commented 7 years ago

Sorry @JKrehl, I didn't look into any of the previous discussions; I know that's frustrating for you but there are only so many hours of the day. Note that my viewpoint is a personal opinion rather than some kind of official policy, as evinced by the fact that others didn't suggest it. I may be the only one who thinks this way, and if so I'd not stand in anyone's way.

Obviously if you have an inplace version you don't technically need the out-of-place algorithm, and I've been assuming you'd get rid of the out-of-place one, or make it not called by default. These would trigger a stronger objection from me. If we keep the current version and add the inplace one as an option for sophisticated users, I'd have fewer objections.

I still think it's mildly crazy to not use RAM when it makes your life better. That's the whole point of having a good GC. @stevengj, if you're doing this a lot in a loop, why not just allocate a temporary buffer once at the beginning? You'll get considerably better performance that way.

JKrehl commented 7 years ago

@timholy Well, I'm only a tiny bit grumpy and that is not your fault because your point is valid. And Julia's code base is complicated enough as it is, although that does not seem to stop anyone, as far as I can tell.

Maybe there is a way to integrate bot algorithms into a single (still human-readable) form, since their iteration patterns are very similar...

timholy commented 7 years ago

If by complex you mostly mean the Base.Cartesian algorithms, over time I've replaced a few with the more modern and readable CartesianIndex/CartesianRange framework. But obviously there are still a lot left.

JKrehl commented 7 years ago

@timholy Not "mostly", but "also". However, being clever and being concise justifies a lot. In my mind.

It is possbile to merge both algorithms into one and it is not even unreadable: https://gist.github.com/JKrehl/a697c17d00a927766b9631c5a68bb8db

RainerHeintzmann commented 4 months ago

Great Algorithm. Here is a version updated to more modern Versions of Julia:

# code modified from https://github.com/JuliaLang/julia/issues/20849

function _reversing_loop(ex, half, ix, irx, be, en, hex=nothing)
    ixen = half ? :(div($be+$en-1,2)) : en
    quote
        $irx = $en
        for $ix in $be:$ixen
            $ex
            $irx -= 1
        end
        if $half && isodd($en-$be-1)
           $ix = $irx = $ixen+1
           $hex
        end
    end
end

@generated function splitreverse!(dest::AbstractArray{T,N}, src::AbstractArray{T,N}, split::NTuple{N, I}) where {T,N,I<:Integer}
    xs = ntuple((i)->Symbol("x_", i) , N)
    rxs = ntuple((i)->Symbol("rx_", i), N)

    hex = ex = :((dest[$(xs...)], dest[$(rxs...)]) = (src[$(rxs...)], src[$(xs...)]))

    for i in 1:N
        hex = quote
            $(_reversing_loop(ex, true, xs[i], rxs[i], 1, :(split[$i]), hex))
            $(_reversing_loop(ex, true, xs[i], rxs[i], :(split[$i]+1), :(size(src, $i)), hex))
        end

        ex = quote
            $(_reversing_loop(ex, false, xs[i], rxs[i], 1, :(split[$i])))
            $(_reversing_loop(ex, false, xs[i], rxs[i], :(split[$i]+1), :(size(src, $i))))
        end
    end

    quote
        @inbounds $hex
        dest
    end
end

function circshift_in_place!(dest::AbstractArray{T,N}, src::AbstractArray{T,N}, shifts::NTuple{N, I}) where {T,N,I<:Integer}
    @assert size(dest) == size(src)

    splits = map(mod, map(-, shifts), size(src))

    splitreverse!(dest, src, splits)
    splitreverse!(dest, dest, size(src))
end

circshift_in_place!(dest::AbstractVector, src::AbstractVector, shifts::I) where {I<:Integer} =  circshift_in_place!(dest, src, (shifts,))

.... just to for future reference.

henricryden commented 2 months ago

Why no chop data inplace instead, i.e. negate every other element?

JuliaLang / julia

proposal for multidimensional in-place circshift! #20849