JuliaLang / julia

The Julia Programming Language
https://julialang.org/
MIT License
45.43k stars 5.45k forks source link

`copyto!` does not vectorize for views with `UnitRange` indices #53430

Open jishnub opened 6 months ago

jishnub commented 6 months ago
julia> a = rand(100,100); b = similar(a); av = view(a, axes(a)...); bv = view(b, axes(b)...); bv2 = view(b, UnitRange.(axes(b))...);

julia> @btime copyto!($bv, $av); # fast, indices are Base.OneTos
  2.333 μs (0 allocations: 0 bytes)

julia> @btime copyto!($bv2, $av); # slow, indices are UnitRanges
  24.093 μs (0 allocations: 0 bytes)

This performance difference appears to arise from a lack of vectorization in indexing. In the first case, the output of

@code_llvm Base.copyto_unaliased!(IndexCartesian(), bv, IndexCartesian(), av);

contains

vector.body:                                      ; preds = %vector.body, %vector.ph
    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; └└
;  @ abstractarray.jl:1105 within `copyto_unaliased!`
; ┌ @ abstractarray.jl:1335 within `getindex`
; │┌ @ abstractarray.jl:1381 within `_getindex`
; ││┌ @ subarray.jl:317 within `getindex` @ array.jl:924 @ essentials.jl:892
     %26 = add i64 %index, %20
     %27 = getelementptr inbounds double, ptr %12, i64 %26
     %wide.load = load <4 x double>, ptr %27, align 8
     %28 = getelementptr inbounds double, ptr %27, i64 4
     %wide.load1001 = load <4 x double>, ptr %28, align 8
     %29 = getelementptr inbounds double, ptr %27, i64 8
     %wide.load1002 = load <4 x double>, ptr %29, align 8
     %30 = getelementptr inbounds double, ptr %27, i64 12
     %wide.load1003 = load <4 x double>, ptr %30, align 8
; └└└
; ┌ @ abstractarray.jl:1436 within `setindex!`
; │┌ @ abstractarray.jl:1466 within `_setindex!`
; ││┌ @ subarray.jl:369 within `setindex!` @ array.jl:988
     %31 = add i64 %index, %21
     %32 = getelementptr inbounds double, ptr %13, i64 %31
     store <4 x double> %wide.load, ptr %32, align 8
     %33 = getelementptr inbounds double, ptr %32, i64 4
     store <4 x double> %wide.load1001, ptr %33, align 8
     %34 = getelementptr inbounds double, ptr %32, i64 8
     store <4 x double> %wide.load1002, ptr %34, align 8
     %35 = getelementptr inbounds double, ptr %32, i64 12
     store <4 x double> %wide.load1003, ptr %35, align 8
     %index.next = add nuw i64 %index, 16
     %36 = icmp eq i64 %index.next, %n.vec
     br i1 %36, label %middle.block, label %vector.body

whereas

@code_llvm Base.copyto_unaliased!(IndexCartesian(), bv2, IndexCartesian(), av);

contains

;  @ abstractarray.jl:1105 within `copyto_unaliased!`
; ┌ @ abstractarray.jl:1335 within `getindex`
; │┌ @ abstractarray.jl:1381 within `_getindex`
; ││┌ @ subarray.jl:317 within `getindex` @ array.jl:924
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ abstractarray.jl:3119 within `offsetin`
; │││││││┌ @ int.jl:86 within `-`
          %19 = add i64 %value_phi71, -1
; ││││││└└
; ││││││┌ @ int.jl:88 within `*`
         %20 = mul i64 %"src::SubArray.parent190.size209.sroa.0.0.copyload", %19
; ││││││└
; ││││││┌ @ int.jl:87 within `+`
         %21 = add i64 %value_phi70, -1
; │││└└└└
; │││ @ subarray.jl:317 within `getindex` @ array.jl:924 @ essentials.jl:892
     %22 = add i64 %21, %20
     %23 = getelementptr inbounds double, ptr %16, i64 %22
     %24 = load double, ptr %23, align 8
; └└└
; ┌ @ abstractarray.jl:1436 within `setindex!`
; │┌ @ abstractarray.jl:1466 within `_setindex!`
; ││┌ @ subarray.jl:369 within `setindex!`
; │││┌ @ subarray.jl:295 within `reindex` @ subarray.jl:295
; ││││┌ @ views.jl:150 within `maybeview`
; │││││┌ @ array.jl:3067 within `getindex`
; ││││││┌ @ range.jl:938 within `_getindex`
; │││││││┌ @ int.jl:87 within `+`
          %25 = add i64 %value_phi71, -2
; │││└└└└└
; │││ @ subarray.jl:369 within `setindex!` @ array.jl:988
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ abstractarray.jl:3119 within `offsetin`
; │││││││┌ @ int.jl:86 within `-`
          %26 = add i64 %25, %"dest::SubArray.indices_ptr[2]_ptr.start_ptr.unbox"
; ││││││└└
; ││││││┌ @ int.jl:88 within `*`
         %27 = mul i64 %"dest::SubArray.parent.size298.sroa.0.0.copyload", %26
; │││└└└└
; │││ @ subarray.jl:369 within `setindex!`
; │││┌ @ subarray.jl:295 within `reindex`
; ││││┌ @ views.jl:150 within `maybeview`
; │││││┌ @ array.jl:3067 within `getindex`
; ││││││┌ @ range.jl:938 within `_getindex`
; │││││││┌ @ int.jl:87 within `+`
          %28 = add i64 %value_phi70, -2
; │││└└└└└
; │││ @ subarray.jl:369 within `setindex!` @ array.jl:988
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ int.jl:87 within `+`
         %29 = add i64 %28, %"dest::SubArray.indices_ptr[1]_ptr.start_ptr.unbox"
; │││└└└└
     %30 = add i64 %29, %27
     %31 = getelementptr inbounds double, ptr %17, i64 %30
     store double %24, ptr %31, align 8
; └└└

In particular, if I add a @simd declaration, this appears to improve performance considerably:

julia> function mycopyto!(dest::AbstractArray, src::AbstractArray)
           iterdest = eachindex(dest)
           itersrc = eachindex(src)
           if iterdest == itersrc
               @inbounds @simd for I in iterdest
                   if isassigned(src, I)
                       dest[I] = src[I]
                   else
                       Base._unsetindex!(dest, I)
                   end
               end
           else
               @invoke copyto!(dest::AbstractArray, src::AbstractArray)
           end
           return dest
       end
mycopyto! (generic function with 1 method)

julia> @btime mycopyto!($bv2, $av);
  2.276 μs (0 allocations: 0 bytes)

Versioninfo:

julia> versioninfo()
Julia Version 1.12.0-DEV.50
Commit 8425b0ea03* (2024-02-21 20:33 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, tigerlake)
Threads: 1 default, 0 interactive, 1 GC (on 8 virtual cores)
Environment:
  LD_LIBRARY_PATH = :/usr/lib/x86_64-linux-gnu/gtk-3.0/modules
  JULIA_EDITOR = subl
jishnub commented 6 months ago

This also appears to be a regression from v1.10.1, where both the operations are equally performant:

julia> a = rand(100,100); b = similar(a); av = view(a, axes(a)...); bv = view(b, axes(b)...); bv2 = view(b, UnitRange.(axes(b))...);

julia> @btime copyto!($bv, $av);
  9.646 μs (0 allocations: 0 bytes)

julia> @btime copyto!($bv2, $av);
  9.560 μs (0 allocations: 0 bytes)

julia> versioninfo()
Julia Version 1.10.1
Commit 7790d6f0641 (2024-02-13 20:41 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-15.0.7 (ORCJIT, tigerlake)
Threads: 1 default, 0 interactive, 1 GC (on 8 virtual cores)
Environment:
  LD_LIBRARY_PATH = :/usr/lib/x86_64-linux-gnu/gtk-3.0/modules
  JULIA_EDITOR = subl
jishnub commented 6 months ago

Bisected the regression to #51760

commit f0a28e9a45a34f1c524b3cf02cbbac1351f1da81 (HEAD)
Author: Jameson Nash <jameson@juliacomputing.com>
Date:   Tue Oct 24 16:07:08 2023 -0500

    add unsetindex support to more copyto methods (#51760)

On df39cee3d8723d04b8ba73b13e59d45d195e8f0b,

julia> a = rand(100,100); b = similar(a); av = view(a, axes(a)...); bv2 = view(b, UnitRange.(axes(b))...);

julia> @btime copyto!($bv2, $av);
  2.428 μs (0 allocations: 0 bytes)

whereas on f0a28e9a45:

julia> @btime copyto!($bv2, $av);
  20.690 μs (0 allocations: 0 bytes)
KristofferC commented 4 months ago

Even with the revert of https://github.com/JuliaLang/julia/pull/51760 there still seem to be some slowdown:

1.10:

julia> @btime copyto!($bv, $av); # fast, indices are Base.OneTos
  3.695 μs (0 allocations: 0 bytes)

julia> @btime copyto!($bv2, $av); # slow, indices are UnitRanges
  3.522 μs (0 allocations: 0 bytes)

1.11:

julia> @btime copyto!($bv, $av); # fast, indices are Base.OneTos
  1.158 μs (0 allocations: 0 bytes)

julia> @btime copyto!($bv2, $av); # slow, indices are UnitRanges
  7.991 μs (0 allocations: 0 bytes)

But I am not sure it is bad enough to require a milestone...

jishnub commented 3 months ago

Oddly, for me, the performance regression persists on a recently nightly:

julia> @btime copyto!($bv, $av); # fast, indices are Base.OneTos
  2.375 μs (0 allocations: 0 bytes)

julia> @btime copyto!($bv2, $av); # slow, indices are UnitRanges
  23.745 μs (0 allocations: 0 bytes)

julia> VERSION
v"1.12.0-DEV.560"

I also see a similar issue on the backports-release-1.11 branch. I'm unsure why the issue still persists, but perhaps we should add the milestone back.

julia> @btime copyto!($bv, $av);
  2.597 μs (0 allocations: 0 bytes)

julia> @btime copyto!($bv2, $av);
  24.499 μs (0 allocations: 0 bytes)

julia> versioninfo()
Julia Version 1.11.0-beta2.2
Commit 862f863e0f* (2024-05-29 10:49 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, tigerlake)
Threads: 1 default, 0 interactive, 1 GC (on 8 virtual cores)
Environment:
  LD_LIBRARY_PATH = :/usr/lib/x86_64-linux-gnu/gtk-3.0/modules
  JULIA_EDITOR = subl

@KristofferC Did the performance improve for you after reverting the PR?