JuliaArrays / FixedSizeArrays.jl

Fixed-size multidimensional arrays. An Array-like type with less indirection at the cost of resizing capability.
MIT License
18 stars 2 forks source link

Memory stores aren't vectorised in a `for` loop unless explicit at-inbounds is used #70

Open giordano opened 4 hours ago

giordano commented 4 hours ago

Follow up from https://github.com/JuliaArrays/FixedSizeArrays.jl/pull/68#issuecomment-2398328623:

julia> code_llvm((FixedSizeVector{Float64,Memory{Float64}},)) do v
           for idx in eachindex(v)
               v[idx] = idx
           end
       end
LLVM IR ```llvm ; Function Signature: var"#23"(FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}}) ; @ REPL[16]:2 within `#23` define void @"julia_#23_3098"(ptr nocapture noundef nonnull readonly align 8 dereferenceable(16) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 { top: ; ┌ @ abstractarray.jl:321 within `eachindex` ; │┌ @ abstractarray.jl:137 within `axes1` ; ││┌ @ abstractarray.jl:98 within `axes` ; │││┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:64 within `size` ; ││││┌ @ Base.jl:49 within `getproperty` %0 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 8 ; └└└└└ ; ┌ @ range.jl:911 within `iterate` ; │┌ @ range.jl:688 within `isempty` ; ││┌ @ operators.jl:425 within `>` ; │││┌ @ int.jl:83 within `<` %.unbox = load i64, ptr %0, align 8 %1 = icmp slt i64 %.unbox, 1 ; └└└└ br i1 %1, label %L48, label %L13.preheader16 L13.preheader16: ; preds = %top %memoryref_mem = load ptr, ptr %.roots.v, align 8 %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %memoryref_mem, i64 0, i32 1 ; @ REPL[16]:3 within `#23` ; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:240 ; │┌ @ boot.jl:544 within `memoryref` %memoryref_data.pre = load ptr, ptr %memory_data_ptr, align 8 ; │└ ; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:59 within `setindex!` br label %L29 L29: ; preds = %L29, %L13.preheader16 %value_phi3 = phi i64 [ %4, %L29 ], [ 1, %L13.preheader16 ] ; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:239 ; │┌ @ number.jl:7 within `convert` ; ││┌ @ float.jl:239 within `Float64` %2 = sitofp i64 %value_phi3 to double ; │└└ ; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:240 %memoryref_offset = shl i64 %value_phi3, 3 ; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:241 %3 = getelementptr i8, ptr %memoryref_data.pre, i64 %memoryref_offset %memoryref_data6 = getelementptr i8, ptr %3, i64 -8 store double %2, ptr %memoryref_data6, align 8 ; └ ; @ REPL[16]:4 within `#23` ; ┌ @ range.jl:915 within `iterate` %4 = add nuw i64 %value_phi3, 1 ; └ %5 = icmp ult i64 %value_phi3, %.unbox br i1 %5, label %L29, label %L48 L48: ; preds = %L29, %top ret void } ```

Compare with

julia> code_llvm((FixedSizeVector{Float64,Memory{Float64}},)) do v
           for idx in eachindex(v)
               @inbounds v[idx] = idx
           end
       end
LLVM IR ```llvm ; Function Signature: var"#26"(FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}}) ; @ REPL[17]:2 within `#26` define void @"julia_#26_3146"(ptr nocapture noundef nonnull readonly align 8 dereferenceable(16) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 { top: ; ┌ @ abstractarray.jl:321 within `eachindex` ; │┌ @ abstractarray.jl:137 within `axes1` ; ││┌ @ abstractarray.jl:98 within `axes` ; │││┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:64 within `size` ; ││││┌ @ Base.jl:49 within `getproperty` %0 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 8 ; └└└└└ ; ┌ @ range.jl:911 within `iterate` ; │┌ @ range.jl:688 within `isempty` ; ││┌ @ operators.jl:425 within `>` ; │││┌ @ int.jl:83 within `<` %.unbox = load i64, ptr %0, align 8 %1 = icmp slt i64 %.unbox, 1 ; └└└└ br i1 %1, label %L48, label %L13.preheader L13.preheader: ; preds = %top %memoryref_mem = load ptr, ptr %.roots.v, align 8 %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %memoryref_mem, i64 0, i32 1 %memoryref_data = load ptr, ptr %memory_data_ptr, align 8 ; @ REPL[17]:4 within `#26` %invariant.gep = getelementptr i8, ptr %memoryref_data, i64 -8 %min.iters.check = icmp ult i64 %.unbox, 8 br i1 %min.iters.check, label %scalar.ph, label %vector.ph vector.ph: ; preds = %L13.preheader %n.vec = and i64 %.unbox, 9223372036854775800 %ind.end = or disjoint i64 %n.vec, 1 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.ind = phi <2 x i64> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] %step.add = add <2 x i64> %vec.ind, %step.add13 = add <2 x i64> %vec.ind, %step.add14 = add <2 x i64> %vec.ind, ; @ REPL[17]:3 within `#26` ; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:239 ; │┌ @ number.jl:7 within `convert` ; ││┌ @ float.jl:239 within `Float64` %2 = sitofp <2 x i64> %vec.ind to <2 x double> %3 = sitofp <2 x i64> %step.add to <2 x double> %4 = sitofp <2 x i64> %step.add13 to <2 x double> %5 = sitofp <2 x i64> %step.add14 to <2 x double> ; │└└ ; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:240 %offset.idx = shl i64 %index, 3 %6 = or disjoint i64 %offset.idx, 8 ; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:241 %7 = getelementptr i8, ptr %invariant.gep, i64 %6 %8 = getelementptr double, ptr %7, i64 2 %9 = getelementptr double, ptr %7, i64 4 %10 = getelementptr double, ptr %7, i64 6 store <2 x double> %2, ptr %7, align 8 store <2 x double> %3, ptr %8, align 8 store <2 x double> %4, ptr %9, align 8 store <2 x double> %5, ptr %10, align 8 %index.next = add nuw i64 %index, 8 %vec.ind.next = add <2 x i64> %vec.ind, %11 = icmp eq i64 %index.next, %n.vec br i1 %11, label %middle.block, label %vector.body middle.block: ; preds = %vector.body ; └ ; @ REPL[17]:4 within `#26` %cmp.n = icmp eq i64 %.unbox, %n.vec br i1 %cmp.n, label %L48, label %scalar.ph scalar.ph: ; preds = %middle.block, %L13.preheader %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L13.preheader ] br label %L13 L13: ; preds = %L13, %scalar.ph %value_phi3 = phi i64 [ %13, %L13 ], [ %bc.resume.val, %scalar.ph ] ; @ REPL[17]:3 within `#26` ; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:239 ; │┌ @ number.jl:7 within `convert` ; ││┌ @ float.jl:239 within `Float64` %12 = sitofp i64 %value_phi3 to double ; │└└ ; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:240 %memoryref_offset = shl i64 %value_phi3, 3 ; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:241 %gep = getelementptr i8, ptr %invariant.gep, i64 %memoryref_offset store double %12, ptr %gep, align 8 ; └ ; @ REPL[17]:4 within `#26` ; ┌ @ range.jl:915 within `iterate` ; │┌ @ promotion.jl:639 within `==` %.not.not = icmp eq i64 %value_phi3, %.unbox ; │└ %13 = add nuw i64 %value_phi3, 1 ; └ br i1 %.not.not, label %L48, label %L13 L48: ; preds = %L13, %middle.block, %top ret void } ```

I believe this is entirely due to the fact we're using Memory instead of MemoryRef, and MemoryRef is somehow better optimised for stores (it doesn't need to call memoryrefnew(mem)). As a proof of concept (don't take this code seriously!), with this patch:

diff --git a/src/FixedSizeArrays.jl b/src/FixedSizeArrays.jl
index 6aeac57..3fd1506 100644
--- a/src/FixedSizeArrays.jl
+++ b/src/FixedSizeArrays.jl
@@ -13,9 +13,11 @@ struct Internal end

 struct FixedSizeArray{T,N,Mem<:GenericMemory{<:Any,T}} <: DenseArray{T,N}
     mem::Mem
+    ref::MemoryRef{T}
     size::NTuple{N,Int}
     function FixedSizeArray{T,N,M}(::Internal, mem::M, size::NTuple{N,Int}) where {T,N,M<:GenericMemory{<:Any,T}}
-        new{T,N,M}(mem, size)
+        ref = Base.memoryref(mem)
+        new{T,N,M}(mem, ref, size)
     end
 end

@@ -57,7 +59,7 @@ Base.IndexStyle(::Type{<:FixedSizeArray}) = IndexLinear()
 Base.@propagate_inbounds Base.getindex(A::FixedSizeArray, i::Int) = A.mem[i]
 Base.@propagate_inbounds Base.@assume_effects :noub_if_noinbounds function Base.setindex!(A::FixedSizeArray{T}, x, i::Int) where {T}
     @boundscheck checkbounds(A, i)
-    @inbounds A.mem[i] = x
+    Base.memoryrefset!(Base.memoryrefnew(A.ref, i, false), x isa T ? x : convert(T,x)::T, :not_atomic, false)
     return A
 end

I get

julia> code_llvm((FixedSizeVector{Float64,Memory{Float64}},)) do v
           for idx in eachindex(v)
               v[idx] = 1.0
           end
       end
; Function Signature: var"#2"(FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}})
;  @ REPL[2]:2 within `#2`
define void @"julia_#2_2000"(ptr nocapture noundef nonnull readonly align 8 dereferenceable(32) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 {
top:
; ┌ @ abstractarray.jl:321 within `eachindex`
; │┌ @ abstractarray.jl:137 within `axes1`
; ││┌ @ abstractarray.jl:98 within `axes`
; │││┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:66 within `size`
; ││││┌ @ Base.jl:49 within `getproperty`
       %0 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 24
; └└└└└
; ┌ @ range.jl:911 within `iterate`
; │┌ @ range.jl:688 within `isempty`
; ││┌ @ operators.jl:425 within `>`
; │││┌ @ int.jl:83 within `<`
      %.unbox = load i64, ptr %0, align 8
      %1 = icmp slt i64 %.unbox, 1
; └└└└
  br i1 %1, label %L45, label %L13.preheader14

L13.preheader14:                                  ; preds = %top
  %2 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 8
  %memoryref_data = load ptr, ptr %2, align 8
;  @ REPL[2]:3 within `#2`
; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:61 within `setindex!`
; │┌ @ abstractarray.jl:699 within `checkbounds`
    %invariant.gep = getelementptr i8, ptr %memoryref_data, i64 -8
    %min.iters.check = icmp ult i64 %.unbox, 8
    br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %L13.preheader14
    %n.vec = and i64 %.unbox, 9223372036854775800
    %ind.end = or disjoint i64 %n.vec, 1
    br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; │└
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:62 within `setindex!`
   %offset.idx = shl i64 %index, 3
   %3 = or disjoint i64 %offset.idx, 8
   %4 = getelementptr i8, ptr %invariant.gep, i64 %3
   %5 = getelementptr i64, ptr %4, i64 2
   %6 = getelementptr i64, ptr %4, i64 4
   %7 = getelementptr i64, ptr %4, i64 6
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %4, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %5, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %6, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %7, align 8
   %index.next = add nuw i64 %index, 8
   %8 = icmp eq i64 %index.next, %n.vec
   br i1 %8, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
; └
;  @ REPL[2]:4 within `#2`
  %cmp.n = icmp eq i64 %.unbox, %n.vec
  br i1 %cmp.n, label %L45, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %L13.preheader14
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L13.preheader14 ]
;  @ REPL[2]:3 within `#2`
; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:61 within `setindex!`
; │┌ @ abstractarray.jl:699 within `checkbounds`
    br label %L30

L30:                                              ; preds = %L30, %scalar.ph
    %value_phi3 = phi i64 [ %9, %L30 ], [ %bc.resume.val, %scalar.ph ]
; │└
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:62 within `setindex!`
   %memoryref_offset = shl i64 %value_phi3, 3
   %gep = getelementptr i8, ptr %invariant.gep, i64 %memoryref_offset
   store i64 4607182418800017408, ptr %gep, align 8
; └
;  @ REPL[2]:4 within `#2`
; ┌ @ range.jl:915 within `iterate`
   %9 = add nuw i64 %value_phi3, 1
; └
  %10 = icmp ult i64 %value_phi3, %.unbox
  br i1 %10, label %L30, label %L45

L45:                                              ; preds = %L30, %middle.block, %top
  ret void
}

which is pretty much what you'd get for Vectors. It looks like using Memory instead of MemoryRef adds an extra layer of indirection, no idea where to go from here. CC: @oscardssmith who may have opinions about this (I already checked, https://github.com/JuliaLang/julia/pull/55913 doesn't change anything here)

giordano commented 2 hours ago

I'm so confused. On https://github.com/JuliaLang/julia/commit/dc344285d5be2bfdf4ead01effa95643b7babc8b I get:

julia> code_llvm((Memory{Float64},)) do v
           for idx in eachindex(v)
               v[idx] = 1.0
           end
       end
; Function Signature: var"#2"(Memory{Float64})
;  @ REPL[1]:2 within `#2`
define void @"julia_#2_1650"(ptr noundef nonnull align 8 dereferenceable(16) %"v::GenericMemory") #0 {
top:
  %pgcstack = call ptr inttoptr (i64 4335582988 to ptr)(i64 4335583024) #9
; ┌ @ range.jl:911 within `iterate`
; │┌ @ range.jl:688 within `isempty`
; ││┌ @ operators.jl:425 within `>`
; │││┌ @ int.jl:83 within `<`
      %.unbox = load i64, ptr %"v::GenericMemory", align 8
      %0 = icmp slt i64 %.unbox, 1
; └└└└
  br i1 %0, label %L29, label %mainloop

mainloop:                                         ; preds = %top
  %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %"v::GenericMemory", i64 0, i32 1
  %memoryref_data = load ptr, ptr %memory_data_ptr, align 8
  %1 = shl nuw nsw i64 %.unbox, 1
  %memoryref_bytelen = shl i64 %.unbox, 3
;  @ REPL[1]:3 within `#2`
; ┌ @ genericmemory.jl:240 within `setindex!`
   %smin26 = call i64 @llvm.smin.i64(i64 %.unbox, i64 %1)
   %2 = sub nsw i64 %1, %smin26
   %exit.mainloop.at = call i64 @llvm.umin.i64(i64 %.unbox, i64 %2)
   %.not = icmp eq i64 %1, %smin26
   br i1 %.not, label %postloop, label %L11.preheader

L11.preheader:                                    ; preds = %mainloop
   %3 = and i64 %.unbox, 2305843009213693951
   %umax = call i64 @llvm.umax.i64(i64 %exit.mainloop.at, i64 1)
   %4 = add nsw i64 %umax, -1
   %umin = call i64 @llvm.umin.i64(i64 %3, i64 %4)
   %min.iters.check = icmp ult i64 %umin, 8
   br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %L11.preheader
   %5 = add nuw nsw i64 %umin, 1
   %n.mod.vf = and i64 %5, 7
   %6 = icmp eq i64 %n.mod.vf, 0
   %7 = select i1 %6, i64 8, i64 %n.mod.vf
   %n.vec = sub nsw i64 %5, %7
   %ind.end = add nsw i64 %n.vec, 1
   br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
   %8 = phi i64 [ 1, %vector.ph ], [ %15, %vector.body ]
; │ @ genericmemory.jl:241 within `setindex!`
   %9 = shl i64 %8, 3
   %10 = getelementptr i8, ptr %memoryref_data, i64 %9
   %11 = getelementptr i8, ptr %10, i64 -8
   %12 = getelementptr i8, ptr %10, i64 8
   %13 = getelementptr i8, ptr %10, i64 24
   %14 = getelementptr i8, ptr %10, i64 40
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %11, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %12, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %13, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %14, align 8
   %index.next = add nuw i64 %index, 8
   %15 = add i64 %8, 8
   %16 = icmp eq i64 %index.next, %n.vec
   br i1 %16, label %scalar.ph, label %vector.body

scalar.ph:                                        ; preds = %vector.body, %L11.preheader
   %bc.resume.val = phi i64 [ 1, %L11.preheader ], [ %ind.end, %vector.body ]
; │ @ genericmemory.jl:240 within `setindex!`
   br label %L11

L11:                                              ; preds = %idxend, %scalar.ph
   %value_phi3 = phi i64 [ %17, %idxend ], [ %bc.resume.val, %scalar.ph ]
   %memoryref_offset = shl i64 %value_phi3, 3
   %memoryref_byteoffset = add i64 %memoryref_offset, -8
   %memoryref_isinbounds = icmp ult i64 %memoryref_byteoffset, %memoryref_bytelen
   br i1 %memoryref_isinbounds, label %idxend, label %oob

L29:                                              ; preds = %idxend.postloop, %main.exit.selector, %top
; └
;  @ REPL[1]:4 within `#2`
  ret void

oob:                                              ; preds = %L11.postloop, %L11
  %value_phi3.lcssa = phi i64 [ %value_phi3.postloop, %L11.postloop ], [ %value_phi3, %L11 ]
;  @ REPL[1]:3 within `#2`
; ┌ @ genericmemory.jl:240 within `setindex!`
   %ptls_field = getelementptr inbounds i8, ptr %pgcstack, i64 16
   %ptls_load = load ptr, ptr %ptls_field, align 8
   %"box::GenericMemoryRef" = call noalias nonnull align 8 dereferenceable(32) ptr @ijl_gc_small_alloc(ptr %ptls_load, i32 616, i32 32, i64 4689350496) #8
   %"box::GenericMemoryRef.tag_addr" = getelementptr inbounds i64, ptr %"box::GenericMemoryRef", i64 -1
   store atomic i64 4689350496, ptr %"box::GenericMemoryRef.tag_addr" unordered, align 8
   store ptr %memoryref_data, ptr %"box::GenericMemoryRef", align 8
   %.repack17 = getelementptr inbounds { ptr, ptr }, ptr %"box::GenericMemoryRef", i64 0, i32 1
   store ptr %"v::GenericMemory", ptr %.repack17, align 8
   call void @ijl_bounds_error_int(ptr nonnull %"box::GenericMemoryRef", i64 %value_phi3.lcssa)
   unreachable

idxend:                                           ; preds = %L11
; │ @ genericmemory.jl:241 within `setindex!`
   %memoryref_data11 = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset
   store i64 4607182418800017408, ptr %memoryref_data11, align 8
; └
;  @ REPL[1]:4 within `#2`
; ┌ @ range.jl:915 within `iterate`
   %17 = add nuw nsw i64 %value_phi3, 1
; └
  %.not35 = icmp ult i64 %value_phi3, %exit.mainloop.at
  br i1 %.not35, label %L11, label %main.exit.selector

main.exit.selector:                               ; preds = %idxend
  %18 = icmp ult i64 %value_phi3, %.unbox
  br i1 %18, label %postloop, label %L29

postloop:                                         ; preds = %main.exit.selector, %mainloop
  %value_phi3.copy = phi i64 [ 1, %mainloop ], [ %17, %main.exit.selector ]
  br label %L11.postloop

L11.postloop:                                     ; preds = %idxend.postloop, %postloop
  %value_phi3.postloop = phi i64 [ %20, %idxend.postloop ], [ %value_phi3.copy, %postloop ]
;  @ REPL[1]:3 within `#2`
; ┌ @ genericmemory.jl:240 within `setindex!`
   %memoryref_offset.postloop = add nsw i64 %value_phi3.postloop, -1
   %19 = add nuw nsw i64 %.unbox, %memoryref_offset.postloop
   %memoryref_ovflw.not.postloop = icmp ult i64 %19, %1
   %memoryref_byteoffset.postloop = shl i64 %memoryref_offset.postloop, 3
   %memoryref_isinbounds.postloop = icmp ult i64 %memoryref_byteoffset.postloop, %memoryref_bytelen
   %"memoryref_isinbounds&notovflw.postloop" = and i1 %memoryref_ovflw.not.postloop, %memoryref_isinbounds.postloop
   br i1 %"memoryref_isinbounds&notovflw.postloop", label %idxend.postloop, label %oob

idxend.postloop:                                  ; preds = %L11.postloop
; │ @ genericmemory.jl:241 within `setindex!`
   %memoryref_data11.postloop = getelementptr inbounds i8, ptr %memoryref_data, i64 %memoryref_byteoffset.postloop
   store i64 4607182418800017408, ptr %memoryref_data11.postloop, align 8
; └
;  @ REPL[1]:4 within `#2`
; ┌ @ range.jl:915 within `iterate`
; │┌ @ promotion.jl:639 within `==`
    %.not.not.postloop = icmp eq i64 %value_phi3.postloop, %.unbox
; │└
   %20 = add nuw nsw i64 %value_phi3.postloop, 1
; └
  br i1 %.not.not.postloop, label %L29, label %L11.postloop
}

For a plain Memory stores are vectorised (and bounds checks aren't even removed, I think it could get a similar treatment to #68), so why a thin wrapper around a Memory object can't do the same??!?

giordano commented 2 hours ago

This issue is fixed by...reverting #68, and the solution was already in the LLVM IR shown at https://github.com/JuliaArrays/FixedSizeArrays.jl/issues/65#issue-2571121680 😕 It sounds like we can't have both bounds checking elision and vectorised stores.