Allocation free broadcasted methods WIP

First try to make element wise operation allocation-free. The test is about a handmade structure TypeA which make a lot of allocation when similar is called.
using ProfileSVG, BenchmarkTools

mutable struct TypeA{T}
  value::T
  compteur::Int
  n::Int
  rd_vec::Vector{T}
end 

typeA(val::T) where T <: Number = TypeA{T}(val,0, 500, rand(T,500))
Base.similar(::TypeA{T}) where T <: Number = TypeA{T}((T)(-1),0, 500, rand(T, 500))

v1 = typeA(1.)
v2 = typeA(2.)
vres = typeA(-1.)

import Base.+
function (+)(v1::TypeA{T}, v2::TypeA{T}; res=similar(v1)) where T <: Number
  res.value = v1.value + v2.value
  v1.compteur += 1
  v2.compteur += 1
  res.compteur += 1
  return res
end

@benchmark v1 + v2
# BenchmarkTools.Trial: 10000 samples with 206 evaluations.
#  Range (min … max):  472.330 ns … 48.035 μs  ┊ GC (min … max):  0.00% … 97.47%
#  Time  (median):     591.262 ns              ┊ GC (median):     0.00%
#  Time  (mean ± σ):   970.916 ns ±  1.827 μs  ┊ GC (mean ± σ):  21.73% ± 12.12%
#   █▅▅▄▃▃▂▁                                                     ▁
#   ███████████▇▇▆▅▅▅▅▄▁▃▃▁▃▃▃▃▁▁▁▃▁▄▄▁▁▃▃▄▁▄▄▃▁▅▅▅▄▆▅▅▆▅▆▅▅▁▄▅▅ █
#   472 ns        Histogram: log(frequency) by time      11.1 μs <
#  Memory estimate: 4.11 KiB, allocs estimate: 2.
@benchmark +(v1, v2; res=vres)
# BenchmarkTools.Trial: 10000 samples with 849 evaluations.
#  Range (min … max):  141.696 ns …  1.973 μs  ┊ GC (min … max): 0.00% … 0.00%
#  Time  (median):     150.766 ns              ┊ GC (median):    0.00%
#  Time  (mean ± σ):   175.039 ns ± 64.511 ns  ┊ GC (mean ± σ):  0.43% ± 2.08%
#   ▅█▆▅▁▂▃▃▂  ▁▂▂▁▁▂▂▁▁     ▁▁                                  ▁
#   ██████████████████████▇██████████▇█▇▇▇▇▇▇▅▆▆▆▆▆▇▅▄▅▄▅▄▅▄▅▅▅▄ █
#   142 ns        Histogram: log(frequency) by time       390 ns <
#  Memory estimate: 32 bytes, allocs estimate: 2.

import Base.*
function (*)(v1::TypeA{T}, v2::Y; res=similar(v1)) where {T <: Number, Y <: Number}
  res.value = v1.value * v2
  v1.compteur += 1
  res.compteur += 1
  return res
end

@benchmark v1 * 2
# BenchmarkTools.Trial: 10000 samples with 208 evaluations.
#  Range (min … max):  468.750 ns … 39.772 μs  ┊ GC (min … max):  0.00% … 93.51%
#  Time  (median):     578.365 ns              ┊ GC (median):     0.00%
#  Time  (mean ± σ):   955.533 ns ±  1.769 μs  ┊ GC (mean ± σ):  21.72% ± 12.26%
#   █▅▄▄▃▃▂▁▁                                                    ▁
#   ██████████▇▇▇▆▆▄▅▄▅▄▃▄▁▁▁▄▁▁▃▁▁▁▃▁▁▁▃▁▃▃▁▄▃▃▃▄▄▄▄▅▄▆▅▆▅▆▅▆▆▆ █
#   469 ns        Histogram: log(frequency) by time      10.5 μs <
#  Memory estimate: 4.11 KiB, allocs estimate: 2.

@benchmark *(v1, 2, res=vres)
# BenchmarkTools.Trial: 10000 samples with 832 evaluations.
#  Range (min … max):  142.188 ns …  2.481 μs  ┊ GC (min … max): 0.00% … 89.60%
#  Time  (median):     152.644 ns              ┊ GC (median):    0.00%
#  Time  (mean ± σ):   179.654 ns ± 69.919 ns  ┊ GC (mean ± σ):  0.39% ±  1.91%
#   ▄█▆▃▂▃▄▃▁▁▁▂▂▁▂▃▂▁▁ ▁ ▁    ▁                                 ▁
#   ████████████████████████████████▇██▇▇▆▇▇▆▆▆▅▅▅▆▅▅▅▅▅▅▆▅▅▄▄▃▅ █
#   142 ns        Histogram: log(frequency) by time       422 ns <
#  Memory estimate: 32 bytes, allocs estimate: 2.

import Base.==
function (==)(v1::TypeA{T}, v2::TypeA{T}) where {T <: Number}
  b = v1.value == v2.value
  return b
end

vec1 = rand(5)
vec2 = rand(5)
bc = Base.broadcasted(+, vec1, Base.broadcasted(*, vec2, 2))

# v1 + v2
# +(v1, v2; res=vres)
# ProfileSVG.@profview @benchmark v1 + v2
# ProfileSVG.@profview @benchmark +(v1, v2; res=vres)

@benchmark bc.f(v1, bc.args[2].f(v2, 2; res=vres); res=vres)
# BenchmarkTools.Trial: 10000 samples with 121 evaluations.
#  Range (min … max):  750.413 ns …  28.322 μs  ┊ GC (min … max): 0.00% … 95.81%
#  Time  (median):     776.860 ns               ┊ GC (median):    0.00%
#  Time  (mean ± σ):   965.603 ns ± 623.786 ns  ┊ GC (mean ± σ):  1.25% ±  2.31%
#   █▄▄▂▃▃▃▂▂▁ ▁▁▂▁▁▁▁▁▁      ▁                                   ▁
#   ████████████████████████████▇▇▇██▇███▇▇▇▇▆▅▆▆▆▆▅▅▆▅▅▄▄▅▅▅▄▄▃▅ █
#   750 ns        Histogram: log(frequency) by time        2.4 μs <
#  Memory estimate: 256 bytes, allocs estimate: 10.
@benchmark bc.f(v1, bc.args[2].f(v2, 2))
# BenchmarkTools.Trial: 10000 samples with 10 evaluations.
#  Range (min … max):  1.060 μs … 551.860 μs  ┊ GC (min … max):  0.00% … 97.86%
#  Time  (median):     1.270 μs               ┊ GC (median):     0.00%
#  Time  (mean ± σ):   2.205 μs ±  10.057 μs  ┊ GC (mean ± σ):  17.62% ±  4.13%
#   ██▄▄ ▃▄▇▅▃▃▁                                                ▂
#   ████████████▇▇▆▅▆▆▆▆▆▆▅▅▄▅▅▅▆▆▆▆▆▅▆▆▆▆▅▅▆█▇█▇▆▆▅▅▄▂▄▃▃▄▄▃▂▄ █
#   1.06 μs      Histogram: log(frequency) by time       8.9 μs <
#  Memory estimate: 8.38 KiB, allocs estimate: 9.

res = bc.f(v1, bc.args[2].f(v2, 2)) 
bc.f(v1, bc.args[2].f(v2, 2; res=vres); res=vres)

res == vres

bc = Base.broadcasted(+, Base.broadcasted(*, vec1, 3), Base.broadcasted(*, vec2, 2))
res = bc.f(bc.args[1].f(v1, 3), bc.args[2].f(v2, 2)) 
bc.f(bc.args[1].f(v1, 3; res=vres), bc.args[2].f(v2, 2; res=vres); res=vres)

res == vres #false
# res.value == 7
# vres.value == 8, i.e. it makes 4 + 4, 4 being the last of result of both argument operations

# After inversing both operands
bc.f(bc.args[2].f(v2, 2; res=vres), bc.args[1].f(v1, 3; res=vres); res=vres)
# vres.value == 6

# The flatten function can receive the optionnal argument
# bcf = Broadcast.flatten(bc)

# @allocated bcf.f(vec1,vec2,3) # 1+2*3 
# @allocated bcf.f(vec1[1],vec2[1],3) # 1+2*3 

# @allocated bcf.f(v1,v2,3)
# @allocated bcf.f(v1,v2,3;res=vres) # not working
JuliaSmoothOptimizers / PartitionedVectors.jl

Allocation free broadcasted methods WIP #17

19