First try to make element wise operation allocation-free.
The test is about a handmade structure TypeA which make a lot of allocation when similar is called.
using ProfileSVG, BenchmarkTools
mutable struct TypeA{T}
value::T
compteur::Int
n::Int
rd_vec::Vector{T}
end
typeA(val::T) where T <: Number = TypeA{T}(val,0, 500, rand(T,500))
Base.similar(::TypeA{T}) where T <: Number = TypeA{T}((T)(-1),0, 500, rand(T, 500))
v1 = typeA(1.)
v2 = typeA(2.)
vres = typeA(-1.)
import Base.+
function (+)(v1::TypeA{T}, v2::TypeA{T}; res=similar(v1)) where T <: Number
res.value = v1.value + v2.value
v1.compteur += 1
v2.compteur += 1
res.compteur += 1
return res
end
@benchmark v1 + v2
# BenchmarkTools.Trial: 10000 samples with 206 evaluations.
# Range (min … max): 472.330 ns … 48.035 μs ┊ GC (min … max): 0.00% … 97.47%
# Time (median): 591.262 ns ┊ GC (median): 0.00%
# Time (mean ± σ): 970.916 ns ± 1.827 μs ┊ GC (mean ± σ): 21.73% ± 12.12%
# █▅▅▄▃▃▂▁ ▁
# ███████████▇▇▆▅▅▅▅▄▁▃▃▁▃▃▃▃▁▁▁▃▁▄▄▁▁▃▃▄▁▄▄▃▁▅▅▅▄▆▅▅▆▅▆▅▅▁▄▅▅ █
# 472 ns Histogram: log(frequency) by time 11.1 μs <
# Memory estimate: 4.11 KiB, allocs estimate: 2.
@benchmark +(v1, v2; res=vres)
# BenchmarkTools.Trial: 10000 samples with 849 evaluations.
# Range (min … max): 141.696 ns … 1.973 μs ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 150.766 ns ┊ GC (median): 0.00%
# Time (mean ± σ): 175.039 ns ± 64.511 ns ┊ GC (mean ± σ): 0.43% ± 2.08%
# ▅█▆▅▁▂▃▃▂ ▁▂▂▁▁▂▂▁▁ ▁▁ ▁
# ██████████████████████▇██████████▇█▇▇▇▇▇▇▅▆▆▆▆▆▇▅▄▅▄▅▄▅▄▅▅▅▄ █
# 142 ns Histogram: log(frequency) by time 390 ns <
# Memory estimate: 32 bytes, allocs estimate: 2.
import Base.*
function (*)(v1::TypeA{T}, v2::Y; res=similar(v1)) where {T <: Number, Y <: Number}
res.value = v1.value * v2
v1.compteur += 1
res.compteur += 1
return res
end
@benchmark v1 * 2
# BenchmarkTools.Trial: 10000 samples with 208 evaluations.
# Range (min … max): 468.750 ns … 39.772 μs ┊ GC (min … max): 0.00% … 93.51%
# Time (median): 578.365 ns ┊ GC (median): 0.00%
# Time (mean ± σ): 955.533 ns ± 1.769 μs ┊ GC (mean ± σ): 21.72% ± 12.26%
# █▅▄▄▃▃▂▁▁ ▁
# ██████████▇▇▇▆▆▄▅▄▅▄▃▄▁▁▁▄▁▁▃▁▁▁▃▁▁▁▃▁▃▃▁▄▃▃▃▄▄▄▄▅▄▆▅▆▅▆▅▆▆▆ █
# 469 ns Histogram: log(frequency) by time 10.5 μs <
# Memory estimate: 4.11 KiB, allocs estimate: 2.
@benchmark *(v1, 2, res=vres)
# BenchmarkTools.Trial: 10000 samples with 832 evaluations.
# Range (min … max): 142.188 ns … 2.481 μs ┊ GC (min … max): 0.00% … 89.60%
# Time (median): 152.644 ns ┊ GC (median): 0.00%
# Time (mean ± σ): 179.654 ns ± 69.919 ns ┊ GC (mean ± σ): 0.39% ± 1.91%
# ▄█▆▃▂▃▄▃▁▁▁▂▂▁▂▃▂▁▁ ▁ ▁ ▁ ▁
# ████████████████████████████████▇██▇▇▆▇▇▆▆▆▅▅▅▆▅▅▅▅▅▅▆▅▅▄▄▃▅ █
# 142 ns Histogram: log(frequency) by time 422 ns <
# Memory estimate: 32 bytes, allocs estimate: 2.
import Base.==
function (==)(v1::TypeA{T}, v2::TypeA{T}) where {T <: Number}
b = v1.value == v2.value
return b
end
vec1 = rand(5)
vec2 = rand(5)
bc = Base.broadcasted(+, vec1, Base.broadcasted(*, vec2, 2))
# v1 + v2
# +(v1, v2; res=vres)
# ProfileSVG.@profview @benchmark v1 + v2
# ProfileSVG.@profview @benchmark +(v1, v2; res=vres)
@benchmark bc.f(v1, bc.args[2].f(v2, 2; res=vres); res=vres)
# BenchmarkTools.Trial: 10000 samples with 121 evaluations.
# Range (min … max): 750.413 ns … 28.322 μs ┊ GC (min … max): 0.00% … 95.81%
# Time (median): 776.860 ns ┊ GC (median): 0.00%
# Time (mean ± σ): 965.603 ns ± 623.786 ns ┊ GC (mean ± σ): 1.25% ± 2.31%
# █▄▄▂▃▃▃▂▂▁ ▁▁▂▁▁▁▁▁▁ ▁ ▁
# ████████████████████████████▇▇▇██▇███▇▇▇▇▆▅▆▆▆▆▅▅▆▅▅▄▄▅▅▅▄▄▃▅ █
# 750 ns Histogram: log(frequency) by time 2.4 μs <
# Memory estimate: 256 bytes, allocs estimate: 10.
@benchmark bc.f(v1, bc.args[2].f(v2, 2))
# BenchmarkTools.Trial: 10000 samples with 10 evaluations.
# Range (min … max): 1.060 μs … 551.860 μs ┊ GC (min … max): 0.00% … 97.86%
# Time (median): 1.270 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 2.205 μs ± 10.057 μs ┊ GC (mean ± σ): 17.62% ± 4.13%
# ██▄▄ ▃▄▇▅▃▃▁ ▂
# ████████████▇▇▆▅▆▆▆▆▆▆▅▅▄▅▅▅▆▆▆▆▆▅▆▆▆▆▅▅▆█▇█▇▆▆▅▅▄▂▄▃▃▄▄▃▂▄ █
# 1.06 μs Histogram: log(frequency) by time 8.9 μs <
# Memory estimate: 8.38 KiB, allocs estimate: 9.
res = bc.f(v1, bc.args[2].f(v2, 2))
bc.f(v1, bc.args[2].f(v2, 2; res=vres); res=vres)
res == vres
bc = Base.broadcasted(+, Base.broadcasted(*, vec1, 3), Base.broadcasted(*, vec2, 2))
res = bc.f(bc.args[1].f(v1, 3), bc.args[2].f(v2, 2))
bc.f(bc.args[1].f(v1, 3; res=vres), bc.args[2].f(v2, 2; res=vres); res=vres)
res == vres #false
# res.value == 7
# vres.value == 8, i.e. it makes 4 + 4, 4 being the last of result of both argument operations
# After inversing both operands
bc.f(bc.args[2].f(v2, 2; res=vres), bc.args[1].f(v1, 3; res=vres); res=vres)
# vres.value == 6
# The flatten function can receive the optionnal argument
# bcf = Broadcast.flatten(bc)
# @allocated bcf.f(vec1,vec2,3) # 1+2*3
# @allocated bcf.f(vec1[1],vec2[1],3) # 1+2*3
# @allocated bcf.f(v1,v2,3)
# @allocated bcf.f(v1,v2,3;res=vres) # not working
First try to make element wise operation allocation-free. The test is about a handmade structure TypeA which make a lot of allocation when similar is called.