FastBroadcast.jl exports @..
that compiles broadcast expressions into loops
that are easier for the compiler to optimize.
julia> using FastBroadcast
julia> function fast_foo9(a, b, c, d, e, f, g, h, i)
@.. a = b + 0.1 * (0.2c + 0.3d + 0.4e + 0.5f + 0.6g + 0.6h + 0.6i)
nothing
end
fast_foo9 (generic function with 1 method)
julia> function foo9(a, b, c, d, e, f, g, h, i)
@. a = b + 0.1 * (0.2c + 0.3d + 0.4e + 0.5f + 0.6g + 0.6h + 0.6i)
nothing
end
foo9 (generic function with 1 method)
julia> a, b, c, d, e, f, g, h, i = [rand(100, 100, 2) for i in 1:9];
julia> using BenchmarkTools
julia> @btime fast_foo9($a, $b, $c, $d, $e, $f, $g, $h, $i);
19.902 μs (0 allocations: 0 bytes)
julia> @btime foo9($a, $b, $c, $d, $e, $f, $g, $h, $i);
81.457 μs (0 allocations: 0 bytes)
It's important to note that FastBroadcast doesn't speed up "dynamic broadcast", i.e. when the arguments are not equal-axised or scalars. For example, dynamic broadcast happens when the expansion of singleton dimensions occurs:
julia> b = [1.0];
julia> @btime foo9($a, $b, $c, $d, $e, $f, $g, $h, $i);
70.634 μs (0 allocations: 0 bytes)
julia> @btime fast_foo9($a, $b, $c, $d, $e, $f, $g, $h, $i);
131.470 μs (0 allocations: 0 bytes)
The macro @..
of FastBroadcast.jl accepts a keyword argument thread
determining whether the broadcast call should use threading (disabled
by default). You can use it as follows (starting Julia with multiple
threads).
julia> using FastBroadcast
julia> function foo_serial!(dest, src)
@.. thread=false dest = log(src)
end
foo_serial! (generic function with 1 method)
julia> function foo_parallel!(dest, src)
@.. thread=true dest = log(src)
end
foo_parallel! (generic function with 1 method)
julia> function foo_maybe_parallel!(dest, src, thread)
@.. thread=thread dest = log(src)
end
foo_maybe_parallel! (generic function with 1 method)
julia> src = rand(10^4); dest = similar(src);
julia> @btime foo_serial!($dest, $src);
50.860 μs (0 allocations: 0 bytes)
julia> @btime foo_parallel!($dest, $src);
17.245 μs (1 allocation: 48 bytes)
julia> @btime foo_maybe_parallel!($dest, $src, $FastBroadcast.False());
51.682 μs (0 allocations: 0 bytes)
julia> @btime foo_maybe_parallel!($dest, $src, $FastBroadcast.True());
17.360 μs (1 allocation: 48 bytes)