3x3 matrix multiply could potentially be faster

This issue is mostly to document that there is a potential speedup to be gain in e.g. 3x3 matrix multiply (and possible other operations where the matrix size is "odd").

Current StaticArrays.jl:

julia> for n in (2,3,4)
           s = Ref(rand(SMatrix{n,n}))
           @btime $(s)[] * $(s)[]
       end
  2.709 ns (0 allocations: 0 bytes)
  10.274 ns (0 allocations: 0 bytes)
  6.059 ns (0 allocations: 0 bytes)

3x3 is quite slow (the Ref shenanigans is to prevent spurious benchmark optimizations).

Handcoding something with SIMD.jl:

using StaticArrays
import SIMD
const SVec{N, T} = SIMD.Vec{N, T}

# load given range of linear indices into SVec
@generated function tosimd(D::NTuple{N, T}, ::Type{Val{strt}}, ::Type{Val{stp}}) where {N, T, strt, stp}
    expr = Expr(:tuple, [:(D[$i]) for i in strt:stp]...)
    M = length(expr.args)
    return quote
        $(Expr(:meta, :inline))
        @inbounds return SVec{$M, T}($expr)
    end
end

# constructor SMatrix from several SVecs
@generated function (::Type{SMatrix{dim, dim}})(r::NTuple{M, SVec{N}}) where {dim, M, N}
    return quote
        $(Expr(:meta, :inline))
        @inbounds return SMatrix{$dim, $dim}($(Expr(:tuple, [:(r[$j][$i]) for i in 1:N, j in 1:M]...)))
    end
end

function matmul3x3(a::SMatrix, b::SMatrix)
    D1 = a.data; D2 = b.data
    SV11 = tosimd(D1, Val{1}, Val{3})
    SV12 = tosimd(D1, Val{4}, Val{6})
    SV13 = tosimd(D1, Val{7}, Val{9})
    r1 = muladd(SV13, D2[3], muladd(SV12, D2[2], SV11 * D2[1]))
    r2 = muladd(SV13, D2[6], muladd(SV12, D2[5], SV11 * D2[4]))
    r3 = muladd(SV13, D2[9], muladd(SV12, D2[8], SV11 * D2[7]))
    return SMatrix{3,3}((r1, r2, r3))
end

julia> @btime matmul3x3($(Ref(s))[], $(Ref(s)[]))
  4.391 ns (0 allocations: 0 bytes)

julia> matmul3x3(s,s) - s*s
3×3 SArray{Tuple{3,3},Float64,2,9}:
 0.0  0.0   0.0        
 0.0  0.0  -1.11022e-16
 0.0  0.0   0.0

which is a ~2.5x speedup.

The code for the handcoded is

julia> @code_native matmul3x3(s,s)
    .section    __TEXT,__text,regular,pure_instructions
    vmovupd (%rsi), %xmm0
    vmovsd  16(%rsi), %xmm1         ## xmm1 = mem[0],zero
    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    vmovupd 24(%rsi), %xmm1
    vmovsd  40(%rsi), %xmm2         ## xmm2 = mem[0],zero
    vinsertf128 $1, %xmm2, %ymm1, %ymm1
    vmovupd 48(%rsi), %xmm2
    vmovsd  64(%rsi), %xmm3         ## xmm3 = mem[0],zero
    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    vbroadcastsd    (%rdx), %ymm3
    vmulpd  %ymm3, %ymm0, %ymm3
    vbroadcastsd    8(%rdx), %ymm4
    vfmadd213pd %ymm3, %ymm1, %ymm4
    vbroadcastsd    16(%rdx), %ymm3
    vfmadd213pd %ymm4, %ymm2, %ymm3
    vbroadcastsd    24(%rdx), %ymm4
    vmulpd  %ymm4, %ymm0, %ymm4
    vbroadcastsd    32(%rdx), %ymm5
    vfmadd213pd %ymm4, %ymm1, %ymm5
    vbroadcastsd    40(%rdx), %ymm4
    vfmadd213pd %ymm5, %ymm2, %ymm4
    vbroadcastsd    48(%rdx), %ymm5
    vmulpd  %ymm5, %ymm0, %ymm0
    vbroadcastsd    56(%rdx), %ymm5
    vfmadd213pd %ymm0, %ymm1, %ymm5
    vbroadcastsd    64(%rdx), %ymm0
    vfmadd213pd %ymm5, %ymm2, %ymm0
    vbroadcastsd    %xmm4, %ymm1
    vblendpd    $8, %ymm1, %ymm3, %ymm1 ## ymm1 = ymm3[0,1,2],ymm1[3]
    vmovupd %ymm1, (%rdi)
    vextractf128    $1, %ymm0, %xmm1
    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    vpermpd $233, %ymm4, %ymm2      ## ymm2 = ymm4[1,2,2,3]
    vblendpd    $12, %ymm0, %ymm2, %ymm0 ## ymm0 = ymm2[0,1],ymm0[2,3]
    vmovupd %ymm0, 32(%rdi)
    vmovlpd %xmm1, 64(%rdi)
    movq    %rdi, %rax
    vzeroupper
    retq
    nopw    %cs:(%rax,%rax)
;}

while for the standard *

julia> @code_native s*s
    .section    __TEXT,__text,regular,pure_instructions
    vmovsd  (%rdx), %xmm0           ## xmm0 = mem[0],zero
    vmovsd  8(%rdx), %xmm7          ## xmm7 = mem[0],zero
    vmovsd  16(%rdx), %xmm6         ## xmm6 = mem[0],zero
    vmovsd  16(%rsi), %xmm11        ## xmm11 = mem[0],zero
    vmovsd  40(%rsi), %xmm12        ## xmm12 = mem[0],zero
    vmovsd  64(%rsi), %xmm9         ## xmm9 = mem[0],zero
    vmovsd  24(%rdx), %xmm3         ## xmm3 = mem[0],zero
    vmovupd (%rsi), %xmm4
    vmovupd 8(%rsi), %xmm10
    vmovhpd (%rsi), %xmm11, %xmm5   ## xmm5 = xmm11[0],mem[0]
    vinsertf128 $1, %xmm5, %ymm4, %ymm5
    vunpcklpd   %xmm3, %xmm0, %xmm1 ## xmm1 = xmm0[0],xmm3[0]
    vmovddup    %xmm0, %xmm0    ## xmm0 = xmm0[0,0]
    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    vmulpd  %ymm0, %ymm5, %ymm0
    vmovsd  32(%rdx), %xmm5         ## xmm5 = mem[0],zero
    vmovupd 24(%rsi), %xmm8
    vmovhpd 24(%rsi), %xmm12, %xmm1 ## xmm1 = xmm12[0],mem[0]
    vinsertf128 $1, %xmm1, %ymm8, %ymm1
    vunpcklpd   %xmm5, %xmm7, %xmm2 ## xmm2 = xmm7[0],xmm5[0]
    vmovddup    %xmm7, %xmm7    ## xmm7 = xmm7[0,0]
    vinsertf128 $1, %xmm2, %ymm7, %ymm2
    vmulpd  %ymm2, %ymm1, %ymm1
    vaddpd  %ymm1, %ymm0, %ymm1
    vmovsd  40(%rdx), %xmm0         ## xmm0 = mem[0],zero
    vmovhpd 48(%rsi), %xmm9, %xmm2  ## xmm2 = xmm9[0],mem[0]
    vmovupd 48(%rsi), %xmm13
    vinsertf128 $1, %xmm2, %ymm13, %ymm2
    vunpcklpd   %xmm0, %xmm6, %xmm7 ## xmm7 = xmm6[0],xmm0[0]
    vmovddup    %xmm6, %xmm6    ## xmm6 = xmm6[0,0]
    vinsertf128 $1, %xmm7, %ymm6, %ymm6
    vmulpd  %ymm6, %ymm2, %ymm2
    vaddpd  %ymm2, %ymm1, %ymm14
    vmovsd  48(%rdx), %xmm1         ## xmm1 = mem[0],zero
    vmovsd  56(%rdx), %xmm2         ## xmm2 = mem[0],zero
    vmovsd  64(%rdx), %xmm6         ## xmm6 = mem[0],zero
    vinsertf128 $1, %xmm4, %ymm10, %ymm4
    vmovddup    %xmm3, %xmm3    ## xmm3 = xmm3[0,0]
    vmovddup    %xmm1, %xmm7    ## xmm7 = xmm1[0,0]
    vinsertf128 $1, %xmm7, %ymm3, %ymm3
    vmulpd  %ymm3, %ymm4, %ymm3
    vmovupd 32(%rsi), %xmm4
    vinsertf128 $1, %xmm8, %ymm4, %ymm4
    vmovddup    %xmm5, %xmm5    ## xmm5 = xmm5[0,0]
    vmovddup    %xmm2, %xmm7    ## xmm7 = xmm2[0,0]
    vinsertf128 $1, %xmm7, %ymm5, %ymm5
    vmulpd  %ymm5, %ymm4, %ymm4
    vaddpd  %ymm4, %ymm3, %ymm3
    vmovupd 56(%rsi), %xmm4
    vinsertf128 $1, %xmm13, %ymm4, %ymm4
    vmovddup    %xmm0, %xmm0    ## xmm0 = xmm0[0,0]
    vmovddup    %xmm6, %xmm5    ## xmm5 = xmm6[0,0]
    vinsertf128 $1, %xmm5, %ymm0, %ymm0
    vmulpd  %ymm0, %ymm4, %ymm0
    vaddpd  %ymm0, %ymm3, %ymm0
    vmulsd  %xmm1, %xmm11, %xmm1
    vmulsd  %xmm2, %xmm12, %xmm2
    vaddsd  %xmm2, %xmm1, %xmm1
    vmulsd  %xmm6, %xmm9, %xmm2
    vaddsd  %xmm2, %xmm1, %xmm1
    vmovupd %ymm14, (%rdi)
    vmovupd %ymm0, 32(%rdi)
    vmovsd  %xmm1, 64(%rdi)
    movq    %rdi, %rax
    vzeroupper
    retq
    nop
;}

We can see how much the xmm registries are used for this compared to the SIMD version. Comparing to the 4x4 case

julia> s = rand(SMatrix{4,4})

julia> @code_native s*s
    .section    __TEXT,__text,regular,pure_instructions
    vbroadcastsd    (%rdx), %ymm4
    vmovupd (%rsi), %ymm3
    vmovupd 32(%rsi), %ymm2
    vmovupd 64(%rsi), %ymm1
    vmovupd 96(%rsi), %ymm0
    vmulpd  %ymm4, %ymm3, %ymm4
    vbroadcastsd    8(%rdx), %ymm5
    vmulpd  %ymm5, %ymm2, %ymm5
    vaddpd  %ymm5, %ymm4, %ymm4
        ...
    vbroadcastsd    96(%rdx), %ymm7
    vmulpd  %ymm7, %ymm3, %ymm3
    vbroadcastsd    104(%rdx), %ymm7
    vmulpd  %ymm7, %ymm2, %ymm2
    vaddpd  %ymm2, %ymm3, %ymm2
    vbroadcastsd    112(%rdx), %ymm3
    vmulpd  %ymm3, %ymm1, %ymm1
    vaddpd  %ymm1, %ymm2, %ymm1
    vbroadcastsd    120(%rdx), %ymm2
    vmulpd  %ymm2, %ymm0, %ymm0
    vaddpd  %ymm0, %ymm1, %ymm0
    vmovupd %ymm4, (%rdi)
    vmovupd %ymm5, 32(%rdi)
    vmovupd %ymm6, 64(%rdi)
    vmovupd %ymm0, 96(%rdi)
    movq    %rdi, %rax
    vzeroupper
    retq
    nopl    (%rax)

we can now see that everything is using the ymm registries which is what we want.

Very nice!

Do you think this is something we should import SIMD.jl for, or could we rather specialize the 3x3 matrix multiplication case? What happens with SIMD and large matrices (when SIMD.jl was brand new I heard LLVM at the time used to crash for large odd LLVM vectors like size 7... is that fixed)? What about when the eltype is not a SIMD type?

Also, is the 3x3 matrix * 3 vector code we generate optimal, or does it suffer from similar issues?

Not sure what the best thing is to do. FWIW, this just seems to be the SLP doing a bad job vectorizing the code, would be interesting to write the loop in C++ and see what code is generated.

We should only dispatch to SIMD-stuff if the eltype is one of the native LLVM types.

Regarding matvec, it currently does not get vectorized at all and forcing it to do so doesn't seem to give any real benefit.

I'm not sure if opening the SIMD.jl door is a good idea or not. Would be cool to see where more performance can be squeezed out but explicit simd is arguably a bit messy.

We can do much better at many different sizes! I benchmarked padded statically sized arrays vs StaticArrays.jl here. Padding allowed for apply optimized matrix multiplication kernels, which are much faster.

Now, as the next step, SIMD.jl provides masked load and store operations. I stole all the code from SIMD.jl, and also switched it to simply use NTuple{N,Core.VecElement{T}} rather than wrapping it in a single field struct. This has the advantage of being easier on LLVM. SIMD.jl results in redundant mov operations that do not always get eliminated by the compiler. It also results in more segmentation faults when used with libraries like SLEEF. The disadvantage is that extending any base methods would be type piracy. Therefore SIMDPirates.jl does not extend any base methods, instead defining functions like vmul for multiplication. I added an @pirate macro that will replace expressions like a * b with vmul(a, b), in case someone prefers writing using the nice overloaded syntax of SIMD.jl. It is untested, so I won't promise it works.

I would propose an eventual PR incorporating some of what follows, or at least it's consideration.

Sorry for the messiness of the following code, but in working on jBLAS.jl, I wanted only a single kernel to generate the desired kernel for dense matrix multiplication. There, it optionally prefetches the following kernel, decides between initializing the product (D = A X) or updating (D += A X), etc. Much of that machinery can be trimmed away, and odds are the code could be simplified regardless.

I also only wrapped a direct call to the kernel here. Note that this only works so long as at least 1 register is left unoccupied after storing the entire product matrix, and a full column of matrix A, on the CPU's registers. If the product matrix is larger than that, you'd have to start looping over the kernel and applying it in chunks. I'm not doing that here for simplicity, meaning 8x6 products is the maximum size for which this code is fast on avx-2 machines, and 16x14 products is the maximum for avx-512.

I would however recommend looping over the kernel, and only falling back to BLAS for matrices > 100x100. On that note, lots of other performance improvements are possible for MMatrices. That is, simply use new() to initialize undefined MMatrices, and then fill them with a loop. Also, replace the getindex call with an unsafe_load. These are all far faster to compile for large MMarrays.

Without further adieu,

using StaticArrays, BenchmarkTools, SIMDPirates, LinearAlgebra, Base.Cartesian, CpuId
using Core.Intrinsics: llvmcall

# These constants can be defined in a build script, so that CpuId is only a build dependency.
const REGISTER_SIZE = simdbytes()
const CACHELINE_SIZE = cachelinesize()

struct PrefetchA
    A::Int
end
struct PrefetchX
    X::Int
end
struct PrefetchAX
    A::Int
    X::Int
end
prefetch_A(::Any)    = (nothing, nothing)
prefetch_X(::Any)    = (nothing, nothing, nothing)
function prefetch_A(::Type{PF}) where {PF <: Union{PrefetchA, PrefetchAX}}
    (
        :(@nexprs $Qₚ q -> prefetch(pA + pf.A + $CACHELINE_SIZE*(q-1), Val(3), Val(0))),
        :(@nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0)))
    )
end
function prefetch_X(::Type{PrefetchX})
    (
        :(@nexprs $Pₖ p -> prefetch(pX + pf.X + (p-1)*$X_stride, Val(3), Val(0))),
        :(@nexprs $Pₖ p -> prefetch(pX + pf.X + n₁*$T_size + (p-1)*$X_stride, Val(3), Val(0))),
        :(prefetch(pX + pf.X + $(N*T_size) + (p-1)*$X_stride, Val(3), Val(0)))
    )
end
@generated function prefetch(address, ::Val{Locality} = Val(1), ::Val{RorW} = Val(0)) where {Locality, RorW}
    prefetch_call_string = """%addr = inttoptr i64 %0 to i8*
    call void @llvm.prefetch(i8* %addr, i32 $RorW, i32 $Locality, i32 1)
    ret void"""
    quote
        $(Expr(:meta, :inline))
        llvmcall(("declare void @llvm.prefetch(i8* , i32 , i32 , i32 )",
        $prefetch_call_string), Cvoid, Tuple{Ptr{Cvoid}}, address)
    end
end
mask_expr(W, r) = :($(Expr(:tuple, [i > r ? Core.VecElement{Bool}(false) : Core.VecElement{Bool}(true) for i ∈ 1:W]...)))

function mulinit(V, WT, Q, Pₖ, X_stride, r, mask_expr, inline_expr, pfA_1)
    if r == 0
        q_load_expr = :(@nexprs $Q q -> vA_q = vload($V, pA + $WT*(q-1)))
    else
        q_load_expr = quote
            @nexprs $(Q-1) q -> vA_q = vload($V, pA + $WT*(q-1))
          $(Symbol(:vA_, Q)) = vload($V, pA + $(WT*(Q-1)),$mask_expr)
      end
    end

    quote
        $inline_expr
        $q_load_expr
        @nexprs $Pₖ p -> begin
            vX = vbroadcast($V, unsafe_load(pX + (p-1)*$X_stride))
            @nexprs $Q q -> Dx_p_q = SIMDPirates.vmul(vA_q, vX)
        end
        $pfA_1
    end
end
function gemminit(V, WT, Q, Pₖ, AD_stride, r, mask_expr, inline_expr)
    if r == 0
        q = quote
            $inline_expr
            @nexprs $Pₖ p -> @nexprs $Q q -> Dx_p_q = vload($V, pD + $WT*(q-1) + $AD_stride*(p-1))
        end
    else
        q = quote
            $inline_expr
        end
        for p ∈ 1:Pₖ
            for q ∈ 1:Q-1
                push!(q.args, :($(Symbol(:Dx_,p,:_,q)) = vload($V, pD + $(WT*(q-1) + AD_stride*(p-1)))))
            end
            push!(q.args, :($(Symbol(:Dx_,p,:_,Q)) = vload($V, pD + $(WT*(Q-1) + AD_stride*(p-1)),$mask_expr)))
        end
    end
    q
end

function kernel_quote(Mₖ,Pₖ,stride_AD,stride_X,N,T,init,inline = false, pf = nothing)
    T_size = sizeof(T)
    AD_stride = stride_AD * T_size
    X_stride = stride_X * T_size
    W = REGISTER_SIZE ÷ T_size
    while W > 2Mₖ
        W >>= 1
    end
    WT = W * T_size
    Q, r = divrem(Mₖ, W) #Assuming Mₖ is a multiple of W
    V = Vec{W,T}
    if r == 0
        mask = :()
        A_load_expr = :(@nexprs $Q q -> vA_q = vload($V, pA + n*$AD_stride + $WT*(q-1)))
        D_store = :(@nexprs $Q q -> vstore(Dx_p_q, pD + $WT*(q-1) + $AD_stride*(p-1)))
    else
        mask = mask_expr(W, r)
        if Q == 0
            Q = 1
            A_load_expr = :($(Symbol(:vA_, Q)) = vload($V, pA + n*$AD_stride + $(WT*(Q-1)), $mask))
        else
            A_load_expr = quote
                @nexprs $Q q -> vA_q = vload($V, pA + n*$AD_stride + $WT*(q-1))
            end
            Q += 1
            push!(A_load_expr.args, :($(Symbol(:vA_, Q)) = vload($V, pA + n*$AD_stride + $(WT*(Q-1)), $mask)))
        end
        D_store = quote
                    @nexprs $(Q-1) q -> vstore(Dx_p_q, pD + $WT*(q-1) + $AD_stride*(p-1))
                    vstore($(Symbol(:Dx_p_, Q)), pD + $(WT*(Q-1)) + $AD_stride*(p-1), $mask)
                end
    end
    C = CACHELINE_SIZE ÷ T_size
    Qₚ = cld(Mₖ, C)
    # Check whether we are prefetching A and/or X.
    pfA_1, pfA_2 = prefetch_A(pf)
    pfX_1, pfX_2, pfX_3 = prefetch_X(pf)
    inline_expr = inline ? Expr(:meta, :inline) : :(nothing)
    if init
        q = mulinit(V, WT, Q, Pₖ, X_stride, r, mask, inline_expr, pfA_1)
    else
        q = gemminit(V, WT, Q, Pₖ, AD_stride, r, mask, inline_expr)
    end

    if pfX_1 == nothing
        push!(q.args,
        quote
            for n ∈ $(init ? 1 : 0):$(N-1)
                $A_load_expr
                @nexprs $Pₖ p -> begin
                    vX = vbroadcast($V, unsafe_load(pX + n*$T_size + (p-1)*$X_stride))
                    @nexprs $Q q -> Dx_p_q = vfma(vA_q, vX, Dx_p_q)
                end
                $pfA_2
                # @nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
            end
            @nexprs $Pₖ p -> $D_store
            nothing
        end)
    else
        push!(q.args,
        quote
            # @nexprs $Qₚ q -> prefetch(pA + pf.A + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
            for n ∈ $(init ? 1 : 0):$(C-1)
                $A_load_expr
                @nexprs $Pₖ p -> begin
                    vX = vbroadcast($V, unsafe_load(pX + n*$T_size + (p-1)*$X_stride))
                    @nexprs $Q q -> Dx_p_q = vfma(vA_q, vX, Dx_p_q)
                end
                $pfA_2
                # @nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
            end
            # @nexprs $Pₖ p -> prefetch(pX + pf.X + (p-1)*$X_stride, Val(3), Val(0))
            $pfX_1

            for n₁ ∈ $C:$C:$(N-C)
                for n ∈ n₁:n₁+$(C-1)
                    $A_load_expr
                    @nexprs $Pₖ p -> begin
                        vX = vbroadcast($V, unsafe_load(pX + n*$T_size + (p-1)*$X_stride))
                        @nexprs $Q q -> Dx_p_q = vfma(vA_q, vX, Dx_p_q)
                    end
                    $pfA_2
                    # @nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
                end
                # @nexprs $Pₖ p -> prefetch(pX + pf.X + n₁*$T_size + (p-1)*$X_stride, Val(3), Val(0))
                $pfX_2
            end
            for n ∈ $(N - (N % C)):$(N-1)
                $A_load_expr
                @nexprs $Pₖ p -> begin
                    vX = vbroadcast($V, unsafe_load(pX + n*$T_size + (p-1)*$X_stride))
                    @nexprs $Q q -> Dx_p_q = vfma(vA_q, vX, Dx_p_q)
                end
                $pfA_2
                # @nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
            end
            @nexprs $Pₖ p -> begin
                # prefetch(pX + pf.X + $(N*T_size) + (p-1)*$X_stride, Val(3), Val(0))
                $pfX_3
                $D_store
            end
            nothing
        end)
    end
    q
end
@generated function fastmul!(D::MMatrix{M,P,T},A::MMatrix{M,N,T},X::MMatrix{N,P,T}) where {M,N,P,T}
    quote
        pD, pA, pX = pointer(D), pointer(A), pointer(X)
        # Mₖ,Pₖ,stride_AD,stride_X,N,T,init
        $(kernel_quote(M,P,M,N,N,T,true,true))
    end
end

max_cols = REGISTER_SIZE == 32 ? 6 : 14
for n ∈ 2:(REGISTER_SIZE == 32 ? 8 : 16)
    m1 = @MMatrix randn(n,n)
    m2 = @MMatrix randn(n,min(n,max_cols))
    m3 = @MMatrix randn(n,min(n,max_cols))
    @show size(m3), size(m1), size(m2)
    println("MMatrix Multiplication:")
    @btime mul!($m3, $m1, $m2)
    println("SMatrix Multiplication:")
    s3 = @btime $m1 * $m2
    println("fastmul!:")
    @btime fastmul!($m3, $m1, $m2)
    @show m3 - s3
    println("")
end

On a computer with avx-512, this yields:

(size(m3), size(m1), size(m2)) = ((2, 2), (2, 2), (2, 2))
MMatrix Multiplication:
  3.135 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  1.584 ns (0 allocations: 0 bytes)
fastmul!:
  1.896 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0; 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((3, 3), (3, 3), (3, 3))
MMatrix Multiplication:
  7.383 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  8.274 ns (0 allocations: 0 bytes)
fastmul!:
  3.237 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((4, 4), (4, 4), (4, 4))
MMatrix Multiplication:
  10.524 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  3.528 ns (0 allocations: 0 bytes)
fastmul!:
  4.154 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((5, 5), (5, 5), (5, 5))
MMatrix Multiplication:
  19.038 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  22.561 ns (0 allocations: 0 bytes)
fastmul!:
  5.894 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((6, 6), (6, 6), (6, 6))
MMatrix Multiplication:
  30.316 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  39.103 ns (0 allocations: 0 bytes)
fastmul!:
  7.837 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((7, 7), (7, 7), (7, 7))
MMatrix Multiplication:
  51.105 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  62.420 ns (0 allocations: 0 bytes)
fastmul!:
  11.871 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((8, 8), (8, 8), (8, 8))
MMatrix Multiplication:
  36.552 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  12.940 ns (0 allocations: 0 bytes)
fastmul!:
  12.794 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((9, 9), (9, 9), (9, 9))
MMatrix Multiplication:
  68.433 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  73.896 ns (0 allocations: 0 bytes)
fastmul!:
  24.042 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((10, 10), (10, 10), (10, 10))
MMatrix Multiplication:
  106.568 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  109.546 ns (0 allocations: 0 bytes)
fastmul!:
  31.296 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((11, 11), (11, 11), (11, 11))
MMatrix Multiplication:
  161.298 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  151.703 ns (0 allocations: 0 bytes)
fastmul!:
  38.405 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((12, 12), (12, 12), (12, 12))
MMatrix Multiplication:
  210.829 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  216.941 ns (0 allocations: 0 bytes)
fastmul!:
  47.986 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((13, 13), (13, 13), (13, 13))
MMatrix Multiplication:
  315.835 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  302.625 ns (0 allocations: 0 bytes)
fastmul!:
  65.856 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((14, 14), (14, 14), (14, 14))
MMatrix Multiplication:
  466.087 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  396.795 ns (0 allocations: 0 bytes)
fastmul!:
  66.755 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((15, 14), (15, 15), (15, 14))
MMatrix Multiplication:
  548.775 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  450.668 ns (0 allocations: 0 bytes)
fastmul!:
  67.804 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((16, 14), (16, 16), (16, 14))
MMatrix Multiplication:
  545.207 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  83.752 ns (0 allocations: 0 bytes)
fastmul!:
  65.457 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

EDIT: Unfortunately, I'm not sure how we could actually use the masked store instructions with SArrays, since they aren't heap allocated, meaning we can't get pointers. Still, this could make using preallocating MArrays extremely fast.

Anyone more familiar with LLVM know how?

Also, FWIW:

julia> n = 3;

julia> m1 = @MMatrix randn(n,n);

julia> m2 = @MMatrix randn(n,n);

julia> m3 = @MMatrix randn(n,n);

julia> @code_native fastmul!(m3, m1, m2)
    .text
; Function fastmul! {
; Location: REPL[17]:2
    movq    %rsi, -8(%rsp)
    movq    8(%rsi), %rax
    movq    16(%rsi), %rcx
; Function macro expansion; {
; Location: REPL[17]:5
; Function macro expansion; {
; Location: REPL[14]:6
; Function vload; {
; Location: memory.jl:80
; Function vload; {
; Location: memory.jl:80
; Function macro expansion; {
; Location: memory.jl:109
    movb    $7, %dl
    kmovd   %edx, %k1
    vmovupd (%rax), %ymm0 {%k1} {z}
;}}}}
; Function macro expansion; {
; Location: llvmwrap.jl:81
    vmulpd  48(%rcx){1to4}, %ymm0, %ymm1
    vmulpd  24(%rcx){1to4}, %ymm0, %ymm2
    vmulpd  (%rcx){1to4}, %ymm0, %ymm0
    movq    (%rsi), %rdx
;}
; Function macro expansion; {
; Location: REPL[14]:16
; Function macro expansion; {
; Location: REPL[16]:48
; Function vload; {
; Location: memory.jl:80
; Function vload; {
; Location: memory.jl:80
; Function macro expansion; {
; Location: memory.jl:109
    vmovupd 24(%rax), %ymm3 {%k1} {z}
;}}}}
; Function macro expansion; {
; Location: llvmwrap.jl:170
    vfmadd231pd 8(%rcx){1to4}, %ymm3, %ymm0
    vfmadd231pd 32(%rcx){1to4}, %ymm3, %ymm2
    vfmadd231pd 56(%rcx){1to4}, %ymm3, %ymm1
;}
; Function macro expansion; {
; Location: REPL[16]:48
; Function vload; {
; Location: memory.jl:80
; Function vload; {
; Location: memory.jl:80
; Function macro expansion; {
; Location: memory.jl:109
    vmovupd 48(%rax), %ymm3 {%k1} {z}
;}}}
; Location: REPL[16]:51
; Function vfma; {
; Location: floating_point_arithmetic.jl:48
; Function llvmwrap; {
; Location: llvmwrap.jl:148
; Function llvmwrap; {
; Location: llvmwrap.jl:148
; Function macro expansion; {
; Location: llvmwrap.jl:170
    vfmadd231pd 16(%rcx){1to4}, %ymm3, %ymm0
    vfmadd231pd 40(%rcx){1to4}, %ymm3, %ymm2
    vfmadd231pd 64(%rcx){1to4}, %ymm3, %ymm1
;}}}}
; Location: REPL[16]:30
; Function vstore; {
; Location: memory.jl:179
; Function vstore; {
; Location: memory.jl:179
; Function macro expansion; {
; Location: memory.jl:207
    vmovupd %ymm0, (%rdx) {%k1}
    vmovupd %ymm2, 24(%rdx) {%k1}
    vmovupd %ymm1, 48(%rdx) {%k1}
;}}}}}
    movabsq $139651999485960, %rax  # imm = 0x7F0343D25008
    vzeroupper
    retq
;}}

Worth pointing out that a bit of thermal throttling is obvious with these benchmarks. Maybe I should add sleep(X) into the for loop between each mul? For example, the minimum time for 14x14 was 66.755ns, but when I rerun just that benchmark after letting my computer rest for a few seconds:

julia> n = 14;

julia> m1 = @MMatrix randn(n,n);

julia> m2 = @MMatrix randn(n,n);

julia> m3 = @MMatrix randn(n,n);

julia> @benchmark fastmul!($m3, $m1, $m2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     61.836 ns (0.00% GC)
  median time:      61.975 ns (0.00% GC)
  mean time:        63.593 ns (0.00% GC)
  maximum time:     102.706 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     981

Not sure what constitutes fair benchmarking -- thermal throttling is something we'd be concerned with in practice. But the function ordered last is likely to suffer more than the others.

EDIT: Interestingly, KrisofferC's method is faster, but I can't replicate the performance with MMatrices:

julia> import SIMD

julia> const SVec{N, T} = SIMD.Vec{N, T}
SIMD.Vec{N,T} where T where N

julia> # load given range of linear indices into SVec
       @generated function tosimd(D::NTuple{N, T}, ::Type{Val{strt}}, ::Type{Val{stp}}) where {N, T, strt, stp}
           expr = Expr(:tuple, [:(D[$i]) for i in strt:stp]...)
           M = length(expr.args)
           return quote
               $(Expr(:meta, :inline))
               @inbounds return SVec{$M, T}($expr)
           end
       end
tosimd (generic function with 1 method)

julia> # constructor SMatrix from several SVecs
       @generated function (::Type{SMatrix{dim, dim}})(r::NTuple{M, SVec{N}}) where {dim, M, N}
           return quote
               $(Expr(:meta, :inline))
               @inbounds return SMatrix{$dim, $dim}($(Expr(:tuple, [:(r[$j][$i]) for i in 1:N, j in 1:M]...)))
           end
       end

julia> n = 3;

julia> m1 = @MMatrix randn(n,n);

julia> m2 = @MMatrix randn(n,n);

julia> m3 = @MMatrix randn(n,n);

julia> function matmul3x3(a::MMatrix{3,3,T}, b::MMatrix{3,3,T}) where T
          D1 = a.data; D2 = b.data
          SV11 = tosimd(D1, Val{1}, Val{3})
          SV12 = tosimd(D1, Val{4}, Val{6})
          SV13 = tosimd(D1, Val{7}, Val{9})
          r1 = muladd(SV13, D2[3], muladd(SV12, D2[2], SV11 * D2[1]))
          r2 = muladd(SV13, D2[6], muladd(SV12, D2[5], SV11 * D2[4]))
          r3 = muladd(SV13, D2[9], muladd(SV12, D2[8], SV11 * D2[7]))
          return SMatrix{3,3}((r1, r2, r3))
       end
matmul3x3 (generic function with 2 methods)

julia> function matmul3x3_vstore!(C::MMatrix{3,3,T}, A::MMatrix{3,3,T}, B::MMatrix{3,3,T}) where T
          D1 = A.data; D2 = B.data
          SV11 = tosimd(D1, Val{1}, Val{3})
          SV12 = tosimd(D1, Val{4}, Val{6})
          SV13 = tosimd(D1, Val{7}, Val{9})
          pC = pointer(C)
          SIMD.vstore(muladd(SV13, D2[3], muladd(SV12, D2[2], SV11 * D2[1])), pC)
          SIMD.vstore(muladd(SV13, D2[6], muladd(SV12, D2[5], SV11 * D2[4])), pC + 3sizeof(T))
          SIMD.vstore(muladd(SV13, D2[9], muladd(SV12, D2[8], SV11 * D2[7])), pC + 6sizeof(T))
          return C
       end
matmul3x3_vstore! (generic function with 1 method)

julia> function matmul3x3_unsafe_store!(C::MMatrix{3,3,T}, A::MMatrix{3,3,T}, B::MMatrix{3,3,T}) where T
          D1 = A.data; D2 = B.data
          SV11 = tosimd(D1, Val{1}, Val{3})
          SV12 = tosimd(D1, Val{4}, Val{6})
          SV13 = tosimd(D1, Val{7}, Val{9})
          r1 = muladd(SV13, D2[3], muladd(SV12, D2[2], SV11 * D2[1]))
          r2 = muladd(SV13, D2[6], muladd(SV12, D2[5], SV11 * D2[4]))
          r3 = muladd(SV13, D2[9], muladd(SV12, D2[8], SV11 * D2[7]))
          S = SMatrix{3,3}((r1, r2, r3))
          unsafe_store!(Base.unsafe_convert(Ptr{typeof(S)}, pointer(C)), S)
          return C
       end
matmul3x3_unsafe_store! (generic function with 1 method)

julia> @btime fastmul!($m3, $m1, $m2);
  4.031 ns (0 allocations: 0 bytes)

julia> @btime matmul3x3_vstore!($m3, $m1, $m2);
  12.027 ns (0 allocations: 0 bytes)

julia> @btime matmul3x3($m1, $m2); #KristofferC's function
  2.527 ns (0 allocations: 0 bytes)

julia> @btime matmul3x3_unsafe_store!($m3, $m1, $m2);
  13.783 ns (0 allocations: 0 bytes)

This implies to me that we can do better than the masked load/stores I demonstrated in this post.

Seems clang kinda barfs on e.g. a 3x3 pattern as well: https://godbolt.org/z/TGCusg.

It is interesting to note that when the sizes correspond to the width of the SIMD registers the SLP does just as well as the hand-written.

(size(m3), size(m1), size(m2)) = ((8, 8), (8, 8), (8, 8))
MMatrix Multiplication:
  36.552 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  12.940 ns (0 allocations: 0 bytes)
fastmul!:
  12.794 ns (0 allocations: 0 bytes)

A nice wish would be for LLVM to just improve how they generate code for this odd matrix sizes (or implement something like http://lists.llvm.org/pipermail/llvm-dev/2018-October/126871.html which presumably will be fast if we emit).

To what extent are these speedups due to using muladd? StaticArrays currently doesn't use muladd at all, and using it constitutes a tradeoff.

and using it constitutes a tradeoff.

How so? At least from my understanding of it, on setups where it can be done on hardware, it is a quicker operation and has less numerical error. Where it can't be done on hardware it just defaults back to the normal way.

From ?muladd:

The result can be different on different machines and can also be different on the same machine due to constant propagation or other optimizations.

IIRC, I actually started that Julia session with --math-mode=fast, to let StaticArrays use muladd.

The benefit in my examples was due to better vectorization. LLVM often did a good job for StaticArrays (but not MArrays) whenever the left matrix's rows were an integer multiple of SIMD vector width. For other sizes, it could be several times slower or more.

On Fri, Jan 18, 2019 at 10:41 PM Twan Koolen notifications@github.com wrote:

To what extent are these speedups due to using muladd? StaticArrays currently doesn't use muladd at all, and using it constitutes a tradeoff.

— You are receiving this because you commented. Reply to this email directly, view it on GitHub https://github.com/JuliaArrays/StaticArrays.jl/issues/512#issuecomment-455749146, or mute the thread https://github.com/notifications/unsubscribe-auth/AHq8U5RCUrTxW_hCRdkqBIJt75VptAe3ks5vEqHogaJpZM4XLcFS .

-- https://github.com/chriselrod?tab=repositories https://www.linkedin.com/in/chris-elrod-9720391a/

I always saw not using muladd in StaticArrays as just a missed optimization. Are you saying it is an intended choice? Seems quite out of spirit with other choices made in this package.

I don't know if it was intended. I tend to agree.

Highly unlikely that muladd was avoided on purpose (I didn't write that code but I was sitting next to @andyferris when he originally wrote the package :-) )

Given that it's faster and has less rounding error on newer CPUs, a patch to use muladd would be very appropriate for this package, I think.

Here are three fairly simple possibilities. The first two are more or less identical, except one wraps items in VecElements. The third unsuccessfully tries to encourage the compiler to use masked load/stores.

@generated function Base.:*(A::SMatrix{M,N,T}, B::SMatrix{N,P,T}) where {M,N,P,T}
    outtup = Vector{Expr}(undef, M*P)
    i = 0
    for p ∈ 1:P, m ∈ 1:M
        i += 1
        outtup[i] = :(@inbounds $(Symbol(:C_, p))[$m] )
    end
    quote
        $(Expr(:meta, :inline))
        A_col = $(Expr(:tuple, [:(@inbounds A[$m,1]) for m ∈ 1:M]...))
        Base.Cartesian.@nexprs $P p -> begin
            @inbounds B_p = B[1,p]
            C_p = $(Expr(:tuple, [:(@inbounds A_col[$n] * B_p) for n ∈ 1:N]...))
        end
        @fastmath @inbounds for n ∈ 2:$N
            A_col = $(Expr(:tuple, [:(@inbounds A[$m,n]) for m ∈ 1:M]...))
            Base.Cartesian.@nexprs $P p -> begin
                @inbounds B_p = B[n,p]
                C_p = $(Expr(:tuple, [:(@inbounds A_col[$n] * B_p + C_p[$n]) for n ∈ 1:N]...))
            end
        end
        SMatrix{$M,$P,$T}(
            $(Expr(:tuple, outtup...))
        )
    end
end

@generated function Base.:*(A::SMatrix{M,N,T}, B::SMatrix{N,P,T}) where {M,N,P,T}
    outtup = Vector{Expr}(undef, M*P)
    i = 0
    for p ∈ 1:P, m ∈ 1:M
        i += 1
        outtup[i] = :(@inbounds $(Symbol(:C_, p))[$m].value )
    end
    quote
        $(Expr(:meta, :inline))
        A_col = $(Expr(:tuple, [:(@inbounds Core.VecElement(A[$m,1])) for m ∈ 1:M]...))
        Base.Cartesian.@nexprs $P p -> begin
            @inbounds B_p = B[1,p]
            C_p = $(Expr(:tuple, [:(@inbounds Core.VecElement(A_col[$n].value * B_p)) for n ∈ 1:N]...))
        end
        @fastmath @inbounds for n ∈ 2:$N
            A_col = $(Expr(:tuple, [:(@inbounds Core.VecElement(A[$m,n])) for m ∈ 1:M]...))
            Base.Cartesian.@nexprs $P p -> begin
                @inbounds B_p = B[n,p]
                C_p = $(Expr(:tuple, [:(@inbounds Core.VecElement(A_col[$n].value * B_p + C_p[$n].value)) for n ∈ 1:N]...))
            end
        end
        SMatrix{$M,$P,$T}(
            $(Expr(:tuple, outtup...))
        )
    end
end

@generated function Base.:*(A::SMatrix{M,N,T}, B::SMatrix{N,P,T}) where {M,N,P,T}
    outtup = Vector{Expr}(undef, M*P)
    Mpow2 = cld(M,2) * 2
    Mrem = Mpow2 - M
    remaining_zeros = [Core.VecElement(zero(T)) for i ∈ 1:Mrem]
    i = 0
    for p ∈ 1:P, m ∈ 1:M
        i += 1
        outtup[i] = :(@inbounds $(Symbol(:C_, p))[$(m+Mrem)].value )
    end
    quote
        $(Expr(:meta, :inline))
        Adata = A.data
        A_col = $(Expr(:tuple, remaining_zeros..., [:(@inbounds Core.VecElement(Adata[$m])) for m ∈ 1:M]...))
        Base.Cartesian.@nexprs $P p -> begin
            @inbounds B_p = B[1,p]
            C_p = $(Expr(:tuple, [:(@inbounds Core.VecElement(A_col[$m].value * B_p)) for m ∈ 1:Mpow2]...))
        end
        @fastmath @inbounds for n ∈ 2:$N
            A_col = $(Expr(:tuple, [:(@inbounds Core.VecElement(Adata[$(m - Mpow2) + n*$M])) for m ∈ 1:Mpow2]...))
            Base.Cartesian.@nexprs $P p -> begin
                @inbounds B_p = B[n,p]
                C_p = $(Expr(:tuple, [:(@inbounds Core.VecElement(A_col[$m].value * B_p + C_p[$m].value)) for m ∈ 1:Mpow2]...))
            end
        end
        SMatrix{$M,$P,$T}(
            $(Expr(:tuple, outtup...))
        )
    end
end

However, they are all pretty bad whenever M is not a multiple of SIMD vector width. Trying the third version:

julia> s = @SMatrix randn(3,3);

julia> t = @SMatrix randn(3,3);

julia> @btime matmul3x3($s,$t)
  2.525 ns (0 allocations: 0 bytes)
3×3 SArray{Tuple{3,3},Float64,2,9}:
 0.600566  -1.94865   0.100573
 1.0341    -1.76276  -1.42636 
 0.34147   -3.58425   0.46392 

julia> @btime $s * $t
  8.270 ns (0 allocations: 0 bytes)
3×3 SArray{Tuple{3,3},Float64,2,9}:
 0.600566  -1.94865   0.100573
 1.0341    -1.76276  -1.42636 
 0.34147   -3.58425   0.46392

Or, looking at vector width x vector width (try 4x4 if your computer has avx2; this one has avx512):

julia> A = @SMatrix randn(8,8);

julia> B = @SMatrix randn(8,8);

julia> C = @SMatrix randn(7,7);

julia> D = @SMatrix randn(7,7);

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     11.009 ns (0.00% GC)
  median time:      11.172 ns (0.00% GC)
  mean time:        11.552 ns (0.00% GC)
  maximum time:     42.911 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     66.681 ns (0.00% GC)
  median time:      66.690 ns (0.00% GC)
  mean time:        68.901 ns (0.00% GC)
  maximum time:     103.893 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     978

For 7x7, this is actually worse than the current method:

julia> A = @SMatrix randn(8,8);

julia> B = @SMatrix randn(8,8);

julia> C = @SMatrix randn(7,7);

julia> D = @SMatrix randn(7,7);

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     16.281 ns (0.00% GC)
  median time:      16.390 ns (0.00% GC)
  mean time:        16.973 ns (0.00% GC)
  maximum time:     63.614 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     998

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     61.545 ns (0.00% GC)
  median time:      61.663 ns (0.00% GC)
  mean time:        63.357 ns (0.00% GC)
  maximum time:     107.715 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     981

The assembly looks really good though in the 8x8 case:

julia> @code_native A * B
    .text
; ┌ @ REPL[88]:2 within `*'
; │┌ @ REPL[88]:17 within `macro expansion'
; ││┌ @ REPL[88]:2 within `*'
    vmovupd (%rsi), %zmm7
    vmulpd  (%rdx){1to8}, %zmm7, %zmm0
    vmulpd  64(%rdx){1to8}, %zmm7, %zmm1
    vmulpd  128(%rdx){1to8}, %zmm7, %zmm2
    vmulpd  192(%rdx){1to8}, %zmm7, %zmm3
    vmulpd  256(%rdx){1to8}, %zmm7, %zmm4
    vmulpd  320(%rdx){1to8}, %zmm7, %zmm5
    vmulpd  384(%rdx){1to8}, %zmm7, %zmm6
    vmulpd  448(%rdx){1to8}, %zmm7, %zmm7
    movq    $-56, %rax
    nopw    %cs:(%rax,%rax)
; │└└
; │┌ @ fastmath.jl:163 within `macro expansion'
L80:
    vmovupd 512(%rsi,%rax,8), %zmm8
; │└
; │┌ @ REPL[88]:23 within `macro expansion'
; ││┌ @ fastmath.jl:161 within `add_fast'
    vfmadd231pd 64(%rdx,%rax){1to8}, %zmm8, %zmm0
    vfmadd231pd 128(%rdx,%rax){1to8}, %zmm8, %zmm1
    vfmadd231pd 192(%rdx,%rax){1to8}, %zmm8, %zmm2
    vfmadd231pd 256(%rdx,%rax){1to8}, %zmm8, %zmm3
    vfmadd231pd 320(%rdx,%rax){1to8}, %zmm8, %zmm4
    vfmadd231pd 384(%rdx,%rax){1to8}, %zmm8, %zmm5
    vfmadd231pd 448(%rdx,%rax){1to8}, %zmm8, %zmm6
    vfmadd231pd 512(%rdx,%rax){1to8}, %zmm8, %zmm7
; ││└
; ││┌ @ range.jl:595 within `iterate'
; │││┌ @ promotion.jl:403 within `=='
    addq    $8, %rax
; ││└└
    jne L80
; ││ @ REPL[88]:26 within `macro expansion'
    vmovupd %zmm0, (%rdi)
    vmovupd %zmm1, 64(%rdi)
    vmovupd %zmm2, 128(%rdi)
    vmovupd %zmm3, 192(%rdi)
    vmovupd %zmm4, 256(%rdi)
    vmovupd %zmm5, 320(%rdi)
    vmovupd %zmm6, 384(%rdi)
    vmovupd %zmm7, 448(%rdi)
    movq    %rdi, %rax
    vzeroupper
    retq
    nopl    (%rax)
; └└

so it may be worth checking if M is a multiple of two in the generated function.

I spent a couple more hours experimenting, trying to get the compiler to emit masked loads for SArrays. No luck.

Masked loads and stores would be a great option for MArrays, though.

For StaticArrays, I think the best solution is still to pad them.

julia> using SIMDArrays, BenchmarkTools

julia> A = @Static randn(8,8);

julia> B = @Static randn(8,8);

julia> C = @Static randn(7,7);

julia> D = @Static randn(7,7);

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     11.665 ns (0.00% GC)
  median time:      11.735 ns (0.00% GC)
  mean time:        12.215 ns (0.00% GC)
  maximum time:     43.882 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     8.550 ns (0.00% GC)
  median time:      8.591 ns (0.00% GC)
  mean time:        8.900 ns (0.00% GC)
  maximum time:     39.067 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> Array(A) * Array(B) .≈ A * B
8×8 BitArray{2}:
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1

julia> Array(C) * Array(D) .≈ C * D
7×7 BitArray{2}:
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1

The trick is simply to pad the matrices with a number of rows less than SIMD vector width.

julia> length(C.data), length(C)
(56, 49)

julia> C
7×7 SIMDArrays.StaticSIMDArray{Tuple{7,7},Float64,2,8,56}:
  0.319496    0.428686  -1.11775   -1.3375    -0.279712   1.66079     0.401079
 -0.0888366  -0.776635   0.656009  -0.632125  -0.354105   2.75417    -0.299715
 -0.496308   -0.825651  -1.23162    0.996198   0.574873  -0.0737101   1.29759 
  1.30891     0.904274  -0.64274    1.07277   -0.222821   0.778396   -1.35605 
  0.783055   -1.40736    0.26168    1.54089   -1.21921    0.69172    -0.628125
  0.96238    -0.950408  -0.684878   0.272511  -0.188624  -0.0396921   0.494139
 -0.814892    0.769325   0.673766  -0.402637   1.01501   -1.94723     0.770561

julia> C.data[1:10]
(0.31949627322535246, -0.0888366159816679, -0.49630848286450396, 1.3089137129729653, 0.7830545118498307, 0.9623798624797774, -0.8148924827876292, 0.18360020071865166, 0.42868648749010024, -0.776634962264874)

C is a 7x7 matrix, but has 56 elements with a stride of 8. Notice that elements 7 through 9 of the underlying tuple are -0.8148924827876292, 0.18360020071865166, 0.42868648749010024. The last element of the first column, padding, and then the first element of the second column. Because of this, we get efficient asm:

julia> @code_native C * D
    .text
; ┌ @ blas.jl:117 within `*'
; │┌ @ blas.jl:117 within `macro expansion'
    vmovupd (%rsi), %zmm6
; ││ @ blas.jl:23 within `macro expansion'
; ││┌ @ floating_point_arithmetic.jl:182 within `evmul'
; │││┌ @ llvmwrap.jl:62 within `llvmwrap' @ llvmwrap.jl:62
; ││││┌ @ llvmwrap.jl:81 within `macro expansion'
    vmulpd  (%rdx){1to8}, %zmm6, %zmm0
    vmulpd  64(%rdx){1to8}, %zmm6, %zmm1
    vmulpd  128(%rdx){1to8}, %zmm6, %zmm2
    vmulpd  192(%rdx){1to8}, %zmm6, %zmm3
    vmulpd  256(%rdx){1to8}, %zmm6, %zmm4
    vmulpd  320(%rdx){1to8}, %zmm6, %zmm5
    vmulpd  384(%rdx){1to8}, %zmm6, %zmm6
    movq    $-48, %rax
    nopl    (%rax)
; ││└└└
; ││ @ blas.jl:28 within `macro expansion'
L64:
    vmovupd 448(%rsi,%rax,8), %zmm7
; ││ @ blas.jl:31 within `macro expansion'
; ││┌ @ SIMDPirates.jl:33 within `vfma'
; │││┌ @ floating_point_arithmetic.jl:427 within `macro expansion'
; ││││┌ @ SIMDPirates.jl:33 within `vfma'
; │││││┌ @ floating_point_arithmetic.jl:349 within `macro expansion'
; ││││││┌ @ llvmwrap.jl:148 within `llvmwrap' @ llvmwrap.jl:148
; │││││││┌ @ llvmwrap.jl:170 within `macro expansion'
    vfmadd231pd 56(%rdx,%rax){1to8}, %zmm7, %zmm0
    vfmadd231pd 120(%rdx,%rax){1to8}, %zmm7, %zmm1
    vfmadd231pd 184(%rdx,%rax){1to8}, %zmm7, %zmm2
    vfmadd231pd 248(%rdx,%rax){1to8}, %zmm7, %zmm3
    vfmadd231pd 312(%rdx,%rax){1to8}, %zmm7, %zmm4
    vfmadd231pd 376(%rdx,%rax){1to8}, %zmm7, %zmm5
    vfmadd231pd 440(%rdx,%rax){1to8}, %zmm7, %zmm6
; │└└└└└└└
; │┌ @ promotion.jl:403 within `macro expansion'
    addq    $8, %rax
; │└
; │ @ blas.jl:31 within `*'
    jne L64
; │ @ blas.jl:36 within `*'
    vmovupd %zmm0, (%rdi)
    vmovupd %zmm1, 64(%rdi)
    vmovupd %zmm2, 128(%rdi)
    vmovupd %zmm3, 192(%rdi)
    vmovupd %zmm4, 256(%rdi)
    vmovupd %zmm5, 320(%rdi)
    vmovupd %zmm6, 384(%rdi)
    movq    %rdi, %rax
    vzeroupper
    retq
    nopl    (%rax)
; └

Or, for the 3x3 case:

julia> S = @Static randn(3,3)
3×3 SIMDArrays.StaticSIMDArray{Tuple{3,3},Float64,2,4,12}:
 -1.72684   -0.053865   -0.541157
  0.020725  -0.0593686  -0.1204  
  0.587405  -0.667932    0.623411

julia> T = @Static randn(3,3)
3×3 SIMDArrays.StaticSIMDArray{Tuple{3,3},Float64,2,4,12}:
 -0.00492318   1.76432  -2.19455 
 -0.359605     1.9209    0.167068
  0.722771    -1.35129   0.520166

julia> @benchmark $S * $T
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.849 ns (0.00% GC)
  median time:      1.873 ns (0.00% GC)
  mean time:        1.950 ns (0.00% GC)
  maximum time:     18.867 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

The downside is that IndexStyle must now be IndexCartesian, to avoid breaking code where these arrays interact with other matrix types. However, they are allowed to act like IndexLinear while interacting with each other. I've not messed with custom broadcasting much, but it's probably possible to handle that there.

EDIT: I'd like to make another commend about manual unrolling. My code above just uses for loops, because they're at least as fast to run for bigger matrices, while compiling far more quickly. For smaller matrices, LLVM will automatically unroll for you anyway:

julia> W = @Static randn(4,4)
4×4 SIMDArrays.StaticSIMDArray{Tuple{4,4},Float64,2,4,16}:
 -1.13708   -0.480141  -0.330682   0.820593
  0.482933  -1.05726    0.536663   0.344132
  0.322102   1.18024    0.930196  -0.58379 
  0.272023   0.567929  -0.504169   1.01676 

julia> Y = @Static randn(4,4)
4×4 SIMDArrays.StaticSIMDArray{Tuple{4,4},Float64,2,4,16}:
  0.492714  1.19693    1.33252    0.0144647
  0.901112  1.52111   -0.507832  -1.94051  
 -0.265069  1.10911    0.862128  -0.739434 
 -0.498172  0.293976  -1.07335    0.623734 

julia> @code_native W * Y
    .text
; ┌ @ blas.jl:117 within `*'
; │┌ @ blas.jl:117 within `macro expansion'
    vmovupd (%rsi), %ymm0
; ││ @ blas.jl:28 within `macro expansion'
    vmovupd 32(%rsi), %ymm1
    vmovupd 64(%rsi), %ymm2
    vmovupd 96(%rsi), %ymm3
; ││ @ blas.jl:23 within `macro expansion'
; ││┌ @ floating_point_arithmetic.jl:182 within `evmul'
; │││┌ @ llvmwrap.jl:62 within `llvmwrap' @ llvmwrap.jl:62
; ││││┌ @ llvmwrap.jl:81 within `macro expansion'
    vmulpd  96(%rdx){1to4}, %ymm0, %ymm4
    vmulpd  64(%rdx){1to4}, %ymm0, %ymm5
    vmulpd  32(%rdx){1to4}, %ymm0, %ymm6
    vmulpd  (%rdx){1to4}, %ymm0, %ymm0
; │└└└└
; │┌ @ llvmwrap.jl:170 within `macro expansion'
    vfmadd231pd 8(%rdx){1to4}, %ymm1, %ymm0
    vfmadd231pd 40(%rdx){1to4}, %ymm1, %ymm6
    vfmadd231pd 72(%rdx){1to4}, %ymm1, %ymm5
    vfmadd231pd 104(%rdx){1to4}, %ymm1, %ymm4
    vfmadd231pd 16(%rdx){1to4}, %ymm2, %ymm0
    vfmadd231pd 48(%rdx){1to4}, %ymm2, %ymm6
    vfmadd231pd 80(%rdx){1to4}, %ymm2, %ymm5
    vfmadd231pd 112(%rdx){1to4}, %ymm2, %ymm4
    vfmadd231pd 24(%rdx){1to4}, %ymm3, %ymm0
    vfmadd231pd 56(%rdx){1to4}, %ymm3, %ymm6
    vfmadd231pd 88(%rdx){1to4}, %ymm3, %ymm5
    vfmadd231pd 120(%rdx){1to4}, %ymm3, %ymm4
; │└
; │ @ blas.jl:36 within `*'
    vmovupd %ymm0, (%rdi)
    vmovupd %ymm6, 32(%rdi)
    vmovupd %ymm5, 64(%rdi)
    vmovupd %ymm4, 96(%rdi)
    movq    %rdi, %rax
    vzeroupper
    retq
    nopl    (%rax)
; └

I'm inclined to just trust LLVM on whether or not to unroll the loop.

Regarding unrolling, it's important to keep in mind interactions between unrolling and type inference. In particular, any type inference opportunities which are exposed by unrolling will be invisible to the julia compiler if we leave it to the LLVM layer. This can be a problem in highly generic code.

That's not to say we don't want to remove unrolling from this package: I'd love it if we could remove all unrolling-related generated functions. At the time StaticArrays was started (julia-0.5 era) that definitely wasn't an option, but the compiler has improved fantastically since then.

Regarding padding - adding any would completely change the memory layout of Vector{SVector{...}} which is somewhat questionable in my mind. But more importantly the memory overhead would be excessive for small vectors and matrices. And what about objects which are not floats or ints? You should be able to have an SVector{2,BigFloat} and the implementation of this should not be burdened with SIMD-related stuff.

So I feel this is not a task for StaticArrays itself, but rather a task for an array wrapper in the spirit of https://github.com/piever/StructArrays.jl (see also StructsOfArrays.jl) which can present a SIMD-friendly data layout internally, while also presenting a human-friendly API as a simple array of structs. For more on how this type of technique can decouple API from memory layout in a really powerful way, see @Keno's part of the Celeste presentation from juliacon 2017: https://www.youtube.com/watch?v=uecdcADM3hY

For 7x7 matrices, the padded version requires 1.14 times more memory (8 rows /7 rows), but multiplying two of them is 7 times faster (61 ns / 8.6 ns). The 8x7 matrix actually requires less registers than the 7x7, and registers are the most precious memory. The 7x7 matrix will either be stored in pieces (eg, 2 xmm and a ymm), or assembly / disassembly into the larger registers takes time, and temporarily occupies several.

Regarding Vector{SVector{...}}, because Vectors are heap allocated, you can use masked load and store operations (for the architectures that support them). This would allow a 7 x N matrix to be converted to a Vector{SVector{7}}, with a getindex function defined on Arrays of StaticArrays so that a masked load is used to produce a SVector{7} that has 8 elements under the hood.

The generated function determining the padding size would already have to use sizeof(T) to determine padding. Would be easy to add

function determine_padding(T)
    isbitstype(T) || return 0
    ...
end

Although odds are for most types that aren't floats or ints, you'd either just want to return 0 (because they're big), or would prefer some other fancy scheme (like StructArrays?).

For me, it's easier to just use SIMDArrays for the padding, and other functionality. But I haven't gotten around to writing unit tests or documentation for any of my libraries. And it is unlikely I'll find the time to do so before I graduate (hopefully after). I'd like to see some of it registered/upstreamed/or otherwise somehow made official, eventually. As is, I often start work in an exploratory fashion, I rely on all the well supported and robust libraries like StaticArrays. But when I feel like optimizing code, I switch to my hacked together and undocumented junk.

I'll have to look at that StructArrays library more, but by the looks of it, it doesn't support StaticArrays (eg, storing a vector of SVector{4,Float64} under the hood as 4 vectors of Float64). I wrote ScatteredArrays to do that, because often it is simple matrix-type data structures (with indices, not fields) that I want to break up under the hood in struct-of-arrays fashion.

julia> using ScatteredArrays, StaticArrays, Random

julia> sm = ScatteredMatrix{Float64,2,SMatrix{3,4,Float64,12}}(undef, 5, 6);

julia> Random.randn!(sm.data); typeof(sm.data)
Array{Float64,3}

julia> size(sm.data)
(5, 6, 12)

julia> sm[1,1]
3×4 SArray{Tuple{3,4},Float64,2,12}:
 -0.410485  0.14147   -0.612619    0.767951 
 -1.48007   0.907609  -0.0631815   0.20726  
 -0.104371  0.812684  -0.296792   -0.0752005

julia> sm.data[1,1,:]
12-element Array{Float64,1}:
 -0.41048499089682966
 -1.480068431968409  
 -0.10437062724429408
  0.14146994042866523
  0.9076085592863015 
  0.8126842410603605 
 -0.6126191772222825 
 -0.06318146709474118
 -0.29679219500032633
  0.7679508044942372 
  0.20725992583359093
 -0.07520049352014255

An N-dimensional ScatteredArray is stored under the hood as a N+1 dimensional array, where the last dimension corresponds to the M-dimensions of the isbits array object.

Similarly, SIMDArray's padding also occurs "under the hood", obscured from the user.

Also, Julia's compiler actually seems rather bad at the type of vectorization StructArrays and ScatteredArrays enable. For example, this thread on discourse featured a fairly simple problem that Julia failed to vectorize, but Fortran compilers did. I discussed a slightly more complicated problem on the Rust discourse, where I also posted benchmarks and code + assembly for a list of languages/compilers: C++ (GNU, Clang, icpc), Rust, Fortran (GNU, Flang, ifort), and ispc. Clang did the best, at 2 microseconds. icpc, Flang, and ispc were close behind, at around 2.4 microseconds. Then the GNU compilers at around 4.2, and Rust at about 4.6. ifort took about 16 microseconds.

And Julia took about 20 microseconds in all the comparable permutations. "Cheating", via a macro that forcibly vectorizes a loop via applying SIMD intrinsics, I got 2.9 microseconds (at the time of last updating the comment on the Rust forum, it was about 3.5 microseconds; I also reported a Julia's 2.6 microsecond time in that post via the macro, but it is invalid due to losing accuracy).

This benchmark was dominated by LLVM-based compilers (Clang, Flang, and ispc are all LLVM front ends), with Rust also at least managing to vectorize code. It was striking that Julia could not. Could I file a "missed optimization" issue on Julia's github about this sort of thing? Ideally, all it would take is a simple @simd or @fastmath, and not macro from a library:

function juliatest3!(X::AbstractMatrix{T}, Data::AbstractMatrix{T}) where T
     @inbounds @fastmath for i ∈ 1:size(Data,1)

        x1,x2,x3 = Data[i,1],Data[i,2],Data[i,3]
        S11,S12,S22,S13,S23,S33 = Data[i,5],Data[i,6],Data[i,7],Data[i,8],Data[i,9],Data[i,10]

        Ui33 = 1 / sqrt(S33)
        U13 = S13 * Ui33
        U23 = S23 * Ui33
        Ui22 = 1 / sqrt(S22 - U23*U23)
        U12 = (S12 - U13*U23) * Ui22

        Ui33x3 = Ui33 * x3

        Ui11 = 1 / sqrt(S11 - U12*U12 - U13*U13)
        Ui12 = - U12 * Ui11 * Ui22
        Ui13x3 = - (U13 * Ui11 + U23 * Ui12) * Ui33x3
        Ui23x3 = - U23 * Ui22 * Ui33x3

        X[i,1] = Ui11*x1 + Ui12*x2 + Ui13x3
        X[i,2] = Ui22*x2 + Ui23x3
        X[i,3] = Ui33x3
    end
end

@inline function pdbacksolve(x1,x2,x3,S11,S12,S22,S13,S23,S33)
    @pirate begin
        Ui33 = rsqrt(S33)
        U13 = S13 * Ui33
        U23 = S23 * Ui33
        Ui22 = rsqrt(S22 - U23*U23)
        U12 = (S12 - U13*U23) * Ui22

        Ui33x3 = Ui33 * x3

        Ui11 = rsqrt(S11 - U12*U12 - U13*U13)
        Ui12 = - U12 * Ui11 * Ui22
        Ui13x3 = - (U13 * Ui11 + U23 * Ui12) * Ui33x3
        Ui23x3 = - U23 * Ui22 * Ui33x3

        (
            Ui11*x1 + Ui12*x2 + Ui13x3,
            Ui22*x2 + Ui23x3,
            Ui33x3
        )
    end
end

@generated function juliatest!(X::AbstractMatrix{T}, Data::AbstractMatrix{T}) where T
    quote
        @vectorize $T for i ∈ 1:size(Data,1)
            X[i,:] .= pdbacksolve(
                Data[i,1],Data[i,2],Data[i,3],
                Data[i,5],Data[i,6],Data[i,7],Data[i,8],Data[i,9],Data[i,10]
            )
        end
    end
end

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     19.657 μs (0.00% GC)
  median time:      20.168 μs (0.00% GC)
  mean time:        20.803 μs (0.00% GC)
  maximum time:     54.302 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark juliatest!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.845 μs (0.00% GC)
  median time:      2.888 μs (0.00% GC)
  mean time:        3.020 μs (0.00% GC)
  maximum time:     6.644 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark clangtest($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.033 μs (0.00% GC)
  median time:      2.053 μs (0.00% GC)
  mean time:        2.157 μs (0.00% GC)
  maximum time:     5.858 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> (@macroexpand @vectorize Float32 for i ∈ 1:size(Data,1)
                   X[i,:] .= pdbacksolve(
                       Data[i,1],Data[i,2],Data[i,3],
                       Data[i,5],Data[i,6],Data[i,7],Data[i,8],Data[i,9],Data[i,10]
                   )
               end) |> striplines
quote
    ##N#442 = size(Data, 1)
    (Q, r) = (LoopVectorization.VectorizationBase).size_loads(Data, 1, Val{16}())
    begin
        ##stride#447 = LoopVectorization.stride_row(X)
        ##stride#449 = LoopVectorization.stride_row(Data)
    end
    ##pData#448 = vectorizable(Data)
    ##pX#443 = vectorizable(X)
    begin
        for ##i#441 = 1:16:Q * 16
            ##iter#440 = ##i#441
            begin
                ####pData#448_i#450 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 0##stride#449)
                ####pData#448_i#451 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 1##stride#449)
                ####pData#448_i#452 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 2##stride#449)
                ####pData#448_i#453 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 4##stride#449)
                ####pData#448_i#454 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 5##stride#449)
                ####pData#448_i#455 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 6##stride#449)
                ####pData#448_i#456 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 7##stride#449)
                ####pData#448_i#457 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 8##stride#449)
                ####pData#448_i#458 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 9##stride#449)
                begin
                    ##B#444 = LoopVectorization.extract_data.(pdbacksolve(####pData#448_i#450, ####pData#448_i#451, ####pData#448_i#452, ####pData#448_i#453, ####pData#448_i#454, ####pData#448_i#455, ####pData#448_i#456, ####pData#448_i#457, ####pData#448_i#458))
                    for ##j#446 = 0:SIMDPirates.vsub(length(##B#444), 1)
                        begin
                            $(Expr(:inbounds, true))
                            local #119#val = SIMDPirates.vstore(getindex(##B#444, SIMDPirates.vadd(1, ##j#446)), ##pX#443, SIMDPirates.vadd(##iter#440, SIMDPirates.vmul(##stride#447, ##j#446)))
                            $(Expr(:inbounds, :pop))
                            #119#val
                        end
                    end
                end
            end
        end
    end
    begin
        if r > 0
            mask = SIMDPirates.vless_or_equal(SIMDPirates.vsub((Core.VecElement{Int32}(1), Core.VecElement{Int32}(2), Core.VecElement{Int32}(3), Core.VecElement{Int32}(4), Core.VecElement{Int32}(5), Core.VecElement{Int32}(6), Core.VecElement{Int32}(7), Core.VecElement{Int32}(8), Core.VecElement{Int32}(9), Core.VecElement{Int32}(10), Core.VecElement{Int32}(11), Core.VecElement{Int32}(12), Core.VecElement{Int32}(13), Core.VecElement{Int32}(14), Core.VecElement{Int32}(15), Core.VecElement{Int32}(16)), unsafe_trunc(Int32, r)), zero(Int32))
            ##i#441 = (##N#442 - r) + 1
            ##iter#440 = ##i#441
            begin
                ####pData#448_i#450 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 0##stride#449, mask)
                ####pData#448_i#451 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 1##stride#449, mask)
                ####pData#448_i#452 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 2##stride#449, mask)
                ####pData#448_i#453 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 4##stride#449, mask)
                ####pData#448_i#454 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 5##stride#449, mask)
                ####pData#448_i#455 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 6##stride#449, mask)
                ####pData#448_i#456 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 7##stride#449, mask)
                ####pData#448_i#457 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 8##stride#449, mask)
                ####pData#448_i#458 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 9##stride#449, mask)
                begin
                    ##B#444 = LoopVectorization.extract_data.(pdbacksolve(####pData#448_i#450, ####pData#448_i#451, ####pData#448_i#452, ####pData#448_i#453, ####pData#448_i#454, ####pData#448_i#455, ####pData#448_i#456, ####pData#448_i#457, ####pData#448_i#458))
                    for ##j#446 = 0:SIMDPirates.vsub(length(##B#444), 1)
                        begin
                            $(Expr(:inbounds, true))
                            local #120#val = SIMDPirates.vstore(getindex(##B#444, SIMDPirates.vadd(1, ##j#446)), ##pX#443, SIMDPirates.vadd(##iter#440, SIMDPirates.vmul(##stride#447, ##j#446)), mask)
                            $(Expr(:inbounds, :pop))
                            #120#val
                        end
                    end
                end
            end
        end
    end
end

gensyms aren't the easiest thing to read. [EDIT: The mask calculation can be improved. ]

But I really like the "StructArrays"-style thinking when it comes to vectorization. Similarly, the SPMD-style, which recognizes that often the best way to vectorize is across loop iterations, not within (on that note, I enjoyed this series of blog posts on the history of ispc ).

PS. Can't believe I didn't see that video until now. Incredible stuff! I did think Newton's Trust Region was the "coolest" method from Optim, and found it normally needed far less iterations. What surprised me, however, is that a second order method can scale better than a first order method -- despite needing the explicit Hessians -- due to different scaling in number of needed iterations.

The ILP stuff was interesting; viewing code regions in terms of the number of instructions that can be run in parallel, to figure out how far you have to "step back" to vectorize it. Which then leads naturally to things like the StructArrays / managing memory layout to get good vectorization.

@chriselrod The detailed investigation you're doing here is great! Don't let me discourage you, it would be really great if StaticArrays would vectorize more effectively, and I am by no means an expert (nor have the time right now to become one).

What I'm pointing out are some of the constraints of a generic small matrices library which I feel make adding padding difficult right now and possibly undesirable in general.

A technical API-related reason why adding padding is difficult with the current state of things is that the size of the storage .data member of SArray cannot be computed from the shape automatically, and users are already forced to manually deal with the total data size and number of dimensions in some circumstances. For example, the concrete type of an SMatrix{2,3,Int} is SArray{Tuple{2,3},Int,2,6}. On the occasion that users are forced to write down the concrete type, at least the 6 is simply 2x3 right now. With padding it wouldn't even be fixed and might vary per architecture! At the very least we need the compiler to improve so that these details can be computed by the library, or change the library so that users never write this concrete type by hand. The desire to write the curly bracket notation is strong though and Core.apply_type can't be overloaded. For extensive discussion, see https://github.com/JuliaLang/julia/issues/18466.

This would allow a 7 x N matrix to be converted to a Vector{SVector{7}}, with a getindex function defined on Arrays of StaticArrays so that a masked load is used to produce a SVector{7} that has 8 elements under the hood.

The memory layout of Vector{SVector{7}} is not independent of the padding which goes in a SVector{7}. If our type SVector{7} has padding, that padding will end up in the Vector{SVector{7}}. This is true of any other heap allocated data structures as well.

This is why I think adding padding might be undesirable because it makes the ABI of anything using StaticArrays architecture dependent and makes the memory usage unpredictable and surprising for users. A lot of people will be using StaticArrays for 3x3 matrices or length 3 vectors in which case the overhead of an additional 1/3 (say) seems considerable to me. In the case that you need SIMD-friendly heap layout you can use some kind of SIMD-friendly SimdFriendlyVector <: AbstractVector{<:StaticArray} to achieve that in the spirit of StructArrays et al.

Now, avoiding padding also means that StaticArrays allocated on the stack have the wrong layout for SIMD. But on the other hand, reorganizing this purely local storage layout via compiler transformations seems plausible at the LLVM level (at least to someone slightly naive like myself ;-)) and doesn't interfere with the user-visible API or ABI.

So the path forward that I'm suggesting is some combination of:

Improve the code emitted by StaticArrays (eg, unrolling strategies) or use of the likes of @simd so that the julia LLVM passes can make something useful of it
Improve the LLVM optimization passes in the julia compiler as required to support (1)
Implement SIMD-friendly containers for heap data which are API-compatible with more naive dense storage.

Does that make sense? Is there something fundamental I'm overlooking here?

The memory layout of Vector{SVector{7}} is not independent of the padding which goes in a SVector{7}. If our type SVector{7} has padding, that padding will end up in the Vector{SVector{7}}. This is true of any other heap allocated data structures as well.

What I was trying to say there only makes this point worse:

With padding it wouldn't even be fixed and might vary per architecture!

However... Let SVector{N,T,L} be a static vector of length N, type T, and total length (with padding) of L. What I was essentially suggesting is that you reinterpret to 7 x P matrix of Float64 to get a P-length Vector{SVector{7,Foat64,7}} and that getindex(::Vector{<:Sarray}, I...) is overloaded, so that it will return a SVector{7,Float64,8}. This can be achieved via using masked load instructions, so that loading from the end of the vector neither segfaults or loads performance-killing subnormals. Masked store instructions would also be used to avoid segfaults.

If the arrays have enough extra memory allocated at the end to avoid segfaults, and if subnormals aren't a problem, regular load/stores could be used. However, the first seems hard to guarantee, and for the latter, I wouldn't want to change global state via a set_zero_subnormals. [Unfortunately, I don't think masked operations are fast on all architectures?]

This will a) avoid the memory overhead of storing extra data, while yeilding considerable performance boosts. The 3x3 padded matrix matmul took about 2 nanoseconds, versus the > 8 ns on my machine currently for 3x3. Although the 2.5 ns from Kristoffer's method is much better, so it'd be great to at least see that added. I think 3 rows is one of the most common use cases (3d points).

However, I do think your points are reasonable. It may be unwise to add features, especially ones that can be relatively opaque, by default. Instead, adding tools and documentation to let people opt in to the ones that would help would probably serve people better, via SimdFriendlyVector, etc.

Now, avoiding padding also means that StaticArrays allocated on the stack have the wrong layout for SIMD. But on the other hand, reorganizing this purely local storage layout via compiler transformations seems plausible at the LLVM level (at least to someone slightly naive like myself ;-)) and doesn't interfere with the user-visible API or ABI.

I don't know either, but because LLVM can find out it wants to assemble SIMD vectors, and does so with very inefficient code (right now), it seems superficially plausible that in the future it could do so "earlier" by representing them differently. Maybe that doesn't involve much more than a vectorization pass after eliminating all the higher level static array machinery. But I'm no compiler expert! But probably still only when they don't escape function boundaries. I've seen compilers emit mask instructions for things like vectorized square roots. Wish I saw them do it more often for loads/stores.

Improve the code emitted by StaticArrays (eg, unrolling strategies) or use of the likes of @simd so that the julia LLVM passes can make something useful of it

@fastmath and @simd both give permission to use fma instructions, unroll loops, etc. But it's more generally applicable (ie, don't need loops).

Does that make sense? Is there something fundamental I'm overlooking here?

Yes. While I do want padded stack matrices to be well supported somewhere, I think I agree with your suggested approach.

What I was essentially suggesting is that you reinterpret to 7 x P matrix of Float64 to get a P-length Vector{SVector{7,Foat64,7}} and that getindex(::Vector{<:Sarray}, I...) is overloaded, so that it will return a SVector{7,Float64,8}.

Oh I'd completely missed this: you're suggesting that the L parameter of SArray might be a feature rather than an irritating inconvenience forced on us by the compiler. Clever idea!

We could possibly use this if it could be made transparent to the average user.

Yes, that is interesting. What I feel we should really be using is something like SArray{axes, T, N, L} where for example axes = (OneTo(3),) for a standard length-3 static vector. (However, we've chose the Tuple{...} "hack" to make a nicer SVector and so-on.)

The assertions about L could be made just to just check it's at least large enough for the length of the array - this doesn't seem problematic to me. I guess you might wants strides and so-on in the most general case.

More broadly (and sorry I've been super duper busy this month) generally @c42f is on the mark here, about the intent of the code. The code here was originally written for a much older version of Julia and complete unrolling was the only way to get success - just like for FixedSizeArrays.jl and ImmutableArrays.jl. More importantly, a major design goal was to be generic with respect to the eltype - for example, the arrays are small so there shouldn't be too many problems with unrolling them even when the elements are very complex (note that we don't/can't force + or * on the elements to inline). I will admit to laziness with respect to Float32 and Float64, in my opinion Julia in general and LLVM in particular should be well tuned for doing things like geometry with these eltypes, and the auto-vectorizer is sometimes quite pleasing.

Basically - if you want to support SIMD better, sure go ahead of course we'd all love that, but keep in mind that (a) the optimal approach depends on the user's architecture (which is why I liked leaving this to LLVM to sort out) and (b) such methods will need to be for specific element types, not the generic versions.

@fastmath let's LLVM use fma instructions, when the host architecture has them, and gives it a little more license to reorder floating point computations.

For my own libraries, I have VectorizationBase.jl which has a build script that depends on CpuId to create a file with some useful info, such as simdbytes(). Then the appropriate vector length can be calculated for Float32 and Float64. But this is not very generic; I wouldn't use it for anything other than the native float or integer types. This is also useful for knowing the number of registers / how much data can be held in registers before spilling.

For example, with avx2, 8x8 * 8x8 matrix multiplication does not fit in the registers. Therefore, it computes it computes it blockwise (unlike earlier posts, this comment is using a chip without avx512, but only avx2):

julia> using BenchmarkTools, SIMDArrays, StaticArrays

julia> A = @Static randn(8,8);

julia> B = @Static randn(8,8);

julia> C = @SMatrix randn(8,8);

julia> D = @SMatrix randn(8,8);

julia> @benchmark $A * $B
@benchmark $BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     37.530 ns (0.00% GC)
  median time:      40.827 ns (0.00% GC)
  mean time:        41.024 ns (0.00% GC)
  maximum time:     67.167 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     992

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     60.743 ns (0.00% GC)
  median time:      63.097 ns (0.00% GC)
  mean time:        63.510 ns (0.00% GC)
  maximum time:     246.020 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     982

The code isn't totally optimal. After computing each block, LLVM moves part of the computed block onto the stack. Then at the end, it moves it back off the stack, into registers, and back again onto the stack:

    vmovups -56(%rsp), %ymm0
    vmovups -88(%rsp), %ymm2
    vmovups -120(%rsp), %ymm1
    movq    %rdi, %rax
    vmovups %ymm0, (%rdi)
    vmovups %ymm2, 32(%rdi)
    vmovups %ymm1, 64(%rdi)
    vmovupd %ymm3, 96(%rdi)
    vmovupd %ymm4, 128(%rdi)
    vmovupd %ymm5, 160(%rdi)
    vmovupd %ymm7, 192(%rdi)

Not the end of the world. Still faster than my heap-allocated array function for some reason:

julia> using LinearAlgebra

julia> X = @Sized randn(8,8);

julia> Y = @Sized randn(8,8);

julia> Z = @Sized randn(8,8);

julia> @benchmark mul!($Z, $X, $Y)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     46.849 ns (0.00% GC)
  median time:      50.203 ns (0.00% GC)
  mean time:        51.597 ns (0.00% GC)
  maximum time:     64.032 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     987

So I'll have to look into that.

The reduced copying becomes more important as we start to get a little larger:

julia> X = @Sized randn(16,16);

julia> Y = @Sized randn(16,16);

julia> Z = @Sized randn(16,16);

julia> @benchmark mul!($Z, $X, $Y)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     299.849 ns (0.00% GC)
  median time:      314.733 ns (0.00% GC)
  mean time:        321.455 ns (0.00% GC)
  maximum time:     528.492 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     258

julia> A = @Static randn(16,16);

julia> B = @Static randn(16,16);

julia> C = @SMatrix randn(16,16);

julia> D = @SMatrix randn(16,16);

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     442.076 ns (0.00% GC)
  median time:      462.576 ns (0.00% GC)
  mean time:        473.638 ns (0.00% GC)
  maximum time:     841.722 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     198

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     568.087 ns (0.00% GC)
  median time:      585.087 ns (0.00% GC)
  mean time:        602.290 ns (0.00% GC)
  maximum time:     1.098 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     183

But at these small sizes, they are all still faster than OpenBLAS:

julia> aA = Array(A); aB = Array(B); AB = similar(aA);

julia> @benchmark mul!($AB, $aA, $aB)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     745.484 ns (0.00% GC)
  median time:      770.881 ns (0.00% GC)
  mean time:        790.861 ns (0.00% GC)
  maximum time:     1.837 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     126

So I think the size range over which StaticArrays is more efficient than OpenBLAS can be made much much larger. Although, there will come a point where you wouldn't want the arrays to be parameterized by size anymore (ie, where you wouldn't want to recompile things for each new size).

These blocking patterns are of course architecture dependent. Have any suggestions on a good way to handle that?

This, and optimal vector widths, are the sort of things we'd like to know to really take advantage of "L as a feature". So if you're onboard with that plan, should I create a PR? Do you want to update sizing from Tuple{M,N,P} to Tuple{OneTo(M),OneTo(N),OneTo(P)}, or what do you have in mind there? I just more or less copied StaticArrays in SIMDArrays, except I added another "stride" parameter:

julia> A = @Static randn(15,16);

julia> typeof(A) # Tuple{# rows,# columns}, T, dim, stride, length of underlying tuple
SIMDArrays.StaticSIMDArray{Tuple{15,16},Float64,2,16,256}

Apparently, LLVM has matrix multiplication intrinsics now:

https://llvm.org/docs/LangRef.html#llvm-matrix-multiply-intrinsic

I look forward to trying them. So far, I haven't been able to successfully build Julia with LLVM 10.

Yes we should try those out when we get the chance. More immediately (julia 1.4), we can try the idea of copying into an appropriately-shaped container of VecElement and seeing whether that helps things. XRef https://discourse.julialang.org/t/ann-loopvectorization/32843/66?u=chris_foster

For reference, IIRC last time I thought about using VecElement LLVM would crash for odd lengths 7 and up, and the fact that 5 didn’t crash was brand new. It didn’t seem reliable for generic code.

This should be fixed on Julia master: https://github.com/JuliaLang/julia/pull/34473 Julia and LLVM didn't always agree on alignment of structs containing tuples of VecElements, which would often cause crashes (due to an assertion) and data corruption before the assertions were turned on (that data corruption also often caused crashes).

This PR is also important: https://github.com/JuliaLang/julia/pull/34490 Without it, llvmcall-defined methods would only work for those supported lengths (other lengths would be llvm-arrays instead of llvm-vectors).

Yeah, both LLVM and Julia have gotten significantly better at LLVM Vectors.

Is this issue still active? The current benchmark shows:

julia> for n in (2,3,4,5,6,7)
           s = Ref(rand(SMatrix{n,n}))
           @btime $(s)[] * $(s)[]
       end
  3.907 ns (0 allocations: 0 bytes)
  6.091 ns (0 allocations: 0 bytes)
  10.159 ns (0 allocations: 0 bytes)
  24.795 ns (0 allocations: 0 bytes)
  42.373 ns (0 allocations: 0 bytes)
  84.822 ns (0 allocations: 0 bytes)

(@v1.7) pkg> st StaticArrays
      Status `~/.julia/environments/v1.7/Project.toml`
  [90137ffa] StaticArrays v1.5.2 `~/.julia/dev/StaticArrays`

julia> versioninfo()
Julia Version 1.7.3
Commit 742b9abb4d (2022-05-06 12:58 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: AMD Ryzen 7 2700X Eight-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, znver1)

Is this issue still active? The current benchmark shows:

Yes.

julia> @benchmark ($(Ref(A3))[]*$(Ref(A3))[])
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range (min … max):  4.645 ns … 16.075 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     4.662 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   4.670 ns ±  0.149 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

                   ▁▄ ▅▅ █ ▇█ ▇▆ ▄▃ ▁ ▁
  ▂▂▁▂▂▁▂▂▁▃▄▁▄▁▆▇▁██▁██▁█▁██▁██▁██▁█▁█▇▁▇▅▁▄▄▁▄▁▄▃▁▃▃▁▃▃▁▂▂ ▄
  4.64 ns        Histogram: frequency by time        4.68 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark matmul3x3($(Ref(A3))[],$(Ref(A3))[])
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range (min … max):  3.160 ns … 17.475 ns  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     3.205 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.212 ns ±  0.212 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

                                          ▁▁▁    ▂▃▆▇█▆▆▂
  ▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▂▂▂▃▃▃▃▂▂▂▁▃▃▅▆████▇▇█▁████████▇▄▃ ▄
  3.16 ns        Histogram: frequency by time        3.21 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

Where matmul3x3 is as defined in the issue + @inline.

JuliaArrays / StaticArrays.jl

3x3 matrix multiply could potentially be faster #512