Experiment with ordering of dimensions (excluding listener and keys computation parts)

"""
D × T × B
"""
function fdtb(m::LAS{M}, Hs::DenseArray{R,3}, ψhs::Vector{M}, maxT::Integer = 8size(Hs,2))::Vector{M} where {R <: Real, M <: DenseMatrix{R}}
   batch_size = size(Hs,3)
   # compute inital decoder state for a batch
   m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context] .+ gpu(zeros(R, m.state.dim, batch_size)))
   ŷs = broadcast(1:maxT) do _
      # compute query ϕ(sᵢ)
      ϕsᵢᵀ = m.attention_ϕ(m.state.decoding)'
      # compute energies
      Eᵢs = diag.((ϕsᵢᵀ,) .* ψhs)
      # compute attentions weights
      # αᵢs = softmax(hcat(Eᵢs...); dims=2)
      αᵢs = softmax(hcat(Eᵢs...)')
      # αᵢs = softmax(reduce(hcat, Eᵢs); dims=2)
      # αᵢs = softmax(reduce(hcat, Eᵢs)')
      # αᵢs = softmax(vcat(Eᵢs'...))
      # αᵢs = softmax(reduce(vcat, Eᵢs'))
      # compute attended context by normalizing values with respect to attention weights, i.e. contextᵢ = Σᵤαᵢᵤhᵤ
      # hcat(@inbounds([sum(αᵢs[b,u] * hs[u][:,b] for u ∈ eachindex(hs)) for b ∈ axes(αᵢs, 1)])...)
      m.state.context = dropdims(sum(reshape(αᵢs, 1, :, batch_size) .* Hs; dims=2); dims=2)
      # predict probability distribution over character alphabet
      m.state.prediction = m.infer([m.state.decoding; m.state.context])
      # compute decoder state
      m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context])
      return m.state.prediction
   end
   reset!(m)
   return ŷs
end
"""
D × B × T
"""
function fdbt(m::LAS{M}, Hs::DenseArray{R,3}, ψhs::Vector{M}, maxT::Integer = 8size(Hs,3))::Vector{M} where {R <: Real, M <: DenseMatrix{R}}
   batch_size = size(Hs,2)
   # compute inital decoder state for a batch
   m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context] .+ gpu(zeros(R, m.state.dim, batch_size)))
   ŷs = broadcast(1:maxT) do _
      # compute query ϕ(sᵢ)
      ϕsᵢᵀ = m.attention_ϕ(m.state.decoding)'
      # compute energies
      Eᵢs = diag.((ϕsᵢᵀ,) .* ψhs)
      # compute attentions weights
      αᵢs = softmax(hcat(Eᵢs...); dims=2)
      # αᵢs = softmax(hcat(Eᵢs...)')
      # αᵢs = softmax(reduce(hcat, Eᵢs); dims=2)
      # αᵢs = softmax(reduce(hcat, Eᵢs)')
      # αᵢs = softmax(vcat(Eᵢs'...))
      # αᵢs = softmax(reduce(vcat, Eᵢs'))
      # compute attended context by normalizing values with respect to attention weights, i.e. contextᵢ = Σᵤαᵢᵤhᵤ
      # hcat(@inbounds([sum(αᵢs[b,u] * hs[u][:,b] for u ∈ eachindex(hs)) for b ∈ axes(αᵢs, 1)])...)
      m.state.context = dropdims(sum(reshape(αᵢs, 1, batch_size, :) .* Hs; dims=3); dims=3)
      # predict probability distribution over character alphabet
      m.state.prediction = m.infer([m.state.decoding; m.state.context])
      # compute decoder state
      m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context])
      return m.state.prediction
   end
   reset!(m)
   return ŷs
end
"""
T × D × B
"""
function ftdb(m::LAS{M}, Hs::DenseArray{R,3}, ψhs::Vector{M}, maxT::Integer = 8size(Hs,1))::Vector{M} where {R <: Real, M <: DenseMatrix{R}}
   batch_size = size(Hs,3)
   # compute inital decoder state for a batch
   m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context] .+ gpu(zeros(R, m.state.dim, batch_size)))
   ŷs = broadcast(1:maxT) do _
      # compute query ϕ(sᵢ)
      ϕsᵢᵀ = m.attention_ϕ(m.state.decoding)'
      # compute energies
      Eᵢs = diag.((ϕsᵢᵀ,) .* ψhs)
      # compute attentions weights
      # αᵢs = softmax(hcat(Eᵢs...); dims=2)
      αᵢs = softmax(hcat(Eᵢs...)')
      # αᵢs = softmax(reduce(hcat, Eᵢs); dims=2)
      # αᵢs = softmax(reduce(hcat, Eᵢs)')
      # αᵢs = softmax(vcat(Eᵢs'...))
      # αᵢs = softmax(reduce(vcat, Eᵢs'))
      # compute attended context by normalizing values with respect to attention weights, i.e. contextᵢ = Σᵤαᵢᵤhᵤ
      # hcat(@inbounds([sum(αᵢs[b,u] * hs[u][:,b] for u ∈ eachindex(hs)) for b ∈ axes(αᵢs, 1)])...)
      m.state.context = dropdims(sum(reshape(αᵢs, :,1, batch_size) .* Hs; dims=1); dims=1)
      # predict probability distribution over character alphabet
      m.state.prediction = m.infer([m.state.decoding; m.state.context])
      # compute decoder state
      m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context])
      return m.state.prediction
   end
   reset!(m)
   return ŷs
end
"""
B × T × D
"""
function fbtd(m::LAS{M}, Hs::DenseArray{R,3}, ψhs::Vector{M}, maxT::Integer = 8size(Hs,2))::Vector{M} where {R <: Real, M <: DenseMatrix{R}}
   batch_size = size(Hs,1)
   # compute inital decoder state for a batch
   m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context] .+ gpu(zeros(R, m.state.dim, batch_size)))
   ŷs = broadcast(1:maxT) do _
      # compute query ϕ(sᵢ)
      ϕsᵢᵀ = m.attention_ϕ(m.state.decoding)'
      # compute energies
      Eᵢs = diag.((ϕsᵢᵀ,) .* ψhs)
      # compute attentions weights
      αᵢs = softmax(hcat(Eᵢs...); dims=2)
      # αᵢs = softmax(hcat(Eᵢs...)')
      # αᵢs = softmax(reduce(hcat, Eᵢs); dims=2)
      # αᵢs = softmax(reduce(hcat, Eᵢs)')
      # αᵢs = softmax(vcat(Eᵢs'...))
      # αᵢs = softmax(reduce(vcat, Eᵢs'))
      # compute attended context by normalizing values with respect to attention weights, i.e. contextᵢ = Σᵤαᵢᵤhᵤ
      # hcat(@inbounds([sum(αᵢs[b,u] * hs[u][:,b] for u ∈ eachindex(hs)) for b ∈ axes(αᵢs, 1)])...)
      m.state.context = dropdims(sum(αᵢs .* Hs; dims=2); dims=2)'
      # predict probability distribution over character alphabet
      m.state.prediction = m.infer([m.state.decoding; m.state.context])
      # compute decoder state
      m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context])
      return m.state.prediction
   end
   reset!(m)
   return ŷs
end
"""
T × B × D
"""
function ftbd(m::LAS{M}, Hs::DenseArray{R,3}, ψhs::Vector{M}, maxT::Integer = 8size(Hs,1))::Vector{M} where {R <: Real, M <: DenseMatrix{R}}
   batch_size = size(Hs,2)
   # compute inital decoder state for a batch
   m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context] .+ gpu(zeros(R, m.state.dim, batch_size)))
   ŷs = broadcast(1:maxT) do _
      # compute query ϕ(sᵢ)
      ϕsᵢᵀ = m.attention_ϕ(m.state.decoding)'
      # compute energies
      Eᵢs = diag.((ϕsᵢᵀ,) .* ψhs)
      # compute attentions weights
      # αᵢs = softmax(hcat(Eᵢs...); dims=2)
      αᵢs = softmax(hcat(Eᵢs...)')
      # αᵢs = softmax(reduce(hcat, Eᵢs); dims=2)
      # αᵢs = softmax(reduce(hcat, Eᵢs)')
      # αᵢs = softmax(vcat(Eᵢs'...))
      # αᵢs = softmax(reduce(vcat, Eᵢs'))
      # compute attended context by normalizing values with respect to attention weights, i.e. contextᵢ = Σᵤαᵢᵤhᵤ
      # hcat(@inbounds([sum(αᵢs[b,u] * hs[u][:,b] for u ∈ eachindex(hs)) for b ∈ axes(αᵢs, 1)])...)
      m.state.context = dropdims(sum(αᵢs .* Hs; dims=1); dims=1)'
      # predict probability distribution over character alphabet
      m.state.prediction = m.infer([m.state.decoding; m.state.context])
      # compute decoder state
      m.state.decoding = m.spell([m.state.decoding; m.state.prediction; m.state.context])
      return m.state.prediction
   end
   reset!(m)
   return ŷs
end

function gfdtb(m, Hs, ψhs, θ)
   gradient(θ) do
      sum(sum(fdtb(m, Hs, ψhs)))
   end
end
function gfdbt(m, Hs, ψhs, θ)
   gradient(θ) do
      sum(sum(fdbt(m, Hs, ψhs)))
   end
end
function gftdb(m, Hs, ψhs, θ)
   gradient(θ) do
      sum(sum(ftdb(m, Hs, ψhs)))
   end
end
function gfbtd(m, Hs, ψhs, θ)
   gradient(θ) do
      sum(sum(fbtd(m, Hs, ψhs)))
   end
end
function gftbd(m, Hs, ψhs, θ)
   gradient(θ) do
      sum(sum(ftbd(m, Hs, ψhs)))
   end
end

θ = Flux.params(m.state, m.attention_ϕ, m.spell, m.infer)
# compute input encoding, which are also values for the attention layer
Hs = m.listen(Xs)
# precompute keys ψ(H)
ψhs = m.attention_ψ.(getindex.(Ref(Hs), :, axes(Hs, 2), :))

reset!(m); Hs′ = permutedims(Hs, [1,2,3]);
@benchmark fdtb($m, $Hs′, $ψhs);
reset!(m); Hs′ = permutedims(Hs, [1,3,2]);
@benchmark fdbt($m, $Hs′, $ψhs);
reset!(m); Hs′ = permutedims(Hs, [2,1,3]);
@benchmark ftdb($m, $Hs′, $ψhs);
reset!(m); Hs′ = permutedims(Hs, [3,2,1]);
@benchmark fbtd($m, $Hs′, $ψhs);
reset!(m); Hs′ = permutedims(Hs, [2,3,1]);
@benchmark ftbd($m, $Hs′, $ψhs);

reset!(m); Hs′ = permutedims(Hs, [1,2,3]);
@benchmark gfdtb($m, $Hs′, $ψhs, $θ);
reset!(m); Hs′ = permutedims(Hs, [1,3,2]);
@benchmark gfdbt($m, $Hs′, $ψhs, $θ);
reset!(m); Hs′ = permutedims(Hs, [2,1,3]);
@benchmark gftdb($m, $Hs′, $ψhs, $θ);
reset!(m); Hs′ = permutedims(Hs, [3,2,1]);
@benchmark gfbtd($m, $Hs′, $ψhs, $θ);
reset!(m); Hs′ = permutedims(Hs, [2,3,1]);
@benchmark gftbd($m, $Hs′, $ψhs, $θ);

Benchmarking results are

julia> reset!(m); Hs′ = permutedims(Hs, [1,2,3]);

julia> @btime fdtb($m, $Hs′, $ψhs);
  9.325 s (246315 allocations: 7.81 GiB)

julia> reset!(m); Hs′ = permutedims(Hs, [1,3,2]);

julia> @btime fdbt($m, $Hs′, $ψhs);
  9.794 s (257715 allocations: 7.81 GiB)

julia> reset!(m); Hs′ = permutedims(Hs, [2,1,3]);

julia> @btime ftdb($m, $Hs′, $ψhs);
  9.746 s (238715 allocations: 7.81 GiB)

julia> reset!(m); Hs′ = permutedims(Hs, [3,2,1]);

julia> @btime fbtd($m, $Hs′, $ψhs);
  9.827 s (259235 allocations: 7.87 GiB)

julia> reset!(m); Hs′ = permutedims(Hs, [2,3,1]);

julia> @btime ftbd($m, $Hs′, $ψhs);
  9.679 s (238715 allocations: 7.87 GiB)

julia> reset!(m); Hs′ = permutedims(Hs, [1,2,3]);

julia> @btime gfdtb($m, $Hs′, $ψhs, $θ);
  167.128 s (2365907 allocations: 74.90 GiB)

julia> reset!(m); Hs′ = permutedims(Hs, [1,3,2]);

julia> @btime gfdbt($m, $Hs′, $ψhs, $θ);
  166.594 s (2393266 allocations: 74.90 GiB)

julia> reset!(m); Hs′ = permutedims(Hs, [2,1,3]);

julia> @btime gftdb($m, $Hs′, $ψhs, $θ);
  165.616 s (2358306 allocations: 74.90 GiB)

julia> reset!(m); Hs′ = permutedims(Hs, [3,2,1]);

julia> @btime gfbtd($m, $Hs′, $ψhs, $θ); # did not complete after an more than an hour runtime

Results on the small size neural net with the following dimensions

encoder_dims = (
   blstm       = (in = (length ∘ first ∘ first)(Xs), out = 64),
   pblstms_out = (64, 64, 64)
)
attention_dim = 64
decoder_out_dims = (128, 64)
m = LAS(encoder_dims, attention_dim, decoder_out_dims, out_dim)

for xs = last(Xs_train); Xs = vecofmats2tensor(xs)

julia> reset!(m); Hs′ = permutedims(Hs, [1,2,3]);

julia> @benchmark fdtb($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  2.32 GiB
  allocs estimate:  237186
  --------------
  minimum time:     2.056 s (10.21% GC)
  median time:      2.061 s (11.02% GC)
  mean time:        2.076 s (10.89% GC)
  maximum time:     2.110 s (11.43% GC)
  --------------
  samples:          3
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [1,3,2]);

julia> @benchmark fdbt($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  2.32 GiB
  allocs estimate:  248585
  --------------
  minimum time:     2.211 s (9.53% GC)
  median time:      2.217 s (9.76% GC)
  mean time:        2.255 s (9.85% GC)
  maximum time:     2.338 s (10.24% GC)
  --------------
  samples:          3
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [2,1,3]);

julia> @benchmark ftdb($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  2.32 GiB
  allocs estimate:  229585
  --------------
  minimum time:     2.280 s (10.04% GC)
  median time:      2.319 s (10.19% GC)
  mean time:        2.313 s (10.30% GC)
  maximum time:     2.339 s (10.65% GC)
  --------------
  samples:          3
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [3,2,1]);

julia> @benchmark fbtd($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  2.33 GiB
  allocs estimate:  249345
  --------------
  minimum time:     2.295 s (9.64% GC)
  median time:      2.328 s (10.05% GC)
  mean time:        2.319 s (10.03% GC)
  maximum time:     2.334 s (10.02% GC)
  --------------
  samples:          3
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [2,3,1]);

julia> @benchmark ftbd($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  2.33 GiB
  allocs estimate:  228825
  --------------
  minimum time:     2.227 s (10.22% GC)
  median time:      2.243 s (10.14% GC)
  mean time:        2.331 s (9.88% GC)
  maximum time:     2.523 s (9.48% GC)
  --------------
  samples:          3
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [1,2,3]);

julia> @benchmark gfdtb($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  13.72 GiB
  allocs estimate:  2049836
  --------------
  minimum time:     20.062 s (58.75% GC)
  median time:      20.062 s (58.75% GC)
  mean time:        20.062 s (58.75% GC)
  maximum time:     20.062 s (58.75% GC)
  --------------
  samples:          1
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [1,3,2]);

julia> @benchmark gfdbt($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  13.72 GiB
  allocs estimate:  2077195
  --------------
  minimum time:     19.861 s (58.31% GC)
  median time:      19.861 s (58.31% GC)
  mean time:        19.861 s (58.31% GC)
  maximum time:     19.861 s (58.31% GC)
  --------------
  samples:          1
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [2,1,3]);

julia> @benchmark gftdb($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  13.72 GiB
  allocs estimate:  2042235
  --------------
  minimum time:     20.234 s (57.38% GC)
  median time:      20.234 s (57.38% GC)
  mean time:        20.234 s (57.38% GC)
  maximum time:     20.234 s (57.38% GC)
  --------------
  samples:          1
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [3,2,1]);

julia> @benchmark gfbtd($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  13.75 GiB
  allocs estimate:  2074915
  --------------
  minimum time:     20.462 s (57.27% GC)
  median time:      20.462 s (57.27% GC)
  mean time:        20.462 s (57.27% GC)
  maximum time:     20.462 s (57.27% GC)
  --------------
  samples:          1
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [2,3,1]);

julia> @benchmark gftbd($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  13.75 GiB
  allocs estimate:  2038435
  --------------
  minimum time:     20.262 s (57.72% GC)
  median time:      20.262 s (57.72% GC)
  mean time:        20.262 s (57.72% GC)
  maximum time:     20.262 s (57.72% GC)
  --------------
  samples:          1
  evals/sample:     1

for xs = first(Xs_train); Xs = vecofmats2tensor(xs)

julia> reset!(m); Hs′ = permutedims(Hs, [1,2,3]);

julia> @benchmark fdtb($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  1.15 GiB
  allocs estimate:  33844
  --------------
  minimum time:     1.339 s (7.84% GC)
  median time:      1.364 s (8.91% GC)
  mean time:        1.360 s (8.79% GC)
  maximum time:     1.371 s (8.49% GC)
  --------------
  samples:          4
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [1,3,2]);

julia> @benchmark fdbt($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  1.15 GiB
  allocs estimate:  36363
  --------------
  minimum time:     1.339 s (7.94% GC)
  median time:      1.373 s (9.06% GC)
  mean time:        1.369 s (8.90% GC)
  maximum time:     1.391 s (8.63% GC)
  --------------
  samples:          4
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [2,1,3]);

julia> @benchmark ftdb($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  1.15 GiB
  allocs estimate:  32163
  --------------
  minimum time:     1.471 s (7.80% GC)
  median time:      1.482 s (8.55% GC)
  mean time:        1.489 s (8.60% GC)
  maximum time:     1.524 s (9.47% GC)
  --------------
  samples:          4
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [3,2,1]);

julia> @benchmark fbtd($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  1.16 GiB
  allocs estimate:  36531
  --------------
  minimum time:     1.371 s (8.18% GC)
  median time:      1.417 s (9.05% GC)
  mean time:        1.412 s (9.05% GC)
  maximum time:     1.444 s (9.88% GC)
  --------------
  samples:          4
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [2,3,1]);

julia> @benchmark ftbd($m, $Hs′, $ψhs)
BenchmarkTools.Trial: 
  memory estimate:  1.16 GiB
  allocs estimate:  31995
  --------------
  minimum time:     1.420 s (7.78% GC)
  median time:      1.455 s (8.21% GC)
  mean time:        1.448 s (8.38% GC)
  maximum time:     1.461 s (8.31% GC)
  --------------
  samples:          4
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [1,2,3]);

julia> @benchmark gfdtb($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  4.39 GiB
  allocs estimate:  280749
  --------------
  minimum time:     4.613 s (17.69% GC)
  median time:      4.658 s (18.04% GC)
  mean time:        4.658 s (18.04% GC)
  maximum time:     4.702 s (18.38% GC)
  --------------
  samples:          2
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [1,3,2]);

julia> @benchmark gfdbt($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  4.39 GiB
  allocs estimate:  286796
  --------------
  minimum time:     4.632 s (17.63% GC)
  median time:      4.668 s (18.03% GC)
  mean time:        4.668 s (18.03% GC)
  maximum time:     4.704 s (18.43% GC)
  --------------
  samples:          2
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [2,1,3]);

julia> @benchmark gftdb($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  4.39 GiB
  allocs estimate:  279068
  --------------
  minimum time:     4.798 s (15.06% GC)
  median time:      4.959 s (17.10% GC)
  mean time:        4.959 s (17.10% GC)
  maximum time:     5.120 s (19.01% GC)
  --------------
  samples:          2
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [3,2,1]);

julia> @benchmark gfbtd($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  4.41 GiB
  allocs estimate:  286292
  --------------
  minimum time:     4.584 s (15.84% GC)
  median time:      4.705 s (18.05% GC)
  mean time:        4.705 s (18.05% GC)
  maximum time:     4.827 s (20.14% GC)
  --------------
  samples:          2
  evals/sample:     1

julia> reset!(m); Hs′ = permutedims(Hs, [2,3,1]);

julia> @benchmark gftbd($m, $Hs′, $ψhs, $θ)
BenchmarkTools.Trial: 
  memory estimate:  4.41 GiB
  allocs estimate:  278228
  --------------
  minimum time:     4.809 s (15.06% GC)
  median time:      4.969 s (17.10% GC)
  mean time:        4.969 s (17.10% GC)
  maximum time:     5.129 s (19.02% GC)
  --------------
  samples:          2
  evals/sample:     1

AzamatB / ListenAttendSpell.jl

Experiment with ordering of dimensions (excluding listener and keys computation parts) #1