Closed jlchan closed 3 years ago
julia> @time matmul!(b,A,x,1.0,1.0)
9.448104 seconds (10.53 M allocations: 619.813 MiB, 1.38% gc time, 100.00% compilation time)
5-element Vector{Float64}:
-1.455691725448457
1.5943697910139107
-2.2440219803017865
-3.300358562850599
0.9222580802381243
Which version?
Weird - the MWE works on a fresh REPL session for me too. We're observing that the 5-arg matmul!
with vector arguments hangs within Trixi.jl, so maybe something is going wrong in the process of running a Trixi solver.
I'm working on making an MWE which actually reproduces this behavior. In the meantime, would a stacktrace be useful?
I'm working on making an MWE which actually reproduces this behavior. In the meantime, would a stacktrace be useful?
Great. Yes, a stack trace would probably be helpful.
Hm, it might be something related to VSCode too. My code runs fine in the REPL, but hangs when running through VSCode. The stacktrace for VSCode is
ERROR: LoadError: InterruptException:
Stacktrace:
[1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
@ Base ./task.jl:710
[2] wait
@ ./task.jl:769 [inlined]
[3] yield()
@ Base ./task.jl:662
[4] wait
@ ~/.julia/packages/ThreadingUtilities/pkz6e/src/threadtasks.jl:62 [inlined]
[5] macro expansion
@ ~/.julia/packages/LoopVectorization/O1tX1/src/codegen/lower_threads.jl:437 [inlined]
[6] macro expansion
@ ~/.julia/packages/LoopVectorization/O1tX1/src/reconstruct_loopset.jl:706 [inlined]
[7] _turbo_!(::Val{(false, 0, 0, false, 4, 32, 15, 64, 32768, 262144, 16777216, 0x0000000000000008)}, ::ERROR: MethodError: no method matching namemap(::Type{LoopVectorization.OperationType})
The applicable method may be too new: running in world age 29642, while current world is 30025.
Closest candidates are:
namemap(::Type{LoopVectorization.OperationType}) at Enums.jl:195 (method too new to be called from this world context.)
namemap(::Type{Tokenize.Tokens.TokenError}) at Enums.jl:195
namemap(::Type{MPI.MPIImpl}) at Enums.jl:195 (method too new to be called from this world context.)
...
Stacktrace:
[1] Symbol(x::LoopVectorization.OperationType)
@ Base.Enums ./Enums.jl:26
[2] show(io::IOContext{IOBuffer}, x::LoopVectorization.OperationType)
@ Base.Enums ./Enums.jl:31
[3] _show_default(io::IOContext{IOBuffer}, x::Any)
@ Base ./show.jl:412
[4] show_default
@ ./show.jl:395 [inlined]
[5] show
@ ./show.jl:390 [inlined]
[6] show_delim_array(io::IOContext{IOBuffer}, itr::Tuple{Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct}, op::Char, delim::Char, cl::Char, delim_one::Bool, i1::Int64, n::Int64)
@ Base ./show.jl:1124
[7] show_delim_array
@ ./show.jl:1109 [inlined]
[8] show(io::IOContext{IOBuffer}, t::Tuple{Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct, Symbol, Symbol, LoopVectorization.OperationStruct})
@ Base ./show.jl:1142
[9] show_datatype(io::IOContext{IOBuffer}, x::DataType)
@ Base ./show.jl:928
[10] show(io::IOContext{IOBuffer}, x::Type)
@ Base ./show.jl:819
[11] sprint(f::Function, args::Type; context::IOContext{Base.TTY}, sizehint::Int64)
@ Base ./strings/io.jl:103
[12] print_type_stacktrace(io::IOContext{Base.TTY}, type::Type; color::Symbol)
@ Base ./show.jl:2262
[13] print_type_stacktrace(io::IOContext{Base.TTY}, type::Type)
@ Base ./show.jl:2262
[14] show_tuple_as_call(io::IOContext{Base.TTY}, name::Symbol, sig::Type, demangle::Bool, kwargs::Nothing, argnames::Vector{Symbol}, qualified::Bool)
@ Base ./show.jl:2243
[15] show_tuple_as_call(io::IOContext{Base.TTY}, name::Symbol, sig::Type, demangle::Bool, kwargs::Nothing, argnames::Vector{Symbol})
@ Base ./show.jl:2220
[16] show_spec_linfo(io::IOContext{Base.TTY}, frame::Base.StackTraces.StackFrame)
@ Base.StackTraces ./stacktraces.jl:241
[17] print_stackframe(io::IOContext{Base.TTY}, i::Int64, frame::Base.StackTraces.StackFrame, n::Int64, digit_align_width::Int64, modulecolor::Symbol)
@ Base ./errorshow.jl:712
[18] print_stackframe(io::IOContext{Base.TTY}, i::Int64, frame::Base.StackTraces.StackFrame, n::Int64, digit_align_width::Int64, modulecolordict::IdDict{Module, Symbol}, modulecolorcycler::Base.Iterators.Stateful{Base.Iterators.Cycle{Vector{Symbol}}, Union{Nothing, Tuple{Symbol, Int64}}})
@ Base ./errorshow.jl:689
[19] show_full_backtrace(io::IOContext{Base.TTY}, trace::Vector{Any}; print_linebreaks::Bool)
@ Base ./errorshow.jl:578
[20] show_backtrace(io::IOContext{Base.TTY}, t::Vector{Base.StackTraces.StackFrame})
@ Base ./errorshow.jl:778
[21] showerror(io::IOContext{Base.TTY}, ex::InterruptException, bt::Vector{Base.StackTraces.StackFrame}; backtrace::Bool)
@ Base ./errorshow.jl:90
[22] showerror(io::IOContext{Base.TTY}, ex::LoadError, bt::Vector{Base.StackTraces.StackFrame}; backtrace::Bool)
@ Base ./errorshow.jl:96
[23] showerror(io::IOContext{Base.TTY}, ex::LoadError, bt::Vector{Base.StackTraces.StackFrame})
@ Base ./errorshow.jl:95
[24] display_repl_error(io::Base.TTY, stack::VSCodeServer.EvalErrorStack)
@ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.2.5/scripts/packages/VSCodeServer/src/repl.jl:166
[25] evalrepl(m::Module, line::Expr, repl::REPL.LineEditREPL, main_mode::REPL.LineEdit.Prompt)
@ VSCodeServer ~/.vscode/extensions/julialang.language-julia-1.2.5/scripts/packages/VSCodeServer/src/repl.jl:122
[26] top-level scope
@ ~/.vscode/extensions/julialang.language-julia-1.2.5/scripts/packages/VSCodeServer/src/repl.jl:100
Mind trying on this branch? https://github.com/JuliaSIMD/ThreadingUtilities.jl/pull/29
Sure - how I make sure I'm using that branch within Octavian? I tried adding Octavian, then adding ThreadingUtilities#manualmemoryandyieldtotask
- will that do it?
It should. If you get a new stacktrace, mind sharing it?
If I run with the manualmemoryandyieldtotask
branch, I get the following error
WARNING: Workqueue inconsistency detected: popfirst!(Workqueue).state != :runnable
ERROR: LoadError: cannot switch to task running on another thread
Stacktrace:
[1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
@ Base ./task.jl:700
[2] yield
@ ./task.jl:681 [inlined]
[3] yield
@ ./task.jl:678 [inlined]
[4] wait
@ ~/.julia/packages/ThreadingUtilities/KZkj9/src/threadtasks.jl:61 [inlined]
[5] wait
@ ~/.julia/packages/ThreadingUtilities/KZkj9/src/threadtasks.jl:55 [inlined]
[6] macro expansion
@ ~/.julia/packages/Polyester/0DPCU/src/batch.jl:89 [inlined]
[7] _batch_no_reserve
@ ~/.julia/packages/Polyester/0DPCU/src/batch.jl:53 [inlined]
[8] batch
@ ~/.julia/packages/Polyester/0DPCU/src/batch.jl:195 [inlined]
[9] macro expansion
@ ~/.julia/packages/Polyester/0DPCU/src/closure.jl:164 [inlined]
[10] macro expansion
@ ~/.julia/dev/Trixi/src/auxiliary/auxiliary.jl:181 [inlined]
Ah, yeah, of course that wouldn't work.
OK - I think I have a more minimal example:
using Polyester, Octavian
using StructArrays, StaticArrays
using LinearAlgebra
mul_by_accum!(A) = let A = A
@inline (out, x)->matmul!(out, A, x, one(eltype(out)), one(eltype(out)))
end
function foo()
A = randn(10,10)
du = StructArray{SVector{2,Float64}}((zeros(10,100),zeros(10,100)))
@batch for col in 1:size(du,2)
x = StructArray{SVector{2,Float64}}((randn(10),randn(10)))
StructArrays.foreachfield(mul_by_accum!(A), view(du, :, col), x)
end
end
This gives the stacktrace
julia> foo()
^C
ERROR: InterruptException:
Stacktrace:
[1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
@ Base ./task.jl:710
[2] wait
@ ./task.jl:769 [inlined]
[3] yield()
@ Base ./task.jl:662
[4] wait
@ ~/.julia/packages/ThreadingUtilities/pkz6e/src/threadtasks.jl:62 [inlined]
[5] macro expansion
@ ~/.julia/packages/LoopVectorization/O1tX1/src/codegen/lower_threads.jl:437 [inlined]
[6] macro expansion
@ ~/.julia/packages/LoopVectorization/O1tX1/src/reconstruct_loopset.jl:706 [inlined]
[7] _turbo_!(::Val{(false, 0, 0, false, 4, 32, 15, 64, 32768, 262144, 16777216, 0x0000000000000008)}, ::Val{(:numericconstant, Symbol("###zero###7###"), LoopVectorization.OperationStruct(0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000002, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x00, 0x01), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x01, 0x02), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x02, 0x03), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000012, 0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000020301, LoopVectorization.compute, 0x00, 0x01), :LoopVectorization, :identity, LoopVectorization.OperationStruct(0x00000000000000000000000000000001, 0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000004, LoopVectorization.compute, 0x00, 0x01), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x00, 0x04), :LoopVectorization, :LOOPCONSTANTINSTRUCTION, LoopVectorization.OperationStruct(0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x00, 0x05), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x03, 0x06), :LoopVectorization, :mul_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000708, LoopVectorization.compute, 0x00, 0x07), :LoopVectorization, :vfmadd_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000001, 0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000060509, LoopVectorization.compute, 0x00, 0x08), :LoopVectorization, :setindex!, LoopVectorization.OperationStruct(0x00000000000000000000000000000001, 0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x0000000000000000000000000000000a, LoopVectorization.memstore, 0x03, 0x09))}, ::Val{(LoopVectorization.ArrayRefStruct{:A, Symbol("##vptr##_A")}(0x00000000000000000000000000000101, 0x00000000000000000000000000000102, 0x00000000000000000000000000000000, 0x00000000000000000000000000000101), LoopVectorization.ArrayRefStruct{:x, Symbol("##vptr##_x")}(0x00000000000000000000000000000001, 0x00000000000000000000000000000002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000001), LoopVectorization.ArrayRefStruct{:y, Symbol("##vptr##_y")}(0x00000000000000000000000000000001, 0x00000000000000000000000000000001, 0x00000000000000000000000000000000, 0x00000000000000000000000000000001))}, ::Val{(0, (), (6, 7), (), (), ((1, LoopVectorization.IntOrFloat),), ())}, ::Val{(:m, :n)}, ::Val{Tuple{Tuple{StrideArraysCore.CloseOpen{StaticInt{0}, Int64}, StrideArraysCore.CloseOpen{StaticInt{0}, Int64}}, Tuple{VectorizationBase.GroupedStridedPointers{Tuple{Ptr{Float64}, Ptr{Float64}, Ptr{Float64}}, (1, 1, 1), (0, 0, 0), ((1, 2), (1,), (1,)), ((1, 2), (1,), (1,)), Tuple{StaticInt{8}, Int64}, Tuple{StaticInt{0}, StaticInt{0}}}, Float64, Float64}}}, ::Int64, ::Int64, ::Ptr{Float64}, ::Ptr{Float64}, ::Ptr{Float64}, ::Int64, ::Float64, ::Float64)
@ LoopVectorization ~/.julia/packages/LoopVectorization/O1tX1/src/reconstruct_loopset.jl:706
[8] _matmul!(y::SubArray{Float64, 1, Matrix{Float64}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64}, true}, A::Matrix{Float64}, x::Vector{Float64}, α::Float64, β::Float64, MKN::Nothing, contig_axis::Nothing)
@ Octavian ~/.julia/packages/Octavian/ZKFHc/src/matmul.jl:508
[9] matmul!
@ ~/.julia/packages/Octavian/ZKFHc/src/matmul.jl:256 [inlined]
[10] matmul!
@ ~/.julia/packages/Octavian/ZKFHc/src/matmul.jl:242 [inlined]
...
An even more minimal example:
using Polyester, Octavian
using LinearAlgebra
function foo()
A = randn(10,10)
du = zeros(10,100)
@batch for col in 1:size(du,2)
x = randn(10)
matmul!(view(du,:,col),A,x)
end
end
The stacktrace is
julia> foo()
^C
ERROR: InterruptException:
Stacktrace:
[1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
@ Base ./task.jl:710
[2] wait
@ ./task.jl:769 [inlined]
[3] yield()
@ Base ./task.jl:662
[4] wait
@ ~/.julia/packages/ThreadingUtilities/pkz6e/src/threadtasks.jl:62 [inlined]
[5] macro expansion
@ ~/.julia/packages/LoopVectorization/O1tX1/src/codegen/lower_threads.jl:437 [inlined]
[6] macro expansion
@ ~/.julia/packages/LoopVectorization/O1tX1/src/reconstruct_loopset.jl:706 [inlined]
[7] _turbo_!(::
...
The terminal unfortunately froze so I couldn't copy and paste any additional stacktrace.
Also, as before, the code works in the REPL but not in VSCode.
Should be fixed on LoopVectorization master for code @tturbo
, which Octavian.matmul!(::AbstractVector, ...)
does.
Octavian.matmul
with matrices will need a PR there.
Thanks! Works like a charm in Trixi now.
The following code hangs
When
x
,b
are matrix arguments, the code runs fine.For reference, I'm running on