Open felixcremer opened 1 year ago
I now got a full failure of the mapCube call with this EOFError and I get this stacktrace:
julia> @time rqatrendvh = RQADeforestation.rqatrend(cubevh,thresh=4)
Progress: 11%|█████████████████▌ | ETA: 6:55:29Worker 38 terminated.
Unhandled Task ERROR: EOFError: read end of file
Stacktrace:
[1] (::Base.var"#wait_locked#680")(s::Sockets.TCPSocket, buf::IOBuffer, nb::Int64)
@ Base ./stream.jl:945
[2] unsafe_read(s::Sockets.TCPSocket, p::Ptr{UInt8}, nb::UInt64)
@ Base ./stream.jl:953
[3] unsafe_read
@ ./io.jl:759 [inlined]
[4] unsafe_read(s::Sockets.TCPSocket, p::Base.RefValue{NTuple{4, Int64}}, n::Int64)
@ Base ./io.jl:758
[5] read!
@ ./io.jl:760 [inlined]
[6] deserialize_hdr_raw
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/messages.jl:167 [inlined]
[7] message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool)
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:172
[8] process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool)
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:133
[9] (::Distributed.var"#103#104"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})()
@ Distributed ./task.jl:484
Progress: 11%|█████████████████▉ | ETA: 7:34:34ERROR: ProcessExitedException(38)
Stacktrace:
[1] try_yieldto(undo::typeof(Base.ensure_rescheduled))
@ Base ./task.jl:871
[2] wait()
@ Base ./task.jl:931
[3] wait(c::Base.GenericCondition{ReentrantLock})
@ Base ./condition.jl:124
[4] take_buffered(c::Channel{Any})
@ Base ./channels.jl:416
[5] take!(c::Channel{Any})
@ Base ./channels.jl:410
[6] take!(::Distributed.RemoteValue)
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:726
[7] remotecall_fetch(f::Function, w::Distributed.Worker, args::Tuple{UnitRange{Int64}, UnitRange{Int64}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:461
[8] remotecall_fetch(f::Function, w::Distributed.Worker, args::Tuple{UnitRange{Int64}, UnitRange{Int64}})
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:454
[9] remotecall_fetch(f::Function, id::Int64, args::Tuple{UnitRange{Int64}, UnitRange{Int64}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:492
[10] remotecall_fetch
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:492 [inlined]
[11] remotecall_pool(rc_f::typeof(remotecall_fetch), f::Function, pool::WorkerPool, args::Tuple{UnitRange{Int64}, UnitRange{Int64}}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/workerpool.jl:123
[12] remotecall_pool
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/workerpool.jl:120 [inlined]
[13] #remotecall_fetch#200
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/workerpool.jl:229 [inlined]
[14] remotecall_fetch
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/workerpool.jl:229 [inlined]
[15] #208#209
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/workerpool.jl:274 [inlined]
[16] #208
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/workerpool.jl:274 [inlined]
[17] (::Base.var"#929#934"{Distributed.var"#208#210"{Distributed.var"#208#209#211"{WorkerPool, ProgressMeter.var"#56#59"{RemoteChannel{Channel{Bool}}, YAXArrays.DAT.var"#fnew#99"{YAXArrays.DAT.var"#107#111"{YAXArrays.DAT.DATConfig{1, 1}}, Future}}}}})(r::Base.RefValue{Any}, args::Tuple{Tuple{UnitRange{Int64}, UnitRange{Int64}}})
@ Base ./asyncmap.jl:100
[18] macro expansion
@ ./asyncmap.jl:234 [inlined]
[19] (::Base.var"#945#946"{Base.var"#929#934"{Distributed.var"#208#210"{Distributed.var"#208#209#211"{WorkerPool, ProgressMeter.var"#56#59"{RemoteChannel{Channel{Bool}}, YAXArrays.DAT.var"#fnew#99"{YAXArrays.DAT.var"#107#111"{YAXArrays.DAT.DATConfig{1, 1}}, Future}}}}}, Channel{Any}, Nothing})()
@ Base ./task.jl:484
Stacktrace:
[1] (::Base.var"#939#941")(x::Task)
@ Base ./asyncmap.jl:177
[2] foreach(f::Base.var"#939#941", itr::Vector{Any})
@ Base ./abstractarray.jl:2774
[3] maptwice(wrapped_f::Function, chnl::Channel{Any}, worker_tasks::Vector{Any}, c::DiskArrays.GridChunks{2})
@ Base ./asyncmap.jl:177
[4] wrap_n_exec_twice
@ ./asyncmap.jl:153 [inlined]
[5] #async_usemap#924
@ ./asyncmap.jl:103 [inlined]
[6] #asyncmap#923
@ ./asyncmap.jl:81 [inlined]
[7] pmap(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2}; distributed::Bool, batch_size::Int64, on_error::Nothing, retry_delays::Vector{Any}, retry_check::Nothing)
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/pmap.jl:126
[8] pmap(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2})
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/pmap.jl:99
[9] macro expansion
@ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1015 [inlined]
[10] macro expansion
@ ./task.jl:454 [inlined]
[11] macro expansion
@ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1014 [inlined]
[12] macro expansion
@ ./task.jl:454 [inlined]
[13] progress_map(::Function, ::Vararg{Any}; mapfun::typeof(pmap), progress::ProgressMeter.Progress, channel_bufflen::Int64, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1007
[14] #progress_pmap#60
@ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1032 [inlined]
[15] pmap_with_data(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2}; initfunc::Function, progress::ProgressMeter.Progress, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:668
[16] pmap_with_data(f::Function, c::DiskArrays.GridChunks{2}; initfunc::Function, kwargs::Base.Pairs{Symbol, ProgressMeter.Progress, Tuple{Symbol}, NamedTuple{(:progress,), Tuple{ProgressMeter.Progress}}})
@ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:673
[17] runLoop(dc::YAXArrays.DAT.DATConfig{1, 1}, showprog::Bool)
@ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:698
[18] mapCube(fu::typeof(RQADeforestation.rqatrend), cdata::Tuple{YAXArray{Union{Missing, Float16}, 3, DiskArrayTools.CFDiskArray{Float16, 3, Int16, DiskArrayTools.DiskArrayStack{Int16, 3, RQADeforestation.BufferGDALBand{Int16}, 1}}, Vector{RangeAxis}}}, addargs::Int64; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Dict{Int64, Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:475
[19] #mapCube#36
@ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:303 [inlined]
[20] #rqatrend#7
@ ~/RQADeforestation/src/analysis.jl:47 [inlined]
[21] top-level scope
@ ./timing.jl:262 [inlined]
[22] top-level scope
@ ~/RQADeforestation/scripts/run_rqatrend.jl:0
@meggart do you have any hint how I could b e debugging this?
I just started the analysis with 64 threads and without workers, to see, whether this would be running through and how long this might take.
Running the whole analysis with 64 threads leads to an estimated duration of 4 and a half days. It seems to be that the partial usage of distributed would be beneficial therefore I would like to fix this bug.
julia> @time rqatrendvh = RQADeforestation.rqatrend(cubevh,thresh=4)
^CProgress: 0%|▍ | ETA: 4 days, 12:48:54
When the process failed with the EOFError and I try it again I get the following error:
julia> rqatrendvh = RQADeforestation.rqatrend(cubevh,thresh=4)
ERROR: ProcessExitedException(2)
Stacktrace:
[1] worker_from_id(pg::Distributed.ProcessGroup, i::Int64)
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:1093
[2] worker_from_id
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:1090 [inlined]
[3] remotecall(::Function, ::Int64; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:447
[4] remotecall
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:447 [inlined]
[5] #95
@ ./none:0 [inlined]
[6] iterate
@ ./generator.jl:47 [inlined]
[7] _all(f::Base.var"#343#345", itr::Base.Generator{Vector{Int64}, YAXArrays.DAT.var"#95#97"{YAXArrays.DAT.var"#106#110"{Future}}}, #unused#::Colon)
@ Base ./reduce.jl:1260
[8] all
@ ./reduce.jl:1246 [inlined]
[9] Dict(kv::Base.Generator{Vector{Int64}, YAXArrays.DAT.var"#95#97"{YAXArrays.DAT.var"#106#110"{Future}}})
@ Base ./dict.jl:131
[10] pmap_with_data(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2}; initfunc::Function, progress::ProgressMeter.Progress, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:660
[11] pmap_with_data(f::Function, c::DiskArrays.GridChunks{2}; initfunc::Function, kwargs::Base.Pairs{Symbol, ProgressMeter.Progress, Tuple{Symbol}, NamedTuple{(:progress,), Tuple{ProgressMeter.Progress}}})
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:676
[12] runLoop(dc::YAXArrays.DAT.DATConfig{1, 1}, showprog::Bool)
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:701
[13] mapCube(fu::typeof(RQADeforestation.rqatrend), cdata::Tuple{YAXArray{Union{Missing, Float16}, 3, DiskArrayTools.CFDiskArray{Float16, 3, Int16, DiskArrayTools.DiskArrayStack{Int16, 3, RQADeforestation.BufferGDALBand{Int16}, 1}}, Vector{RangeAxis}}}, addargs::Int64; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Dict{Int64, Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:475
[14] #mapCube#36
@ ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:303 [inlined]
[15] #rqatrend#7
@ ~/RQADeforestation/src/analysis.jl:47 [inlined]
[16] top-level scope
@ REPL[34]:1
caused by: ProcessExitedException(2)
Stacktrace:
[1] worker_from_id(pg::Distributed.ProcessGroup, i::Int64)
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:1093
[2] worker_from_id
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:1090 [inlined]
[3] remotecall(::Function, ::Int64; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:447
[4] remotecall
@ ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:447 [inlined]
[5] #95
@ ./none:0 [inlined]
[6] iterate
@ ./generator.jl:47 [inlined]
[7] Dict{Int64, Future}(kv::Base.Generator{Vector{Int64}, YAXArrays.DAT.var"#95#97"{YAXArrays.DAT.var"#106#110"{Future}}})
@ Base ./dict.jl:105
[8] dict_with_eltype
@ ./abstractdict.jl:575 [inlined]
[9] dict_with_eltype
@ ./abstractdict.jl:582 [inlined]
[10] Dict(kv::Base.Generator{Vector{Int64}, YAXArrays.DAT.var"#95#97"{YAXArrays.DAT.var"#106#110"{Future}}})
@ Base ./dict.jl:129
[11] pmap_with_data(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2}; initfunc::Function, progress::ProgressMeter.Progress, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:660
[12] pmap_with_data(f::Function, c::DiskArrays.GridChunks{2}; initfunc::Function, kwargs::Base.Pairs{Symbol, ProgressMeter.Progress, Tuple{Symbol}, NamedTuple{(:progress,), Tuple{ProgressMeter.Progress}}})
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:676
[13] runLoop(dc::YAXArrays.DAT.DATConfig{1, 1}, showprog::Bool)
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:701
[14] mapCube(fu::typeof(RQADeforestation.rqatrend), cdata::Tuple{YAXArray{Union{Missing, Float16}, 3, DiskArrayTools.CFDiskArray{Float16, 3, Int16, DiskArrayTools.DiskArrayStack{Int16, 3, RQADeforestation.BufferGDALBand{Int16}, 1}}, Vector{RangeAxis}}}, addargs::Int64; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Dict{Int64, Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:475
[15] #mapCube#36
@ ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:303 [inlined]
[16] #rqatrend#7
@ ~/RQADeforestation/src/analysis.jl:47 [inlined]
[17] top-level scope
@ REPL[34]:1
@ Base ./asyncmap.jl:177
[4] wrap_n_exec_twice
@ ./asyncmap.jl:153 [inlined]
[5] #async_usemap#924
@ ./asyncmap.jl:103 [inlined]
[6] #asyncmap#923
@ ./asyncmap.jl:81 [inlined]
[7] pmap(f::Function, p::CachingPool, c::DiskArrays.GridChunks{2}; distributed::Bool, batch_size::Int64, on_error::Nothing, retry_delays::Vector{Any}, retry_check::Nothing)
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/pmap.jl:126
[8] pmap(f::Function, p::CachingPool, c::DiskArrays.GridChunks{2})
@ Distributed ~/.julia/juliaup/julia-1.8.5+0.x64.linux.gnu/share/julia/stdlib/v1.8/Distributed/src/pmap.jl:99
[9] macro expansion
@ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1015 [inlined]
[10] macro expansion
@ ./task.jl:454 [inlined]
[11] macro expansion
@ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1014 [inlined]
[12] macro expansion
@ ./task.jl:454 [inlined]
[13] progress_map(::Function, ::Vararg{Any}; mapfun::typeof(pmap), progress::ProgressMeter.Progress, channel_bufflen::Int64, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1007
[14] #progress_pmap#60
@ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1032 [inlined]
[15] progress_pmap
@ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1032 [inlined]
[16] runLoop(dc::YAXArrays.DAT.DATConfig{1, 1}, showprog::Bool)
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:706
[17] mapCube(fu::typeof(RQADeforestation.rqatrend), cdata::Tuple{YAXArray{Union{Missing, Float16}, 3, DiskArrayTools.CFDiskArray{Float16, 3, Int16, DiskArrayTools.DiskArrayStack{Int16, 3, RQADeforestation.BufferGDALBand{Int16}, 1}}, Vector{RangeAxis}}}, addargs::Int64; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Dict{Int64, Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ YAXArrays.DAT ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:476
[18] #mapCube#36
@ ~/.julia/dev/YAXArrays/src/DAT/DAT.jl:304 [inlined]
[19] #rqatrend#7
@ ~/RQADeforestation/src/analysis.jl:47 [inlined]
[20] macro expansion
@ ./timing.jl:262 [inlined]
[21] (::var"#1#2")()
@ Main ~/RQADeforestation/scripts/run_rqatrend.jl:42
[22] redirect_stdio(f::var"#1#2"; stdin::Nothing, stderr::String, stdout::String)
@ Base ./stream.jl:1411
[23] top-level scope
@ ~/RQADeforestation/scripts/run_rqatrend.jl:39
in expression starting at /home/ubuntu/RQADeforestation/scripts/run_rqatrend.jl:39
error: <inline asm>:1:2: invalid character in input
I just got the following error on one of the workers, and I have no idea where this came from, but I want to capture it, if it happens again: