slimgroup / JUDI.jl

Julia Devito inversion.
https://slimgroup.github.io/JUDI.jl
MIT License
94 stars 29 forks source link

TypeError("cannot pickle 'traceback' object") #260

Closed kerim371 closed 1 week ago

kerim371 commented 1 week ago

Hi,

When running FWI on the CentOS 7 cluster I randomly get an error:

      From worker 5:    Building forward operator
      From worker 6:    Building forward operator
      From worker 3:    Building forward operator
      From worker 4:    Building forward operator
      From worker 2:    Building forward operator
      From worker 2:    Trying to allocate more memory for symbol us_u than available on physical device, this will start swapping
      From worker 2:    Trying to allocate more memory for symbol us_u than available on physical device, this will start swapping
      From worker 5:    Operator `forward` ran in 29.85 s
      From worker 5:    Building adjoint born operator
      From worker 3:    Operator `forward` ran in 42.70 s
      From worker 3:    Building adjoint born operator
      From worker 4:    Operator `forward` ran in 52.22 s
      From worker 4:    Building adjoint born operator
      From worker 6:    Operator `forward` ran in 78.56 s
      From worker 6:    Building adjoint born operator
      From worker 5:    Operator `forward` ran in 29.58 s
      From worker 3:    Operator `forward` ran in 42.38 s
      From worker 4:    Operator `forward` ran in 52.38 s
      From worker 6:    Operator `forward` ran in 78.33 s
      From worker 5:    ┌ Error: Fatal error on process 5
      From worker 5:    │   exception =
      From worker 5:    │    PyError ($(Expr(:escape, :(ccall(#= /home/kerim/.julia/packages/PyCall/1gn3u/src/pyfncall.jl:43 =# @pysym(:PyObject_Call), PyPtr, (PyPtr, PyPtr, PyPtr), o, pyargsptr, kw))))) <class 'TypeError'>
      From worker 5:    │    TypeError("cannot pickle 'traceback' object")
      From worker 5:    │    
      From worker 5:    │    Stacktrace:
      From worker 5:    │      [1] pyerr_check
      From worker 5:    │        @ ~/.julia/packages/PyCall/1gn3u/src/exception.jl:75 [inlined]
      From worker 5:    │      [2] pyerr_check
      From worker 5:    │        @ ~/.julia/packages/PyCall/1gn3u/src/exception.jl:79 [inlined]
      From worker 5:    │      [3] _handle_error(msg::String)
      From worker 5:    │        @ PyCall ~/.julia/packages/PyCall/1gn3u/src/exception.jl:96
      From worker 5:    │      [4] macro expansion
      From worker 5:    │        @ ~/.julia/packages/PyCall/1gn3u/src/exception.jl:110 [inlined]
      From worker 5:    │      [5] #107
      From worker 5:    │        @ ~/.julia/packages/PyCall/1gn3u/src/pyfncall.jl:43 [inlined]
      From worker 5:    │      [6] disable_sigint
      From worker 5:    │        @ ./c.jl:473 [inlined]
      From worker 5:    │      [7] __pycall!
      From worker 5:    │        @ ~/.julia/packages/PyCall/1gn3u/src/pyfncall.jl:42 [inlined]
      From worker 5:    │      [8] _pycall!(ret::PyCall.PyObject, o::PyCall.PyObject, args::Tuple{PyCall.PyObject}, nargs::Int64, kw::Ptr{Nothing})
      From worker 5:    │        @ PyCall ~/.julia/packages/PyCall/1gn3u/src/pyfncall.jl:29
      From worker 5:    │      [9] _pycall!
      From worker 5:    │        @ ~/.julia/packages/PyCall/1gn3u/src/pyfncall.jl:11 [inlined]
      From worker 5:    │     [10] #pycall#112
      From worker 5:    │        @ ~/.julia/packages/PyCall/1gn3u/src/pyfncall.jl:80 [inlined]
      From worker 5:    │     [11] pycall
      From worker 5:    │        @ ~/.julia/packages/PyCall/1gn3u/src/pyfncall.jl:80 [inlined]
      From worker 5:    │     [12] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, pyo::PyCall.PyObject)
      From worker 5:    │        @ PyCall ~/.julia/packages/PyCall/1gn3u/src/serialize.jl:14
      From worker 5:    │     [13] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
      From worker 5:    │        @ Serialization ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:678
      From worker 5:    │     [14] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
      From worker 5:    │        @ Serialization ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:657
      From worker 5:    │     [15] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, ex::CapturedException)
      From worker 5:    │        @ Distributed ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/clusterserialize.jl:192
      From worker 5:    │     [16] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
      From worker 5:    │        @ Serialization ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:678
      From worker 5:    │     [17] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
      From worker 5:    │        @ Serialization ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:657
      From worker 5:    │     [18] serialize_msg(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, o::Distributed.ResultMsg)
      From worker 5:    │        @ Distributed ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/messages.jl:78
      From worker 5:    │     [19] #invokelatest#2
      From worker 5:    │        @ ./essentials.jl:819 [inlined]
      From worker 5:    │     [20] invokelatest
      From worker 5:    │        @ ./essentials.jl:816 [inlined]
      From worker 5:    │     [21] send_msg_(w::Distributed.Worker, header::Distributed.MsgHeader, msg::Distributed.ResultMsg, now::Bool)
      From worker 5:    │        @ Distributed ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/messages.jl:181
      From worker 5:    │     [22] send_msg_now
      From worker 5:    │        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/messages.jl:118 [inlined]
      From worker 5:    │     [23] send_msg_now(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
      From worker 5:    │        @ Distributed ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/messages.jl:113
      From worker 5:    │     [24] deliver_result(sock::Sockets.TCPSocket, msg::Symbol, oid::Distributed.RRID, value::RemoteException)
      From worker 5:    │        @ Distributed ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:102
      From worker 5:    │     [25] macro expansion
      From worker 5:    │        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:293 [inlined]
      From worker 5:    │     [26] (::Distributed.var"#109#111"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})()
      From worker 5:    │        @ Distributed ./task.jl:514
      From worker 5:    └ @ Distributed ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:106
ERROR: LoadError: TaskFailedException

    nested task error: On worker 6:
    ProcessExitedException(5)
    Stacktrace:
      [1] try_yieldto
        @ ./task.jl:920
      [2] wait
        @ ./task.jl:984
      [3] #wait#621
        @ ./condition.jl:130
      [4] wait
        @ ./condition.jl:125 [inlined]
      [5] take_buffered
        @ ./channels.jl:457
      [6] take!
        @ ./channels.jl:451
      [7] take!
        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:726
      [8] #remotecall_fetch#159
        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:461
      [9] remotecall_fetch
        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:454
     [10] #remotecall_fetch#162
        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:492 [inlined]
     [11] remotecall_fetch
        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:492 [inlined]
     [12] local_reduce!
        @ ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/distributed.jl:38
     [13] #invokelatest#2
        @ ./essentials.jl:819
     [14] invokelatest
        @ ./essentials.jl:816
     [15] #114
        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:301
     [16] run_work_thunk
        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:70
     [17] run_work_thunk
        @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:79
     [18] #100
        @ ./task.jl:514
    Stacktrace:
     [1] remotecall_wait(::Function, ::Distributed.Worker, ::Future, ::Vararg{Future}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
       @ Distributed ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:507
     [2] remotecall_wait
       @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:497 [inlined]
     [3] #remotecall_wait#167
       @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:519 [inlined]
     [4] remotecall_wait
       @ ~/shared_app/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:519 [inlined]
     [5] (::JUDI.var"#293#294"{Vector{Future}, Int64})()
       @ JUDI ./task.jl:514
Stacktrace:
  [1] sync_end(c::Channel{Any})
    @ Base ./task.jl:445
  [2] macro expansion
    @ ./task.jl:477 [inlined]
  [3] reduce_level!(futures::Vector{Future}, nleaf::Int64)
    @ JUDI ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/distributed.jl:53
  [4] reduce!(futures::Vector{Future})
    @ JUDI ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/distributed.jl:75
  [5] run_and_reduce(func::Function, pool::WorkerPool, nsrc::Int64, arg_func::JUDI.var"#337#340"{JUDIOptions, JUDI.IsoModel{Float32, 3}, judiVector{Float32, Matrix{Float32}}, JUDI.LazyMul{Float32}, Nothing}; kw::JUDI.var"#338#341"{Base.Pairs{Symbol, Any, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:nlind, :lin, :misfit), Tuple{Bool, Bool, typeof(myloss)}}}, Bool})
    @ JUDI ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/propagation.jl:33
  [6] multi_src_fg!(G::PhysicalParameter{Float32, 3}, model::JUDI.IsoModel{Float32, 3}, q::judiVector{Float32, Matrix{Float32}}, dobs::JUDI.LazyMul{Float32}, dm::Nothing; options::JUDIOptions, kw::Base.Pairs{Symbol, Any, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:nlind, :lin, :misfit), Tuple{Bool, Bool, typeof(myloss)}}})
    @ JUDI ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/propagation.jl:114
  [7] multi_src_fg!
    @ ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/propagation.jl:105 [inlined]
  [8] #multi_exp_fg!#314
    @ ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/misfit_fg.jl:203 [inlined]
  [9] multi_exp_fg!
    @ ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/misfit_fg.jl:203 [inlined]
 [10] fwi_objective!(G::PhysicalParameter{Float32, 3}, model::JUDI.IsoModel{Float32, 3}, q::judiVector{Float32, Matrix{Float32}}, dobs::JUDI.LazyMul{Float32}; options::JUDIOptions, kw::Base.Pairs{Symbol, typeof(myloss), Tuple{Symbol}, NamedTuple{(:misfit,), Tuple{typeof(myloss)}}})
    @ JUDI ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/misfit_fg.jl:184
 [11] fwi_objective(model::JUDI.IsoModel{Float32, 3}, q::judiVector{Float32, Matrix{Float32}}, dobs::JUDI.LazyMul{Float32}; options::JUDIOptions, kw::Base.Pairs{Symbol, typeof(myloss), Tuple{Symbol}, NamedTuple{(:misfit,), Tuple{typeof(myloss)}}})
    @ JUDI ~/.julia/packages/JUDI/26Aci/src/TimeModeling/Modeling/misfit_fg.jl:149
 [12] objective_function(m_update::Array{Float64, 3})
    @ Main ~/shared/phys_model_inversion/inversion/fwi_spg.jl:274
 [13] (::SlimOptim.var"#objgrad!#11"{typeof(objective_function), result{Float32}})(g::Array{Float32, 3}, x::Array{Float64, 3})
    @ SlimOptim ~/.julia/packages/SlimOptim/jaZj3/src/SPGSlim.jl:99
 [14] _spg(obj::Function, grad!::SlimOptim.var"#grad!#10"{typeof(objective_function), result{Float32}}, objgrad!::SlimOptim.var"#objgrad!#11"{typeof(objective_function), result{Float32}}, projection::SlimOptim.var"#projection#9"{typeof(proj), result{Float32}}, x::Array{Float32, 3}, g::Array{Float32, 3}, sol::result{Float32}, ls::Nothing, options::SlimOptim.SPG_params; callback::typeof(SlimOptim.noop_callback))
    @ SlimOptim ~/.julia/packages/SlimOptim/jaZj3/src/SPGSlim.jl:172
 [15] spg(funObj::typeof(objective_function), x::Array{Float32, 3}, funProj::typeof(proj), options::SlimOptim.SPG_params; ls::Nothing, callback::Function)
    @ SlimOptim ~/.julia/packages/SlimOptim/jaZj3/src/SPGSlim.jl:103
 [16] spg(funObj::Function, x::Array{Float32, 3}, funProj::Function, options::SlimOptim.SPG_params)
    @ SlimOptim ~/.julia/packages/SlimOptim/jaZj3/src/SPGSlim.jl:89
 [17] top-level scope
    @ ~/shared/phys_model_inversion/inversion/fwi_spg.jl:322
in expression starting at /home/kerim/shared/phys_model_inversion/inversion/fwi_spg.jl:322

Options structure:

jopt = JUDI.Options(
    IC = "fwi",
    limit_m = true,
    buffer_size = buffer_size,
    optimal_checkpointing=false,
    subsampling_factor=2,
    free_surface=true,  
    space_order=16) 

Any ideas what may cause such behaviour?

JUDI: v3.4.4 Devito: 4.8.8

kerim371 commented 1 week ago

Oh, I guess that is probably because of insufficient RAM. The swapping may be the reason of this