avik-pal / Wandb.jl

Unofficial Julia bindings for logging experiments to wandb.ai
https://avik-pal.github.io/Wandb.jl/stable/
MIT License
80 stars 10 forks source link

Ctrl+C during Training tends to kill the Wandb Process #19

Open awadell1 opened 2 years ago

awadell1 commented 2 years ago

Which admittedly is not shocking, but with ~10 minutes startup time, I'd really like to avoid having to start a new session

Problem at: (unknown file) 0 (unknown function)
Traceback (most recent call last):
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_init.py", line 954, in init
    run = wi.init()
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_init.py", line 489, in init
    tel.feature.init_return_run = True
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/lib/telemetry.py", line 43, in __exit__
    self._run._telemetry_callback(self._obj)
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 470, in _telemetry_callback
    self._telemetry_flush()
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 481, in _telemetry_flush
    self._backend.interface._publish_telemetry(self._telemetry_obj)
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 73, in _publish_telemetry
    self._publish(rec)
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/interface/interface_queue.py", line 49, in _publish
    raise Exception("The wandb backend process has shutdown")
Exception: The wandb backend process has shutdown
ERROR: LoadError: PyError ($(Expr(:escape, :(ccall(#= /home/awadell/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:43 =# @pysym(:PyObject_Call), PyPtr, (PyPtr, PyPtr, PyPtr), o, pyargsptr, kw))))) <class 'Exception'>
Exception('problem')
wandb: ERROR Abnormal program exit
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_init.py", line 992, in init
    six.raise_from(Exception("problem"), error_seen)
  File "<string>", line 3, in raise_from

Stacktrace:
  [1] pyerr_check
    @ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:62 [inlined]
  [2] pyerr_check
    @ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:66 [inlined]
  [3] _handle_error(msg::String)
    @ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:83
  [4] macro expansion
    @ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:97 [inlined]
  [5] #107
    @ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:43 [inlined]
  [6] disable_sigint
    @ ./c.jl:458 [inlined]
  [7] __pycall!
    @ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:42 [inlined]
  [8] _pycall!(ret::PyCall.PyObject, o::PyCall.PyObject, args::Tuple{}, nargs::Int64, kw::PyCall.PyObject)
    @ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:29
  [9] _pycall!(ret::PyCall.PyObject, o::PyCall.PyObject, args::Tuple{}, kwargs::Base.Pairs{Symbol, Any, Tuple{Symbol, Symbol, Symbol}, NamedTuple{(:project, :name, :config), Tuple{String, Nothing, Dict{String, Any}}}})
    @ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:11
 [10] #_#114
    @ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:86 [inlined]
 [11] WandbLogger(; project::String, name::Nothing, min_level::Base.CoreLogging.LogLevel, step_increment::Int64, start_step::Int64, kwargs::Base.Pairs{Symbol, Dict{String, Any}, Tuple{Symbol}, NamedTuple{(:config,), Tuple{Dict{String, Any}}}})
    @ Wandb ~/project/.julia/packages/Wandb/8Eio5/src/main.jl:19
avik-pal commented 2 years ago

Why would you have to restart the session? You could Wandb.close(lg) and then restart the logger with the same parameters.

awadell1 commented 2 years ago

I had been including a script like this, so I didn't have access to the logger.

function train()
lg = WandbLogger(...)

# Training stuff
end

train()

But even after switching to this, once the wandb backend shutdown, all calls to Wandb error out:

function train(lg)
...
end

lg = WandbLogger(...)
train(lg)
Wandb.close(lg)

Specifically, after Ctrl+Cing in the training loop, calling Wandb.close(lg) gives:

julia> Wandb.close(lg)
ERROR: PyError ($(Expr(:escape, :(ccall(#= /home/awadell/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:43 =# @pysym(:PyObject_Call), PyPtr, (PyPtr, PyPtr, PyPtr), o, pyargsptr, kw))))) <class 'Exception'>
Exception('The wandb backend process has shutdown')
Exception in thread NetStatThr:
    raise Exception("The wandb backend process has shutdown")
Traceback (most recent call last):
Exception: The wandb backend process has shutdown
  File "/home/awadell/project/.spack-env/._view/nz3aetbopueaqzrd7hqzg2w3h5busd3r/lib/python3.9/threading.py", line 973, in _bootstrap_inner
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 1538, in finish
    tel.feature.finish = True
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/lib/telemetry.py", line 43, in __exit__
    self._run._telemetry_callback(self._obj)
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 470, in _telemetry_callback
    self._telemetry_flush()
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 481, in _telemetry_flush
    self._backend.interface._publish_telemetry(self._telemetry_obj)
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 73, in _publish_telemetry
    self._publish(rec)
  File "/home/awadell/project/.spack-env/view/lib/python3.9/site-packages/wandb/sdk/interface/interface_queue.py", line 49, in _publish
    raise Exception("The wandb backend process has shutdown")

Stacktrace:
  [1] pyerr_check
    @ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:62 [inlined]
  [2] pyerr_check
    @ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:66 [inlined]
  [3] _handle_error(msg::String)
    @ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:83
  [4] macro expansion
    @ ~/project/.julia/packages/PyCall/7a7w0/src/exception.jl:97 [inlined]
  [5] #107
    @ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:43 [inlined]
  [6] disable_sigint
    @ ./c.jl:458 [inlined]
  [7] __pycall!
    @ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:42 [inlined]
  [8] _pycall!(ret::PyCall.PyObject, o::PyCall.PyObject, args::Tuple{}, nargs::Int64, kw::Ptr{Nothing})
    @ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:29
  [9] _pycall!
    @ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:11 [inlined]
 [10] #_#114
    @ ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:86 [inlined]
 [11] (::PyCall.PyObject)()
    @ PyCall ~/project/.julia/packages/PyCall/7a7w0/src/pyfncall.jl:86
 [12] close(lg::WandbLogger; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Wandb ~/project/.julia/packages/Wandb/8Eio5/src/main.jl:46
 [13] close(lg::WandbLogger)
    @ Wandb ~/project/.julia/packages/Wandb/8Eio5/src/main.jl:46
 [14] top-level scope
    @ REPL[3]:1
 [15] top-level scope
    @ ~/project/.julia/packages/CUDA/5jdFl/src/initialization.jl:52

So if I want to keep using Wandb to log things (I do, great package btw), I need to restart julia. Or at least that's the only strategy I've found that works