JuliaParallel / ClusterManagers.jl

Other
232 stars 74 forks source link

SGE fails in rmprocs #163

Open aquaresima opened 3 years ago

aquaresima commented 3 years ago

Hi,

after adding and using 4 processors, ClusterManagers fails when it s time to remove the procs. Below a MWE

using ClusterManagers
using Distributed

N_JOBS = 4
addprocs_sge(N_JOBS; queue="single.q")

@everywhere using Dates
@everywhere  function test(id, info)
    sleep(10)
    # println(second(now()))
    return id, second(now()), info
end

futures = []
#works
for i  in workers()
    global futures
    ## fetch them inline
    println("start: ",fetch( @spawnat i getpid() ), " ", second(now()) )
    ## fetch them later
    result = @spawnat i test(getpid(), "pull results" )
    push!(futures, result)
end

# works
for i  in futures
    data = fetch(i)
    println("end: ", data...)
end

rmprocs(workers())

The error is the following:

ERROR: LoadError: MethodError: no method matching get(::Base.Process)
Closest candidates are:
  get(::IO, ::Any, ::Any) at show.jl:339
  get(::Base.EnvDict, ::AbstractString, ::Any) at env.jl:80
  get(::Base.TTY, ::Symbol, ::Any) at ttyhascolor.jl:27
  ...
Stacktrace:
 [1] kill(::SGEManager, ::Int64, ::WorkerConfig) at /home/alequa/.julia/packages/ClusterManagers/Mq0H0/src/qsub.jl:119
 [2] _rmprocs(::Array{Int64,1}, ::Int64) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1035
 [3] rmprocs(::Array{Int64,1}; waitfor::Int64) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1018
 [4] rmprocs(::Array{Int64,1}) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1010
 [5] top-level scope at /home/alequa/Documents/Research/phd_project/simulations/tripod_network/cluster/sge_test.jl:29
 [6] include(::Function, ::Module, ::String) at ./Base.jl:380
 [7] include(::Module, ::String) at ./Base.jl:368
 [8] exec_options(::Base.JLOptions) at ./client.jl:296
 [9] _start() at ./client.jl:506
in expression starting at /home/alequa/Documents/Research/phd_project/simulations/tripod_network/cluster/sge_test.jl:29
┌ Warning: Forcibly interrupting busy workers
│   exception =
│    MethodError: no method matching get(::Base.Process)
│    Closest candidates are:
│      get(::IO, ::Any, ::Any) at show.jl:339
│      get(::Base.EnvDict, ::AbstractString, ::Any) at env.jl:80
│      get(::Base.TTY, ::Symbol, ::Any) at ttyhascolor.jl:27
│      ...
└ @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1234
┌ Error: Unable to terminate all workers
│   exception =
│    MethodError: no method matching get(::Base.Process)
│    Closest candidates are:
│      get(::IO, ::Any, ::Any) at show.jl:339
│      get(::Base.EnvDict, ::AbstractString, ::Any) at env.jl:80
│      get(::Base.TTY, ::Symbol, ::Any) at ttyhascolor.jl:27
│      ...
│    Stacktrace:
│     [1] kill(::SGEManager, ::Int64, ::WorkerConfig) at /home/alequa/.julia/packages/ClusterManagers/Mq0H0/src/qsub.jl:119
│     [2] _rmprocs(::Array{Int64,1}, ::Float64) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1035
│     [3] rmprocs(::Array{Int64,1}; waitfor::Float64) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1018
│     [4] terminate_all_workers() at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1238
│     [5] _atexit() at ./initdefs.jl:316
│     [6] exit at ./initdefs.jl:28 [inlined]
│     [7] exec_options(::Base.JLOptions) at ./client.jl:300
│     [8] _start() at ./client.jl:506
└ @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1240

Thanks for your help

PS.

It gives a similar error if I end the program without running rmprpocs:

┌ Warning: Forcibly interrupting busy workers
│   exception =
│    MethodError: no method matching get(::Base.Process)
│    Closest candidates are:
│      get(::IO, ::Any, ::Any) at show.jl:339
│      get(::Base.EnvDict, ::AbstractString, ::Any) at env.jl:80
│      get(::Base.TTY, ::Symbol, ::Any) at ttyhascolor.jl:27
│      ...
└ @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1234
┌ Error: Unable to terminate all workers
│   exception =
│    MethodError: no method matching get(::Base.Process)
│    Closest candidates are:
│      get(::IO, ::Any, ::Any) at show.jl:339
│      get(::Base.EnvDict, ::AbstractString, ::Any) at env.jl:80
│      get(::Base.TTY, ::Symbol, ::Any) at ttyhascolor.jl:27
│      ...
│    Stacktrace:
│     [1] kill(::SGEManager, ::Int64, ::WorkerConfig) at /home/alequa/.julia/packages/ClusterManagers/Mq0H0/src/qsub.jl:119
│     [2] _rmprocs(::Array{Int64,1}, ::Float64) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1035
│     [3] rmprocs(::Array{Int64,1}; waitfor::Float64) at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1018
│     [4] terminate_all_workers() at /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1238
│     [5] _atexit() at ./initdefs.jl:316
└ @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.5/Distributed/src/cluster.jl:1240