JuliaParallel / DTables.jl

Distributed table structures and data manipulation operations built on top of Dagger.jl
MIT License
73 stars 4 forks source link

Disk caching; mixed compute config; single machine #23

Open krynju opened 1 year ago

krynju commented 1 year ago

Failure on a simple DTable create and reduce with caching on (without caching it doesn't appear) Processes: 3 Threads: 2 Caching: on Platform: linux, but observed elsewhere (mac & windows)

Julia 1.8.5; Dagger 0.16.3; DTables 0.2.1

Notes: Process stuck after this error appeared Appears randomly - not easily reproducible

["1", "1", "true", "3"]
Used threads: 2
Used processes: 3
┌ Error: Fatal error on process 1
│   exception =
│    attempt to send to unknown socket
│    Stacktrace:
│     [1] error(s::String)
│       @ Base ./error.jl:35
│     [2] send_msg_unknown(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:99
│     [3] send_msg_now(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:115
│     [4] deliver_result(sock::Sockets.TCPSocket, msg::Symbol, oid::Distributed.RRID, value::Tuple{Bool, Dagger.ThunkFailedException{Dagger.ThunkFailedException{RemoteException}}})
│       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:102
│     [5] macro expansion
│       @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:293 [inlined]
│     [6] (::Distributed.var"#109#111"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})()
│       @ Distributed ./task.jl:484
└ @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:106
      From worker 3:    ┌ Error: Error on 3 while connecting to peer 2, exitingError in sending dynamic request:
no process with id 3 exists
Stacktrace:
 [1] error(s::String)
   @ Base ./error.jl:35
 [2] worker_from_id(pg::Distributed.ProcessGroup, i::Int64)
   @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:1098
 [3] worker_from_id
   @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:1090 [inlined]
 [4] #remotecall_fetch#162
   @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:492 [inlined]
 [5] remotecall_fetch
   @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:492 [inlined]
 [6] call_on_owner
   @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:565 [inlined]
 [7] take!
   @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:761 [inlined]
 [8] macro expansion
   @ ~/.julia/packages/Dagger/vNUsP/src/sch/dynamic.jl:50 [inlined]
 [9] (::Dagger.Sch.var"#32#36"{Dagger.Context, Dagger.Sch.ComputeState, RemoteChannel{Channel{Any}}, RemoteChannel{Channel{Any}}, Task})()
   @ Dagger.Sch ./task.jl:484

Worker 3 terminated.┌ Error: Fatal error on process 1
│   exception =
│    attempt to send to unknown socket
│    Stacktrace:
│     [1] error(s::String)
│       @ Base ./error.jl:35
│     [2] send_msg_unknown(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:99
│     [3] send_msg_now(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:115
│     [4] deliver_result(sock::Sockets.TCPSocket, msg::Symbol, oid::Distributed.RRID, value::Nothing)
│       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:102
│     [5] macro expansion
│       @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:302 [inlined]
│     [6] (::Distributed.var"#113#115"{Distributed.CallWaitMsg, Distributed.MsgHeader, Sockets.TCPSocket})()
│       @ Distributed ./task.jl:484
└ @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:106
      From worker 3:    │   exception =

      From worker 3:    │    ConcurrencyViolationError("lock must be held")
Unhandled Task ERROR: EOFError: read end of file
Stacktrace:
 [1] (::Base.var"#wait_locked#680")(s::Sockets.TCPSocket, buf::IOBuffer, nb::Int64)
   @ Base ./stream.jl:945
 [2] unsafe_read(s::Sockets.TCPSocket, p::Ptr{UInt8}, nb::UInt64)
   @ Base ./stream.jl:953
 [3] unsafe_read
   @ ./io.jl:759 [inlined]
 [4] unsafe_read(s::Sockets.TCPSocket, p::Base.RefValue{NTuple{4, Int64}}, n::Int64)
   @ Base ./io.jl:758
 [5] read!
   @ ./io.jl:760 [inlined]
 [6] deserialize_hdr_raw
   @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:167 [inlined]
 [7] message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool)
   @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:172
 [8] process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool)
   @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:133
 [9] (::Distributed.var"#103#104"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})()
   @ Distributed ./task.jl:484
      From worker 3:    │    Stacktrace:Error in eager scheduler:
TaskFailedException

    nested task error: no process with id 3 exists
    Stacktrace:
     [1] error(s::String)
       @ Base ./error.jl:35
     [2] worker_from_id(pg::Distributed.ProcessGroup, i::Int64)
       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:1098
     [3] worker_from_id
       @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:1090 [inlined]
     [4] remote_do(::Function, ::Int64, ::TimespanLogging.NoOpLog, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:557
     [5] remote_do(::Function, ::Int64, ::TimespanLogging.NoOpLog, ::Vararg{Any})
       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:557
     [6] (::Dagger.Sch.var"#112#114"{Dagger.Context, Set{Dagger.Chunk}, Int64})()
       @ Dagger.Sch ./task.jl:484
Stacktrace:
  [1] sync_end(c::Channel{Any})
    @ Base ./task.jl:436
  [2] macro expansion
    @ ./task.jl:455 [inlined]
  [3] evict_all_chunks!(ctx::Dagger.Context, to_evict::Set{Dagger.Chunk})
    @ Dagger.Sch ~/.julia/packages/Dagger/vNUsP/src/sch/Sch.jl:836
  [4] finish_task!(ctx::Dagger.Context, state::Dagger.Sch.ComputeState, node::Dagger.Thunk, thunk_failed::Bool)
    @ Dagger.Sch ~/.julia/packages/Dagger/vNUsP/src/sch/Sch.jl:831
  [5] (::Dagger.Sch.var"#89#90"{Dagger.Context, Dagger.Sch.ComputeState, Dagger.OSProc, NamedTuple{(:time_pressure, :storage_pressure, :storage_capacity, :loadavg, :threadtime, :gc_allocd, :transfer_rate), Tuple{UInt64, UInt64, UInt64, Tuple{Float64, Float64, Float64}, UInt64, Int64, UInt64}}, RemoteException, Int64, Dagger.ThreadProc, Int64})()
    @ Dagger.Sch ~/.julia/packages/Dagger/vNUsP/src/sch/Sch.jl:537
  [6] lock(f::Dagger.Sch.var"#89#90"{Dagger.Context, Dagger.Sch.ComputeState, Dagger.OSProc, NamedTuple{(:time_pressure, :storage_pressure, :storage_capacity, :loadavg, :threadtime, :gc_allocd, :transfer_rate), Tuple{UInt64, UInt64, UInt64, Tuple{Float64, Float64, Float64}, UInt64, Int64, UInt64}}, RemoteException, Int64, Dagger.ThreadProc, Int64}, l::ReentrantLock)
    @ Base ./lock.jl:185
  [7] scheduler_run(ctx::Dagger.Context, state::Dagger.Sch.ComputeState, d::Dagger.Thunk, options::Dagger.Sch.SchedulerOptions)
    @ Dagger.Sch ~/.julia/packages/Dagger/vNUsP/src/sch/Sch.jl:488
  [8] compute_dag(ctx::Dagger.Context, d::Dagger.Thunk; options::Dagger.Sch.SchedulerOptions)
    @ Dagger.Sch ~/.julia/packages/Dagger/vNUsP/src/sch/Sch.jl:416
  [9] compute(ctx::Dagger.Context, d::Dagger.Thunk; options::Dagger.Sch.SchedulerOptions)
    @ Dagger ~/.julia/packages/Dagger/vNUsP/src/compute.jl:31
 [10] (::Dagger.Sch.var"#53#55"{Dagger.Context})()
    @ Dagger.Sch ./task.jl:484

      From worker 3:    │      [1] concurrency_violation()
      From worker 3:    │        @ Base ./condition.jl:8
      From worker 3:    │      [2] assert_havelock
      From worker 3:    │        @ ./condition.jl:25 [inlined]
      From worker 3:    │      [3] assert_havelock
      From worker 3:    │        @ ./condition.jl:48 [inlined]
      From worker 3:    │      [4] assert_havelock
      From worker 3:    │        @ ./condition.jl:72 [inlined]
      From worker 3:    │      [5] notify(c::Condition, arg::Any, all::Bool, error::Bool)
      From worker 3:    │        @ Base ./condition.jl:144
      From worker 3:    │      [6] #notify#586
      From worker 3:    │        @ ./condition.jl:142 [inlined]
      From worker 3:    │      [7] set_worker_state
      From worker 3:    │        @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:148 [inlined]
      From worker 3:    │      [8] Distributed.Worker(id::Int64, r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, manager::Distributed.DefaultClusterManager; version::Nothing, config::WorkerConfig)
      From worker 3:    │        @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:126
      From worker 3:    │      [9] connect_to_peer(manager::Distributed.DefaultClusterManager, rpid::Int64, wconfig::WorkerConfig)
      From worker 3:    │        @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:363
      From worker 3:    │     [10] (::Distributed.var"#121#123"{Int64, WorkerConfig})()
      From worker 3:    │        @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:349
      From worker 3:    │     [11] exec_conn_func(w::Distributed.Worker)
      From worker 3:    │        @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:181
      From worker 3:    │     [12] (::Distributed.var"#21#24"{Distributed.Worker})()
      From worker 3:    │        @ Distributed ./task.jl:484
      From worker 3:    └ @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:369
      From worker 2:    ErrorException("Cookie read failed. Connection closed by peer.")CapturedException(ErrorException("Cookie read failed. Connection closed by peer."), Any[(error(s::String) at error.jl:35, 1), (process_hdr(s::Sockets.TCPSocket, validate_cookie::Bool) at process_messages.jl:258, 1), (message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) at process_messages.jl:158, 1), (process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) at process_messages.jl:133, 1), ((::Distributed.var"#103#104"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})() at task.jl:484, 1)])
      From worker 2:    Process(2) - Unknown remote, closing connection.

signal (15): Terminated
in expression starting at /home/krynju/Downloads/bench1/dtables_caching_test_tuple.jl:29
unknown function (ip: 0x7fbddce89117)
pthread_cond_wait at /usr/lib/libc.so.6 (unknown line)
uv_cond_wait at /workspace/srcdir/libuv/src/unix/thread.c:883
ijl_task_get_next at /home/krynju/julia/src/partr.c:596
poptask at ./task.jl:921
wait at ./task.jl:930
task_done_hook at ./task.jl:634
jfptr_task_done_hook_26224 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
jl_finish_task at /home/krynju/julia/src/task.c:254
start_task at /home/krynju/julia/src/task.c:942
unknown function (ip: (nil))
epoll_wait at /usr/lib/libc.so.6 (unknown line)
uv__io_poll at /workspace/srcdir/libuv/src/unix/epoll.c:236
uv_run at /workspace/srcdir/libuv/src/unix/core.c:400
ijl_task_get_next at /home/krynju/julia/src/partr.c:565
poptask at ./task.jl:921
wait at ./task.jl:930
wait at ./condition.jl:124
put_buffered at ./channels.jl:343
put! at ./channels.jl:321
unknown function (ip: 0x7fbdd6f37e99)
put! at /home/krynju/julia/usr/share/julia/stdlib/v1.8/Distributed/src/remotecall.jl:703
#54 at ./task.jl:688
unknown function (ip: 0x7fbdd6f38d1f)
jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
start_task at /home/krynju/julia/src/task.c:931
unknown function (ip: (nil))
Allocations: 33782883 (Pool: 33770379; Big: 12504); GC: 20
schedule: Task not runnable
atexit hook threw an error: ErrorException("task switch not allowed from inside gc finalizer")
ijl_error at /home/krynju/julia/src/rtutils.c:41
ijl_switch at /home/krynju/julia/src/task.c:530
try_yieldto at ./task.jl:861
wait at ./task.jl:931
uv_write at ./stream.jl:1046
unsafe_write at ./stream.jl:1118
write at ./strings/io.jl:244 [inlined]
print at ./strings/io.jl:246
jfptr_print_45001 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
showerror at ./errorshow.jl:144
unknown function (ip: 0x7fbdc5f61881)
_atexit at ./initdefs.jl:374
jfptr__atexit_48368 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
ijl_atexit_hook at /home/krynju/julia/src/init.c:219
ijl_exit at /home/krynju/julia/src/jl_uv.c:640
jl_exit_thread0_cb at /home/krynju/julia/src/signals-unix.c:428
krynju commented 1 year ago

Ah it looks like a gc finalizer issue again, will need to catch the process with gdb

krynju commented 1 year ago

Another one

┌ Error: Fatal error on process 1
│   exception =
│    ArgumentError: Cannot serialize a Thunk
│    Stacktrace:
│      [1] serialize(io::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::Dagger.Thunk)
│        @ Dagger ~/.julia/packages/Dagger/vNUsP/src/thunk.jl:96
│      [2] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
│        @ Serialization ~/julia/usr/share/julia/stdlib/v1.8/Serialization/src/Serialization.jl:675
│      [3] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
│        @ Serialization ~/julia/usr/share/julia/stdlib/v1.8/Serialization/src/Serialization.jl:654
│      [4] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
│        @ Serialization ~/julia/usr/share/julia/stdlib/v1.8/Serialization/src/Serialization.jl:675
│      [5] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
│        @ Serialization ~/julia/usr/share/julia/stdlib/v1.8/Serialization/src/Serialization.jl:654
│      [6] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any)
│        @ Serialization ~/julia/usr/share/julia/stdlib/v1.8/Serialization/src/Serialization.jl:675
│      [7] serialize
│        @ ~/julia/usr/share/julia/stdlib/v1.8/Serialization/src/Serialization.jl:654 [inlined]
│      [8] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::Tuple{Bool, Dagger.ThunkFailedException{Dagger.ThunkFailedException{RemoteException}}})
│        @ Serialization ~/julia/usr/share/julia/stdlib/v1.8/Serialization/src/Serialization.jl:205
│      [9] serialize_msg(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, o::Distributed.ResultMsg)
│        @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:78
│     [10] #invokelatest#2
│        @ ./essentials.jl:729 [inlined]
│     [11] invokelatest
│        @ ./essentials.jl:726 [inlined]
│     [12] send_msg_(w::Distributed.Worker, header::Distributed.MsgHeader, msg::Distributed.ResultMsg, now::Bool)
│        @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:181
│     [13] send_msg_now
│        @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:118 [inlined]
│     [14] send_msg_now(s::Sockets.TCPSocket, header::Distributed.MsgHeader, msg::Distributed.ResultMsg)
│        @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:113
│     [15] deliver_result(sock::Sockets.TCPSocket, msg::Symbol, oid::Distributed.RRID, value::Tuple{Bool, Dagger.ThunkFailedException{Dagger.ThunkFailedException{RemoteException}}})
│        @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:102
│     [16] macro expansion
│        @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:293 [inlined]
│     [17] (::Distributed.var"#109#111"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})()
│        @ Distributed ./task.jl:484
└ @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:106
Worker 3 terminated.┌ Warning: Worker 3 died, rescheduling work
└ @ Dagger.Sch ~/.julia/packages/Dagger/vNUsP/src/sch/Sch.jl:492

Unhandled Task ERROR: EOFError: read end of file
Stacktrace:
 [1] (::Base.var"#wait_locked#680")(s::Sockets.TCPSocket, buf::IOBuffer, nb::Int64)
   @ Base ./stream.jl:944
 [2] unsafe_read(s::Sockets.TCPSocket, p::Ptr{UInt8}, nb::UInt64)
   @ Base ./stream.jl:953
 [3] unsafe_read
   @ ./io.jl:759 [inlined]
 [4] unsafe_read(s::Sockets.TCPSocket, p::Base.RefValue{NTuple{4, Int64}}, n::Int64)
   @ Base ./io.jl:758
 [5] read!
   @ ./io.jl:760 [inlined]
 [6] deserialize_hdr_raw
   @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:167 [inlined]
 [7] message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool)
   @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:172
 [8] process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool)
   @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:133
 [9] (::Distributed.var"#103#104"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})()
   @ Distributed ./task.jl:484
      From worker 3:    ┌ Error: Fatal error on process 3
      From worker 3:    │   exception =
      From worker 3:    │    EOFError: read end of file
      From worker 3:    │    Stacktrace:
      From worker 3:    │     [1] (::Base.var"#wait_locked#680")(s::Sockets.TCPSocket, buf::IOBuffer, nb::Int64)
      From worker 3:    │       @ Base ./stream.jl:945
      From worker 3:    │     [2] unsafe_read(s::Sockets.TCPSocket, p::Ptr{UInt8}, nb::UInt64)
      From worker 3:    │       @ Base ./stream.jl:953
      From worker 3:    │     [3] unsafe_read
      From worker 3:    │       @ ./io.jl:759 [inlined]
      From worker 3:    │     [4] unsafe_read(s::Sockets.TCPSocket, p::Base.RefValue{NTuple{4, Int64}}, n::Int64)
      From worker 3:    │       @ Base ./io.jl:758
      From worker 3:    │     [5] read!
      From worker 3:    │       @ ./io.jl:760 [inlined]
      From worker 3:    │     [6] deserialize_hdr_raw
      From worker 3:    │       @ ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/messages.jl:167 [inlined]
      From worker 3:    │     [7] message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool)
      From worker 3:    │       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:172
      From worker 3:    │     [8] process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool)
      From worker 3:    │       @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:133
      From worker 3:    │     [9] (::Distributed.var"#103#104"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})()
      From worker 3:    │       @ Distributed ./task.jl:484
      From worker 3:    └ @ Distributed ~/julia/usr/share/julia/stdlib/v1.8/Distributed/src/process_messages.jl:229
      From worker 2:
      From worker 2:    signal (15): Terminated
      From worker 2:    in expression starting at none:0
      From worker 2:    unknown function (ip: 0x7f30f2c89117)
      From worker 2:    pthread_cond_wait at /usr/lib/libc.so.6 (unknown line)
      From worker 2:    uv_cond_wait at /workspace/srcdir/libuv/src/unix/thread.c:883
      From worker 2:    ijl_task_get_next at /home/krynju/julia/src/partr.c:596
      From worker 2:    poptask at ./task.jl:921
      From worker 2:    wait at ./task.jl:930
      From worker 2:    task_done_hook at ./task.jl:634
      From worker 2:    jfptr_task_done_hook_26224 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
      From worker 2:    jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
      From worker 2:    jl_finish_task at /home/krynju/julia/src/task.c:254
      From worker 2:    start_task at /home/krynju/julia/src/task.c:942
      From worker 2:    unknown function (ip: (nil))
      From worker 2:    unknown function (ip: 0x7f30f2c89117)
      From worker 2:    pthread_cond_wait at /usr/lib/libc.so.6 (unknown line)
      From worker 2:    uv_cond_wait at /workspace/srcdir/libuv/src/unix/thread.c:883
      From worker 2:    ijl_task_get_next at /home/krynju/julia/src/partr.c:596
      From worker 2:    poptask at ./task.jl:921
      From worker 2:    wait at ./task.jl:930
      From worker 2:    task_done_hook at ./task.jl:634
      From worker 2:    jfptr_task_done_hook_26224 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
      From worker 2:    jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
      From worker 2:    jl_finish_task at /home/krynju/julia/src/task.c:254
      From worker 2:    start_task at /home/krynju/julia/src/task.c:942
      From worker 2:    unknown function (ip: (nil))
      From worker 2:    unknown function (ip: 0x7f30f2c89117)
      From worker 2:    pthread_cond_wait at /usr/lib/libc.so.6 (unknown line)
      From worker 2:    uv_cond_wait at /workspace/srcdir/libuv/src/unix/thread.c:883
      From worker 2:    ijl_task_get_next at /home/krynju/julia/src/partr.c:596
      From worker 2:    poptask at ./task.jl:921
      From worker 2:    wait at ./task.jl:930
      From worker 2:    task_done_hook at ./task.jl:634
      From worker 2:    jfptr_task_done_hook_26224 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
      From worker 2:    jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
      From worker 2:    jl_finish_task at /home/krynju/julia/src/task.c:254
      From worker 2:    start_task at /home/krynju/julia/src/task.c:942
      From worker 2:    unknown function (ip: (nil))
      From worker 2:    epoll_wait at /usr/lib/libc.so.6 (unknown line)
      From worker 2:    uv__io_poll at /workspace/srcdir/libuv/src/unix/epoll.c:236
      From worker 2:    uv_run at /workspace/srcdir/libuv/src/unix/core.c:400
      From worker 2:    ijl_task_get_next at /home/krynju/julia/src/partr.c:565
      From worker 2:    poptask at ./task.jl:921
      From worker 2:    wait at ./task.jl:930
      From worker 2:    task_done_hook at ./task.jl:634
      From worker 2:    jfptr_task_done_hook_26224 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
      From worker 2:    jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
      From worker 2:    jl_finish_task at /home/krynju/julia/src/task.c:254
      From worker 2:    start_task at /home/krynju/julia/src/task.c:942
      From worker 2:    unknown function (ip: (nil))
      From worker 2:    Allocations: 26411673 (Pool: 26402943; Big: 8730); GC: 31

signal (15): Terminated
in expression starting at /home/krynju/Downloads/bench1/dtables_caching_test_tuple.jl:29
unknown function (ip: 0x7f4507689117)
pthread_cond_wait at /usr/lib/libc.so.6 (unknown line)
uv_cond_wait at /workspace/srcdir/libuv/src/unix/thread.c:883
ijl_task_get_next at /home/krynju/julia/src/partr.c:596
poptask at ./task.jl:921
wait at ./task.jl:930
task_done_hook at ./task.jl:634
jfptr_task_done_hook_26224 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
jl_finish_task at /home/krynju/julia/src/task.c:254
start_task at /home/krynju/julia/src/task.c:942
unknown function (ip: (nil))
unknown function (ip: 0x7f4507689117)
pthread_cond_wait at /usr/lib/libc.so.6 (unknown line)
uv_cond_wait at /workspace/srcdir/libuv/src/unix/thread.c:883
ijl_task_get_next at /home/krynju/julia/src/partr.c:596
poptask at ./task.jl:921
wait at ./task.jl:930
task_done_hook at ./task.jl:634
jfptr_task_done_hook_26224 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
jl_finish_task at /home/krynju/julia/src/task.c:254
start_task at /home/krynju/julia/src/task.c:942
unknown function (ip: (nil))
unknown function (ip: 0x7f4507689117)
pthread_cond_wait at /usr/lib/libc.so.6 (unknown line)
uv_cond_wait at /workspace/srcdir/libuv/src/unix/thread.c:883
ijl_task_get_next at /home/krynju/julia/src/partr.c:596
poptask at ./task.jl:921
wait at ./task.jl:930
task_done_hook at ./task.jl:634
jfptr_task_done_hook_26224 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
jl_finish_task at /home/krynju/julia/src/task.c:254
start_task at /home/krynju/julia/src/task.c:942
unknown function (ip: (nil))
epoll_wait at /usr/lib/libc.so.6 (unknown line)
uv__io_poll at /workspace/srcdir/libuv/src/unix/epoll.c:236
uv_run at /workspace/srcdir/libuv/src/unix/core.c:400
ijl_task_get_next at /home/krynju/julia/src/partr.c:565
poptask at ./task.jl:921
wait at ./task.jl:930
wait at ./condition.jl:124
#readuntil#681 at ./stream.jl:1012
readuntil##kw at ./stream.jl:996 [inlined]
#readline#397 at ./io.jl:543
readline at ./io.jl:542 [inlined]
macro expansion at /home/krynju/julia/usr/share/julia/stdlib/v1.8/Distributed/src/cluster.jl:283 [inlined]
#37 at ./task.jl:484
unknown function (ip: 0x7f44f0180fdf)
jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
start_task at /home/krynju/julia/src/task.c:931
unknown function (ip: (nil))
Allocations: 51978913 (Pool: 51960682; Big: 18231); GC: 20
schedule: Task not runnable
atexit hook threw an error: ErrorException("task switch not allowed from inside gc finalizer")
ijl_error at /home/krynju/julia/src/rtutils.c:41
ijl_switch at /home/krynju/julia/src/task.c:530
try_yieldto at ./task.jl:861
wait at ./task.jl:931
uv_write at ./stream.jl:1046
unsafe_write at ./stream.jl:1118
write at ./strings/io.jl:244 [inlined]
print at ./strings/io.jl:246
jfptr_print_45001 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
showerror at ./errorshow.jl:144
unknown function (ip: 0x7f44df7693f1)
_atexit at ./initdefs.jl:374
jfptr__atexit_48368 at /home/krynju/julia/usr/lib/julia/sys.so (unknown line)
jl_apply at /home/krynju/julia/src/julia.h:1843 [inlined]
ijl_atexit_hook at /home/krynju/julia/src/init.c:219
ijl_exit at /home/krynju/julia/src/jl_uv.c:640
jl_exit_thread0_cb at /home/krynju/julia/src/signals-unix.c:428
krynju commented 1 year ago

Lead: Seems like the concurrency violation appearing often happens on Worker.c_state and may be connected to the lazy way of initializing worker connections TODO: check if issue appears when forcing a non-lazy distributed setup