JuliaParallel / DTables.jl

Distributed table structures and data manipulation operations built on top of Dagger.jl
MIT License
73 stars 4 forks source link

`MemPool.datastore` memory utilization keeps increasing when using DTables with multiple processes #60

Open StevenWhitaker opened 11 months ago

StevenWhitaker commented 11 months ago

I have the following setup:

The actual query includes loading a table from a .csv file into a DTable (with a DataFrame table type). Operations include selecting columns, fetching the table into a DataFrame for adding/removing rows/columns and other processing as needed, and re-wrapping the table in a DTable to later be processed further. At the end of processing, the result is returned as a DataFrame.

The .csv file contains a table with 233930 rows and 102 columns: 1 column of InlineStrings.String15, 2 columns of InlineStrings.String1, 45 columns of Int64, and 54 columns of Float64.

The issue: I noticed that if I keep running the same query repeatedly, the MemPool.datastore on worker 2 consumes more and more memory, as determined by

remotecall_fetch(2) do
    Base.summarysize(MyPackage.Dagger.MemPool.datastore)
end

Eventually, the memory usage grows enough to cause my WSL 2 Linux OOM manager to kill worker 2, crashing my program.

Notably, I do not observe this growth in memory usage in the following scenarios:

I do observe this growth in memory usage in the following additional scenarios:

I'm posting this issue in DTables.jl in case there's something DTables.jl is doing that somehow causes the MemPool.jl data store to keep references around longer than expected, but of course please transfer this issue to Dagger.jl or MemPool.jl as needed.

Please let me know if there is any other information that would help with finding the root cause of this issue.

jpsamaroo commented 11 months ago

Do you have a reproducer for this one, just to help me debug it reliably?

StevenWhitaker commented 10 months ago

I am working on a better reproducer, but I believe the behavior I pointed out in JuliaParallel/Dagger.jl#445 (related to JuliaParallel/Dagger.jl#438) is essentially the same I am reporting here---all those CPURAMDevices needed to be evicted when Julia closed because they were not being evicted earlier. It's just that before I wasn't running my code enough times to get to the point where the OOM manager killed one of my processes.

StevenWhitaker commented 10 months ago

@jpsamaroo Here's a MWE that shows the ever-growing memory utilization. It took me running main about 120 times to get the OOM manager to kill worker 2 (I have 32 GB RAM). Let me know if you need data as well (i.e., "file.csv").

using Distributed
addprocs(5 - nprocs(); exeflags = "--heap-size-hint=3G")

@everywhere using DTables, DataFrames, CSV

@everywhere const DT = Ref{DTable}()

@everywhere mutable struct DTableCols
    key_names
    value_names
    keys
    values
end

function main()

    remotecall_fetch(query, 2)

end

@everywhere function query()

    dt1 = load_dt()
    dt2 = add_value_col!(dt1)
    dt3 = update_value_col!(dt2)
    @info "" length(dt3)
    dt4 = calc_value_cols(dt3)
    dt5 = select(dt4, [6; 12; 103:113]...; copycols = false)
    dt_agg = aggregate_dt(dt5)
    return fetch(dt_agg)

end

@everywhere function load_dt()

    isassigned(DT) && return DT[]
    file = "file.csv"
    GC.enable(false)
    dt = DTable(x -> CSV.File(x), [file]; tabletype = DataFrame)
    GC.enable(true)
    DT[] = dt
    return dt

end

@everywhere function add_value_col!(dt)

    dt_cols = create_dt_cols(dt, 1:48, 49:102)
    dt_cols.value_names = [dt_cols.value_names; "RAND"]
    dt_cols.values = (dt_cols.values..., rand(length(dt_cols.values[1])))
    return create_dt_from_cols(dt_cols; is_sorted = true)

end

@everywhere function create_dt_cols(dt, key_cols, value_cols)

    df = fetch(dt)
    key_names = names(df)[key_cols]
    value_names = names(df)[value_cols]
    keys = [df[!, i] for i in key_cols]
    values = [df[!, i] for i in value_cols]
    return DTableCols(key_names, value_names, keys, values)

end

@everywhere function create_dt_from_cols(dt_cols; is_sorted = false)

    df = DataFrame(
        (dt_cols.key_names .=> dt_cols.keys)...,
        (dt_cols.value_names .=> dt_cols.values)...;
        copycols = false,
    )
    is_sorted || sort!(df)
    return DTable(df)

end

@everywhere function update_value_col!(dt)

    dt_cols = create_dt_cols(dt, 1:48, 49:103)
    dt_cols.values = (
        dt_cols.values[1:10]...,
        rand(length(dt_cols.values[1])),
        dt_cols.values[12:end]...,
    )
    return create_dt_from_cols(dt_cols; is_sorted = true)

end

@everywhere function calc_value_cols(dt)

    newvals = Vector{Float64}[]
    for i = 1:10
        v = calc_new_value(dt, i)
        push!(newvals, v)
    end
    return append_value_cols(dt, newvals)

end

@everywhere function calc_new_value(dt, i)

    dt_cols = create_dt_cols(dt, 1:48, 49:103)
    return abs.(dt_cols.values[i])

end

@everywhere function append_value_cols(dt, newvals)

    df = fetch(dt)
    for (i, v) in enumerate(newvals)
        setproperty!(df, "NEW$i", v)
    end
    return DTable(df)

end

@everywhere function aggregate_dt(dt)

    key_names = [Symbol("6"), Symbol("12")]
    gdt = groupby(fetch(dt), key_names)
    gkeys = sort!(collect(keys(gdt)))
    key_pairs = key_names .=> invert(gkeys)
    value_names = [[Symbol("RAND")]; Symbol.("NEW", 1:10)]
    sums = fetch(reduce(+, gdt; cols = value_names))
    sorted = sortperm(invert(sums[key_names]))
    value_pairs = map(value_names) do value
        value => sums[Symbol(:result_, value)][sorted]
    end
    return DTable(DataFrame(key_pairs..., value_pairs...))

end

@everywhere invert(x) = [[x[j][i] for j = 1:length(x)] for i = 1:length(x[1])]

@everywhere function Base.reduce(f, df::DataFrames.AbstractDataFrame; cols)

    NamedTuple(col => reduce(f, df[!, col]) for col in cols)

end

@everywhere function Base.reduce(f, gdt::DataFrames.GroupedDataFrame; cols)

    gkeys = keys(gdt)
    dims = keys(gkeys[1])
    merge(
        NamedTuple(dim => getproperty.(gkeys, dim) for dim in dims),
        NamedTuple(
            Symbol(:result_, col) => [reduce(f, gdt[k]; cols = [col])[col] for k in gkeys]
            for col in cols
        ),
    )

end
krynju commented 10 months ago

One thing I remembered was that when I was benchmarking dtables.jl around release time I had a really bad time with running it in wsl2. I would barely get to a quarter of the table size which I could run successfully on Linux due to the weird memory management wsl does

Let's keep this in mind when looking at this, Linux will behave differently for sure. I'll try to have a look at it this week

StevenWhitaker commented 10 months ago

I just ran the exact same code as in #61 (which in turn is the same as the MWE above but with a call to enable_disk_caching!(50, 2^10 * 20)). This time, instead of a BoundsError, I got AssertionError: Failed to migrate 183.839 MiB for ref 1624:

julia> include("mwe.jl"); for i = 1:100 (i % 10 == 0 && @show(i)); main() end
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
i = 10
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
i = 20
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
i = 30
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
      From worker 2:    ┌ Info:
      From worker 2:    └   length(dt3) = 233930
ERROR: On worker 2:
AssertionError: Failed to migrate 183.839 MiB for ref 1624
Stacktrace:
  [1] #105
    @ ~/.julia/packages/MemPool/l9nLj/src/storage.jl:887
  [2] with_lock
    @ ~/.julia/packages/MemPool/l9nLj/src/lock.jl:80
  [3] #sra_migrate!#103
    @ ~/.julia/packages/MemPool/l9nLj/src/storage.jl:849
  [4] sra_migrate!
    @ ~/.julia/packages/MemPool/l9nLj/src/storage.jl:826 [inlined]
  [5] write_to_device!
    @ ~/.julia/packages/MemPool/l9nLj/src/storage.jl:817
  [6] #poolset#160
    @ ~/.julia/packages/MemPool/l9nLj/src/datastore.jl:386
  [7] #tochunk#139
    @ ~/.julia/packages/Dagger/M13n0/src/chunks.jl:267
  [8] tochunk (repeats 2 times)
    @ ~/.julia/packages/Dagger/M13n0/src/chunks.jl:259 [inlined]
  [9] #DTable#1
    @ ~/.julia/packages/DTables/BjdY2/src/table/dtable.jl:38
 [10] DTable
    @ ~/.julia/packages/DTables/BjdY2/src/table/dtable.jl:28
 [11] #create_dt_from_cols#9
    @ ~/tmp/mwe.jl:76
 [12] create_dt_from_cols
    @ ~/tmp/mwe.jl:68 [inlined]
 [13] add_value_col!
    @ ~/tmp/mwe.jl:53
 [14] query
    @ ~/tmp/mwe.jl:26
 [15] #invokelatest#2
    @ ./essentials.jl:819 [inlined]
 [16] invokelatest
    @ ./essentials.jl:816
 [17] #110
    @ ~/programs/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:285
 [18] run_work_thunk
    @ ~/programs/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:70
 [19] macro expansion
    @ ~/programs/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:285 [inlined]
 [20] #109
    @ ./task.jl:514
Stacktrace:
 [1] remotecall_fetch(::Function, ::Distributed.Worker; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
   @ Distributed ~/programs/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:465
 [2] remotecall_fetch(::Function, ::Distributed.Worker)
   @ Distributed ~/programs/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:454
 [3] #remotecall_fetch#162
   @ ~/programs/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:492 [inlined]
 [4] remotecall_fetch
   @ ~/programs/julia/julia-1.9.3/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:492 [inlined]
 [5] main
   @ ~/tmp/mwe.jl:19 [inlined]
 [6] top-level scope
   @ ./REPL[1]:1

I wonder if this is essentially the same issue as the OP, where data is being kept longer than it should. Just in this case, instead of a process getting killed, MemPool errors because we end up exceeding the 20 GB of disk space I said MemPool could use. If it is the same issue, then WSL 2 memory management shouldn't have anything to do with this.

StevenWhitaker commented 10 months ago

I ran the MWE (with and without enabling disk caching) on Windows (not WSL 2).

So there definitely is a difference in behavior between WSL 2 and Windows.

krynju commented 10 months ago

I did get it reproduced twice with the MemPool fix from the other issue 5 processes, no threads

julia> d = DTable((a=rand(Int, N1),), N1 ÷ 100)
ERROR: AssertionError: Failed to migrate 10.240 MiB for ref 5646
Stacktrace:
  [1] (::MemPool.var"#105#113"{Bool, MemPool.SimpleRecencyAllocator, MemPool.RefState, Int64})()
    @ MemPool C:\Users\krynjupc\.julia\dev\MemPool\src\storage.jl:920
  [2] with_lock(f::MemPool.var"#105#113"{Bool, MemPool.SimpleRecencyAllocator, MemPool.RefState, Int64}, lock::MemPool.NonReentrantLock, cond::Bool)
    @ MemPool C:\Users\krynjupc\.julia\dev\MemPool\src\lock.jl:80
  [3] sra_migrate!(sra::MemPool.SimpleRecencyAllocator, state::MemPool.RefState, ref_id::Int64, to_mem::Missing; read::Bool, locked::Bool)
    @ MemPool C:\Users\krynjupc\.julia\dev\MemPool\src\storage.jl:882
  [4] sra_migrate!(sra::MemPool.SimpleRecencyAllocator, state::MemPool.RefState, ref_id::Int64, to_mem::Missing)
    @ MemPool C:\Users\krynjupc\.julia\dev\MemPool\src\storage.jl:859 [inlined]
  [5] write_to_device!(sra::MemPool.SimpleRecencyAllocator, state::MemPool.RefState, ref_id::Int64)
    @ MemPool C:\Users\krynjupc\.julia\dev\MemPool\src\storage.jl:850
  [6]
    @ MemPool C:\Users\krynjupc\.julia\dev\MemPool\src\datastore.jl:386
  [7] tochunk(x::@NamedTuple{a::Vector{Int64}}, proc::OSProc, scope::AnyScope; persist::Bool, cache::Bool, device::Nothing, kwargs::@Kwargs{})
    @ Dagger C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\chunks.jl:267
  [8] tochunk
    @ Dagger C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\chunks.jl:259 [inlined]
  [9] tochunk(x::@NamedTuple{a::Vector{Int64}})
    @ Dagger C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\chunks.jl:259
 [10] DTable(table::@NamedTuple{a::Vector{Int64}}, chunksize::Int64; tabletype::Nothing, interpartition_merges::Bool)
    @ DTables C:\Users\krynjupc\.julia\packages\DTables\BjdY2\src\table\dtable.jl:122
 [11] DTable(table::@NamedTuple{a::Vector{Int64}}, chunksize::Int64)
    @ DTables C:\Users\krynjupc\.julia\packages\DTables\BjdY2\src\table\dtable.jl:61
 [12] top-level scope
    @ REPL[56]:1
Some type information was truncated. Use `show(err)` to see complete types.

julia> map(x -> (r=x.a + 1,), d) |> fetch
ERROR: ThunkFailedException:
  Root Exception Type: CapturedException
  Root Exception:
AssertionError: Failed to migrate 10.240 MiB for ref 5051
Stacktrace:
  [1] #105
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\storage.jl:920
  [2] with_lock
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\lock.jl:83
  [3] #sra_migrate!#103
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\storage.jl:882
  [4] #120
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\storage.jl:1001
  [5] with_lock
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\lock.jl:80
  [6] with_lock
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\lock.jl:78
  [7] read_from_device
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\storage.jl:991 [inlined]
  [8] _getlocal
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\datastore.jl:433
  [9] #174
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\datastore.jl:425
 [10] #invokelatest#2
    @ .\essentials.jl:899
 [11] invokelatest
    @ .\essentials.jl:896
 [12] #110
    @ C:\Users\krynjupc\AppData\Local\Programs\Julia-1.11.0-DEV\share\julia\stdlib\v1.11\Distributed\src\process_messages.jl:286
 [13] run_work_thunk
    @ C:\Users\krynjupc\AppData\Local\Programs\Julia-1.11.0-DEV\share\julia\stdlib\v1.11\Distributed\src\process_messages.jl:70
 [14] #109
    @ C:\Users\krynjupc\AppData\Local\Programs\Julia-1.11.0-DEV\share\julia\stdlib\v1.11\Distributed\src\process_messages.jl:286
Stacktrace:
  [1] #remotecall_fetch#159
    @ C:\Users\krynjupc\AppData\Local\Programs\Julia-1.11.0-DEV\share\julia\stdlib\v1.11\Distributed\src\remotecall.jl:465
  [2] remotecall_fetch
    @ C:\Users\krynjupc\AppData\Local\Programs\Julia-1.11.0-DEV\share\julia\stdlib\v1.11\Distributed\src\remotecall.jl:454
  [3] remotecall_fetch
    @ C:\Users\krynjupc\AppData\Local\Programs\Julia-1.11.0-DEV\share\julia\stdlib\v1.11\Distributed\src\remotecall.jl:492 [inlined]
  [4] #173
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\datastore.jl:424 [inlined]
  [5] forwardkeyerror
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\datastore.jl:409
  [6] poolget
    @ C:\Users\krynjupc\.julia\dev\MemPool\src\datastore.jl:423
  [7] move
    @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\chunks.jl:98
  [8] move
    @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\chunks.jl:96
  [9] #invokelatest#2
    @ .\essentials.jl:899 [inlined]
 [10] invokelatest
    @ .\essentials.jl:896 [inlined]
 [11] #154
    @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\sch\Sch.jl:1475
Stacktrace:
 [1] wait
   @ .\task.jl:354 [inlined]
 [2] fetch
   @ .\task.jl:374 [inlined]
 [3] fetch_report
   @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\sch\util.jl:241
 [4] do_task
   @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\sch\Sch.jl:1502
 [5] #132
   @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\sch\Sch.jl:1243
  Root Thunk:  Thunk(id=2224, #38(Dagger.WeakChunk(1, 5051, WeakRef(Dagger.Chunk{@NamedTuple{a::Vector{Int64}}, DRef, OSProc, AnyScope}(@NamedTuple{a::Vector{Int64}}, 
UnitDomain(), DRef(1, 5051, 0x0000000000a3d738), OSProc(1), AnyScope(), false))), #129))
  Inner Thunk: Thunk(id=2325, isnonempty(Thunk[2224](#38, Any[Dagger.WeakChunk(1, 5051, WeakRef(Dagger.Chunk{@NamedTuple{a::Vector{Int64}}, DRef, OSProc, AnyScope}(@NamedTuple{a::Vector{Int64}}, UnitDomain(), DRef(1, 5051, 0x0000000000a3d738), OSProc(1), AnyScope(), false))), var"#129#130"()])))
  This Thunk:  Thunk(id=2325, isnonempty(Thunk[2224](#38, Any[Dagger.WeakChunk(1, 5051, WeakRef(Dagger.Chunk{@NamedTuple{a::Vector{Int64}}, DRef, OSProc, AnyScope}(@NamedTuple{a::Vector{Int64}}, UnitDomain(), DRef(1, 5051, 0x0000000000a3d738), OSProc(1), AnyScope(), false))), var"#129#130"()])))
Stacktrace:
  [1] fetch(t::Dagger.ThunkFuture; proc::OSProc, raw::Bool)
    @ Dagger C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\eager_thunk.jl:16
  [2] fetch
    @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\eager_thunk.jl:11 [inlined]
  [3] #fetch#75
    @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\eager_thunk.jl:58 [inlined]
  [4] fetch
    @ C:\Users\krynjupc\.julia\packages\Dagger\M13n0\src\eager_thunk.jl:54 [inlined]
  [5] #10
    @ C:\Users\krynjupc\.julia\packages\DTables\BjdY2\src\table\dtable.jl:233 [inlined]
  [6] filter(f::DTables.var"#10#13"{Vector{Dagger.EagerThunk}}, a::Vector{Tuple{Int64, Union{Dagger.EagerThunk, Dagger.Chunk}}})
    @ Base .\array.jl:2673
  [7] trim!(d::DTable)
    @ DTables C:\Users\krynjupc\.julia\packages\DTables\BjdY2\src\table\dtable.jl:233
  [8] trim(d::DTable)
    @ DTables C:\Users\krynjupc\.julia\packages\DTables\BjdY2\src\table\dtable.jl:242
  [9] retrieve_partitions
    @ C:\Users\krynjupc\.julia\packages\DTables\BjdY2\src\table\dtable.jl:179 [inlined]
 [10] fetch(d::DTable)
    @ DTables C:\Users\krynjupc\.julia\packages\DTables\BjdY2\src\table\dtable.jl:167
 [11] |>(x::DTable, f::typeof(fetch))
    @ Base .\operators.jl:917
 [12] top-level scope
    @ REPL[57]:1
krynju commented 10 months ago

reproducer, no files needed just run julia -p4 and run the last 5 lines over and over till it appears (and then any further call will generate the error again)

ENV["JULIA_MEMPOOL_EXPERIMENTAL_FANCY_ALLOCATOR"] = "true"
ENV["JULIA_MEMPOOL_EXPERIMENTAL_MEMORY_BOUND"] = string(2 * (2^30)) # 2GB
# ENV["JULIA_MEMPOOL_EXPERIMENTAL_DISK_CACHE"] = "C:\\Users\\krynjupc\\.mempool\\demo_session_$(rand(Int))"

using Distributed

@info(
    "Execution environment details",
    julia_version=VERSION,
    n_workers=Distributed.nworkers(),
    n_procs=Distributed.nprocs(),
    n_threads=Threads.nthreads(),
)

function view_cache()
    !isdir(ENV["JULIA_MEMPOOL_EXPERIMENTAL_DISK_CACHE"]) && return []
    map(
        x -> (basename(x), round(filesize(x) / 2^20, digits=2)),
        readdir(ENV["JULIA_MEMPOOL_EXPERIMENTAL_DISK_CACHE"], join=true)
    )
end

using DTables
DTables.enable_disk_caching!()

using MemPool
using Dagger

N1 = 2^27 # 1GB
d = DTable((a=rand(Int, N1),), N1 ÷ 100)
map(x -> (r=x.a + 1,), d) |> fetch
MemPool.GLOBAL_DEVICE[]
view_cache()
krynju commented 10 months ago

Can't reproduce with the fix https://github.com/JuliaData/MemPool.jl/pull/74 Stressed it really hard and I didn't get any errors

Will cut a release soon

StevenWhitaker commented 10 months ago

I just tested the new releases of DTables.jl/Dagger.jl/MemPool.jl using the reproducer I mentioned above.

Without disk caching enabled:

With enable_disk_caching!(50, 2^10 * 20):

So, it looks like the issue is not entirely resolved yet.

jpsamaroo commented 10 months ago

@StevenWhitaker when this occurs, how much does Base.format_bytes(MemPool.GLOBAL_DEVICE[].device_size) report, and what amount of memory is reported by the OS as being used by Julia (out of the total system RAM)? If MemPool's LRU truly thinks it's running out of memory, then this will trigger (I probably should make the error more informative).

StevenWhitaker commented 10 months ago

@jpsamaroo I ran the following code to grab the info you requested. Let me know if you need any other info.

julia> include("mwe.jl"); for i = 1:200
           totalmem = Base.format_bytes(Sys.total_physical_memory())
           memusage = map(procs()) do id
               remotecall_fetch(id) do
                   parse(Int, split(read(`ps -p $(getpid()) -o rss`, String), "\n")[end-1]) * 1000
               end
           end
           totalmemusage = Base.format_bytes(sum(memusage))
           worker_memusage = Base.format_bytes(memusage[2])
           device_size = remotecall_fetch(2) do
               DTables.Dagger.MemPool.GLOBAL_DEVICE[].device_size[]
           end |> Base.format_bytes # Remove `device_size` when disk caching not enabled
           @info "" i device_size worker_memusage totalmemusage totalmem
           main()
       end

Without disk caching enabled on WSL 2:

With enable_disk_caching!(50, 2^10 * 20) on WSL 2:

So, it looks like the device is running out of memory, but why?

jpsamaroo commented 10 months ago

Can you try throwing in some @everywhere GC.gc() calls every iteration and see if that delays or eliminates the OOM situation? If so, then it means that we should start automatically calling the GC to reduce memory usage when we start hitting RAM limits.

StevenWhitaker commented 10 months ago

I added @everywhere GC.gc() at the start of each iteration. It delayed the OOM issue until iteration 300, but the Julia process still was killed.

StevenWhitaker commented 10 months ago

Any updates yet on this front?

jpsamaroo commented 10 months ago

@StevenWhitaker can you try out https://github.com/JuliaData/MemPool.jl/pull/75 and see if it delays the OOM further or eliminates it? You'll probably want to fiddle with the MemPool.MEM_RESERVED[] value to see which avoids the OOM best while minimizing overhead - it's set to 512MB to be conservative, but 2GB works on my machine better (since I run earlyoom which is more eager than the Linux kernel's OOM killer).

StevenWhitaker commented 10 months ago

@jpsamaroo I tried using the default value of MemPool.MEM_RESERVED[], 2 GB, and 10 GB, and in all cases I got the same error (that looks different than what I reported earlier):

Error message

``` From worker 4: ┌ Error: Error on 4 while connecting to peer 3, exiting From worker 4: │ exception = From worker 4: │ ConcurrencyViolationError("lock must be held") From worker 4: │ Stacktrace: From worker 4: │ [1] concurrency_violation() From worker 4: │ @ Base ./condition.jl:8 From worker 4: │ [2] assert_havelock From worker 4: │ @ ./condition.jl:25 [inlined] From worker 4: │ [3] assert_havelock From worker 4: │ @ ./condition.jl:48 [inlined] From worker 4: │ [4] assert_havelock From worker 4: │ @ ./condition.jl:72 [inlined] From worker 4: │ [5] notify(c::Condition, arg::Any, all::Bool, error::Bool) From worker 4: │ @ Base ./condition.jl:150 From worker 4: │ [6] #notify#622 From worker 4: │ @ ./condition.jl:148 [inlined] From worker 4: │ [7] notify (repeats 2 times) From worker 4: │ @ ./condition.jl:148 [inlined] From worker 4: │ [8] set_worker_state From worker 4: │ @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:148 [inlined] From worker 4: │ [9] Distributed.Worker(id::Int64, r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, manager::Distributed.DefaultClusterManager; version::Nothing, config::WorkerConfig) From worker 4: │ @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:126 From worker 4: │ [10] Worker From worker 4: │ @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:116 [inlined] From worker 4: │ [11] connect_to_peer(manager::Distributed.DefaultClusterManager, rpid::Int64, wconfig::WorkerConfig) From worker 4: │ @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:364 From worker 4: │ [12] (::Distributed.var"#121#123"{Int64, WorkerConfig})() From worker 4: │ @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:350 From worker 4: │ [13] exec_conn_func(w::Distributed.Worker) From worker 4: │ @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:181 From worker 4: │ [14] (::Distributed.var"#21#24"{Distributed.Worker})() From worker 4: │ @ Distributed ./task.jl:514 From worker 4: └ @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:370 ┌ Error: Error when tearing down scheduler │ exception = │ TaskFailedException │ │ nested task error: no process with id 4 exists │ Stacktrace: │ [1] error(s::String) │ @ Base ./error.jl:35 │ [2] worker_from_id(pg::Distributed.ProcessGroup, i::Int64) │ @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:1098 │ [3] worker_from_id │ @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:1090 [inlined] │ [4] #remote_do#170 │ @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:557 [inlined] │ [5] remote_do │ @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:557 [inlined] │ [6] cleanup_proc(state::Dagger.Sch.ComputeState, p::Dagger.OSProc, log_sink::TimespanLogging.NoOpLog) │ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:409 │ [7] (::Dagger.Sch.var"#105#108"{Dagger.Context, Dagger.Sch.ComputeState, Dagger.OSProc})() │ @ Dagger.Sch ./task.jl:514 │ Stacktrace: │ [1] sync_end(c::Channel{Any}) │ @ Base ./task.jl:445 │ [2] macro expansion │ @ ./task.jl:477 [inlined] │ [3] scheduler_exit(ctx::Dagger.Context, state::Dagger.Sch.ComputeState, options::Dagger.Sch.SchedulerOptions) │ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:615 │ [4] compute_dag(ctx::Dagger.Context, d::Dagger.Thunk; options::Dagger.Sch.SchedulerOptions) │ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:471 │ [5] compute_dag │ @ ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:431 [inlined] │ [6] compute(ctx::Dagger.Context, d::Dagger.Thunk; options::Dagger.Sch.SchedulerOptions) │ @ Dagger ~/.julia/packages/Dagger/lhyAj/src/compute.jl:23 │ [7] compute │ @ ~/.julia/packages/Dagger/lhyAj/src/compute.jl:22 [inlined] │ [8] macro expansion │ @ ~/.julia/packages/Dagger/lhyAj/src/sch/eager.jl:27 [inlined] │ [9] (::Dagger.Sch.var"#58#60"{Dagger.Context})() │ @ Dagger.Sch ./threadingconstructs.jl:416 └ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:473 Error in eager scheduler: Cannot serialize a WeakChunk Stacktrace: [1] error(s::String) @ Base ./error.jl:35 [2] serialize(io::Distributed.ClusterSerializer{Sockets.TCPSocket}, wc::Dagger.WeakChunk) @ Dagger ~/.julia/packages/Dagger/lhyAj/src/chunks.jl:298 [3] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any) @ Serialization ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:678 [4] serialize @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:657 [inlined] [5] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, a::Vector{Pair{Union{Nothing, Symbol}, Any}}) @ Serialization ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:277 [6] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any) @ Serialization ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:678 [7] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any) @ Serialization ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:657 [8] serialize_any(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, x::Any) @ Serialization ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:678 [9] serialize @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:657 [inlined] [10] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::Tuple{Bool, Dagger.ThunkFailedException{RemoteException}}) @ Serialization ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:205 [11] serialize(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, t::Tuple{Distributed.RRID, Tuple{Bool, Dagger.ThunkFailedException{RemoteException}}, Int64}) @ Serialization ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Serialization/src/Serialization.jl:205 [12] serialize_msg(s::Distributed.ClusterSerializer{Sockets.TCPSocket}, o::Distributed.CallMsg{:call_fetch}) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/messages.jl:78 [13] #invokelatest#2 @ ./essentials.jl:819 [inlined] [14] invokelatest @ ./essentials.jl:816 [inlined] [15] send_msg_(w::Distributed.Worker, header::Distributed.MsgHeader, msg::Distributed.CallMsg{:call_fetch}, now::Bool) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/messages.jl:181 [16] send_msg @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/messages.jl:122 [inlined] [17] remotecall_fetch(::Function, ::Distributed.Worker, ::Distributed.RRID, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:460 [18] remotecall_fetch(::Function, ::Distributed.Worker, ::Distributed.RRID, ::Vararg{Any}) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:454 [19] remotecall_fetch(::Function, ::Int64, ::Distributed.RRID, ::Vararg{Any}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:492 [20] remotecall_fetch(::Function, ::Int64, ::Distributed.RRID, ::Vararg{Any}) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:492 [21] call_on_owner(::Function, ::Future, ::Tuple{Bool, Dagger.ThunkFailedException{RemoteException}}, ::Vararg{Any}) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:565 [22] macro expansion @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:681 [inlined] [23] macro expansion @ ./lock.jl:267 [inlined] [24] put!(r::Future, v::Tuple{Bool, Dagger.ThunkFailedException{RemoteException}}) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/remotecall.jl:680 [25] #put!#71 @ ~/.julia/packages/Dagger/lhyAj/src/eager_thunk.jl:24 [inlined] [26] fill_registered_futures!(state::Dagger.Sch.ComputeState, node::Dagger.Thunk, failed::Bool) @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/util.jl:59 [27] finish_failed!(state::Dagger.Sch.ComputeState, thunk::Dagger.Thunk, origin::Dagger.Thunk) @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/util.jl:183 [28] set_failed!(state::Dagger.Sch.ComputeState, origin::Dagger.Thunk, thunk::Dagger.Thunk) @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/util.jl:180 [29] set_failed! @ ~/.julia/packages/Dagger/lhyAj/src/sch/util.jl:177 [inlined] [30] finish_task!(ctx::Dagger.Context, state::Dagger.Sch.ComputeState, node::Dagger.Thunk, thunk_failed::Bool) @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:915 [31] (::Dagger.Sch.var"#103#104"{Dagger.Context, Dagger.Sch.ComputeState, Dagger.OSProc, NamedTuple{(:time_pressure, :storage_pressure, :storage_capacity, :loadavg, :threadtime, :gc_allocd, :transfer_rate), Tuple{UInt64, UInt64, UInt64, Tuple{Float64, Float64, Float64}, UInt64, Int64, UInt64}}, RemoteException, Int64, Dagger.ThreadProc, Int64})() @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:591 [32] lock(f::Dagger.Sch.var"#103#104"{Dagger.Context, Dagger.Sch.ComputeState, Dagger.OSProc, NamedTuple{(:time_pressure, :storage_pressure, :storage_capacity, :loadavg, :threadtime, :gc_allocd, :transfer_rate), Tuple{UInt64, UInt64, UInt64, Tuple{Float64, Float64, Float64}, UInt64, Int64, UInt64}}, RemoteException, Int64, Dagger.ThreadProc, Int64}, l::ReentrantLock) @ Base ./lock.jl:229 [33] scheduler_run(ctx::Dagger.Context, state::Dagger.Sch.ComputeState, d::Dagger.Thunk, options::Dagger.Sch.SchedulerOptions) @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:542 [34] compute_dag(ctx::Dagger.Context, d::Dagger.Thunk; options::Dagger.Sch.SchedulerOptions) @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:466 [35] compute_dag @ ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:431 [inlined] [36] compute(ctx::Dagger.Context, d::Dagger.Thunk; options::Dagger.Sch.SchedulerOptions) @ Dagger ~/.julia/packages/Dagger/lhyAj/src/compute.jl:23 [37] compute @ ~/.julia/packages/Dagger/lhyAj/src/compute.jl:22 [inlined] [38] macro expansion @ ~/.julia/packages/Dagger/lhyAj/src/sch/eager.jl:27 [inlined] [39] (::Dagger.Sch.var"#58#60"{Dagger.Context})() @ Dagger.Sch ./threadingconstructs.jl:416 From worker 3: UNHANDLED TASK ERROR: AssertionError: id > 0Worker 4 terminated. From worker 3: Stacktrace: Unhandled Task ERROR: EOFError: read end of file Stacktrace: [1] (::Base.var"#wait_locked#715")(s::Sockets.TCPSocket, buf::IOBuffer, nb::Int64) @ Base ./stream.jl:947 [2] unsafe_read(s::Sockets.TCPSocket, p::Ptr{UInt8}, nb::UInt64) @ Base ./stream.jl:955 [3] unsafe_read @ ./io.jl:761 [inlined] [4] unsafe_read(s::Sockets.TCPSocket, p::Base.RefValue{NTuple{4, Int64}}, n::Int64) @ Base ./io.jl:760 [5] read! @ ./io.jl:762 [inlined] [6] deserialize_hdr_raw @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/messages.jl:167 [inlined] [7] message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:172 [8] process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:133 [9] (::Distributed.var"#103#104"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})() @ Distributed ./task.jl:514 From worker 3: [1] Distributed.Worker(id::Int64, conn_func::Nothing) From worker 3: @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:133 From worker 3: [2] Worker From worker 3: @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:131 [inlined] From worker 3: [3] worker_from_id(pg::Distributed.ProcessGroup, i::Int64) From worker 3: @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:1100 From worker 3: [4] worker_from_id From worker 3: @ ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/cluster.jl:1090 [inlined] From worker 3: [5] message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) From worker 3: @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:213 From worker 3: [6] process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) From worker 3: @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:133 From worker 3: [7] (::Distributed.var"#103#104"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})() From worker 3: @ Distributed ./task.jl:514 From worker 3: From worker 3: caused by: Cookie read failed. Connection closed by peer. From worker 3: Stacktrace: From worker 3: [1] error(s::String) From worker 3: @ Base ./error.jl:35 From worker 3: [2] process_hdr(s::Sockets.TCPSocket, validate_cookie::Bool) From worker 3: @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:259 From worker 3: [3] message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) From worker 3: @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:158 From worker 3: [4] process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) From worker 3: @ Distributed ~/programs/julia/julia-1.9.4/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:133 From worker 3: [5] (::Distributed.var"#103#104"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})() From worker 3: @ Distributed ./task.jl:514 ```

The error looks different, but it probably is the same error (i.e., OOM manager kills one of my Julia processes) because I notice one process is missing after the error. Oh, and now I see the Worker 4 terminated. in the stacktrace.

Note that the error occurred sooner with 10 GB reserved.

This is with disk caching disabled; I'm assuming JuliaData/MemPool.jl#75 does nothing about the disk caching errors I've also been experiencing.

jpsamaroo commented 10 months ago

The error shown should be fixed by https://github.com/JuliaParallel/Dagger.jl/issues/450#issuecomment-1812894331, you need to be using Julia 1.11 with my Distributed PR at all times for reliable distributed computing.

And yes, it doesn't yet handle disk caching, but I can add support for that (we'll nicely ask the LRU to swap more stuff to disk if it can).

StevenWhitaker commented 10 months ago

Oh, I thought JuliaLang/Distributed.jl#4 was only relevant if running code with multiple threads, but I'm running julia with -t1. But I will try again tomorrow with that fix.

With disk caching enabled, as far as I can tell, my issue isn't running out of RAM, it's running out of allotted disk space. So the LRU needs to delete stuff that is no longer needed, but it seems like that isn't happening.

StevenWhitaker commented 9 months ago

My reproducer still fails.

With JuliaLang/Distributed.jl#4 and JuliaData/MemPool.jl#75 and the most recent nightly build of Julia:

`MemPool.MEM_RESERVED[]`: Default

``` $ ~/programs/julia/julia-9fc1b653c4/bin/julia --project -t1 --heap-size-hint=3G _ _ _ _(_)_ | Documentation: https://docs.julialang.org (_) | (_) (_) | _ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help. | | | | | | |/ _` | | | | |_| | | | (_| | | Version 1.11.0-DEV.1105 (2023-12-15) _/ |\__'_|_|_|\__'_| | Commit 9fc1b653c43 (0 days old master) |__/ | julia> include("mwe.jl"); @show(Base.format_bytes(DTables.Dagger.MemPool.MEM_RESERVED[])); for i = 1:300 (i % 10 == 0 && @show(i)); main() end Base.format_bytes(DTables.Dagger.MemPool.MEM_RESERVED[]) = "512.000 MiB" From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: GC error (probable corruption) From worker 2: Allocations: 191540514 (Pool: 191538428; Big: 2086); GC: 3793 From worker 2:

`MemPool.MEM_RESERVED[]`: 2 GB

``` $ JULIA_MEMPOOL_MEMORY_RESERVED=2000000000 ~/programs/julia/julia-9fc1b653c4/bin/julia --project -t 1 --heap-size-hint=3G _ _ _ _(_)_ | Documentation: https://docs.julialang.org (_) | (_) (_) | _ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help. | | | | | | |/ _` | | | | |_| | | | (_| | | Version 1.11.0-DEV.1105 (2023-12-15) _/ |\__'_|_|_|\__'_| | Commit 9fc1b653c43 (0 days old master) |__/ | julia> include("mwe.jl"); @show(Base.format_bytes(DTables.Dagger.MemPool.MEM_RESERVED[])); for i = 1:300 (i % 10 == 0 && @show(i)); main() end Base.format_bytes(DTables.Dagger.MemPool.MEM_RESERVED[]) = "1.863 GiB" From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: GC error (probable corruption) From worker 2: Allocations: 191545919 (Pool: 191543833; Big: 2086); GC: 3751 From worker 2:

`MemPool.MEM_RESERVED[]`: 10 GB

``` $ JULIA_MEMPOOL_MEMORY_RESERVED=10000000000 ~/programs/julia/julia-9fc1b653c4/bin/julia --project -t1 --heap-size-hint=3G _ _ _ _(_)_ | Documentation: https://docs.julialang.org (_) | (_) (_) | _ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help. | | | | | | |/ _` | | | | |_| | | | (_| | | Version 1.11.0-DEV.1105 (2023-12-15) _/ |\__'_|_|_|\__'_| | Commit 9fc1b653c43 (0 days old master) |__/ | julia> include("mwe.jl"); @show(Base.format_bytes(DTables.Dagger.MemPool.MEM_RESERVED[])); for i = 1:300 (i % 10 == 0 && @show(i)); main() end Base.format_bytes(DTables.Dagger.MemPool.MEM_RESERVED[]) = "9.313 GiB" From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: ┌ Info: From worker 2: └ length(dt3) = 233930 From worker 2: From worker 2: [5814] signal 11 (1): Segmentation fault From worker 2: in expression starting at none:0 From worker 2: Allocations: 191516083 (Pool: 191513997; Big: 2086); GC: 6403 ERROR: ┌ Warning: Worker 2 died, rescheduling work └ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:546 Worker 2 terminated.ProcessExitedException┌ Warning: Worker 2 died, rescheduling work └ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:546 (┌ Error: Error assigning workers │ exception = │ ProcessExitedException(2) │ Stacktrace: │ [1] worker_from_id(pg::Distributed.ProcessGroup, i::Int64) │ @ Distributed ~/tmp/distributed_fix/Distributed.jl/src/cluster.jl:1121 │ [2] worker_from_id(pg::Distributed.ProcessGroup, i::Int64) │ @ Distributed ~/tmp/distributed_fix/Distributed.jl/src/cluster.jl:1118 [inlined] │ [3] remote_do │ @ ~/tmp/distributed_fix/Distributed.jl/src/remotecall.jl:557 [inlined] │ [4] cleanup_proc(state::Dagger.Sch.ComputeState, p::Dagger.OSProc, log_sink::TimespanLogging.NoOpLog) │ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:409 │ [5] monitor_procs_changed!(ctx::Dagger.Context, state::Dagger.Sch.ComputeState) │ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:887 │ [6] (::Dagger.Sch.var"#100#102"{Dagger.Context, Dagger.Sch.ComputeState})() │ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:509 └ @ Dagger.Sch ~/.julia/packages/Dagger/lhyAj/src/sch/Sch.jl:511 2Unhandled Task ERROR: EOFError: read end of file Stacktrace: [1] (::Base.var"#wait_locked#797")(s::Sockets.TCPSocket, buf::IOBuffer, nb::Int64) @ Base ./stream.jl:950 [2] unsafe_read(s::Sockets.TCPSocket, p::Ptr{UInt8}, nb::UInt64) @ Base ./stream.jl:958 [3] unsafe_read @ ./io.jl:882 [inlined] [4] unsafe_read(s::Sockets.TCPSocket, p::Base.RefValue{NTuple{4, Int64}}, n::Int64) @ Base ./io.jl:881 [5] read! @ ./io.jl:886 [inlined] [6] deserialize_hdr_raw @ ~/tmp/distributed_fix/Distributed.jl/src/messages.jl:167 [inlined] [7] message_handler_loop(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) @ Distributed ~/tmp/distributed_fix/Distributed.jl/src/process_messages.jl:172 [8] process_tcp_streams(r_stream::Sockets.TCPSocket, w_stream::Sockets.TCPSocket, incoming::Bool) @ Distributed ~/tmp/distributed_fix/Distributed.jl/src/process_messages.jl:133 [9] (::Distributed.var"#113#114"{Sockets.TCPSocket, Sockets.TCPSocket, Bool})() @ Distributed ~/tmp/distributed_fix/Distributed.jl/src/process_messages.jl:121 ) Stacktrace: [1] try_yieldto(undo::typeof(Base.ensure_rescheduled)) @ Base ./task.jl:944 [2] wait() @ Base ./task.jl:1008 [3] wait(c::Base.GenericCondition{ReentrantLock}; first::Bool) @ Base ./condition.jl:130 [4] wait @ Base ./condition.jl:125 [inlined] [5] take_buffered(c::Channel{Any}) @ Base ./channels.jl:477 [6] take!(c::Channel{Any}) @ Base ./channels.jl:471 [7] take!(::Distributed.RemoteValue) @ Distributed ~/tmp/distributed_fix/Distributed.jl/src/remotecall.jl:726 [8] remotecall_fetch(::Function, ::Distributed.Worker; kwargs::@Kwargs{}) @ Distributed ~/tmp/distributed_fix/Distributed.jl/src/remotecall.jl:461 [9] remotecall_fetch(::Function, ::Distributed.Worker) @ Distributed ~/tmp/distributed_fix/Distributed.jl/src/remotecall.jl:454 [10] remotecall_fetch @ ~/tmp/distributed_fix/Distributed.jl/src/remotecall.jl:492 [inlined] [11] main @ ~/tmp/distributed_fix/mwe.jl:19 [inlined] [12] top-level scope @ ./REPL[1]:1 ```

I can try with a different Julia commit if you think the issue here is the particular commit I used. (And let me know if you have a commit in mind.)

StevenWhitaker commented 9 months ago

@jpsamaroo Happy Holidays! Any updates on this front?

And to reiterate some questions I had:

Oh, I thought https://github.com/JuliaLang/Distributed.jl/pull/4 was only relevant if running code with multiple threads, but I'm running julia with -t1.

Is my assumption wrong, i.e., even with one thread we need that Distributed.jl fix?

With disk caching enabled, as far as I can tell, my issue isn't running out of RAM, it's running out of allotted disk space. So the LRU needs to delete stuff that is no longer needed, but it seems like that isn't happening.

Any thoughts on this?

StevenWhitaker commented 9 months ago

I tried a couple of variations of my reproducer to collect a few more data points in case it helps with this issue:

I realized that I accidentally had been using multiple threads on the worker processes (because I didn't realize the -t 1 option didn't automatically propagate to workers added with addprocs). So, I updated my reproducer to include "-t 1" in the exeflags kwarg of addprocs. I then ran the reproducer both with and without disk caching, as before:

I then decided to remove the two GC.enable lines in load_dt:

I then tried wrapping the call to query in main with Dagger.with_options(query; scope = ProcessScope(myid())):

There are no memory or disk caching problems when I don't add any processes, regardless of whether or not multiple threads are used.

TL;DR Everything I tried still resulted in failure, except removing processes altogether.

jpsamaroo commented 9 months ago

Happy Holidays!

Is my assumption wrong, i.e., even with one thread we need that Distributed.jl fix?

Probably yes, as it may still be a race with multiple async tasks. I haven't really experienced that, but it can probably still occur.

Any thoughts on this?

I still need to implement this in https://github.com/JuliaData/MemPool.jl/pull/75 - I have a long TODO list from before the holidays, so I'm slowly working down it. Thanks for you patience :smile:

W.r.t the non-disk-caching OOMs, I've put together https://github.com/JuliaData/MemPool.jl/pull/76, which together with https://github.com/JuliaData/MemPool.jl/pull/75 and https://github.com/JuliaParallel/Dagger.jl/tree/jps/chained-dtors significantly reduces the amount of memory that Dagger keeps around, and also forces GC calls when we're running out of memory (tunable with the new JULIA_MEMPOOL_MEMORY_RESERVED env var, or MemPool.MEM_RESERVED[]). I haven't tested it heavily yet with DTables, but I've seen some solid improvements with DArray operations. At least for single worker tests, I see very consistent, much lower memory usage, rather than the wild sawtooth pattern that we're used to seeing.

I'll keep at this, but thank you for the detailed updates and patience while I work through these issues!

StevenWhitaker commented 8 months ago

Thanks for your work, and no worries about having a long TODO list!

I just tried adding JuliaData/MemPool.jl#75 and https://github.com/JuliaParallel/Dagger.jl/tree/jps/chained-dtors to my environment, in conjunction with DTables v0.4.3, and DTables failed to precompile (segfault). I saw this on Julia 1.9.4 and 1.10.0. (But it precompiled fine on 1.11 (2024-01-11 nightly), even without your Distributed fix.) Any thoughts on why DTables would fail to precompile?

jpsamaroo commented 8 months ago

Odd, can you provide a stacktrace of the segfault?

StevenWhitaker commented 8 months ago

In Julia 1.10.0, but it looked the same in Julia 1.9.4.

This is the log when precompiling after updating packages (it's very long, so I had to truncate):

After updating

``` Precompiling project... ✗ DTables 10 dependencies successfully precompiled in 24 seconds. 43 already precompiled. 1 dependency had output during precompilation: ┌ Dagger │ Task │ Task(next=Task(next=Task(next=Task(next=Task(next=Task(next=Task(next=Task(next=Task(next=Task(next=nothing, queue=Base.IntrusiveLinkedList{Task}(head=, tail=), storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.Sch.var"#1#4"{Task}(t=), rngState0=0xa610950288d1a62b, rngState1=0x69c678e269ea747a, rngState2=0x0b3844ba33ace071, rngState3=0x079455409e71f8ba, rngState4=0x060e91f40887cffb, _state=0x00, sticky=false, _isexception=false, priority=0x0000), queue=Base.IntrusiveLinkedList{Task}(head=, tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.Sch.var"#1#4"{Task}(t=), rngState0=0xa610950288d1a62b, rngState1=0x69c678e269ea747a, rngState2=0x0b3844ba33ace071, rngState3=0x079455409e71f8ba, rngState4=0x060e91f40887cffb, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x8beae08b01e92a62, rngState1=0xbf7c41e876765331, rngState2=0x6eafabeca88db3b5, rngState3=0xb7b76922816e3418, rngState4=0x0a3920fe1bb73992, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x8beae08b01e92a62, rngState1=0xbf7c41e876765331, rngState2=0x6eafabeca88db3b5, rngState3=0xb7b76922816e3418, rngState4=0x0a3920fe1bb73992, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.var"#101#102"{Dagger.UnrefThunkByUser}(unref=Dagger.UnrefThunkByUser(thunk=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)}[Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1), Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1)], syncdeps=Base.Set{Any}(dict=Base.Dict{Any, Nothing}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Any, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], vals=Array{Nothing, (16,)}[nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing], ndel=0, count=0, age=0x0000000000000000, idxfloor=16, maxprobe=0)), id=2, get_result=false, meta=false, persist=false, cache=false, cache_ref=nothing, affinity=nothing, eager_ref=MemPool.DRef(owner=1, id=0, size=0x0000000000000008), options=Dagger.Sch.ThunkOptions(single=nothing, proclist=nothing, time_util=nothing, alloc_util=nothing, occupancy=nothing, allow_errors=nothing, checkpoint=nothing, restore=nothing, storage=nothing, storage_root_tag=nothing, storage_leaf_tag=nothing, storage_retain=false), propagates=()))), rngState0=0xe697ff881da65821, rngState1=0x9d4dfe9c7d07f5ca, rngState2=0x6356cc58f183f300, rngState3=0x52aaf13dd576c4c6, rngState4=0x5cd84792ebe5130d, _state=0x00, sticky=false, _isexception=false, priority=0x0000), queue=Base.IntrusiveLinkedList{Task}(head=, tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.Sch.var"#1#4"{Task}(t=Task(next=, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x8beae08b01e92a62, rngState1=0xbf7c41e876765331, rngState2=0x6eafabeca88db3b5, rngState3=0xb7b76922816e3418, rngState4=0x0a3920fe1bb73992, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x8beae08b01e92a62, rngState1=0xbf7c41e876765331, rngState2=0x6eafabeca88db3b5, rngState3=0xb7b76922816e3418, rngState4=0x0a3920fe1bb73992, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.var"#101#102"{Dagger.UnrefThunkByUser}(unref=Dagger.UnrefThunkByUser(thunk=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)}[Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1), Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1)], syncdeps=Base.Set{Any}(dict=Base.Dict{Any, Nothing}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Any, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], vals=Array{Nothing, (16,)}[nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing], ndel=0, count=0, age=0x0000000000000000, idxfloor=16, maxprobe=0)), id=2, get_result=false, meta=false, persist=false, cache=false, cache_ref=nothing, affinity=nothing, eager_ref=MemPool.DRef(owner=1, id=0, size=0x0000000000000008), options=Dagger.Sch.ThunkOptions(single=nothing, proclist=nothing, time_util=nothing, alloc_util=nothing, occupancy=nothing, allow_errors=nothing, checkpoint=nothing, restore=nothing, storage=nothing, storage_root_tag=nothing, storage_leaf_tag=nothing, storage_retain=false), propagates=()))), rngState0=0xe697ff881da65821, rngState1=0x9d4dfe9c7d07f5ca, rngState2=0x6356cc58f183f300, rngState3=0x52aaf13dd576c4c6, rngState4=0x5cd84792ebe5130d, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), rngState0=0xa610950288d1a62b, rngState1=0x69c678e269ea747a, rngState2=0x0b3844ba33ace071, rngState3=0x079455409e71f8ba, rngState4=0x060e91f40887cffb, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x57ac2d299a0c4979, rngState1=0xdb6d9cc8bb1ff127, rngState2=0x8578e29541ccd5c8, rngState3=0xe04def6ef7584a19, rngState4=0x137b48b63ff2b3dc, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x57ac2d299a0c4979, rngState1=0xdb6d9cc8bb1ff127, rngState2=0x8578e29541ccd5c8, rngState3=0xe04def6ef7584a19, rngState4=0x137b48b63ff2b3dc, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=MemPool.var"#187#188"{Int64, MemPool.RefState}(id=2, state=MemPool.RefState(storage=MemPool.StorageState(data=Base.Some{Any}(value=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)}[Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1), Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1)], syncdeps=Base.Set{Any}(dict=Base.Dict{Any, Nothing}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Any, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], vals=Array{Nothing, (16,)}[nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing], ndel=0, count=0, age=0x0000000000000000, idxfloor=16, maxprobe=0)), id=2, get_result=false, meta=false, persist=false, cache=false, cache_ref=nothing, affinity=nothing, eager_ref=MemPool.DRef(owner=1, id=0, size=0x0000000000000008), options=Dagger.Sch.ThunkOptions(single=nothing, proclist=nothing, time_util=nothing, alloc_util=nothing, occupancy=nothing, allow_errors=nothing, checkpoint=nothing, restore=nothing, storage=nothing, storage_root_tag=nothing, storage_leaf_tag=nothing, storage_retain=false), propagates=())), leaves=Array{MemPool.StorageLeaf, (0,)}[], root=MemPool.CPURAMDevice(), ready=Base.Event(notify=Base.GenericCondition{Base.ReentrantLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.ReentrantLock(locked_by=nothing, reentrancy_cnt=0x00000000, havelock=0x00, cond_wait=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), _=(0, 139637977253888, 4294967297))), autoreset=false, set=true)), size=0x0000000000000040, tag=nothing, leaf_tag=MemPool.Tag(tags=Base.Dict{Type, Any}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Type, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], vals=Array{Any, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], ndel=0, count=0, age=0x0000000000000000, idxfloor=16, maxprobe=0)), destructor=Dagger.UnrefThunkByUser(thunk=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)}[Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1), Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1)], syncdeps=Base.Set{Any}(dict=Base.Dict{Any, Nothing}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Any, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], vals=Array{Nothing, (16,)}[nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing], ndel=0, count=0, age=0x0000000000000000, idxfloor=16, maxprobe=0)), id=2, get_result=false, meta=false, persist=false, cache=false, cache_ref=nothing, affinity=nothing, eager_ref=MemPool.DRef(owner=1, id=0, size=0x0000000000000008), options=Dagger.Sch.ThunkOptions(single=nothing, proclist=nothing, time_util=nothing, alloc_util=nothing, occupancy=nothing, allow_errors=nothing, checkpoint=nothing, restore=nothing, storage=nothing, storage_root_tag=nothing, storage_leaf_tag=nothing, storage_retain=false), propagates=())))), rngState0=0x1964c79885361d8e, rngState1=0xc6b0cf0864f19a3f, rngState2=0x1b522b8d6540deb2, rngState3=0x1a2beff1be0852b4, rngState4=0xde98a337cd85f9af, _state=0x00, sticky=false, _isexception=false, priority=0x0000), queue=Base.IntrusiveLinkedList{Task}(head=, tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.Sch.var"#1#4"{Task}(t=Task(next=, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x8beae08b01e92a62, rngState1=0xbf7c41e876765331, rngState2=0x6eafabeca88db3b5, rngState3=0xb7b76922816e3418, rngState4=0x0a3920fe1bb73992, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x8beae08b01e92a62, rngState1=0xbf7c41e876765331, rngState2=0x6eafabeca88db3b5, rngState3=0xb7b76922816e3418, rngState4=0x0a3920fe1bb73992, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.var"#101#102"{Dagger.UnrefThunkByUser}(unref=Dagger.UnrefThunkByUser(thunk=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)}[Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1), Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1)], syncdeps=Base.Set{Any}(dict=Base.Dict{Any, Nothing}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Any, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], vals=Array{Nothing, (16,)}[nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing], ndel=0, count=0, age=0x0000000000000000, idxfloor=16, maxprobe=0)), id=2, get_result=false, meta=false, persist=false, cache=false, cache_ref=nothing, affinity=nothing, eager_ref=MemPool.DRef(owner=1, id=0, size=0x0000000000000008), options=Dagger.Sch.ThunkOptions(single=nothing, proclist=nothing, time_util=nothing, alloc_util=nothing, occupancy=nothing, allow_errors=nothing, checkpoint=nothing, restore=nothing, storage=nothing, storage_root_tag=nothing, storage_leaf_tag=nothing, storage_retain=false), propagates=()))), rngState0=0xe697ff881da65821, rngState1=0x9d4dfe9c7d07f5ca, rngState2=0x6356cc58f183f300, rngState3=0x52aaf13dd576c4c6, rngState4=0x5cd84792ebe5130d, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), rngState0=0xa610950288d1a62b, rngState1=0x69c678e269ea747a, rngState2=0x0b3844ba33ace071, rngState3=0x079455409e71f8ba, rngState4=0x060e91f40887cffb, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x069d764105eea6ff, rngState1=0x3e39b45e868c94e2, rngState2=0xbc3c1f09676bfede, rngState3=0xd7a8052e37358a6a, rngState4=0x8d78746561669f76, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x069d764105eea6ff, rngState1=0x3e39b45e868c94e2, rngState2=0xbc3c1f09676bfede, rngState3=0xd7a8052e37358a6a, rngState4=0x8d78746561669f76, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.Sch.var"#1#4"{Task}(t=), rngState0=0xec0667b380aa7ce4, rngState1=0xb561d121a0d7dd98, rngState2=0xb53004b94cecfd90, rngState3=0x3ca6f165ed7c2e7b, rngState4=0xe0a08a73fc8c7861, _state=0x00, sticky=false, _isexception=false, priority=0x0000), queue=Base.IntrusiveLinkedList{Task}(head=, tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x08edfc4b43ca9be3, rngState1=0x50dc04a389826c0e, rngState2=0x18229d209795b919, rngState3=0xfcb77865399e629b, rngState4=0xe4a6980409ad6218, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.Sch.var"#1#4"{Task}(t=Task(next=, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x8beae08b01e92a62, rngState1=0xbf7c41e876765331, rngState2=0x6eafabeca88db3b5, rngState3=0xb7b76922816e3418, rngState4=0x0a3920fe1bb73992, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x8beae08b01e92a62, rngState1=0xbf7c41e876765331, rngState2=0x6eafabeca88db3b5, rngState3=0xb7b76922816e3418, rngState4=0x0a3920fe1bb73992, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.var"#101#102"{Dagger.UnrefThunkByUser}(unref=Dagger.UnrefThunkByUser(thunk=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)}[Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1), Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1)], syncdeps=Base.Set{Any}(dict=Base.Dict{Any, Nothing}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Any, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], vals=Array{Nothing, (16,)}[nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing], ndel=0, count=0, age=0x0000000000000000, idxfloor=16, maxprobe=0)), id=2, get_result=false, meta=false, persist=false, cache=false, cache_ref=nothing, affinity=nothing, eager_ref=MemPool.DRef(owner=1, id=0, size=0x0000000000000008), options=Dagger.Sch.ThunkOptions(single=nothing, proclist=nothing, time_util=nothing, alloc_util=nothing, occupancy=nothing, allow_errors=nothing, checkpoint=nothing, restore=nothing, storage=nothing, storage_root_tag=nothing, storage_leaf_tag=nothing, storage_retain=false), propagates=()))), rngState0=0xe697ff881da65821, rngState1=0x9d4dfe9c7d07f5ca, rngState2=0x6356cc58f183f300, rngState3=0x52aaf13dd576c4c6, rngState4=0x5cd84792ebe5130d, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), rngState0=0xa610950288d1a62b, rngState1=0x69c678e269ea747a, rngState2=0x0b3844ba33ace071, rngState3=0x079455409e71f8ba, rngState4=0x060e91f40887cffb, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x49e6972f8b3ff2c2, rngState1=0xe7a6d748b1245761, rngState2=0x0764377583e7a4cc, rngState3=0xb2b082bf6c7468f6, rngState4=0x34c8bea741f1fee0, _state=0x00, sticky=false, _isexception=false, priority=0x0000), tail=Task(next=nothing, queue=, storage=nothing, donenotify=Base.GenericCondition{Base.Threads.SpinLock}(waitq=Base.IntrusiveLinkedList{Task}(head=nothing, tail=nothing), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Base.var"#671#672"{Task}(t=), rngState0=0x49e6972f8b3ff2c2, rngState1=0xe7a6d748b1245761, rngState2=0x0764377583e7a4cc, rngState3=0xb2b082bf6c7468f6, rngState4=0x34c8bea741f1fee0, _state=0x00, sticky=false, _isexception=false, priority=0x0000)), lock=Base.Threads.SpinLock(owned=0)), result=nothing, logstate=nothing, code=Dagger.var"#101#102"{Dagger.UnrefThunkByUser}(unref=Dagger.UnrefThunkByUser(thunk=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)}[Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1), Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=Dagger.WeakThunk(x=WeakRef(value=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)}[Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1), Pair{Union{Nothing, Symbol}, Any}(first=nothing, second=1)], syncdeps=Base.Set{Any}(dict=Base.Dict{Any, Nothing}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Any, (16,)}[#, #, #, #, #, #, #, #, #, #, #, #, #, #, #, #], vals=Array{Nothing, (16,)}[nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing, nothing], ndel=0, count=0, age=0x0000000000000000, idxfloor=16, maxprobe=0)), id=2, get_result=false, meta=false, persist=false, cache=false, cache_ref=nothing, affinity=nothing, eager_ref=MemPool.DRef(owner=1, id=0, size=0x0000000000000008), options=Dagger.Sch.ThunkOptions(single=nothing, proclist=nothing, time_util=nothing, alloc_util=nothing, occupancy=nothing, allow_errors=nothing, checkpoint=nothing, restore=nothing, storage=nothing, storage_root_tag=nothing, storage_leaf_tag=nothing, storage_retain=false), propagates=()))))], syncdeps=Base.Set{Any}(dict=Base.Dict{Any, Nothing}(slots=Array{UInt8, (16,)}[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00], keys=Array{Any, (16,)}[ │ #, │ #, │ #, │ #, │ #, │ #, │ #, │ #, │ Dagger.WeakThunk(x=WeakRef(value=Dagger.Thunk(f=Base.:(+), inputs=Array{Pair{Union{Nothing, Symbol}, Any}, (2,)} │ ... └ 1 dependency errored. For a report of the errors see `julia> err`. To retry use `pkg> precompile` ```

And here's the segfault that comes when calling precompile:

Segfault

``` (chained-dtors) pkg> precompile Precompiling project... ✗ DTables 0 dependencies successfully precompiled in 1 seconds. 53 already precompiled. ERROR: The following 1 direct dependency failed to precompile: DTables [20c56dc6-594c-4682-91cf-1d46875b1eba] Failed to precompile DTables [20c56dc6-594c-4682-91cf-1d46875b1eba] to "/home/steven/.julia/compiled/v1.10/DTables/jl_f6HpNj". [1629] signal (6.-6): Aborted in expression starting at /home/steven/.julia/packages/DTables/EiSy4/src/DTables.jl:7 pthread_kill at /lib/x86_64-linux-gnu/libc.so.6 (unknown line) raise at /lib/x86_64-linux-gnu/libc.so.6 (unknown line) abort at /lib/x86_64-linux-gnu/libc.so.6 (unknown line) get_item_for_reloc at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/staticdata.c:1798 [inlined] jl_read_reloclist at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/staticdata.c:1874 jl_restore_system_image_from_stream_ at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/staticdata.c:2996 jl_restore_package_image_from_stream at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/staticdata.c:3418 jl_restore_incremental_from_buf at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/staticdata.c:3465 ijl_restore_package_image_from_file at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/staticdata.c:3549 _include_from_serialized at ./loading.jl:1052 _require_search_from_serialized at ./loading.jl:1575 _require at ./loading.jl:1932 __require_prelocked at ./loading.jl:1806 jfptr___require_prelocked_80742.1 at /home/steven/programs/julia/julia-1.10.0/lib/julia/sys.so (unknown line) _jl_invoke at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined] ijl_apply_generic at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:3076 jl_apply at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined] jl_f__call_in_world at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/builtins.c:831 #invoke_in_world#3 at ./essentials.jl:921 [inlined] invoke_in_world at ./essentials.jl:918 [inlined] _require_prelocked at ./loading.jl:1797 macro expansion at ./loading.jl:1784 [inlined] macro expansion at ./lock.jl:267 [inlined] __require at ./loading.jl:1747 jfptr___require_80707.1 at /home/steven/programs/julia/julia-1.10.0/lib/julia/sys.so (unknown line) _jl_invoke at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined] ijl_apply_generic at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:3076 jl_apply at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined] jl_f__call_in_world at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/builtins.c:831 #invoke_in_world#3 at ./essentials.jl:921 [inlined] invoke_in_world at ./essentials.jl:918 [inlined] require at ./loading.jl:1740 jfptr_require_80704.1 at /home/steven/programs/julia/julia-1.10.0/lib/julia/sys.so (unknown line) _jl_invoke at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined] ijl_apply_generic at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:3076 jl_apply at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined] call_require at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:481 [inlined] eval_import_path at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:518 eval_import_from at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:635 [inlined] eval_import_from at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:626 jl_toplevel_eval_flex at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:742 jl_eval_module_expr at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:215 [inlined] jl_toplevel_eval_flex at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:736 jl_toplevel_eval_flex at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:877 ijl_toplevel_eval_in at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:985 eval at ./boot.jl:385 [inlined] include_string at ./loading.jl:2070 _jl_invoke at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined] ijl_apply_generic at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:3076 _include at ./loading.jl:2130 include at ./Base.jl:495 [inlined] include_package_for_output at ./loading.jl:2216 jfptr_include_package_for_output_80987.1 at /home/steven/programs/julia/julia-1.10.0/lib/julia/sys.so (unknown line) _jl_invoke at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined] ijl_apply_generic at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:3076 jl_apply at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined] do_call at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/interpreter.c:126 eval_value at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/interpreter.c:223 eval_stmt_value at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/interpreter.c:174 [inlined] eval_body at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/interpreter.c:617 jl_interpret_toplevel_thunk at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/interpreter.c:775 jl_toplevel_eval_flex at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:934 jl_toplevel_eval_flex at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:877 ijl_toplevel_eval_in at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/toplevel.c:985 eval at ./boot.jl:385 [inlined] include_string at ./loading.jl:2070 include_string at ./loading.jl:2080 [inlined] exec_options at ./client.jl:316 _start at ./client.jl:552 jfptr__start_82703.1 at /home/steven/programs/julia/julia-1.10.0/lib/julia/sys.so (unknown line) _jl_invoke at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:2894 [inlined] ijl_apply_generic at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/gf.c:3076 jl_apply at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/julia.h:1982 [inlined] true_main at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/jlapi.c:582 jl_repl_entrypoint at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/src/jlapi.c:731 main at /cache/build/builder-amdci4-6/julialang/julia-release-1-dot-10/cli/loader_exe.c:58 unknown function (ip: 0x7f524eabcd8f) __libc_start_main at /lib/x86_64-linux-gnu/libc.so.6 (unknown line) unknown function (ip: 0x4010b8) Allocations: 2906 (Pool: 2897; Big: 9); GC: ```

jpsamaroo commented 8 months ago

Yeah I'm dealing with these errors too now that I've opened the PR. I'll get those fixed and then let you know when you can try it.

StevenWhitaker commented 8 months ago

@jpsamaroo I just saw JuliaLang/julia#40626. Does this issue affect Dagger.jl at all, i.e., could this issue be the cause of what I'm seeing with memory seeming like it doesn't get freed?

jpsamaroo commented 8 months ago

Huh, I hadn't thought of that, but now that you mention it, every Dagger task does rely on fetching the result of a Julia Task to get the Dagger task's return value (https://github.com/JuliaParallel/Dagger.jl/blob/2110d621212591b6652cfeb8d6ba668255358d10/src/processor.jl#L164-L172). That could certainly cause over-preservation of task results. I'll investigate this week and get back to you!

Also, the jps/chained-dtors branch has been merged into Dagger master (with fixes for the precompile errors), so please give that a try when you get a chance!

StevenWhitaker commented 8 months ago

@jpsamaroo I've been trying several different things, so here's a report of my findings.

Comparison to MemPool.jl

`dtables.jl` (no significant changes to the original MWE I posted)

```julia @everywhere using DTables, DataFrames, CSV if @isdefined(USE_DISK_CACHING) && USE_DISK_CACHING @info "disk caching enabled" enable_disk_caching!(8, 20 * 2^10) # 8% max memory to lead to ~512 MiB per process to match custom code else @info "no disk caching" end @everywhere const DT = Ref{DTable}() @everywhere mutable struct DTableCols key_names value_names keys values end function main() remotecall_fetch(query, 2) end @everywhere function query() dt1 = load_dt() dt2 = add_value_col!(dt1) dt3 = update_value_col!(dt2) @info "" length(dt3) dt4 = calc_value_cols(dt3) dt5 = select(dt4, [6; 12; 103:113]...; copycols = false) dt_agg = aggregate_dt(dt5) return fetch(dt_agg) end @everywhere function load_dt() isassigned(DT) && return DT[] file = "file.csv" dt = DTable(x -> CSV.File(x), [file]; tabletype = DataFrame) DT[] = dt return dt end @everywhere function add_value_col!(dt) dt_cols = create_dt_cols(dt, 1:48, 49:102) dt_cols.value_names = [dt_cols.value_names; "RAND"] dt_cols.values = (dt_cols.values..., rand(length(dt_cols.values[1]))) return create_dt_from_cols(dt_cols; is_sorted = true) end @everywhere function create_dt_cols(dt, key_cols, value_cols) df = fetch(dt) key_names = names(df)[key_cols] value_names = names(df)[value_cols] keys = [df[!, i] for i in key_cols] values = [df[!, i] for i in value_cols] return DTableCols(key_names, value_names, keys, values) end @everywhere function create_dt_from_cols(dt_cols; is_sorted = false) df = DataFrame( (dt_cols.key_names .=> dt_cols.keys)..., (dt_cols.value_names .=> dt_cols.values)...; copycols = false, ) is_sorted || sort!(df) return DTable(df) end @everywhere function update_value_col!(dt) dt_cols = create_dt_cols(dt, 1:48, 49:103) dt_cols.values = ( dt_cols.values[1:10]..., rand(length(dt_cols.values[1])), dt_cols.values[12:end]..., ) return create_dt_from_cols(dt_cols; is_sorted = true) end @everywhere function calc_value_cols(dt) newvals = Vector{Float64}[] for i = 1:10 v = calc_new_value(dt, i) push!(newvals, v) end return append_value_cols(dt, newvals) end @everywhere function calc_new_value(dt, i) dt_cols = create_dt_cols(dt, 1:48, 49:103) return abs.(dt_cols.values[i]) end @everywhere function append_value_cols(dt, newvals) df = fetch(dt) for (i, v) in enumerate(newvals) setproperty!(df, "NEW$i", v) end return DTable(df) end @everywhere function aggregate_dt(dt) key_names = [Symbol("6"), Symbol("12")] gdt = groupby(fetch(dt), key_names) gkeys = sort!(collect(keys(gdt))) key_pairs = key_names .=> invert(gkeys) value_names = [[Symbol("RAND")]; Symbol.("NEW", 1:10)] sums = fetch(reduce(+, gdt; cols = value_names)) sorted = sortperm(invert(sums[key_names])) value_pairs = map(value_names) do value value => sums[Symbol(:result_, value)][sorted] end return DTable(DataFrame(key_pairs..., value_pairs...)) end @everywhere invert(x) = [[x[j][i] for j = 1:length(x)] for i = 1:length(x[1])] @everywhere function Base.reduce(f, df::DataFrames.AbstractDataFrame; cols) NamedTuple(col => reduce(f, df[!, col]) for col in cols) end @everywhere function Base.reduce(f, gdt::DataFrames.GroupedDataFrame; cols) gkeys = keys(gdt) dims = keys(gkeys[1]) merge( NamedTuple(dim => getproperty.(gkeys, dim) for dim in dims), NamedTuple( Symbol(:result_, col) => [reduce(f, gdt[k]; cols = [col])[col] for k in gkeys] for col in cols ), ) end ```

`custom.jl` (same as `dtables.jl` except for the custom `struct`)

```julia @everywhere using MemPool, DataFrames, CSV if @isdefined(USE_DISK_CACHING) && USE_DISK_CACHING @info "disk caching enabled" @everywhere let total_mem = Sys.total_memory() ÷ 2 # mem_per_proc = Int(total_mem ÷ nprocs()) # This is too much memory for testing! mem_per_proc = 512 * 2^20 config = MemPool.DiskCacheConfig(; toggle = true, membound = mem_per_proc, diskbound = 20 * 2^30) MemPool.setup_global_device!(config) end else @info "no disk caching" end @everywhere struct DCTable ref::DRef DCTable(df::DataFrame) = new(poolset(df)) end # Call `copy` to be a fairer comparison to DTables.jl's `fetch`. # Code is much faster without `copy`! @everywhere Base.fetch(dt::DCTable) = copy(poolget(dt.ref)) @everywhere Base.length(dt::DCTable) = nrow(fetch(dt)) @everywhere function DataFrames.select(dt::DCTable, args...; kwargs...) df = fetch(dt) selected = select(df, args...; kwargs...) DCTable(selected) end @everywhere const DT = Ref{DCTable}() @everywhere mutable struct DCTableCols key_names value_names keys values end function main() remotecall_fetch(query, 2) end @everywhere function query() dt1 = load_dt() dt2 = add_value_col!(dt1) dt3 = update_value_col!(dt2) @info "" length(dt3) dt4 = calc_value_cols(dt3) dt5 = select(dt4, [6; 12; 103:113]...; copycols = false) dt_agg = aggregate_dt(dt5) return fetch(dt_agg) end @everywhere function load_dt() isassigned(DT) && return DT[] file = "file.csv" df = CSV.read(file, DataFrame) dt = DCTable(df) DT[] = dt return dt end @everywhere function add_value_col!(dt) dt_cols = create_dt_cols(dt, 1:48, 49:102) dt_cols.value_names = [dt_cols.value_names; "RAND"] dt_cols.values = (dt_cols.values..., rand(length(dt_cols.values[1]))) return create_dt_from_cols(dt_cols; is_sorted = true) end @everywhere function create_dt_cols(dt, key_cols, value_cols) df = fetch(dt) key_names = names(df)[key_cols] value_names = names(df)[value_cols] keys = [df[!, i] for i in key_cols] values = [df[!, i] for i in value_cols] return DCTableCols(key_names, value_names, keys, values) end @everywhere function create_dt_from_cols(dt_cols; is_sorted = false) df = DataFrame( (dt_cols.key_names .=> dt_cols.keys)..., (dt_cols.value_names .=> dt_cols.values)...; copycols = false, ) is_sorted || sort!(df) return DCTable(df) end @everywhere function update_value_col!(dt) dt_cols = create_dt_cols(dt, 1:48, 49:103) dt_cols.values = ( dt_cols.values[1:10]..., rand(length(dt_cols.values[1])), dt_cols.values[12:end]..., ) return create_dt_from_cols(dt_cols; is_sorted = true) end @everywhere function calc_value_cols(dt) newvals = Vector{Float64}[] for i = 1:10 v = calc_new_value(dt, i) push!(newvals, v) end return append_value_cols(dt, newvals) end @everywhere function calc_new_value(dt, i) dt_cols = create_dt_cols(dt, 1:48, 49:103) return abs.(dt_cols.values[i]) end @everywhere function append_value_cols(dt, newvals) df = fetch(dt) for (i, v) in enumerate(newvals) setproperty!(df, "NEW$i", v) end return DCTable(df) end @everywhere function aggregate_dt(dt) key_names = [Symbol("6"), Symbol("12")] gdt = groupby(fetch(dt), key_names) gkeys = sort!(collect(keys(gdt))) key_pairs = key_names .=> invert(gkeys) value_names = [[Symbol("RAND")]; Symbol.("NEW", 1:10)] sums = fetch(reduce(+, gdt; cols = value_names)) sorted = sortperm(invert(sums[key_names])) value_pairs = map(value_names) do value value => sums[Symbol(:result_, value)][sorted] end return DCTable(DataFrame(key_pairs..., value_pairs...)) end @everywhere invert(x) = [[x[j][i] for j = 1:length(x)] for i = 1:length(x[1])] @everywhere function Base.reduce(f, df::DataFrames.AbstractDataFrame; cols) NamedTuple(col => reduce(f, df[!, col]) for col in cols) end @everywhere function Base.reduce(f, gdt::DataFrames.GroupedDataFrame; cols) gkeys = keys(gdt) dims = keys(gkeys[1]) merge( NamedTuple(dim => getproperty.(gkeys, dim) for dim in dims), NamedTuple( Symbol(:result_, col) => [reduce(f, gdt[k]; cols = [col])[col] for k in gkeys] for col in cols ), ) end ```

Results: For each of the following I started Julia 1.9.4 with julia --project -p 4 -t 1 --heap-size-hint=3G.

dtables.jl:

julia> include("dtables.jl"); @time for i = 1:50
           memusage = map(procs()) do id
               remotecall_fetch(id) do
                   parse(Int, split(read(`ps -p $(getpid()) -o rss`, String), "\n")[end-1]) * 1000
               end
           end
           totalmemusage = Base.format_bytes(sum(memusage))
           worker_memusage = Base.format_bytes(memusage[2])
           @info "$i" worker_memusage totalmemusage
           main()
       end
⋮
┌ Info: 50
│   worker_memusage = "24.741 GiB"
└   totalmemusage = "28.720 GiB"
129.672974 seconds (9.29 M allocations: 590.653 MiB, 0.12% gc time, 2.99% compilation time: 22% of which was recompilation)

custom.jl:

julia> include("custom.jl"); @time for i = 1:50
           memusage = map(procs()) do id
               remotecall_fetch(id) do
                   parse(Int, split(read(`ps -p $(getpid()) -o rss`, String), "\n")[end-1]) * 1000
               end
           end
           totalmemusage = Base.format_bytes(sum(memusage))
           worker_memusage = Base.format_bytes(memusage[2])
           @info "$i" worker_memusage totalmemusage
           main()
       end
⋮
┌ Info: 50
│   worker_memusage = "9.517 GiB"
└   totalmemusage = "11.166 GiB"
 19.154547 seconds (882.60 k allocations: 61.428 MiB, 2.22% compilation time)

dtables.jl with disk caching:

julia> USE_DISK_CACHING = true; include("dtables.jl"); @time for i = 1:50
           memusage = map(procs()) do id
               remotecall_fetch(id) do
                   parse(Int, split(read(`ps -p $(getpid()) -o rss`, String), "\n")[end-1]) * 1000
               end
           end
           totalmemusage = Base.format_bytes(sum(memusage))
           worker_memusage = Base.format_bytes(memusage[2])
           device_size = remotecall_fetch(2) do
               DTables.Dagger.MemPool.GLOBAL_DEVICE[].device_size[]
           end |> Base.format_bytes
           @info "$i" device_size worker_memusage totalmemusage
           main()
       end
⋮
┌ Info: 50
│   device_size = "10.812 GiB"
│   worker_memusage = "8.309 GiB"
└   totalmemusage = "12.879 GiB"
130.644326 seconds (9.75 M allocations: 621.888 MiB, 0.16% gc time, 3.08% compilation time: 21% of which was recompilation)

custom.jl with disk caching:

julia> USE_DISK_CACHING = true; include("custom.jl"); @time for i = 1:50
           memusage = map(procs()) do id
               remotecall_fetch(id) do
                   parse(Int, split(read(`ps -p $(getpid()) -o rss`, String), "\n")[end-1]) * 1000
               end
           end
           totalmemusage = Base.format_bytes(sum(memusage))
           worker_memusage = Base.format_bytes(memusage[2])
           device_size = remotecall_fetch(2) do
               MemPool.GLOBAL_DEVICE[].device_size[]
           end |> Base.format_bytes
           @info "$i" device_size worker_memusage totalmemusage
           main()
       end
⋮
┌ Info: 50
│   device_size = "8.436 GiB"
│   worker_memusage = "6.284 GiB"
└   totalmemusage = "7.925 GiB"
 39.369563 seconds (921.19 k allocations: 63.999 MiB, 0.03% gc time, 1.27% compilation time)
Project

``` [336ed68f] CSV v0.10.12 [20c56dc6] DTables v0.4.3 [a93c6f00] DataFrames v1.6.1 [f9f48841] MemPool v0.4.6 [8ba89e20] Distributed ```

jpsamaroo commented 7 months ago

I'd like to try to reproduce this locally so I can figure out where Dagger is adding overwhelming overhead (which in general it should not, when working with sufficiently large files). Can you post your file.csv to somewhere I can download it, or provide a script for generating a compatible equivalent? The one I have locally is apparently not in the correct format.

StevenWhitaker commented 7 months ago

Here's a script that generates a .csv file that can reproduce the above behavior:

using CSV, DataFrames, InlineStrings, Random

const NROWS = 233930
const NCOLS = 102

const ELTYPE = [Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, InlineStrings.String1, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, InlineStrings.String15, Int64, Int64, Int64, Int64, InlineStrings.String1, Int64, Int64, Int64, Int64, Int64]

const NUM_UNIQUE = [12, 1, 1, 1, 1, 3, 12, 2, 4, 1, 12, 9, 13, 32, 1292, 13, 32, 493, 2, 3, 3, 3, 3, 2, 3, 2, 367, 462, 8, 369, 192, 28, 28, 193, 43, 243, 243, 48871, 4, 8, 10, 2, 3, 3, 5, 3, 3, 5]

function generate()

    Random.seed!(0)

    input = map(1:NCOLS) do i
        name = string(i)
        if i <= length(ELTYPE)
            if ELTYPE[i] isa AbstractString
                # Oops, this branch is never taken, but the resulting file still reproduces the issue.
                col = string.(rand(1:NUM_UNIQUE[i]-1, NROWS))
                col[1:12] .= " "
                return name => ELTYPE[i].(col)
            else
                col = rand(1:NUM_UNIQUE[i], NROWS)
                return name => col
            end
        else
            col = rand(NROWS)
            return name => col
        end
    end
    df = DataFrame(input)
    sort!(@view(df[13:end, :]), 1:length(ELTYPE))
    CSV.write("file.csv", df)

    return df

end

generate()