m3g / CellListMap.jl

Flexible implementation of cell lists to map the calculations of particle-pair dependent functions, such as forces, energies, neighbor lists, etc.
https://m3g.github.io/CellListMap.jl/
MIT License
87 stars 4 forks source link

CellListMap allocating too much memory #94

Closed dforero0896 closed 10 months ago

dforero0896 commented 10 months ago

Hi, I am using CellListMap to apply a couple of functions to a catalog of ~16M objects. The general idea is as follows:

using Random
using CellListMap
mutable struct Catalog{T}
    pos::AbstractMatrix{T}
    vel::AbstractMatrix{T}
    r_min::AbstractVector{T}
    param_1
    param_2
end #struct

function update!(sat_pos, cen_pos, sat_id, cen_id, d2, catalog::Catalog{T}) where T
    param_1 = catalog.param_1
    param_2 = catalog.param_2

    catalog.pos[1, sat_id] = cen_pos[1] + randn(param_1 * param_2)
    catalog.pos[2, sat_id] = cen_pos[2] + randn(param_1 * param_2)
    catalog.pos[3, sat_id] = cen_pos[3] + randn(param_1 * param_2)

    for ax = 1:3
        catalog.vel[ax, sat_id] = catalog.vel[ax, sat_id] + param_2
    end #for
    catalog
end #func

function inner_1!(cen_pos, sat_pos, cen_id, sat_id, d2, catalog::Catalog{T}) where T<:Real
            catalog = update!(sat_pos, cen_pos, sat_id, cen_id, d2, catalog::Catalog{T})
            catalog.r_min[sat_id] = d2
    catalog
end #func

function inner_2!(cen_pos, sat_pos, cen_id, sat_id, d2, catalog::Catalog{T}) where T<:Real
            catalog = update!(sat_pos, cen_pos, sat_id, cen_id, d2, catalog)
            catalog.r_min[sat_id] = d2
    catalog
end #func

function apply(catalog::Catalog{T},
        params::AbstractVector,
        box_size::AbstractVector;
        show_progress = true) where T
    #catalog = copy_output(catalog)
    box = Box(box_size, params[1])
    cl = CellList(catalog.pos, catalog.pos, box)
    catalog.param_1 = params[3]
    catalog.param_2 = params[end]
    CellListMap.map_pairwise!((x, y, i, j, d2, catalog) -> inner_1!(x, y, i, j, d2, catalog),
                              catalog, box, cl, parallel=false, show_progress=show_progress)
   # UP TO THIS POINT THE CODE RUNS EXTREME ALLOCATION SEEMS TO BE FROM HERE ON
    box = Box(box_size, params[2])
    cl = CellList(catalog.pos, catalog.pos, box)
    catalog.param_1 = params[4]
    CellListMap.map_pairwise!((x, y, i, j, d2, catalog) -> inner_2!(x, y, i, j, d2, catalog),
                               catalog, box, cl, parallel=false, show_progress=show_progress)

    catalog

end #func

function main()
    Nobjects = 16000000
    cat = Catalog(2000f0 * rand(Float32, (3, Nobjects)), 2000f0 * randn(Float32, (3, Nobjects)), 2000f0 * rand(Float32, Nobjects), 0., 0.)
    box_size = [2000. for _ = 1:3]
    params = [1., 1., 1., 1., 1.]
    apply(cat::Catalog{Float32},
        params::AbstractVector,
        box_size::AbstractVector;
        show_progress = true)

end #func

main()

The catalog on disk occupies 1.4 Gb and while I use other large arrays, I wouldn't expect the code to need >300Gb. And it seems that it fails when updating the cell list, I just get Killed most of the time

lmiq commented 10 months ago

What immediately pops out is that this structure has abstract fields:

mutable struct Catalog{T}
    pos::AbstractMatrix{T}
    vel::AbstractMatrix{T}
    r_min::AbstractVector{T}
    param_1
    param_2
end #struct

You probably should use something like:

mutable struct Catalog{T}
    pos::Matrix{T}
    vel::Matrix{T}
    r_min::Vector{T}
    param_1::T
    param_2::T
end #struct

(or the appropriate types for the param values).

lmiq commented 10 months ago

But the actual issue is here:

catalog.pos[1, sat_id] = cen_pos[1] + randn(param_1 * param_2)

randn(10), for example, creates an array of 10 random numbers. You probably want randn(T) there, to create one random number. Thus with that randn(param_1 * param_2) you are creating an array of random numbers on each call of that function, inside your update function.

lmiq commented 10 months ago

Ah, there is another problem, that more related to CellListMap:

If you have such a large box, with side 2000, a cutoff of 1.0 implies that you have 2000^3 cells. That can explode the memory. For using cell lists for this, you probably will have to choose a larger cutoff, even if that implies some cost.

In my machine, if you put a cutoff of 10.0 instead of 1.0 the memory is fine.

lmiq commented 10 months ago

A good rule of thumb is that it does not make sense to have more cells than particles (this is strictly correct for homogeneous systems, for inhomogeneous systems things get more complicated). Then, you could choose your cutoff with:

julia> cutoff = ceil(2000 / 16000000^(1/3))
8.0

with which I can run your example fine here:

julia> box = Box(box_size, 8.0)
Box{OrthorhombicCell, 3}
  unit cell matrix = [ 2000.0 0.0 0.0; 0.0 2000.0 0.0; 0.0 0.0 2000.0 ]
  cutoff = 8.0
  number of computing cells on each dimension = [253, 253, 253]
  computing cell sizes = [8.0, 8.0, 8.0] (lcell: 1)
  Total number of cells = 16194277

julia> cl = CellList(catalog.pos, catalog.pos, box)
CellListMap.CellListPair{Vector{StaticArraysCore.SVector{3, Float64}}, 3, Float64, CellListMap.NotSwapped}
   1600 particles in the reference vector.
   1598 cells with real particles of target vector.
lmiq commented 10 months ago

At the end, this is the code I can run here. Concerning the content, I'll just warn you by the fact that I have changed the randn call inside the update function, and you should check if that is what you actually want.

using Random
using CellListMap
mutable struct Catalog{T}
    pos::Matrix{T}
    vel::Matrix{T}
    r_min::Vector{T}
    param_1::T
    param_2::T
end #struct

function update!(sat_pos, cen_pos, sat_id, cen_id, d2, catalog::Catalog{T}) where T
    param_1 = catalog.param_1
    param_2 = catalog.param_2

    catalog.pos[1, sat_id] = cen_pos[1] + randn(T)
    catalog.pos[2, sat_id] = cen_pos[2] + randn(T)
    catalog.pos[3, sat_id] = cen_pos[3] + randn(T)

    for ax = 1:3
        catalog.vel[ax, sat_id] = catalog.vel[ax, sat_id] + param_2
    end #for
    catalog
end #func

function inner_1!(cen_pos, sat_pos, cen_id, sat_id, d2, catalog::Catalog{T}) where T<:Real
            catalog = update!(sat_pos, cen_pos, sat_id, cen_id, d2, catalog::Catalog{T})
            catalog.r_min[sat_id] = d2
    catalog
end #func

function inner_2!(cen_pos, sat_pos, cen_id, sat_id, d2, catalog::Catalog{T}) where T<:Real
            catalog = update!(sat_pos, cen_pos, sat_id, cen_id, d2, catalog)
            catalog.r_min[sat_id] = d2
    catalog
end #func

function apply(catalog::Catalog{T},
        params::AbstractVector,
        box_size::AbstractVector;
        show_progress = true) where T
    #catalog = copy_output(catalog)
    box = Box(box_size, 8.0)
    cl = CellList(catalog.pos, catalog.pos, box)
    catalog.param_1 = params[3]
    catalog.param_2 = params[end]
    CellListMap.map_pairwise!((x, y, i, j, d2, catalog) -> inner_1!(x, y, i, j, d2, catalog),
                              catalog, box, cl, parallel=false, show_progress=show_progress)
   # UP TO THIS POINT THE CODE RUNS EXTREME ALLOCATION SEEMS TO BE FROM HERE ON
    box = Box(box_size, 8.0)
    cl = CellList(catalog.pos, catalog.pos, box)
    catalog.param_1 = params[4]
    CellListMap.map_pairwise!((x, y, i, j, d2, catalog) -> inner_2!(x, y, i, j, d2, catalog),
                               catalog, box, cl, parallel=false, show_progress=show_progress)

    catalog

end #func

function main()
    Nobjects = 16000000
    cat = Catalog(2000f0 * rand(Float32, (3, Nobjects)), 2000f0 * randn(Float32, (3, Nobjects)), 2000f0 * rand(Float32, Nobjects), 0.f0, 0.f0)
    box_size = [2000. for _ = 1:3]
    params = [1., 1., 1., 1., 1.]
    apply(cat::Catalog{Float32},
        params::AbstractVector,
        box_size::AbstractVector;
        show_progress = true)

end #func

@time main()
dforero0896 commented 10 months ago

Hi, thanks for pointing these out! Actually why do the abstract types pose a problem for the structure? Also, in the random part I agree. This was an attempt at a MWE so I actually meant param_1 * param_2 * rand(). In my actual code I should not be allocating that many arrays and I only add a single number., so I don;t think that is the actual issue. The actual issue seems to be the number of cells. I agree the cutoff is small and it is actually a parameter that can change on each run of apply! so I can;t always choose it. I would expect that if the cutoff is too small, CellListMap would choose a "sensible" cell size and do a brute force search within since no other cells would need to be explored (something similar to the leaf size in a KDTree), is it possible to have this? Or would I be able to choose another cell size (I think I read once this was possible in the docs)?

lmiq commented 10 months ago

Well, the problem is the that the systems can be inhomogenous. Thus, if we do not think about memory problems, the smaller the cell size the better, if we increase the cutoff and the system is inhomogeous that will just compute too many interactions. Thus, I cannot make that decision to the user, unless I kept track of the free memory and opted in for an alternative construction scheme if noticing that the memory will explode. But that, I think, is outside the scope of what this package should provide.

Cell lists are good for rather homogeneous systems. For highly heterogeneous ones KDTrees are better. For homogeneous systems, if one has so many particles that the arrays that we have to store are close to the limit of the memory of the computer, one will have problems. CellListMap requires a memory which is only a few times the memory required for the array of coordinates (given a number of cells not larger than the number of particles), so if the computer can store with ease the coordinates, it should be able to run celllistmap.

The problem with the abstract structure is more a general Julia issue: you will have a struct with abstract types, which will cause type instabilities that an make everything very slow. Thus, you want to have concrete types. I think this was not clearly a problem here because you where passing the values of the fields of the struct (which are concrete) to the inner functions, creating function barriers. But anyway you should avoid that pattern. If you were passing the complete struct to the functions, you would have problems.

lmiq commented 10 months ago

Continuing (I'm running from one place to the other, so the multiple messages):

You have in the manual the possibility of using smaller cell sizes, by increasing the lcell parameter upon box construction. But this is not what you want here. That is useful for very dense systems, to avoid even more spurious (farther than the cutoff) computations. (this would be the corresponding manual section).

Another non-related comment: while CellListMap does not have a safeguard for dealing with possible memory issues, it does use much less memory than a KDTree algorithm. The cutoff you can define just before the Box construction, given the problem data, and that may be the best solution here. Whenever you use a cutoff greater than the actual one desired, you should not have correctness problems.

dforero0896 commented 10 months ago

Thanks a lot! That's very informative. I don;t expect my data to be very heterogeneous, at least for my current tests (but may be in the future). I may also try the NearestNeighbors tree in that case, which I thought used very little memory if the data wasn't reordered, I haven't tested it myself but why is it that you find it uses much more memory? For now I think I will have to limit the minimum cell size for this problem.

lmiq commented 10 months ago

That of course will depend on the implementation of the tree. However, as far as I know, one big problem is that you will have to first build the list and then compute the property, while in CellListMap you are not saving explicitly the list. For a large problem saving the list of pairs is prohibitive.

One example:

julia> using CellListMap, NearestNeighbors

julia> function cl(x, y, cutoff)
           box = Box(limits(x,y), cutoff)
           cl = CellList(x, y, box)
           map_pairwise((x,y,i,j,d2,out) -> out += d2, 0.0, box, cl)
       end
cl (generic function with 2 methods)

julia> function nb(x, y, cutoff)
           tree = balltree = BallTree(y)
           list = NearestNeighbors.inrange(balltree, x, cutoff, true)
           out = 0.0
           for i in eachindex(list)
               for j in eachindex(list[i])
                   out += sum(abs2, x[i] - y[list[i][j]])
               end
           end
           return out
       end
nb (generic function with 1 method)

julia> x, box = CellListMap.xgalactic(10^6);

julia> y = [ box.computing_box[1] .+ rand.() .* box.computing_box[2] for _ in 1:10^3 ];

julia> cl(x, y, box.cutoff) ≈ nb(x, y, box.cutoff)
true

julia> @time cl(x, y, box.cutoff)
  0.419813 seconds (2.13 k allocations: 23.523 MiB)
6.122889375594708e7

julia> @time nb(x, y, box.cutoff)
  1.125405 seconds (2.01 M allocations: 152.340 MiB)
6.1228893755947076e7

The function xgalatic resturns a set of coordinates and a box with density typical of galaxies (I use that for testing, there is a xatomic function as well). Note that, used naively (I don't know an alternative, actually), the NearestNeighbors method uses about 7-8 times more memory.

Also I don't know how to deal with periodic boundary conditions in that case (I think in the case of orthorhombic you can use some distance from Distances.jl).

lmiq commented 10 months ago

ps: That was not running in parallel, in parallel we get:

julia> @time cl(x, y, box.cutoff)
  0.140854 seconds (5.03 k allocations: 24.739 MiB, 14.46% gc time)
6.089914259555441e7

julia> @time nb(x, y, box.cutoff)
  1.141484 seconds (2.01 M allocations: 152.591 MiB, 2.52% gc time)
6.089914259555665e7

(nb is not parallel).

But running CellListMap in parallel requires proportionally more memory, and that can get limiting for you.

dforero0896 commented 10 months ago

Thanks for the tests! Indeed having to build the list of neighbors seems a bit wasteful compared to being able to just apply a function to each pair. I think the issue is solved. Thanks again for your help!

lmiq commented 10 months ago

I've added a note here, for the future: https://m3g.github.io/CellListMap.jl/stable/LowLevel/#Optimizing-the-cell-grid