Threading of FE assemble operation

ahojukka5 commented 7 years ago

We need to get threading working for FE assembler. Some preliminary results follow.

Big model: nodes 4189147, elements 2475306. It's 12.5 million dofs. So we need to assemble maybe 5 million elements in a reasonable time. Simple performance test, create model with 10000 elements and assemble:

using JuliaFEM
using JuliaFEM.Preprocess
using JuliaFEM.Testing
using Base.Threads

info("number of threads: ", nthreads())

""" Return test model with N elements. """
function get_model(N=10000)
    problem = Problem(Elasticity, "tet10", 3)
    for i=1:N
        el = Element(Tet10, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        el["youngs modulus"] = 480.0
        el["poissons ratio"] = 1/3
        x1 = [2.0, 3.0, 4.0]
        x2 = [6.0, 3.0, 2.0]
        x3 = [2.0, 5.0, 1.0]
        x4 = [4.0, 3.0, 6.0]
        x5 = 0.5*(x1+x2)
        x6 = 0.5*(x2+x3)
        x7 = 0.5*(x3+x1)
        x8 = 0.5*(x1+x4)
        x9 = 0.5*(x2+x4)
        x10 = 0.5*(x3+x4)
        X = Dict{Int64, Vector{Float64}}(
            1 => x1, 2 => x2, 3 => x3, 4 => x4, 5 => x5,
            6 => x6, 7 => x7, 8 => x8, 9 => x9, 10 => x10)
        u = Dict{Int64, Vector{Float64}}()
        for i=1:10
            u[i] = [0.0, 0.0, 0.0]
        end
        update!(el, "geometry", X)
        update!(el, "displacement", u)
        push!(problem.elements, el)
    end
    return problem
end

function case1()
    problem = get_model(10000)
    N = length(problem.elements)

    t0 = time()
    assemble!(problem, 0.0)
    t1 = time()
    dt = t1-t0
    kps = N/dt
    info("Assembled $N stiffness matrices in $(round(dt, 3)) seconds, $(round(kps, 3)) stiffness matrices per second.")
end

function case2()
    problems = [get_model(10000) for i=1:nthreads()]
    N = sum([length(problem.elements) for problem in problems])
    t0 = time()
    @threads for i=1:length(problems)
        assemble!(problems[i], 0.0)
    end
    t1 = time()
    dt = t1-t0
    kps = N/dt
    info("Assembled $N stiffness matrices in $(round(dt, 3)) seconds, $(round(kps, 3)) stiffness matrices per second.")
end

case1()
case2()

27-Mar 07:46:56:INFO:root:Assembled 10000 stiffness matrices in 18.422 seconds, 542.827 stiffness matrices per second.
27-Mar 07:50:04:INFO:root:Assembled 160000 stiffness matrices in 183.772 seconds, 870.642 stiffness matrices per second.

We are not getting expected scaling. Here is a test code to study problems arising when using multithreading:

# This file is a part of JuliaFEM.
# License is MIT: see https://github.com/JuliaFEM/JuliaFEM.jl/blob/master/LICENSE.md

using JuliaFEM
using JuliaFEM.Preprocess
using JuliaFEM.Testing
using TimerOutputs
using Base.Threads

const to = TimerOutput()

BLAS.set_num_threads(1)

""" Return test model with N elements. """
function get_model(N=10000)
    problem = Problem(Elasticity, "tet10", 3)
    for i=1:N
        el = Element(Tet10, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        el["youngs modulus"] = 480.0
        el["poissons ratio"] = 1/3
        x1 = [2.0, 3.0, 4.0]
        x2 = [6.0, 3.0, 2.0]
        x3 = [2.0, 5.0, 1.0]
        x4 = [4.0, 3.0, 6.0]
        x5 = 0.5*(x1+x2)
        x6 = 0.5*(x2+x3)
        x7 = 0.5*(x3+x1)
        x8 = 0.5*(x1+x4)
        x9 = 0.5*(x2+x4)
        x10 = 0.5*(x3+x4)
        X = Dict{Int64, Vector{Float64}}(
            1 => x1, 2 => x2, 3 => x3, 4 => x4, 5 => x5,
            6 => x6, 7 => x7, 8 => x8, 9 => x9, 10 => x10)
        u = Dict{Int64, Vector{Float64}}()
        for i=1:10
            u[i] = [0.0, 0.0, 0.0]
        end
        update!(el, "geometry", X)
        update!(el, "displacement", u)
        push!(problem.elements, el)
    end
    return problem
end

rfib(n) = n < 2 ? n : rfib(n-1) + rfib(n-2)

function operation(assembly, problem, element, time)
    #gdofs = get_gdofs(problem, element)
    v = rfib(22) # do something heavy calculation
    #for j in gdofs
    #    for i in gdofs
    #        push!(assembly.K.I, i)
    #        push!(assembly.K.J, j)
    #        push!(assembly.K.V, v)
    #    end
    #end
    #K = zeros(3*length(element), 3*length(element))
    #add!(assembly.K, gdofs, gdofs, K)
    #rfib(22)
    return
end

function measure(N, M; use_threads=true)

    problems = [get_model(M) for i=1:N]
    n_elements = sum([length(problem.elements) for problem in problems])

    info("warmup")
    tic()
    info(operation(problems[1].assembly, problems[1], problems[1].elements[1], 0.0))
    toc()

    chunks(a, n) = [a[i:n:end] for i=1:n]

    t0 = time()
    if use_threads
        for problem in problems
            n_chunks = nthreads()
            info("Threading ON, using $n_chunks threads.")
            sub_assemblies = [Assembly() for i=1:n_chunks]
            sub_elements = chunks(problem.elements, n_chunks)
            sub_problems = [Problem(Elasticity, "tet10", 3) for i=1:n_chunks]
            @threads for i=1:n_chunks
                for element in sub_elements[i]
                    operation(sub_assemblies[i], sub_problems[i], element, 0.0)
                end
            end
            for sub_assembly in sub_assemblies
                append!(problem.assembly, sub_assembly)
            end
        end
    else
        for problem in problems
            for element in problem.elements
                operation(problem.assembly, problem, element, 0.0)
            end
        end
    end
    t1 = time()
    dt = t1-t0
    kps = n_elements/dt
    if use_threads
        info("Using $(nthreads()) threads and $(nworkers()) workers: $N problems, $M elements/problem -> $n_elements operations in $(round(dt, 3)) seconds, $(round(kps, 3)) operations per second.")
    else
        info("$N problems, $M elements/problem -> $n_elements operations in $(round(dt, 3)) seconds, $(round(kps, 3)) operations per second.")
    end
    return dt
end

N = 1
M = 200000
dt1 = measure(N, M; use_threads=false)
dt2 = measure(N, M; use_threads=true)
info("Speedup: $(dt1/dt2) x")

27-Mar 07:58:52:INFO:root:warmup
27-Mar 07:58:52:INFO:root:nothing
elapsed time: 0.003918508 seconds
27-Mar 07:59:16:INFO:root:1 problems, 200000 elements/problem -> 200000 operations in 23.66 seconds, 8453.206 operations per second.
27-Mar 07:59:19:INFO:root:warmup
27-Mar 07:59:19:INFO:root:nothing
elapsed time: 0.000167442 seconds
27-Mar 07:59:19:INFO:root:Threading ON, using 2 threads.
27-Mar 07:59:31:INFO:root:Using 2 threads and 1 workers: 1 problems, 200000 elements/problem -> 200000 operations in 12.176 seconds, 16425.863 operations per second.
27-Mar 07:59:31:INFO:root:Speedup: 1.9431518923913702 x

27-Mar 08:59:57:INFO:root:warmup
27-Mar 08:59:57:INFO:root:nothing
elapsed time: 0.003972694 seconds
27-Mar 09:00:23:INFO:root:1 problems, 200000 elements/problem -> 200000 operations in 25.895 seconds, 7723.477 operations per second.
27-Mar 09:00:24:INFO:root:warmup
27-Mar 09:00:24:INFO:root:nothing
elapsed time: 0.000201575 seconds
27-Mar 09:00:24:INFO:root:Threading ON, using 12 threads.
27-Mar 09:00:27:INFO:root:Using 12 threads and 1 workers: 1 problems, 200000 elements/problem -> 200000 operations in 2.501 seconds, 79981.511 operations per second.
27-Mar 09:00:27:INFO:root:Speedup: 10.355635255826938 x

27-Mar 09:01:07:INFO:root:warmup
27-Mar 09:01:07:INFO:root:nothing
elapsed time: 0.003751095 seconds
27-Mar 09:01:30:INFO:root:1 problems, 200000 elements/problem -> 200000 operations in 23.397 seconds, 8548.165 operations per second.
27-Mar 09:01:31:INFO:root:warmup
27-Mar 09:01:31:INFO:root:nothing
elapsed time: 0.000184833 seconds
27-Mar 09:01:31:INFO:root:Threading ON, using 16 threads.
27-Mar 09:01:33:INFO:root:Using 16 threads and 1 workers: 1 problems, 200000 elements/problem -> 200000 operations in 1.904 seconds, 105041.633 operations per second.
27-Mar 09:01:33:INFO:root:Speedup: 12.288207689872657 x

Lessons learned so far:

using BLAS inside multithreaded code will crash program
when just calculating Fibonacci numbers inside operation(), we get 10-15 x scaling
when using shared memory models inside operation(), e.g. one common problem, performance is reduced remarkably (common problem, speedup for 16 threads is 2x, sub problems, speedup for 16 threads 12x)
reserving more memory inside multithreaded loop, e.g. appending to I,J,V vectors, is reducing performance remarkably
integration points must be in global const dict, get_integration_points(element) is dropping speedup from 12 to 8
element(ip, time, Val{:detJ}) is dropping speedup from 12 to 4
and so on

So basically we need to check scalability of every elementary operation...

ahojukka5 commented 7 years ago

Interesting findings..

function get_integration_points_test_1(element::Element{Tet10}, ::Type{Val{3}})
    weights = (-2.0/15.0, 3.0/40.0, 3.0/40.0, 3.0/40.0, 3.0/40.0)
    coords = (
              (1.0/4.0, 1.0/4.0, 1.0/4.0),
              (1.0/6.0, 1.0/6.0, 1.0/6.0),
              (1.0/6.0, 1.0/6.0, 1.0/2.0),
              (1.0/6.0, 1.0/2.0, 1.0/6.0),
              (1.0/2.0, 1.0/6.0, 1.0/6.0)
             )
    return zip(weights, coords)
end

27-Mar 10:36:08:INFO:root:1 problems, 200000 elements/problem -> 200000 operations in 27.406 seconds, 7297.772 operations per second.
27-Mar 10:36:10:INFO:root:Threading ON, using 16 threads.
27-Mar 10:36:12:INFO:root:Using 16 threads and 1 workers: 1 problems, 200000 elements/problem -> 200000 operations in 2.104 seconds, 95071.725 operations per second.
27-Mar 10:36:12:INFO:root:Speedup: 13.02749921459311 x

function get_integration_points_test_2(element::Element{Tet10}, ::Type{Val{3}})
    weights = [-2.0/15.0, 3.0/40.0, 3.0/40.0, 3.0/40.0, 3.0/40.0]
    coords = Vector{Float64}[
                             [1.0/4.0, 1.0/4.0, 1.0/4.0],
                             [1.0/6.0, 1.0/6.0, 1.0/6.0],
                             [1.0/6.0, 1.0/6.0, 1.0/2.0],
                             [1.0/6.0, 1.0/2.0, 1.0/6.0],
                             [1.0/2.0, 1.0/6.0, 1.0/6.0]]
    return zip(weights, coords)
end

27-Mar 10:39:07:INFO:root:1 problems, 200000 elements/problem -> 200000 operations in 27.088 seconds, 7383.312 operations per second.
27-Mar 10:39:08:INFO:root:Threading ON, using 16 threads.
27-Mar 10:39:12:INFO:root:Using 16 threads and 1 workers: 1 problems, 200000 elements/problem -> 200000 operations in 3.344 seconds, 59813.263 operations per second.
27-Mar 10:39:12:INFO:root:Speedup: 8.101142401863232 x

ahojukka5 commented 7 years ago

In the beginning, 1 core assembly takes

 ──────────────────────────────────────────────────────────────────────────────
                                       Time                   Allocations
                               ──────────────────────   ───────────────────────
       Tot / % measured:           12555s / 99.9%           1664GiB / 100%

 Section               ncalls     time   %tot     avg     alloc   %tot      avg
 ──────────────────────────────────────────────────────────────────────────────
 assemble problem           1   12348s  98.4%  12348s   1633GiB  98.2%  1633GiB
 read mesh from disk        1     119s  0.95%    119s   14.5GiB  0.87%  14.5GiB
 create problem             1    79.0s  0.63%   79.0s   16.2GiB  0.97%  16.2GiB
 ──────────────────────────────────────────────────────────────────────────────

Now,

 ──────────────────────────────────────────────────────────────────────────────
                                       Time                   Allocations
                               ──────────────────────   ───────────────────────
       Tot / % measured:             779s / 99.9%            101GiB / 100%

 Section               ncalls     time   %tot     avg     alloc   %tot      avg
 ──────────────────────────────────────────────────────────────────────────────
 assemble problem           1     607s  78.0%    607s   78.0GiB  77.4%  78.0GiB
 read mesh from disk        1     101s  13.0%    101s   13.5GiB  13.4%  13.5GiB
 create problem             1    69.8s  8.98%   69.8s   9.30GiB  9.22%  9.30GiB
 ──────────────────────────────────────────────────────────────────────────────

It's 4077 elements / second for 1 core now, speedup is 20x. Take also look of memory allocations. Memory allocation for assembly dropped 20.93 x, speedup is 20.34. Coincidence? SparseMatrixCOO takes three vectors, for 4950612 elements needed space is 66.4 GB without taking symmetry into account. So maybe using symmetry we may gain another 2x in assembly time, making -30 GB and -5 minutes in 12.5 million dof model.

ahojukka5 commented 7 years ago

Pre-allocate whole sparse matrix at beginning, make assemble more type stable using @code_warntype:

                                       Time                   Allocations
                               ──────────────────────   ───────────────────────
       Tot / % measured:             492s / 99.9%            101GiB / 100%

 Section               ncalls     time   %tot     avg     alloc   %tot      avg
 ──────────────────────────────────────────────────────────────────────────────
 assemble problem           1     318s  64.6%    318s   77.9GiB  77.3%  77.9GiB
 read mesh from disk        1     102s  20.7%    102s   13.5GiB  13.4%  13.5GiB
 create problem             1    72.1s  14.7%   72.1s   9.30GiB  9.23%  9.30GiB
 ──────────────────────────────────────────────────────────────────────────────

Test assembly of 100000 elements:

3.755976 seconds (900.21 k allocations: 83.201 MB, 2.04% gc time)

Still lot of allocations

JuliaFEM / JuliaFEM.jl

Threading of FE assemble operation #103