Lattice-Boltzmann, CUDA and GPU

LaurieGustavo commented 5 years ago

Hello,

I'm implementing the Lattice-Boltzmann method (for fluid flows) in Futhark. My goal is to execute it on GPU using CUDA. It works but it's very slow, many times slower than the translated sequential C code.

So i'd like to ask if there is any obvious mistake or misuse of Futhark in my code that would lead to inefficient parallel code. Here's a few things i'm not sure of the impact :

the in-place updates loop in the "propagation" function
the fact that at every iteration an entire matrix of tuples is calculated

I'm also looking with my university if i'm not simply misusing the gpu environment they gave me

-- Program for Lattice-Boltzmann in Futhark

import "vector"

------------------------------------
----- INITIALIZATION FUNCTIONS -----
------------------------------------

-- Initilization of an array of length nx with value x
-- parametric polymorphism
let init_array 't (nx: i32)(x: t): [nx]t =
    map( \_ -> x) (0..<nx)

-- Initialization of a matrix nx * ny with value x
-- parametric polymorphism
let init_matrix 't (nx: i32)(ny: i32)(x: t): [nx][ny]t =
    map( \_ -> init_array(ny)(x)) (0..<nx)

-- Initialization of mask
entry init_mask(nx: i32)(ny: i32): [nx][ny]u32 =
    map( \(x) -> 
        map( \(y) ->
            if y > x && x > (ny-y) && y < (3*ny)/4 then
                1u32
            else
                0u32
        ) (0..<ny)
    ) (0..<nx)

entry init_rho(nx: i32)(ny: i32): [nx][ny]f64 =
    init_matrix(nx)(ny)(1f64)

entry init_u(nx: i32)(ny: i32): [nx][ny]vector =
    init_matrix(nx)(ny)((0f64, 0f64))

------------------------------------
------------ CONSTANTS -------------
------------------------------------

let c: [9]vector        =[  ( 0,  0),
                            (-1,  1),
                            (-1,  0),
                            (-1, -1),
                            ( 0, -1),
                            ( 1, -1),
                            ( 1,  0),
                            ( 1,  1),
                            ( 0,  1) ]

let opposite: [9]i32    = [0, 5, 6, 7, 8, 1, 2, 3, 4]

let w: [9]f64           = [4/9, 1/36, 1/9, 1/36, 1/9, 1/36, 1/9, 1/36, 1/9]

------------------------------------
------ CALCULATION FUNCTIONS -------
------------------------------------

-- calculates rho at some point
let rho_calcul(f_in: []f64)(mask: u32): f64 =
    if mask > 0 then 0 else reduce (+) 0 f_in

-- updates rho at every point
let rho_update [nx][ny] (f_in: [nx][ny][9]f64)(mask: [nx][ny]u32): [nx][ny]f64 =
    map2(\fx mx ->
        map2(\fxy mxy ->
            rho_calcul(fxy)(mxy)
        ) fx mx
    ) f_in mask

-- calculates u at some point
let u_calcul(f_in: []f64)(rho: f64)(mask: u32): vector =
    if mask > 0 then
        (0,0)
    else
        let res = reduce add (0,0) (map2 mult f_in c)
        in div(rho)(res)

-- updates u at every point
let u_update [nx][ny](f_in: [nx][ny][9]f64)(rho: [nx][ny]f64)(mask: [nx][ny]u32): [nx][ny]vector =
    map3 (\fx rx mx ->
        map3 ( \fxy rxy mxy ->
            u_calcul(fxy)(rxy)(mxy)
        ) fx rx mx
    ) f_in rho mask

-- j(x,y) = u[x][y] + Tau*g
let j(u: vector)(tau: f64)(g: vector): vector = 
    let a = mult tau g
    in add a u

-- f_eq
let f_eq(i: i32)(rho: f64)(u: vector)(g: vector)(tau: f64): f64 =
    let j_val   = j(u)(tau)(g)
    let dot_val = dotprod(c[i])(j_val)
    in w[i]*rho*(1 + 3*dot_val + (9f64/2)*dot_val*dot_val - (3f64/2)*square(j_val))

-- Initialization of f_in
entry init_f_in (nx: i32)(ny: i32)(g_x: f64)(g_y: f64)(tau: f64): [nx][ny][9]f64 =
    let g: vector = (g_x, g_y)
    -- Initialize u
    let u           = init_u(nx)(ny)
    -- Initialize rho
    let rho         = init_rho(nx)(ny)

    in map2(\rx ux ->
        map2(\rxy uxy ->
            map(\(i) ->
                f_eq(i)(rxy)(uxy)(g)(tau)
            ) (0..<9)
        ) rx ux
    ) rho u

-- calculates f_out at point (x,y,i)
let f_out_calcul(f_in: []f64)(i: i32)(rho: f64)(u: vector)(g: vector)(tau: f64)(mask: u32): f64 =   

    if mask > 0 then
        f_in[opposite[i]]
    else
        f_in[i] - (f_in[i] - f_eq(i)(rho)(u)(g)(tau))/tau 

-- updates f_out at every point
let f_out_update [nx][ny] (f_in: [nx][ny][9]f64)(rho: [nx][ny]f64)(u: [nx][ny]vector)(tau: f64)(g: vector)(mask: [nx][ny]u32): [nx][ny][9]f64 =
    map4(\fx rx ux mx ->
        map4(\fxy rxy uxy mxy ->
            map( \(i) ->
                f_out_calcul(fxy)(i)(rxy)(uxy)(g)(tau)(mxy)
            ) (0..<9)
        ) fx rx ux mx
    ) f_in rho u mask

-- collision phase 
let collision [nx][ny] (f_in: [nx][ny][9]f64)(g: vector)(tau: f64)(mask: [nx][ny]u32): ([nx][ny][9]f64, [nx][ny]f64, [nx][ny]vector) =
    let rho     = rho_update(f_in)(mask)
    let u       = u_update(f_in)(rho)(mask)
    let f_out   = f_out_update(f_in)(rho)(u)(tau)(g)(mask)
    in (f_out, rho, u)

-- propagation phase
let propagation [nx][ny] (f_out: [nx][ny][9]f64): [nx][ny][9]f64 =
    let f_in = init_matrix(nx)(ny)(init_array(9)(0f64))
    in loop (f_in) for x < nx do
        loop (f_in) for y < ny do
            loop (f_in) for i < 9 do 
                f_in with [(x+(i32.f64 c[i].1)+nx)%nx,(y+(i32.f64 c[i].2)+ny)%ny,i] = f_out[x,y,i]

let norm_u [nx][ny] (u: [nx][ny]vector): [nx][ny]f64 =
    map (\(u_x) ->
        map (\(u_x_y) ->
            norm(u_x_y)
        ) u_x
    ) u

-- executes one iteration of Lattice-Boltzmann
-- which means it takes an f_in and updates it
-- also returns the norm of speed matrix u
entry lattice_boltzmann_iteration [nx][ny] (f_in: [nx][ny][9]f64)(mask: [nx][ny]u32)(g_x: f64)(g_y: f64)(tau: f64): ([nx][ny][9]f64, [nx][ny]f64) =
    let (f_out, _ , u)  = collision(f_in)(g_x,g_y)(tau)(mask)
    let new_f_in        = propagation(f_out)
    let speed           = norm_u(u)
    in (new_f_in, speed)

-- executes Lattice-Boltzmann it times and
let lattice_boltzmann [nx][ny] (g: vector)(tau: f64)(mask: [nx][ny]u32)(it: i32): [nx][ny]f64 =

    let speed       = init_matrix(nx)(ny)(0f64)
    let f_in        = init_f_in(nx)(ny)(g.1)(g.2)(tau)
    let(_, u_speed) = loop (f_in, speed) for _i < it do
        lattice_boltzmann_iteration(f_in)(mask)(g.1)(g.2)(tau)
    in u_speed

let main(nx: i32)(ny: i32)(g_x: f64)(g_y: f64)(tau: f64)(it: i32): [nx][ny]f64 =

    -- Constantes
    let g: vector   = (g_x, g_y)
    let mask        = init_mask(nx)(ny)

    in lattice_boltzmann(g)(tau)(mask)(it)

-- VECTOR file --

import "futlib/math"

-- type vector
type vector = (f64,f64)

------------ FUNCTIONS ------------

-- addition of two vectors
let add(v1: vector)(v2: vector): vector =
    let(a,b) = v1
    let(c,d) = v2
    in (a+c, b+d)

-- multiply a vector by a float
let mult(f: f64)(v: vector): vector =
    let (a,b) = v
    in (f*a, f*b)

-- divide a vector by a float
let div(f: f64)(v: vector): vector =
    let(a,b) = v
    in (a/f, b/f)

-- dotprod of two vectors
let dotprod(v1: vector)(v2: vector): f64 =
    let(a,b) = v1
    let(c,d) = v2
    in a*c + b*d

-- square of a vector
let square(v: vector): f64 =
    dotprod(v)(v)

-- norm of a vector
let norm(v: vector): f64 =
    f64.sqrt(square(v))

athas commented 5 years ago

The problem is in the propagation function, which is fully sequential (runs on the CPU). This is extra bad in Futhark, because when using the CUDA backend, all arrays are currently naively stored on the GPU, even if they are only used on the CPU. This adds orders of magnitude of overhead to the memory access latency.

I think the best way to rewrite propagation is to perform the complicated indexing on f_out, such that the result can be computed with a map nest. However, I don't know Lattice-Boltzman methods well enough to say whether this is reasonable. If you do need complicated in-place updates on the result, then the only way is to use scatter, which operates on a flat array:

let propagation [nx][ny] (f_out: [nx][ny][9]f64): [nx][ny][9]f64 =
  let f_in = replicate (nx*ny*9) 0
  let indices = tabulate_3d nx ny 9 (\x y i -> ((x+(i32.f64 c[i].1)+nx)%nx) * ny * 9 +
                                               (((y+(i32.f64 c[i].2)+ny)%ny)*9) +
                                               i)
                |> flatten_3d
  in scatter f_in indices (flatten_3d f_out) |> unflatten_3d nx ny 9

Notoe the nasty index computations to turn the (conceptually) multi-dimensional index into a flat row-major index. The above will be fully parallel.

LaurieGustavo commented 5 years ago

Thank you very much, i'll give it a try

athas commented 5 years ago

Here is an even better approach, without nasty flattening:

let propagation [nx][ny] (f_out: [nx][ny][9]f64): [nx][ny][9]f64 =
  tabulate_3d nx ny 9 (\x y i -> f_out[(x-(i32.f64 c[i].1))%nx,
                                       (y-(i32.f64 c[i].2))%ny,
                                       i])

You may need an unsafe to avoid bounds checking.

LaurieGustavo commented 5 years ago

May the glory of your name be sung from here to the very gates of heaven and beyond (a.k.a thanks)

athas commented 5 years ago

Does it work? Can we close this issue?

LaurieGustavo commented 5 years ago

It does work. Now i'm basically trying to do some optimizations. I can't guarantee that i won't have any new questions about this specific code and Futhark until 11 July, when my work is due. Can we keep this issue opened until then ?

athas commented 5 years ago

Sure, no problem.

diku-dk / futhark

Lattice-Boltzmann, CUDA and GPU #776