diku-dk / futhark

:boom::computer::boom: A data-parallel functional programming language
http://futhark-lang.org
ISC License
2.37k stars 164 forks source link

With OpenCL function returns 0xCDCDCDCD #1207

Closed Sirius902 closed 3 years ago

Sirius902 commented 3 years ago

Hello, when trying to run this code I noticed some oddities with OpenCL. When compiling with futhark c, I get the expected result of 0x0047FF53 but when compiling with futhark opencl I get 0xCDCDCDCD instead.

module JavaRandom = {
    type gen = { seed: u64 }

    let multiplier: u64 = 0x5DEECE66D
    let addend: u64 = 0xB
    let mask: u64 = (1 << 48) - 1

    let init(seed: u64): gen = { seed = (seed ^ multiplier) & mask }

    let next (bits: u64) (self: gen): (gen, i32) =
        let nextseed = (self.seed * multiplier + addend) & mask
        in ({ seed = nextseed }, i32.u64 (nextseed >> (48 - bits)))

    let int = next 32

    let int_less_than (bound: i32) (self: gen): (gen, i32) =
        let (g, r) = next 31 self
        let m = bound - 1
        in if bound & m == 0 then
            (g, i32.i64 ((i64.i32 bound * i64.i32 r) >> 31))
        else
            let (g', n, _) =
                loop (g', r', u) = (g, r % bound, r) while u - r' + m < 0 do
                    let (g'', u') = next 31 g'
                    in (g'', u % bound, u')
            in (g', n)

    let float = next 24 >-> (\(g, n) -> (g, f32.i32 n / f32.i32 (1 << 24)))
}

type slot = #first | #second | #third

-- represents a cast from int to long in Java
let long_cast(n: u32): u64 =
    let n' = u64.u32 n
    in if n & (1 << (32 - 1)) > 0 then
        n' | (0xFFFFFFFF << 32)
    else
        n'

let level_cost (bookshelves: i32) (s: slot) (g: JavaRandom.gen): (JavaRandom.gen, i32) =
    let power = i32.min bookshelves 15
    let (g', r1) = JavaRandom.int_less_than 8 g
    let (g'', r2) = JavaRandom.int_less_than (power + 1) g'
    let j = r1 + 1 + (power >> 1) + r2
    let cost =
        match s
            case #first -> i32.max (j / 3) 1
            case #second -> j * 2 / 3 + 1
            case #third -> i32.max j (power * 2)
    in (g'', cost)

let enchant_levels (bookshelves: i32) (xp_seed: u32): [3]i32 =
    let g = JavaRandom.init (long_cast xp_seed)
    let (g', f) = level_cost bookshelves #first g
    let (g'', s) = level_cost bookshelves #second g'
    let (_, t) = level_cost bookshelves #third g''
    in zip (iota 3) [f, s, t]
        |> map (\(i, l) -> if i64.i32 l < i + 1 then 0 else l)

let crack_xp_seed: u32 =
    let bookshelves = 15
    let levels = [9, 20, 30]
    in iota 0x100000000
        |> map u32.i64
        |> filter (enchant_levels bookshelves >-> (==levels))
        |> length >-> u32.i64

let main(_: i32): u32 = crack_xp_seed
athas commented 3 years ago

What kind of GPU are you running this on? This program requires dozens of GiB of memory.

athas commented 3 years ago

You should also read this, which will make your code faster anyway.

Sirius902 commented 3 years ago

Thank you for taking a look! I should probably clarify: I am running this on an NVIDIA RTX 2080 Ti and I don't intend on actually returning the length of the filtered list, I was only doing that for testing. I intend on serializing and returning the list of potential seeds to the caller. I have noticed that returning the head of the list instead of the length also results in the same 0xCDCDCDCD. Do you think I could be hitting the memory limit on my GPU?

athas commented 3 years ago

An RTX 2080 Ti does not have enough memory to run this program, and I'm surprised you don't get an out-of-memory crash. I do on my RTX 2080 Ti.

Instead of computing all 2**32 values immediately, I suggest you do multiple passes of fewer elements, and concatenate the results:

let crack_xp_seed: u32 =
    let bookshelves = 15
    let levels = [9, 20, 30]
    let log_num_chunks = 3
    let num_chunks = 1<<log_num_chunks
    let chunk_size = 0x100000000 >> log_num_chunks
    let res =
      loop acc = [] for i < num_chunks do
        acc ++
            (iota chunk_size
             |> map (+(chunk_size*i))
             |> map u32.i64
             |> filter (enchant_levels bookshelves >-> (==levels)))
    in u32.i64 (length res)

You'll still need to have enough memory to fit all the results on the GPU, of course, but that looks like an array with less than five million elements.

Sirius902 commented 3 years ago

Thanks! I tried out your suggestion and managed to get it to work with log_num_chunks = 4.