JuliaGPU / CUDA.jl

CUDA programming in Julia.
https://juliagpu.org/cuda/
Other
1.21k stars 221 forks source link

device-side rand() are not random between successive kernel launches #1633

Closed zpeng2 closed 6 months ago

zpeng2 commented 2 years ago

Describe the bug

device-side rand() are not random between successive kernel launches

To reproduce

The Minimal Working Example (MWE) for this bug:

using CUDA, LinearAlgebra, Random

function kernel_test(out, n)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    @inbounds for i = index:stride:n
        val = rand(Float64)
        if i == 100
            @cuprintln "val=" val
        end
        out[i] = val
    end
    return nothing
end
f(x) = 2x 
N = 1000
C = CUDA.rand(Float64, N)
g = CUDA.zeros(Float64, N)
out = similar(g)
A = CUDA.ones(Float64, N, N)
julia> for i in 1:10
           @. g = f(C)
           A * g # this operation screws up rand in cuda kernel?
           @cuda threads = N blocks = 1 kernel_test(out, N)
       end
val=0.923766
val=0.923766
val=0.923766
val=0.923766
val=0.923766
val=0.923766
val=0.923766
val=0.923766
val=0.923766
val=0.923766
Manifest.toml # This file is machine-generated - editing it directly is not advised julia_version = "1.7.3" manifest_format = "2.0" [[deps.AbstractFFTs]] deps = ["ChainRulesCore", "LinearAlgebra"] git-tree-sha1 = "69f7020bd72f069c219b5e8c236c1fa90d2cb409" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" version = "1.2.1" [[deps.Adapt]] deps = ["LinearAlgebra"] git-tree-sha1 = "195c5505521008abea5aee4f96930717958eac6f" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" version = "3.4.0" [[deps.ArgParse]] deps = ["Logging", "TextWrap"] git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" version = "1.1.4" [[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" [[deps.BFloat16s]] deps = ["LinearAlgebra", "Printf", "Random", "Test"] git-tree-sha1 = "a598ecb0d717092b5539dbbe890c98bac842b072" uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" version = "0.2.0" [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" [[deps.BenchmarkTools]] deps = ["JSON", "Logging", "Printf", "Profile", "Statistics", "UUIDs"] git-tree-sha1 = "4c10eee4af024676200bc7752e536f858c6b8f93" uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" version = "1.3.1" [[deps.CEnum]] git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" version = "0.4.2" [[deps.CUDA]] deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "SpecialFunctions", "TimerOutputs"] git-tree-sha1 = "49549e2c28ffb9cc77b3689dc10e46e6271e9452" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" version = "3.12.0" [[deps.ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] git-tree-sha1 = "80ca332f6dcb2508adba68f22f551adb2d00a624" uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" version = "1.15.3" [[deps.ChangesOfVariables]] deps = ["ChainRulesCore", "LinearAlgebra", "Test"] git-tree-sha1 = "38f7a08f19d8810338d4f5085211c7dfa5d5bdd8" uuid = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" version = "0.1.4" [[deps.Compat]] deps = ["Dates", "LinearAlgebra", "UUIDs"] git-tree-sha1 = "5856d3031cdb1f3b2b6340dfdc66b6d9a149a374" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" version = "4.2.0" [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" [[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" [[deps.DocStringExtensions]] deps = ["LibGit2"] git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" version = "0.8.6" [[deps.Downloads]] deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" [[deps.ExprTools]] git-tree-sha1 = "56559bbef6ca5ea0c0818fa5c90320398a6fbf8d" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" version = "0.1.8" [[deps.FileWatching]] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" [[deps.GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] git-tree-sha1 = "45d7deaf05cbb44116ba785d147c518ab46352d7" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" version = "8.5.0" [[deps.GPUArraysCore]] deps = ["Adapt"] git-tree-sha1 = "6872f5ec8fd1a38880f027a26739d42dcda6691f" uuid = "46192b85-c4d5-4398-a991-12ede77f4527" version = "0.1.2" [[deps.GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "TimerOutputs", "UUIDs"] git-tree-sha1 = "122d7bcc92abf94cf1a86281ad7a4d0e838ab9e0" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" version = "0.16.3" [[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" [[deps.InverseFunctions]] deps = ["Test"] git-tree-sha1 = "b3364212fb5d870f724876ffcd34dd8ec6d98918" uuid = "3587e190-3f89-42d0-90ee-14403ec27112" version = "0.1.7" [[deps.IrrationalConstants]] git-tree-sha1 = "7fd44fd4ff43fc60815f8e764c0f352b83c49151" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" version = "0.1.1" [[deps.JLLWrappers]] deps = ["Preferences"] git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" version = "1.4.1" [[deps.JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] git-tree-sha1 = "3c837543ddb02250ef42f4738347454f95079d4e" uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.3" [[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] git-tree-sha1 = "e7e9184b0bf0158ac4e4aa9daf00041b5909bf1a" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" version = "4.14.0" [[deps.LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg", "TOML"] git-tree-sha1 = "771bfe376249626d3ca12bcd58ba243d3f961576" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" version = "0.0.16+0" [[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" [[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" [[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" [[deps.LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" [[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" [[deps.LinearAlgebra]] deps = ["Libdl", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[deps.LogExpFunctions]] deps = ["ChainRulesCore", "ChangesOfVariables", "DocStringExtensions", "InverseFunctions", "IrrationalConstants", "LinearAlgebra"] git-tree-sha1 = "94d9c52ca447e23eac0c0f074effbcd38830deb5" uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" version = "0.3.18" [[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" [[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" [[deps.Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" [[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" [[deps.OpenBLAS_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" [[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" [[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.5+0" [[deps.Parsers]] deps = ["Dates"] git-tree-sha1 = "3d5bf43e3e8b412656404ed9466f1dcbf7c50269" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" version = "2.4.0" [[deps.Pkg]] deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [[deps.Preferences]] deps = ["TOML"] git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" uuid = "21216c6a-2e73-6563-6e65-726566657250" version = "1.3.0" [[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" [[deps.Profile]] deps = ["Printf"] uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" [[deps.REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[deps.Random]] deps = ["SHA", "Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[deps.Random123]] deps = ["Random", "RandomNumbers"] git-tree-sha1 = "7a1a306b72cfa60634f03a911405f4e64d1b718b" uuid = "74087812-796a-5b5d-8853-05524746bad3" version = "1.6.0" [[deps.RandomNumbers]] deps = ["Random", "Requires"] git-tree-sha1 = "043da614cc7e95c703498a491e2c21f58a2b8111" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.5.3" [[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" [[deps.Requires]] deps = ["UUIDs"] git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" version = "1.3.0" [[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" [[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" [[deps.SparseArrays]] deps = ["LinearAlgebra", "Random"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[deps.SpecialFunctions]] deps = ["ChainRulesCore", "IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] git-tree-sha1 = "d75bda01f8c31ebb72df80a46c88b25d1c79c56d" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" version = "2.1.7" [[deps.StaticArrays]] deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"] git-tree-sha1 = "2d4e51cfad63d2d34acde558027acbc66700349b" uuid = "90137ffa-7385-5640-81b9-e52037218182" version = "1.5.3" [[deps.StaticArraysCore]] git-tree-sha1 = "ec2bd695e905a3c755b33026954b119ea17f2d22" uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" version = "1.3.0" [[deps.Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" [[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[deps.TextWrap]] git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" uuid = "b718987f-49a8-5099-9789-dcd902bef87d" version = "1.0.1" [[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] git-tree-sha1 = "9dfcb767e17b0849d6aaf85997c98a5aea292513" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" version = "0.5.21" [[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" [[deps.libblastrampoline_jll]] deps = ["Artifacts", "Libdl", "OpenBLAS_jll"] uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" [[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" [[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"

``` Paste your Manifest.toml here, or accurately describe which version of CUDA.jl and its dependencies (GPUArrays.jl, GPUCompiler.jl, LLVM.jl) you are using. ```

Expected behavior

rand() in successive kernel launches should be "random"

Version info

Details on Julia:

# please post the output of:
versioninfo()
Julia Version 1.7.1
Commit ac5cc99908 (2021-12-22 19:35 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Core(TM) i7-8700 CPU @ 3.20GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, skylake)
Environment:
  JULIA_NUM_THREADS = 8

Details on CUDA:

# please post the output of:
CUDA.versioninfo()
CUDA toolkit 11.7, artifact installation
NVIDIA driver 510.39.1, for CUDA 11.6
CUDA driver 11.6

Libraries:
- CUBLAS: 11.10.1
- CURAND: 10.2.10
- CUFFT: 10.7.2
- CUSOLVER: 11.3.5
- CUSPARSE: 11.7.3
- CUPTI: 17.0.0
- NVML: 11.0.0+510.39.1
- CUDNN: 8.30.2 (for CUDA 11.5.0)
- CUTENSOR: 1.4.0 (for CUDA 11.5.0)

Toolchain:
- Julia: 1.7.1
- LLVM: 12.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0
- Device capability support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80

1 device:
  0: NVIDIA GeForce RTX 2080 (sm_75, 7.778 GiB / 8.000 GiB available)

Additional context

N/A

maleadt commented 2 years ago

rand() in successive kernel launches should be "random"

Not necessarily, as that would require copying random state back to the CPU after each kernel, which is prohibitively expensive. What we do, is store that state in shared memory, which might be reset after a kernel finishes or might retain the state (it's undefined behavior). In the latter case we'll generate new random numbers, in the former case we should re-initialize the seed based on the clock() instruction which should give us a new seed unless you're launching kernels in quick succession.

It'd be interesting to see why that isn't happening here, but in general, you need to seed yourself if you want to guarantee different random numbers in different kernels.

Alternatively, we could use the new kernel state to set a random seed on kernel launch, but I'm not entirely sure that wouldn't result in loss of reproducibility though.

zpeng2 commented 2 years ago

When simulating stochastic dynamics (in my case I simulate the Brownian motion of particles), it is typical to launch kernels in quick succession. At each time step, first, some computation is done and then a kernel is launched to update the positions of all particles. In the entire simulation, millions of kernels will be launched that require random numbers.

I tried to seed the generator at each launch based on the time index:

function kernel_test(out, n, seed)
    index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    stride = blockDim().x * gridDim().x
    Random.seed!(seed)
    @inbounds for i = index:stride:n
        val = rand(Float64)
        if i == 100
            @cuprintln "val=" val
        end
        out[i] = val
    end
    return nothing
end
julia> for i in 1:10
           @. g = f(C)
           A * g 
           @cuda threads = N blocks = 1 kernel_test(out, N, i) # seed the rng with current time index?
       end
val=0.273348
val=0.391978
val=0.373867
val=0.531302
val=0.343881
val=0.245345
val=0.169952
val=0.833296
val=0.340522
val=0.087738

This seems to be working, not sure if there is a better or correct way to seed it if one wishes to launch millions of kernels in succession.

Yangyang-Tan commented 1 year ago

For the Brownian motion simulation, I recommend to use the SDE solver in DifferentialEquations.jl which can implement Brownian motion simulation on GPU in any dimensions.

zpeng2 commented 1 year ago

For the Brownian motion simulation, I recommend to use the SDE solver in DifferentialEquations.jl which can implement Brownian motion simulation on GPU in any dimensions.

Thanks for the note; I have a more involved Brownian dynamics simulation in which particles have finite size and interact sterically and/or hydrodynamically.

maleadt commented 6 months ago

This is fixed now:

julia> for i in 1:10
           @. g = f(C)
           A * g # this operation screws up rand in cuda kernel?
           @cuda threads = N blocks = 1 kernel_test(out, N)
       end

julia> val=0.934863
val=0.713337
val=0.738163
val=0.026299
val=0.279074
val=0.992566
val=0.564650
val=0.135331
val=0.522691
val=0.884230