JuliaGPU / CUDA.jl

CUDA programming in Julia.
https://juliagpu.org/cuda/
Other
1.22k stars 223 forks source link

CUSPARSE error when multiplying sparse matrix by a vector #2555

Open lpawela opened 1 week ago

lpawela commented 1 week ago

Describe the bug For sparse matrices with a relatively large number of nonzero elements I get the following error

ERROR: LoadError: CUSPARSEError: internal error (code 7, CUSPARSE_STATUS_INTERNAL_ERROR)
Stacktrace:
  [1] throw_api_error(res::CUDA.CUSPARSE.cusparseStatus_t)
    @ CUDA.CUSPARSE ~/.julia/packages/CUDA/2kjXI/lib/cusparse/libcusparse.jl:14
  [2] check
    @ ~/.julia/packages/CUDA/2kjXI/lib/cusparse/libcusparse.jl:27 [inlined]
  [3] cusparseSpMV
    @ ~/.julia/packages/CUDA/2kjXI/lib/utils/call.jl:34 [inlined]
  [4] (::CUDA.CUSPARSE.var"#1293#1295"{Bool, Bool, CUDA.CUSPARSE.cusparseSpMVAlg_t, DataType, CUDA.CUSPARSE.CuDenseVectorDescriptor, CUDA.CUSPARSE.CuDenseVectorDescriptor})(buffer::CuArray{UInt8, 1, CUDA.DeviceMemory})
    @ CUDA.CUSPARSE ~/.julia/packages/CUDA/2kjXI/lib/cusparse/generic.jl:188
  [5] with_workspaces(f::CUDA.CUSPARSE.var"#1293#1295"{Bool, Bool, CUDA.CUSPARSE.cusparseSpMVAlg_t, DataType, CUDA.CUSPARSE.CuDenseVectorDescriptor, CUDA.CUSPARSE.CuDenseVectorDescriptor}, cache_gpu::Nothing, cache_cpu::Nothing, size_gpu::CUDA.CUSPARSE.var"#bufferSize#1294"{Bool, Bool, CUDA.CUSPARSE.cusparseSpMVAlg_t, DataType, CUDA.CUSPARSE.CuDenseVectorDescriptor, CUDA.CUSPARSE.CuDenseVectorDescriptor}, size_cpu::Int64)
    @ CUDA.APIUtils ~/.julia/packages/CUDA/2kjXI/lib/utils/call.jl:131
  [6] with_workspace
    @ ~/.julia/packages/CUDA/2kjXI/lib/utils/call.jl:67 [inlined]
  [7] mv!(transa::Char, alpha::Bool, A::CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int64}, X::CuArray{Float32, 1, CUDA.DeviceMemory}, beta::Bool, Y::CuArray{Float32, 1, CUDA.DeviceMemory}, index::Char, algo::CUDA.CUSPARSE.cusparseSpMVAlg_t)
    @ CUDA.CUSPARSE ~/.julia/packages/CUDA/2kjXI/lib/cusparse/generic.jl:187
  [8] mv!
    @ ~/.julia/packages/CUDA/2kjXI/lib/cusparse/generic.jl:143 [inlined]
  [9] mv_wrapper
    @ ~/.julia/packages/CUDA/2kjXI/lib/cusparse/interfaces.jl:34 [inlined]
 [10] generic_matvecmul!
    @ ~/.julia/packages/CUDA/2kjXI/lib/cusparse/interfaces.jl:74 [inlined]
 [11] generic_matvecmul!
    @ ~/.julia/packages/CUDA/2kjXI/lib/cusparse/interfaces.jl:65 [inlined]
 [12] _mul!
    @ ~/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:73 [inlined]
 [13] mul!
    @ ~/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:70 [inlined]
 [14] mul!
    @ ~/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:253 [inlined]
 [15] *(A::CUDA.CUSPARSE.CuSparseMatrixCSR{Float32, Int64}, x::CuArray{Float32, 1, CUDA.DeviceMemory})
    @ LinearAlgebra ~/.julia/juliaup/julia-1.11.1+0.x64.linux.gnu/share/julia/stdlib/v1.11/LinearAlgebra/src/matmul.jl:60
 [16] top-level scope
    @ ~/ttt/mwe.jl:35
in expression starting at /home/lpawela/ttt/mwe.jl:18

To reproduce

The Minimal Working Example (MWE) for this bug. If I change nz = 2^31 - 2^7 the code completes without error. This seems to be independent of dim. Tested for a few choices of dim between 10^6 and 10^7.

using CUDA, SparseArrays

nz = 2^31-2^7+1
dim = 10^7
a = spzeros(dim, dim)
v = rand(nz)
a[1:nz] = v

@assert nnz(a) == nz #sanity check
J2 = CUSPARSE.CuSparseMatrixCSR{Float32,Int64}(
               CuVector{Int64}(a.colptr),
               CuVector{Int64}(a.rowval),
               CuVector{Float32}(a.nzval),
               size(a),
           ) # control index and nzval types
x = CUDA.rand(size(J2, 1))

CUDA.@sync J2 * x
Manifest.toml

``` # This file is machine-generated - editing it directly is not advised julia_version = "1.11.1" manifest_format = "2.0" project_hash = "bb90c52fd56d684edfdbaad07e216a548307fa02" [[deps.AbstractFFTs]] deps = ["LinearAlgebra"] git-tree-sha1 = "d92ad398961a3ed262d8bf04a1a2b8340f915fef" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" version = "1.5.0" [deps.AbstractFFTs.extensions] AbstractFFTsChainRulesCoreExt = "ChainRulesCore" AbstractFFTsTestExt = "Test" [deps.AbstractFFTs.weakdeps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[deps.Adapt]] deps = ["LinearAlgebra", "Requires"] git-tree-sha1 = "50c3c56a52972d78e8be9fd135bfb91c9574c140" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" version = "4.1.1" weakdeps = ["StaticArrays"] [deps.Adapt.extensions] AdaptStaticArraysExt = "StaticArrays" [[deps.ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" version = "1.1.2" [[deps.Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" version = "1.11.0" [[deps.Atomix]] deps = ["UnsafeAtomics"] git-tree-sha1 = "c06a868224ecba914baa6942988e2f2aade419be" uuid = "a9b6321e-bd34-4604-b9c9-b65b8de01458" version = "0.1.0" [[deps.BFloat16s]] deps = ["LinearAlgebra", "Printf", "Random", "Test"] git-tree-sha1 = "2c7cc21e8678eff479978a0a2ef5ce2f51b63dff" uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" version = "0.5.0" [[deps.Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" version = "1.11.0" [[deps.CEnum]] git-tree-sha1 = "389ad5c84de1ae7cf0e28e381131c98ea87d54fc" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" version = "0.5.0" [[deps.CUDA]] deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CUDA_Driver_jll", "CUDA_Runtime_Discovery", "CUDA_Runtime_jll", "Crayons", "DataFrames", "ExprTools", "GPUArrays", "GPUCompiler", "KernelAbstractions", "LLVM", "LLVMLoopInfo", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "NVTX", "Preferences", "PrettyTables", "Printf", "Random", "Random123", "RandomNumbers", "Reexport", "Requires", "SparseArrays", "StaticArrays", "Statistics", "demumble_jll"] git-tree-sha1 = "e0725a467822697171af4dae15cec10b4fc19053" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" version = "5.5.2" [deps.CUDA.extensions] ChainRulesCoreExt = "ChainRulesCore" EnzymeCoreExt = "EnzymeCore" SpecialFunctionsExt = "SpecialFunctions" [deps.CUDA.weakdeps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" [[deps.CUDA_Driver_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "ccd1e54610c222fadfd4737dac66bff786f63656" uuid = "4ee394cb-3365-5eb0-8335-949819d2adfc" version = "0.10.3+0" [[deps.CUDA_Runtime_Discovery]] deps = ["Libdl"] git-tree-sha1 = "33576c7c1b2500f8e7e6baa082e04563203b3a45" uuid = "1af6417a-86b4-443c-805f-a4643ffb695f" version = "0.3.5" [[deps.CUDA_Runtime_jll]] deps = ["Artifacts", "CUDA_Driver_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] git-tree-sha1 = "ed8a056a88f5b852df94046060f6770a57334728" uuid = "76a88914-d11a-5bdc-97e0-2f5a05c973a2" version = "0.15.4+0" [[deps.ColorTypes]] deps = ["FixedPointNumbers", "Random"] git-tree-sha1 = "b10d0b65641d57b8b4d5e234446582de5047050d" uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" version = "0.11.5" [[deps.Colors]] deps = ["ColorTypes", "FixedPointNumbers", "Reexport"] git-tree-sha1 = "362a287c3aa50601b0bc359053d5c2468f0e7ce0" uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" version = "0.12.11" [[deps.Compat]] deps = ["TOML", "UUIDs"] git-tree-sha1 = "8ae8d32e09f0dcf42a36b90d4e17f5dd2e4c4215" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" version = "4.16.0" weakdeps = ["Dates", "LinearAlgebra"] [deps.Compat.extensions] CompatLinearAlgebraExt = "LinearAlgebra" [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" version = "1.1.1+0" [[deps.Crayons]] git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" version = "4.1.1" [[deps.DataAPI]] git-tree-sha1 = "abe83f3a2f1b857aac70ef8b269080af17764bbe" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" version = "1.16.0" [[deps.DataFrames]] deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] git-tree-sha1 = "fb61b4812c49343d7ef0b533ba982c46021938a6" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" version = "1.7.0" [[deps.DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] git-tree-sha1 = "1d0a14036acb104d9e89698bd408f63ab58cdc82" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" version = "0.18.20" [[deps.DataValueInterfaces]] git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" version = "1.0.0" [[deps.Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" version = "1.11.0" [[deps.Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" version = "1.11.0" [[deps.Downloads]] deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" version = "1.6.0" [[deps.ExprTools]] git-tree-sha1 = "27415f162e6028e81c72b82ef756bf321213b6ec" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" version = "0.1.10" [[deps.FileWatching]] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" version = "1.11.0" [[deps.FixedPointNumbers]] deps = ["Statistics"] git-tree-sha1 = "05882d6995ae5c12bb5f36dd2ed3f61c98cbb172" uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" version = "0.8.5" [[deps.Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" version = "1.11.0" [[deps.GPUArrays]] deps = ["Adapt", "GPUArraysCore", "LLVM", "LinearAlgebra", "Printf", "Random", "Reexport", "Serialization", "Statistics"] git-tree-sha1 = "62ee71528cca49be797076a76bdc654a170a523e" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" version = "10.3.1" [[deps.GPUArraysCore]] deps = ["Adapt"] git-tree-sha1 = "ec632f177c0d990e64d955ccc1b8c04c485a0950" uuid = "46192b85-c4d5-4398-a991-12ede77f4527" version = "0.1.6" [[deps.GPUCompiler]] deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "PrecompileTools", "Preferences", "Scratch", "Serialization", "TOML", "TimerOutputs", "UUIDs"] git-tree-sha1 = "1d6f290a5eb1201cd63574fbc4440c788d5cb38f" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" version = "0.27.8" [[deps.InlineStrings]] git-tree-sha1 = "45521d31238e87ee9f9732561bfee12d4eebd52d" uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" version = "1.4.2" [deps.InlineStrings.extensions] ArrowTypesExt = "ArrowTypes" ParsersExt = "Parsers" [deps.InlineStrings.weakdeps] ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd" Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" [[deps.InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" version = "1.11.0" [[deps.InvertedIndices]] git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" version = "1.3.0" [[deps.IteratorInterfaceExtensions]] git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" uuid = "82899510-4779-5014-852e-03e436cf321d" version = "1.0.0" [[deps.JLLWrappers]] deps = ["Artifacts", "Preferences"] git-tree-sha1 = "be3dc50a92e5a386872a493a10050136d4703f9b" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" version = "1.6.1" [[deps.JuliaNVTXCallbacks_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "af433a10f3942e882d3c671aacb203e006a5808f" uuid = "9c1d0b0a-7046-5b2e-a33f-ea22f176ac7e" version = "0.2.1+0" [[deps.KernelAbstractions]] deps = ["Adapt", "Atomix", "InteractiveUtils", "MacroTools", "PrecompileTools", "Requires", "StaticArrays", "UUIDs", "UnsafeAtomics", "UnsafeAtomicsLLVM"] git-tree-sha1 = "e73a077abc7fe798fe940deabe30ef6c66bdde52" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" version = "0.9.29" [deps.KernelAbstractions.extensions] EnzymeExt = "EnzymeCore" LinearAlgebraExt = "LinearAlgebra" SparseArraysExt = "SparseArrays" [deps.KernelAbstractions.weakdeps] EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[deps.LLVM]] deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Preferences", "Printf", "Unicode"] git-tree-sha1 = "d422dfd9707bec6617335dc2ea3c5172a87d5908" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" version = "9.1.3" weakdeps = ["BFloat16s"] [deps.LLVM.extensions] BFloat16sExt = "BFloat16s" [[deps.LLVMExtra_jll]] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] git-tree-sha1 = "05a8bd5a42309a9ec82f700876903abce1017dd3" uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" version = "0.0.34+0" [[deps.LLVMLoopInfo]] git-tree-sha1 = "2e5c102cfc41f48ae4740c7eca7743cc7e7b75ea" uuid = "8b046642-f1f6-4319-8d3c-209ddc03c586" version = "1.0.0" [[deps.LaTeXStrings]] git-tree-sha1 = "dda21b8cbd6a6c40d9d02a73230f9d70fed6918c" uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" version = "1.4.0" [[deps.LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" version = "1.11.0" [[deps.LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" version = "0.6.4" [[deps.LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" version = "8.6.0+0" [[deps.LibGit2]] deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" version = "1.11.0" [[deps.LibGit2_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"] uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5" version = "1.7.2+0" [[deps.LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" version = "1.11.0+1" [[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" version = "1.11.0" [[deps.LinearAlgebra]] deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" version = "1.11.0" [[deps.Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" version = "1.11.0" [[deps.MacroTools]] deps = ["Markdown", "Random"] git-tree-sha1 = "2fa9ee3e63fd3a4f7a9a4f4744a52f4856de82df" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" version = "0.5.13" [[deps.Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" version = "1.11.0" [[deps.MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" version = "2.28.6+0" [[deps.Missings]] deps = ["DataAPI"] git-tree-sha1 = "ec4f7fbeab05d7747bdf98eb74d130a2a2ed298d" uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" version = "1.2.0" [[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" version = "2023.12.12" [[deps.NVTX]] deps = ["Colors", "JuliaNVTXCallbacks_jll", "Libdl", "NVTX_jll"] git-tree-sha1 = "53046f0483375e3ed78e49190f1154fa0a4083a1" uuid = "5da4648a-3479-48b8-97b9-01cb529c0a1f" version = "0.3.4" [[deps.NVTX_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "ce3269ed42816bf18d500c9f63418d4b0d9f5a3b" uuid = "e98f9f5b-d649-5603-91fd-7774390e6439" version = "3.1.0+2" [[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" version = "1.2.0" [[deps.OpenBLAS_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" version = "0.3.27+1" [[deps.OrderedCollections]] git-tree-sha1 = "dfdf5519f235516220579f949664f1bf44e741c5" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" version = "1.6.3" [[deps.Pkg]] deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "Random", "SHA", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" version = "1.11.0" [deps.Pkg.extensions] REPLExt = "REPL" [deps.Pkg.weakdeps] REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[deps.PooledArrays]] deps = ["DataAPI", "Future"] git-tree-sha1 = "36d8b4b899628fb92c2749eb488d884a926614d3" uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" version = "1.4.3" [[deps.PrecompileTools]] deps = ["Preferences"] git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f" uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" version = "1.2.1" [[deps.Preferences]] deps = ["TOML"] git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6" uuid = "21216c6a-2e73-6563-6e65-726566657250" version = "1.4.3" [[deps.PrettyTables]] deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"] git-tree-sha1 = "1101cd475833706e4d0e7b122218257178f48f34" uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" version = "2.4.0" [[deps.Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" version = "1.11.0" [[deps.ProgressMeter]] deps = ["Distributed", "Printf"] git-tree-sha1 = "8f6bc219586aef8baf0ff9a5fe16ee9c70cb65e4" uuid = "92933f4c-e287-5a05-a399-4b506db050ca" version = "1.10.2" [[deps.Random]] deps = ["SHA"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" version = "1.11.0" [[deps.Random123]] deps = ["Random", "RandomNumbers"] git-tree-sha1 = "4743b43e5a9c4a2ede372de7061eed81795b12e7" uuid = "74087812-796a-5b5d-8853-05524746bad3" version = "1.7.0" [[deps.RandomNumbers]] deps = ["Random"] git-tree-sha1 = "c6ec94d2aaba1ab2ff983052cf6a606ca5985902" uuid = "e6cf234a-135c-5ec9-84dd-332b85af5143" version = "1.6.0" [[deps.Reexport]] git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.2.2" [[deps.Requires]] deps = ["UUIDs"] git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" uuid = "ae029012-a4dd-5104-9daa-d747884805df" version = "1.3.0" [[deps.SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" version = "0.7.0" [[deps.Scratch]] deps = ["Dates"] git-tree-sha1 = "3bac05bc7e74a75fd9cba4295cde4045d9fe2386" uuid = "6c6a2e73-6563-6170-7368-637461726353" version = "1.2.1" [[deps.SentinelArrays]] deps = ["Dates", "Random"] git-tree-sha1 = "d0553ce4031a081cc42387a9b9c8441b7d99f32d" uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" version = "1.4.7" [[deps.Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" version = "1.11.0" [[deps.Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" version = "1.11.0" [[deps.SortingAlgorithms]] deps = ["DataStructures"] git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085" uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" version = "1.2.1" [[deps.SparseArrays]] deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" version = "1.11.0" [[deps.StaticArrays]] deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"] git-tree-sha1 = "777657803913ffc7e8cc20f0fd04b634f871af8f" uuid = "90137ffa-7385-5640-81b9-e52037218182" version = "1.9.8" [deps.StaticArrays.extensions] StaticArraysChainRulesCoreExt = "ChainRulesCore" StaticArraysStatisticsExt = "Statistics" [deps.StaticArrays.weakdeps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[deps.StaticArraysCore]] git-tree-sha1 = "192954ef1208c7019899fbf8049e717f92959682" uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" version = "1.4.3" [[deps.Statistics]] deps = ["LinearAlgebra"] git-tree-sha1 = "ae3bb1eb3bba077cd276bc5cfc337cc65c3075c0" uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" version = "1.11.1" weakdeps = ["SparseArrays"] [deps.Statistics.extensions] SparseArraysExt = ["SparseArrays"] [[deps.StringManipulation]] deps = ["PrecompileTools"] git-tree-sha1 = "a6b1675a536c5ad1a60e5a5153e1fee12eb146e3" uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" version = "0.4.0" [[deps.SuiteSparse_jll]] deps = ["Artifacts", "Libdl", "libblastrampoline_jll"] uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" version = "7.7.0+0" [[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" version = "1.0.3" [[deps.TableTraits]] deps = ["IteratorInterfaceExtensions"] git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" version = "1.0.1" [[deps.Tables]] deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "OrderedCollections", "TableTraits"] git-tree-sha1 = "598cd7c1f68d1e205689b1c2fe65a9f85846f297" uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" version = "1.12.0" [[deps.Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" version = "1.10.0" [[deps.Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" version = "1.11.0" [[deps.TimerOutputs]] deps = ["ExprTools", "Printf"] git-tree-sha1 = "3a6f063d690135f5c1ba351412c82bae4d1402bf" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" version = "0.5.25" [[deps.UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" version = "1.11.0" [[deps.Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" version = "1.11.0" [[deps.UnsafeAtomics]] git-tree-sha1 = "6331ac3440856ea1988316b46045303bef658278" uuid = "013be700-e6cd-48c3-b4a1-df204f14c38f" version = "0.2.1" [[deps.UnsafeAtomicsLLVM]] deps = ["LLVM", "UnsafeAtomics"] git-tree-sha1 = "2d17fabcd17e67d7625ce9c531fb9f40b7c42ce4" uuid = "d80eeb9a-aca5-4d75-85e5-170c8b632249" version = "0.2.1" [[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" version = "1.2.13+1" [[deps.demumble_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] git-tree-sha1 = "6498e3581023f8e530f34760d18f75a69e3a4ea8" uuid = "1e29f10c-031c-5a83-9565-69cddfc27673" version = "1.3.0+0" [[deps.libblastrampoline_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" version = "5.11.0+0" [[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" version = "1.59.0+0" [[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" version = "17.4.0+2" ```

Expected behavior

The code completes as for nz = 2^31 - 2^7.

Version info

Details on Julia:

# please post the output of:
versioninfo()

Julia Version 1.11.1
Commit 8f5b7ca12ad (2024-10-16 10:53 UTC)
Build Info:
  Official https://julialang.org/ release
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 128 × Intel(R) Xeon(R) Platinum 8462Y+
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, sapphirerapids)
Threads: 1 default, 0 interactive, 1 GC (on 128 virtual cores)

Details on CUDA:

# please post the output of:
CUDA.versioninfo()

julia> CUDA.versioninfo()
CUDA runtime 12.6, artifact installation
CUDA driver 12.6
NVIDIA driver 550.54.15

CUDA libraries: 
- CUBLAS: 12.6.3
- CURAND: 10.3.7
- CUFFT: 11.3.0
- CUSOLVER: 11.7.1
- CUSPARSE: 12.5.4
- CUPTI: 2024.3.2 (API 24.0.0)
- NVML: 12.0.0+550.54.15

Julia packages: 
- CUDA: 5.5.2
- CUDA_Driver_jll: 0.10.3+0
- CUDA_Runtime_jll: 0.15.4+0

Toolchain:
- Julia: 1.11.1
- LLVM: 16.0.6

4 devices:
  0: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  1: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  2: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
  3: NVIDIA H100 (sm_90, 92.999 GiB / 93.584 GiB available)
pawel-tarasiuk-quantumz commented 1 week ago

I was able to confirm the same behavior without Julia / CUDA.jl. Roughly similar (using arbitrary, regular element positions in sparse matrix) complete example that shows difference between 2**31 - 128 and 2**31 - 128 + 1 elements is included below. It was tested on NVIDIA H100 as well, with CUDA 12.4.

CUSPARSE_STATUS_INTERNAL_ERROR status is returned by cusparseSpMM, which is likely equivalent to what J2 * x line calls in @lpawela's example.

#include <cuda.h>
#include <cusparse_v2.h>

void cudaCall(cudaError_t const &err, char const *fname, int const &line) {
    if (err != cudaSuccess) {
         printf("CUDA error at %s:%d: %s\n", fname, line, cudaGetErrorString(err));
         exit(1);
    }
}

#define CUDA_CALL(x) cudaCall((x), __FILE__, __LINE__)

char const * cusparseGetStatusName(cusparseStatus_t const &err) {
    switch (err) {
        case CUSPARSE_STATUS_SUCCESS:
            return "CUSPARSE_STATUS_SUCCESS";
        case CUSPARSE_STATUS_NOT_INITIALIZED:
            return "CUSPARSE_STATUS_NOT_INITIALIZED";
        case CUSPARSE_STATUS_ALLOC_FAILED:
            return "CUSPARSE_STATUS_ALLOC_FAILED";
        case CUSPARSE_STATUS_INVALID_VALUE:
            return "CUSPARSE_STATUS_INVALID_VALUE";
        case CUSPARSE_STATUS_ARCH_MISMATCH:
            return "CUSPARSE_STATUS_ARCH_MISMATCH";
        case CUSPARSE_STATUS_MAPPING_ERROR:
            return "CUSPARSE_STATUS_MAPPING_ERROR";
        case CUSPARSE_STATUS_EXECUTION_FAILED:
            return "CUSPARSE_STATUS_EXECUTION_FAILED";
        case CUSPARSE_STATUS_INTERNAL_ERROR:
            return "CUSPARSE_STATUS_INTERNAL_ERROR";
        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
            return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
        case CUSPARSE_STATUS_ZERO_PIVOT:
            return "CUSPARSE_STATUS_ZERO_PIVOT";
        case CUSPARSE_STATUS_NOT_SUPPORTED:
            return "CUSPARSE_STATUS_NOT_SUPPORTED";
        case CUSPARSE_STATUS_INSUFFICIENT_RESOURCES:
            return "CUSPARSE_STATUS_INSUFFICIENT_RESOURCES";
    }

    return "<unknown>";
}

void cusparseCall( cusparseStatus_t const &err, char const *fname, int const &line) {
    if (err != CUSPARSE_STATUS_SUCCESS) {
         printf("CUSPARSE error at %s:%d: status %s\n", fname, line, cusparseGetStatusName(err));
         exit(1);
    }
}

#define CUSPARSE_CALL(x) cusparseCall((x), __FILE__, __LINE__)

cusparseIndexType_t const indType = CUSPARSE_INDEX_64I;
typedef int64_t idx_t;

cudaDataType const valueType = CUDA_R_32F;  // float
typedef float val_t;

void mwe(int64_t N, int64_t C) {
    printf("Init CUSPARSE\n");
    cusparseHandle_t cusparseHandle;
    CUSPARSE_CALL(cusparseCreate(&cusparseHandle));

    printf("Prepare CSR (host)\n");
    idx_t *csrRowOffsetsHost = (idx_t *)(malloc(N * sizeof(*csrRowOffsetsHost)));
    idx_t *csrColIndHost = (idx_t *)(malloc(C * sizeof(*csrColIndHost)));
    val_t *csrValuesHost  = (val_t *)(malloc(C * sizeof(*csrValuesHost)));
    if (! (csrRowOffsetsHost && csrColIndHost && csrValuesHost)) {
        perror("Host malloc failed\n");
        exit(1);
    }
    idx_t pos = 0;
    idx_t row = 0;
    size_t step = (N * N) / C;
    for (; pos < C; ++pos) {
        idx_t i = (pos * step) / N;
        idx_t j = (pos * step) % N;
        val_t v = 1.0;
        for (idx_t r = row; r <= i; ++r) {
            csrRowOffsetsHost[row++] = pos;
        }
        csrColIndHost[pos] = j;
        csrValuesHost[pos] = v;
        ++pos;
    }

    printf("Allocate CSR data (device)\n");
    idx_t *csrRowOffsetsDev; CUDA_CALL(cudaMalloc((void **)(&csrRowOffsetsDev), N * sizeof(*csrRowOffsetsDev)));
    idx_t *csrColIndDev; CUDA_CALL(cudaMalloc((void **)(&csrColIndDev), C * sizeof(*csrColIndDev)));
    val_t *csrValuesDev; CUDA_CALL(cudaMalloc((void **)(&csrValuesDev), C * sizeof(*csrValuesDev)));
    cudaDeviceSynchronize();

    printf("CSR data: host -> device\n");
    CUDA_CALL(cudaMemcpy(csrRowOffsetsDev, csrRowOffsetsHost, N * sizeof(*csrRowOffsetsDev), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(csrColIndDev, csrColIndHost, C * sizeof(*csrColIndDev), cudaMemcpyHostToDevice));
    CUDA_CALL(cudaMemcpy(csrValuesDev, csrValuesHost, C * sizeof(*csrValuesDev), cudaMemcpyHostToDevice));

    free(csrRowOffsetsHost); csrRowOffsetsHost = nullptr;
    free(csrColIndHost); csrColIndHost = nullptr;
    free(csrValuesHost); csrValuesHost = nullptr;

    printf("CUSPARSE: Create CSR descriptor\n");
    cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
    cusparseSpMatDescr_t spMat;
    CUSPARSE_CALL(cusparseCreateCsr(&spMat, N, N, C, csrRowOffsetsDev, csrColIndDev, csrValuesDev, indType, indType, idxBase, valueType));

    printf("CUSPARSE: Prepare data for SpMM\n");
    val_t alpha = 1.0;
    val_t beta = 0.0;

    val_t *dnMatValues; CUDA_CALL(cudaMalloc((void **)(&dnMatValues), N * sizeof(*dnMatValues)));
    val_t *dnMatCValues; CUDA_CALL(cudaMalloc((void **)(&dnMatCValues), N * sizeof(*dnMatCValues)));
    val_t *out; CUDA_CALL(cudaMalloc((void **)(&out), N * sizeof(*out)));
    cudaDeviceSynchronize();

    cuMemsetD32(CUdeviceptr(dnMatValues), 0x3F800000, N);  // very float-specific
    CUDA_CALL(cudaMemset(dnMatCValues, 0, N * sizeof(*dnMatValues)));

    cusparseConstDnMatDescr_t dnMat;
    CUSPARSE_CALL(cusparseCreateConstDnMat(&dnMat, N, 1, N, dnMatValues, valueType, CUSPARSE_ORDER_ROW));
    cusparseDnMatDescr_t dnMatC;
    CUSPARSE_CALL(cusparseCreateDnMat(&dnMatC, N, 1, N, dnMatCValues, valueType, CUSPARSE_ORDER_ROW));

    printf("CUSPARSE: SpMM\n");
    CUSPARSE_CALL(cusparseSpMM(
        cusparseHandle,
        CUSPARSE_OPERATION_NON_TRANSPOSE,
        CUSPARSE_OPERATION_NON_TRANSPOSE,
        &alpha,
        spMat,
        dnMat,
        &beta,
        dnMatC,
        valueType,
        CUSPARSE_SPMM_CSR_ALG1,
        out
    ));

    printf("Cleanup\n");
    CUSPARSE_CALL(cusparseDestroySpMat(spMat));
    CUSPARSE_CALL(cusparseDestroyDnMat(dnMat));
    CUSPARSE_CALL(cusparseDestroyDnMat(dnMatC));

    CUSPARSE_CALL(cusparseDestroy(cusparseHandle));
}

int main() {
    // mwe(1000000, (INT64_C(1) << 31) - 128);  // this one works
    mwe(1000000, (INT64_C(1) << 31) - 128 + 1);

    return 0;
}
pawel-tarasiuk-quantumz commented 1 week ago

Reported upstream as https://developer.nvidia.com/bugs/4966852

In response, I have received confirmation that it can be reproduced and it was routed to CUSPARSE team.

maleadt commented 1 week ago

Thanks for looking into this!