JuliaGPU / CUDA.jl

CUDA programming in Julia.
https://juliagpu.org/cuda/
Other
1.2k stars 218 forks source link

Ballot intrinsics should use .sync variety #711

Closed JonasIsensee closed 3 years ago

JonasIsensee commented 3 years ago

Describe the bug

A clear and concise description of what the bug is. Hi, I just installed CUDA for the first time on a clean julia environment for julia v1.6-rc1 and ] test CUDA fails.

Manifest.toml

``` [[AbstractFFTs]] deps = ["LinearAlgebra"] git-tree-sha1 = "8ed9de2f1b1a9b1dee48582ad477c6e67b83eb2c" uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" version = "1.0.0" [[Adapt]] deps = ["LinearAlgebra"] git-tree-sha1 = "ffcfa2d345aaee0ef3d8346a073d5dd03c983ebe" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" version = "3.2.0" [[ArgTools]] uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" [[ArrayInterface]] deps = ["IfElse", "LinearAlgebra", "Requires", "SparseArrays"] git-tree-sha1 = "ee07ae00e3cc277dcfa5507ce25be522313ecc3e" uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" version = "3.1.1" [[Artifacts]] uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" [[AxisAlgorithms]] deps = ["LinearAlgebra", "Random", "SparseArrays", "WoodburyMatrices"] git-tree-sha1 = "a4d07a1c313392a77042855df46c5f534076fab9" uuid = "13072b0f-2c55-5437-9ae7-d433b7a33950" version = "1.0.0" [[AxisArrays]] deps = ["Dates", "IntervalSets", "IterTools", "RangeArrays"] git-tree-sha1 = "f31f50712cbdf40ee8287f0443b57503e34122ef" uuid = "39de3d68-74b9-583c-8d2d-e117c070f3a9" version = "0.4.3" [[BFloat16s]] deps = ["LinearAlgebra", "Test"] git-tree-sha1 = "4af69e205efc343068dc8722b8dfec1ade89254a" uuid = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" version = "0.1.0" [[Base64]] uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" [[BenchmarkTools]] deps = ["JSON", "Logging", "Printf", "Statistics", "UUIDs"] git-tree-sha1 = "9e62e66db34540a0c919d72172cc2f642ac71260" uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" version = "0.5.0" [[CEnum]] git-tree-sha1 = "215a9aa4a1f23fbd05b92769fdd62559488d70e9" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" version = "0.4.1" [[CUDA]] deps = ["AbstractFFTs", "Adapt", "BFloat16s", "CEnum", "CompilerSupportLibraries_jll", "DataStructures", "ExprTools", "GPUArrays", "GPUCompiler", "LLVM", "LazyArtifacts", "Libdl", "LinearAlgebra", "Logging", "MacroTools", "Memoize", "NNlib", "Printf", "Random", "Reexport", "Requires", "SparseArrays", "Statistics", "TimerOutputs"] path = "/home/isensee/.julia/dev/CUDA" uuid = "052768ef-5323-5732-b1bb-66c8b64840ba" version = "2.6.0" [[CUDATutorials]] deps = ["CUDA", "Documenter", "IJulia", "Images", "InteractiveUtils", "Literate", "Pkg"] path = "/home/isensee/.julia/dev/CUDATutorials" uuid = "8d564016-0a89-4fc6-a3a0-4211001cf922" version = "0.1.0" [[CatIndices]] deps = ["CustomUnitRanges", "OffsetArrays"] git-tree-sha1 = "a0f80a09780eed9b1d106a1bf62041c2efc995bc" uuid = "aafaddc9-749c-510e-ac4f-586e18779b91" version = "0.2.2" [[ChainRulesCore]] deps = ["Compat", "LinearAlgebra", "SparseArrays"] git-tree-sha1 = "d3d0a4e0d5bc03a6c97f4d249c8a471fc20a2f33" uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" version = "0.9.28" [[ColorTypes]] deps = ["FixedPointNumbers", "Random"] git-tree-sha1 = "4bffea7ed1a9f0f3d1a131bbcd4b925548d75288" uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" version = "0.10.9" [[ColorVectorSpace]] deps = ["ColorTypes", "Colors", "FixedPointNumbers", "LinearAlgebra", "SpecialFunctions", "Statistics", "StatsBase"] git-tree-sha1 = "4d17724e99f357bfd32afa0a9e2dda2af31a9aea" uuid = "c3611d14-8923-5661-9e6a-0046d554d3a4" version = "0.8.7" [[Colors]] deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"] git-tree-sha1 = "ac5f2213e56ed8a34a3dd2f681f4df1166b34929" uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" version = "0.12.6" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] git-tree-sha1 = "919c7f3151e79ff196add81d7f4e45d91bbf420b" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" version = "3.25.0" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" [[ComputationalResources]] git-tree-sha1 = "52cb3ec90e8a8bea0e62e275ba577ad0f74821f7" uuid = "ed09eef8-17a6-5b46-8889-db040fac31e3" version = "0.3.2" [[Conda]] deps = ["JSON", "VersionParsing"] git-tree-sha1 = "c0647249d785f1d5139c0cc96db8f6b32f7ec416" uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d" version = "1.5.0" [[CoordinateTransformations]] deps = ["LinearAlgebra", "StaticArrays"] git-tree-sha1 = "6d1c23e740a586955645500bbec662476204a52c" uuid = "150eb455-5306-5404-9cee-2592286d6298" version = "0.6.1" [[CustomUnitRanges]] git-tree-sha1 = "537c988076d001469093945f3bd0b300b8d3a7f3" uuid = "dc8bdbbb-1ca9-579f-8c36-e416f6a65cce" version = "1.0.1" [[DataAPI]] git-tree-sha1 = "8ab70b4de35bb3b8cc19654f6b893cf5164f8ee8" uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" version = "1.5.1" [[DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] git-tree-sha1 = "4437b64df1e0adccc3e5d1adbc3ac741095e4677" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" version = "0.18.9" [[Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" [[DelimitedFiles]] deps = ["Mmap"] uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" [[Distances]] deps = ["LinearAlgebra", "Statistics"] git-tree-sha1 = "366715149014943abd71aa647a07a43314158b2d" uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" version = "0.10.2" [[Distributed]] deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" [[DocStringExtensions]] deps = ["LibGit2", "Markdown", "Pkg", "Test"] git-tree-sha1 = "50ddf44c53698f5e784bbebb3f4b21c5807401b1" uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" version = "0.8.3" [[Documenter]] deps = ["Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] git-tree-sha1 = "b7715ae18be02110a8cf9cc8ed2ccdb1e3e3aba2" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" version = "0.26.1" [[Downloads]] deps = ["ArgTools", "LibCURL", "NetworkOptions"] uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" [[EllipsisNotation]] deps = ["ArrayInterface"] git-tree-sha1 = "8041575f021cba5a099a456b4163c9a08b566a02" uuid = "da5c29d0-fa7d-589e-88eb-ea29b0a81949" version = "1.1.0" [[ExprTools]] git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" version = "0.1.3" [[FFTViews]] deps = ["CustomUnitRanges", "FFTW"] git-tree-sha1 = "70a0cfd9b1c86b0209e38fbfe6d8231fd606eeaf" uuid = "4f61f5a4-77b1-5117-aa51-3ab5ef4ef0cd" version = "0.3.1" [[FFTW]] deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"] git-tree-sha1 = "1b48dbde42f307e48685fa9213d8b9f8c0d87594" uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" version = "1.3.2" [[FFTW_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "5a0d4b6a22a34d17d53543bd124f4b08ed78e8b0" uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a" version = "3.3.9+7" [[FileIO]] deps = ["Pkg"] git-tree-sha1 = "fee8955b9dfa7bec67117ef48085fb2b559b9c22" uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" version = "1.4.5" [[FileWatching]] uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" [[FixedPointNumbers]] deps = ["Statistics"] git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" version = "0.8.4" [[GPUArrays]] deps = ["AbstractFFTs", "Adapt", "LinearAlgebra", "Printf", "Random", "Serialization"] git-tree-sha1 = "f99a25fe0313121f2f9627002734c7d63b4dd3bd" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" version = "6.2.0" [[GPUCompiler]] deps = ["DataStructures", "ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "Serialization", "TimerOutputs", "UUIDs"] git-tree-sha1 = "ef2839b063e158672583b9c09d2cf4876a8d3d55" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" version = "0.10.0" [[Graphics]] deps = ["Colors", "LinearAlgebra", "NaNMath"] git-tree-sha1 = "2c1cf4df419938ece72de17f368a021ee162762e" uuid = "a2bd30eb-e257-5431-a919-1863eab51364" version = "1.1.0" [[IJulia]] deps = ["Base64", "Conda", "Dates", "InteractiveUtils", "JSON", "Markdown", "MbedTLS", "Pkg", "Printf", "REPL", "Random", "SoftGlobalScope", "Test", "UUIDs", "ZMQ"] git-tree-sha1 = "0862f73c51b49d80168e75b141a26d1cbb9a7295" uuid = "7073ff75-c697-5162-941a-fcdaad2a7d2a" version = "1.23.1" [[IOCapture]] deps = ["Logging"] git-tree-sha1 = "377252859f740c217b936cebcd918a44f9b53b59" uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" version = "0.1.1" [[IdentityRanges]] deps = ["OffsetArrays"] git-tree-sha1 = "be8fcd695c4da16a1d6d0cd213cb88090a150e3b" uuid = "bbac6d45-d8f3-5730-bfe4-7a449cd117ca" version = "0.3.1" [[IfElse]] git-tree-sha1 = "28e837ff3e7a6c3cdb252ce49fb412c8eb3caeef" uuid = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" version = "0.1.0" [[ImageAxes]] deps = ["AxisArrays", "ImageCore", "Reexport", "SimpleTraits"] git-tree-sha1 = "1592c7fd668ac9cdcef73f704ca457ccdaac2933" uuid = "2803e5a7-5153-5ecf-9a86-9b4c37f5f5ac" version = "0.6.8" [[ImageContrastAdjustment]] deps = ["ColorVectorSpace", "ImageCore", "ImageTransformations", "Parameters"] git-tree-sha1 = "210f8fb370d4b97fa12d65322c62df06f3e5563b" uuid = "f332f351-ec65-5f6a-b3d1-319c6670881a" version = "0.3.6" [[ImageCore]] deps = ["AbstractFFTs", "Colors", "FixedPointNumbers", "Graphics", "MappedArrays", "MosaicViews", "OffsetArrays", "PaddedViews", "Reexport"] git-tree-sha1 = "79badd979fbee9b8980cd995cd5a86a9e93b8ad7" uuid = "a09fc81d-aa75-5fe9-8630-4744c3626534" version = "0.8.20" [[ImageDistances]] deps = ["ColorVectorSpace", "Distances", "ImageCore", "ImageMorphology", "LinearAlgebra", "Statistics"] git-tree-sha1 = "159e24b4313d9197eef900e97fbd7365986f2844" uuid = "51556ac3-7006-55f5-8cb3-34580c88182d" version = "0.2.10" [[ImageFiltering]] deps = ["CatIndices", "ColorVectorSpace", "ComputationalResources", "DataStructures", "FFTViews", "FFTW", "ImageCore", "ImageMetadata", "LinearAlgebra", "OffsetArrays", "Requires", "SparseArrays", "StaticArrays", "Statistics", "TiledIteration"] git-tree-sha1 = "f82a52fa2e684d4ed69028b16188852ff94b3f75" uuid = "6a3955dd-da59-5b1f-98d4-e7296123deb5" version = "0.6.19" [[ImageMetadata]] deps = ["AxisArrays", "ColorVectorSpace", "ImageAxes", "ImageCore", "IndirectArrays"] git-tree-sha1 = "ff77c7f234e7d8a618958fcf23b6959f2cbef2c6" uuid = "bc367c6b-8a6b-528e-b4bd-a4b897500b49" version = "0.9.4" [[ImageMorphology]] deps = ["ColorVectorSpace", "ImageCore", "LinearAlgebra", "TiledIteration"] git-tree-sha1 = "113df7743f1e18da5f5ea5f98eb59ceb77092734" uuid = "787d08f9-d448-5407-9aad-5290dd7ab264" version = "0.2.9" [[ImageQualityIndexes]] deps = ["ColorVectorSpace", "ImageCore", "ImageDistances", "ImageFiltering", "OffsetArrays", "Statistics"] git-tree-sha1 = "80484f9e1beae36860ed8022f195d04c751cfec6" uuid = "2996bd0c-7a13-11e9-2da2-2f5ce47296a9" version = "0.2.1" [[ImageShow]] deps = ["Base64", "FileIO", "ImageCore", "Requires"] git-tree-sha1 = "c9df184bc7c2e665f971079174aabb7d18f1845f" uuid = "4e3cecfd-b093-5904-9786-8bbb286a6a31" version = "0.2.3" [[ImageTransformations]] deps = ["AxisAlgorithms", "ColorVectorSpace", "CoordinateTransformations", "IdentityRanges", "ImageCore", "Interpolations", "OffsetArrays", "Rotations", "StaticArrays"] git-tree-sha1 = "0426a62ca1a23f3b1ee75cc0e47320d859abd6ae" uuid = "02fcd773-0e25-5acc-982a-7f6622650795" version = "0.8.9" [[Images]] deps = ["AxisArrays", "Base64", "ColorVectorSpace", "FileIO", "Graphics", "ImageAxes", "ImageContrastAdjustment", "ImageCore", "ImageDistances", "ImageFiltering", "ImageMetadata", "ImageMorphology", "ImageQualityIndexes", "ImageShow", "ImageTransformations", "IndirectArrays", "OffsetArrays", "Random", "Reexport", "SparseArrays", "StaticArrays", "Statistics", "StatsBase", "TiledIteration"] git-tree-sha1 = "535bcaae047f017f4fd7331ee859b75f2b27e505" uuid = "916415d5-f1e6-5110-898d-aaa5f9f070e0" version = "0.23.3" [[IndirectArrays]] git-tree-sha1 = "c2a145a145dc03a7620af1444e0264ef907bd44f" uuid = "9b13fd28-a010-5f03-acff-a1bbcff69959" version = "0.5.1" [[IntelOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "d979e54b71da82f3a65b62553da4fc3d18c9004c" uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0" version = "2018.0.3+2" [[InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" [[Interpolations]] deps = ["AxisAlgorithms", "LinearAlgebra", "OffsetArrays", "Random", "Ratios", "SharedArrays", "SparseArrays", "StaticArrays", "WoodburyMatrices"] git-tree-sha1 = "eb1dd6d5b2275faaaa18533e0fc5f9171cec25fa" uuid = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" version = "0.13.1" [[IntervalSets]] deps = ["Dates", "EllipsisNotation", "Statistics"] git-tree-sha1 = "93a6d78525feb0d3ee2a2ae83a7d04db1db5663f" uuid = "8197267c-284f-5f27-9208-e0e47529a953" version = "0.5.2" [[IterTools]] git-tree-sha1 = "05110a2ab1fc5f932622ffea2a003221f4782c18" uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" version = "1.3.0" [[JLLWrappers]] git-tree-sha1 = "a431f5f2ca3f4feef3bd7a5e94b8b8d4f2f647a0" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" version = "1.2.0" [[JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.1" [[LLVM]] deps = ["CEnum", "Libdl", "Printf", "Unicode"] git-tree-sha1 = "b616937c31337576360cb9fb872ec7633af7b194" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" version = "3.6.0" [[LazyArtifacts]] deps = ["Artifacts", "Pkg"] uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" [[LibCURL]] deps = ["LibCURL_jll", "MozillaCACerts_jll"] uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" [[LibCURL_jll]] deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" [[LibGit2]] deps = ["Base64", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[LibSSH2_jll]] deps = ["Artifacts", "Libdl", "MbedTLS_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" [[Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" [[LinearAlgebra]] deps = ["Libdl"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" [[Literate]] deps = ["Base64", "JSON", "REPL"] git-tree-sha1 = "32b517d4d8219d3bbab199de3416ace45010bdb3" uuid = "98b081ad-f1c9-55d3-8b20-4c87d4299306" version = "2.8.0" [[Logging]] uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" [[MKL_jll]] deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "Pkg"] git-tree-sha1 = "c253236b0ed414624b083e6b72bfe891fbd2c7af" uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7" version = "2021.1.1+1" [[MacroTools]] deps = ["Markdown", "Random"] git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" version = "0.5.6" [[MappedArrays]] deps = ["FixedPointNumbers"] git-tree-sha1 = "b92bd220c95a8bbe89af28f11201fd080e0e3fe7" uuid = "dbb5928d-eab1-5f90-85c2-b9b0edb7c900" version = "0.3.0" [[Markdown]] deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[MbedTLS]] deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe" uuid = "739be429-bea8-5141-9913-cc70e7f3736d" version = "1.0.3" [[MbedTLS_jll]] deps = ["Artifacts", "Libdl"] uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" [[Memoize]] deps = ["MacroTools"] git-tree-sha1 = "2b1dfcba103de714d31c033b5dacc2e4a12c7caa" uuid = "c03570c3-d221-55d1-a50c-7939bbd78826" version = "0.4.4" [[Missings]] deps = ["DataAPI"] git-tree-sha1 = "f8c673ccc215eb50fcadb285f522420e29e69e1c" uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" version = "0.4.5" [[Mmap]] uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[MosaicViews]] deps = ["MappedArrays", "OffsetArrays", "PaddedViews"] git-tree-sha1 = "614e8d77264d20c1db83661daadfab38e8e4b77e" uuid = "e94cdb99-869f-56ef-bcf0-1ae2bcbe0389" version = "0.2.4" [[MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" [[NNlib]] deps = ["ChainRulesCore", "Compat", "LinearAlgebra", "Pkg", "Requires", "Statistics"] git-tree-sha1 = "df42d0816edfc24f5b82a728f46381613c4dff79" uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" version = "0.7.14" [[NaNMath]] git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb" uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" version = "0.3.5" [[NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" [[OffsetArrays]] deps = ["Adapt"] git-tree-sha1 = "76622f08645764e040b4d7e86d0ff471fd126ae4" uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" version = "1.5.3" [[OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] git-tree-sha1 = "9db77584158d0ab52307f8c04f8e7c08ca76b5b3" uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.3+4" [[OrderedCollections]] git-tree-sha1 = "d45739abcfc03b51f6a42712894a593f74c80a23" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" version = "1.3.3" [[PaddedViews]] deps = ["OffsetArrays"] git-tree-sha1 = "0fa5e78929aebc3f6b56e1a88cf505bb00a354c4" uuid = "5432bcbf-9aad-5242-b902-cca2824c8663" version = "0.5.8" [[Parameters]] deps = ["OrderedCollections", "UnPack"] git-tree-sha1 = "2276ac65f1e236e0a6ea70baff3f62ad4c625345" uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" version = "0.12.2" [[Parsers]] deps = ["Dates"] git-tree-sha1 = "50c9a9ed8c714945e01cd53a21007ed3865ed714" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" version = "1.0.15" [[Pkg]] deps = ["Artifacts", "Dates", "Downloads", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [[Printf]] deps = ["Unicode"] uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" [[REPL]] deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" [[Random]] deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [[RangeArrays]] git-tree-sha1 = "b9039e93773ddcfc828f12aadf7115b4b4d225f5" uuid = "b3c3ace0-ae52-54e7-9d0b-2c1406fd6b9d" version = "0.3.2" [[Ratios]] git-tree-sha1 = "37d210f612d70f3f7d57d488cb3b6eff56ad4e41" uuid = "c84ed2f1-dad5-54f0-aa8e-dbefe2724439" version = "0.4.0" [[Reexport]] git-tree-sha1 = "57d8440b0c7d98fc4f889e478e80f268d534c9d5" uuid = "189a3867-3050-52da-a836-e630ba90ab69" version = "1.0.0" [[Requires]] deps = ["UUIDs"] git-tree-sha1 = "cfbac6c1ed70c002ec6361e7fd334f02820d6419" uuid = "ae029012-a4dd-5104-9daa-d747884805df" version = "1.1.2" [[Rotations]] deps = ["LinearAlgebra", "StaticArrays", "Statistics"] git-tree-sha1 = "2ed8d8a16d703f900168822d83699b8c3c1a5cd8" uuid = "6038ab10-8711-5258-84ad-4b1120ba62dc" version = "1.0.2" [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" [[Scratch]] deps = ["Dates"] git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6" uuid = "6c6a2e73-6563-6170-7368-637461726353" version = "1.0.3" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [[SharedArrays]] deps = ["Distributed", "Mmap", "Random", "Serialization"] uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" [[SimpleTraits]] deps = ["InteractiveUtils", "MacroTools"] git-tree-sha1 = "daf7aec3fe3acb2131388f93a4c409b8c7f62226" uuid = "699a6c99-e7fa-54fc-8d76-47d257e15c1d" version = "0.9.3" [[Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" [[SoftGlobalScope]] deps = ["REPL"] git-tree-sha1 = "986ec2b6162ccb95de5892ed17832f95badf770c" uuid = "b85f4697-e234-5449-a836-ec8e2f98b302" version = "1.1.0" [[SortingAlgorithms]] deps = ["DataStructures", "Random", "Test"] git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" version = "0.3.1" [[SparseArrays]] deps = ["LinearAlgebra", "Random"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[SpecialFunctions]] deps = ["ChainRulesCore", "OpenSpecFun_jll"] git-tree-sha1 = "75394dbe2bd346beeed750fb02baa6445487b862" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" version = "1.2.1" [[StaticArrays]] deps = ["LinearAlgebra", "Random", "Statistics"] git-tree-sha1 = "9da72ed50e94dbff92036da395275ed114e04d49" uuid = "90137ffa-7385-5640-81b9-e52037218182" version = "1.0.1" [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [[StatsBase]] deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] git-tree-sha1 = "7bab7d4eb46b225b35179632852b595a3162cb61" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" version = "0.33.2" [[TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" [[Tar]] deps = ["ArgTools", "SHA"] uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" [[Test]] deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[TiledIteration]] deps = ["OffsetArrays"] git-tree-sha1 = "05f74c5b3c00d5336bc109416df2df907e3bd91d" uuid = "06e1c1a7-607b-532d-9fad-de7d9aa2abac" version = "0.2.5" [[TimerOutputs]] deps = ["Printf"] git-tree-sha1 = "3318281dd4121ecf9713ce1383b9ace7d7476fdd" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" version = "0.5.7" [[UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [[UnPack]] git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" version = "1.0.2" [[Unicode]] uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [[VersionParsing]] git-tree-sha1 = "80229be1f670524750d905f8fc8148e5a8c4537f" uuid = "81def892-9a0e-5fdd-b105-ffc91e053289" version = "1.2.0" [[WoodburyMatrices]] deps = ["LinearAlgebra", "SparseArrays"] git-tree-sha1 = "59e2ad8fd1591ea019a5259bd012d7aee15f995c" uuid = "efce3f68-66dc-5838-9240-27a6d6f5f9b6" version = "0.5.3" [[ZMQ]] deps = ["FileWatching", "Sockets", "ZeroMQ_jll"] git-tree-sha1 = "fc68e8a3719166950a0f3e390a14c7302c48f8de" uuid = "c2297ded-f4af-51ae-bb23-16f91089e4e1" version = "1.2.1" [[ZeroMQ_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "libsodium_jll"] git-tree-sha1 = "74a74a3896b63980734cc876da8a103454559fe8" uuid = "8f1865be-045e-5c20-9c9f-bfbfb0764568" version = "4.3.2+6" [[Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" [[libsodium_jll]] deps = ["Libdl", "Pkg"] git-tree-sha1 = "7127f5f40332ccfa43ee07dcd0c4d81a27d9bb23" uuid = "a9144af2-ca23-56d9-984f-0d03f7b5ccf8" version = "1.0.18+1" [[nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" ```

Version info

Details on Julia:

julia> versioninfo()
Julia Version 1.6.0-rc1
Commit a58bdd9010 (2021-02-06 15:49 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: Intel(R) Xeon(R) Platinum 8268 CPU @ 2.90GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-11.0.1 (ORCJIT, cascadelake)

Details on CUDA:

julia> CUDA.versioninfo()
CUDA toolkit 11.2.0, artifact installation
CUDA driver 11.2.0
NVIDIA driver 460.32.3

Libraries: 
- CUBLAS: 11.3.1
- CURAND: 10.2.3
- CUFFT: 10.4.0
- CUSOLVER: 11.0.2
- CUSPARSE: 11.3.1
- CUPTI: 14.0.0
- NVML: 11.0.0+460.32.3
- CUDNN: 8.10.0 (for CUDA 11.2.0)
- CUTENSOR: 1.2.2 (for CUDA 11.1.0)

Toolchain:
- Julia: 1.6.0-rc1
- LLVM: 11.0.1
- PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0
- Device support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80

2 devices:
  0: A100-PCIE-40GB (sm_80, 38.916 GiB / 39.586 GiB available)
  1: A100-PCIE-40GB (sm_80, 39.583 GiB / 39.586 GiB available)
`] test CUDA`

``` (@v1.6) pkg> test CUDA Testing CUDA Status `/tmp/jl_PxOBCI/Project.toml` [79e6a3ab] Adapt v3.2.0 [ab4f0b2a] BFloat16s v0.1.0 [052768ef] CUDA v2.6.0 `~/.julia/dev/CUDA` [864edb3b] DataStructures v0.18.9 [7a1cc6ca] FFTW v1.3.2 [1a297f60] FillArrays v0.11.2 [f6369f11] ForwardDiff v0.10.16 [0c68f7d7] GPUArrays v6.2.0 [a98d9a8b] Interpolations v0.13.1 [872c559c] NNlib v0.7.14 [ade2ca70] Dates `@stdlib/Dates` [8ba89e20] Distributed `@stdlib/Distributed` [37e2e46d] LinearAlgebra `@stdlib/LinearAlgebra` [de0858da] Printf `@stdlib/Printf` [3fa0cd96] REPL `@stdlib/REPL` [9a3f8284] Random `@stdlib/Random` [2f01184e] SparseArrays `@stdlib/SparseArrays` [10745b16] Statistics `@stdlib/Statistics` [8dfed614] Test `@stdlib/Test` Status `/tmp/jl_PxOBCI/Manifest.toml` [621f4979] AbstractFFTs v1.0.0 [79e6a3ab] Adapt v3.2.0 [13072b0f] AxisAlgorithms v1.0.0 [ab4f0b2a] BFloat16s v0.1.0 [fa961155] CEnum v0.4.1 [052768ef] CUDA v2.6.0 `~/.julia/dev/CUDA` [d360d2e6] ChainRulesCore v0.9.28 [bbf7d656] CommonSubexpressions v0.3.0 [34da2185] Compat v3.25.0 [864edb3b] DataStructures v0.18.9 [163ba53b] DiffResults v1.0.3 [b552c78f] DiffRules v1.0.2 [e2ba6199] ExprTools v0.1.3 [7a1cc6ca] FFTW v1.3.2 [1a297f60] FillArrays v0.11.2 [f6369f11] ForwardDiff v0.10.16 [0c68f7d7] GPUArrays v6.2.0 [61eb1bfa] GPUCompiler v0.10.0 [a98d9a8b] Interpolations v0.13.1 [692b3bcd] JLLWrappers v1.2.0 [929cbde3] LLVM v3.6.0 [1914dd2f] MacroTools v0.5.6 [c03570c3] Memoize v0.4.4 [872c559c] NNlib v0.7.14 [77ba4419] NaNMath v0.3.5 [6fe1bfb0] OffsetArrays v1.5.3 [bac558e1] OrderedCollections v1.3.3 [c84ed2f1] Ratios v0.4.0 [189a3867] Reexport v1.0.0 [ae029012] Requires v1.1.2 [6c6a2e73] Scratch v1.0.3 [276daf66] SpecialFunctions v1.2.1 [90137ffa] StaticArrays v1.0.1 [a759f4b9] TimerOutputs v0.5.7 [efce3f68] WoodburyMatrices v0.5.3 [f5851436] FFTW_jll v3.3.9+7 [1d5cc7b8] IntelOpenMP_jll v2018.0.3+2 [856f044c] MKL_jll v2021.1.1+1 [efe28fd5] OpenSpecFun_jll v0.5.3+4 [0dad84c5] ArgTools `@stdlib/ArgTools` [56f22d72] Artifacts `@stdlib/Artifacts` [2a0f44e3] Base64 `@stdlib/Base64` [ade2ca70] Dates `@stdlib/Dates` [8bb1440f] DelimitedFiles `@stdlib/DelimitedFiles` [8ba89e20] Distributed `@stdlib/Distributed` [f43a241f] Downloads `@stdlib/Downloads` [b77e0a4c] InteractiveUtils `@stdlib/InteractiveUtils` [4af54fe1] LazyArtifacts `@stdlib/LazyArtifacts` [b27032c2] LibCURL `@stdlib/LibCURL` [76f85450] LibGit2 `@stdlib/LibGit2` [8f399da3] Libdl `@stdlib/Libdl` [37e2e46d] LinearAlgebra `@stdlib/LinearAlgebra` [56ddb016] Logging `@stdlib/Logging` [d6f4376e] Markdown `@stdlib/Markdown` [a63ad114] Mmap `@stdlib/Mmap` [ca575930] NetworkOptions `@stdlib/NetworkOptions` [44cfe95a] Pkg `@stdlib/Pkg` [de0858da] Printf `@stdlib/Printf` [3fa0cd96] REPL `@stdlib/REPL` [9a3f8284] Random `@stdlib/Random` [ea8e919c] SHA `@stdlib/SHA` [9e88b42a] Serialization `@stdlib/Serialization` [1a1011a3] SharedArrays `@stdlib/SharedArrays` [6462fe0b] Sockets `@stdlib/Sockets` [2f01184e] SparseArrays `@stdlib/SparseArrays` [10745b16] Statistics `@stdlib/Statistics` [fa267f1f] TOML `@stdlib/TOML` [a4e569a6] Tar `@stdlib/Tar` [8dfed614] Test `@stdlib/Test` [cf7118a7] UUIDs `@stdlib/UUIDs` [4ec0a83e] Unicode `@stdlib/Unicode` [e66e0078] CompilerSupportLibraries_jll `@stdlib/CompilerSupportLibraries_jll` [deac9b47] LibCURL_jll `@stdlib/LibCURL_jll` [29816b5a] LibSSH2_jll `@stdlib/LibSSH2_jll` [c8ffd9c3] MbedTLS_jll `@stdlib/MbedTLS_jll` [14a3606d] MozillaCACerts_jll `@stdlib/MozillaCACerts_jll` [83775a58] Zlib_jll `@stdlib/Zlib_jll` [8e850ede] nghttp2_jll `@stdlib/nghttp2_jll` Testing Running tests... ┌ Info: System information: │ CUDA toolkit 11.2.0, artifact installation │ CUDA driver 11.2.0 │ NVIDIA driver 460.32.3 │ │ Libraries: │ - CUBLAS: 11.3.1 │ - CURAND: 10.2.3 │ - CUFFT: 10.4.0 │ - CUSOLVER: 11.0.2 │ - CUSPARSE: 11.3.1 │ - CUPTI: 14.0.0 │ - NVML: 11.0.0+460.32.3 │ - CUDNN: 8.10.0 (for CUDA 11.2.0) │ - CUTENSOR: 1.2.2 (for CUDA 11.1.0) │ │ Toolchain: │ - Julia: 1.6.0-rc1 │ - LLVM: 11.0.1 │ - PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0 │ - Device support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80 │ │ 2 devices: │ 0: A100-PCIE-40GB (sm_80, 38.249 GiB / 39.586 GiB available) └ 1: A100-PCIE-40GB (sm_80, 39.583 GiB / 39.586 GiB available) [ Info: Testing using 1 device(s): 2. A100-PCIE-40GB (UUID 541fcd36-c901-0c21-1712-9456bc3e0548) | | ---------------- GPU ---------------- | ---------------- CPU ---------------- | Test (Worker) | Time (s) | GC (s) | GC % | Alloc (MB) | RSS (MB) | GC (s) | GC % | Alloc (MB) | RSS (MB) | initialization (2) | 2.49 | 0.00 | 0.0 | 0.00 | 413.00 | 0.04 | 1.5 | 153.64 | 1457.43 | apiutils (2) | 0.18 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 1.81 | 1457.43 | array (2) | 132.70 | 0.04 | 0.0 | 5.29 | 487.00 | 6.68 | 5.0 | 14060.00 | 1457.43 | broadcast (2) | 42.97 | 0.00 | 0.0 | 0.00 | 465.00 | 1.65 | 3.8 | 3786.76 | 1457.43 | codegen (2) | 12.05 | 0.00 | 0.0 | 0.00 | 659.00 | 0.52 | 4.3 | 1230.63 | 1457.43 | cublas (2) | 112.56 | 0.02 | 0.0 | 14.50 | 753.00 | 6.22 | 5.5 | 14173.74 | 2747.73 | cufft (2) | 25.49 | 0.01 | 0.0 | 151.26 | 631.00 | 1.20 | 4.7 | 2609.79 | 2897.05 | curand (2) | 0.09 | 0.00 | 0.0 | 0.00 | 421.00 | 0.00 | 0.0 | 6.49 | 2897.05 | cusolver (2) | 85.58 | 0.04 | 0.0 | 1294.29 | 1113.00 | 4.32 | 5.0 | 10129.60 | 3967.79 | cusparse (2) | 35.48 | 0.01 | 0.0 | 8.45 | 615.00 | 1.19 | 3.4 | 2632.25 | 4022.89 | examples (2) | 142.28 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 29.09 | 4022.89 | exceptions (2) | 86.58 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 38.95 | 4022.89 | From worker 2: WARNING: Method definition #5848#kernel(Any) in module Main at /home/isensee/.julia/dev/CUDA/test/execution.jl:316 overwritten at /home/isensee/.julia/dev/CUDA/test/execution.jl:324. execution (2) | 88.67 | 0.00 | 0.0 | 1.28 | 697.00 | 4.58 | 5.2 | 8970.64 | 4088.87 | forwarddiff (2) | 111.86 | 0.00 | 0.0 | 0.00 | 469.00 | 3.37 | 3.0 | 7338.76 | 4088.87 | iterator (2) | 2.63 | 0.00 | 0.0 | 1.16 | 415.00 | 0.23 | 8.8 | 375.89 | 4088.87 | nnlib (2) | 17.92 | 0.00 | 0.0 | 0.01 | 803.00 | 0.91 | 5.1 | 1892.80 | 4105.64 | nvml (2) | 0.39 | 0.00 | 0.0 | 0.00 | 413.00 | 0.06 | 15.6 | 27.80 | 4105.64 | nvtx (2) | 0.21 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 23.39 | 4105.64 | pointer (2) | 0.26 | 0.00 | 0.1 | 0.00 | 415.00 | 0.00 | 0.0 | 13.50 | 4105.64 | pool (2) | 3.57 | 0.00 | 0.0 | 0.00 | 413.00 | 1.15 | 32.2 | 561.82 | 4105.64 | random (2) | 14.80 | 0.00 | 0.0 | 0.02 | 465.00 | 0.61 | 4.1 | 1232.32 | 4105.64 | sorting (2) | 77.09 | 0.01 | 0.0 | 259.47 | 6353.00 | 3.95 | 5.1 | 9818.27 | 4294.57 | statistics (2) | 20.42 | 0.00 | 0.0 | 0.00 | 463.00 | 0.81 | 4.0 | 1786.25 | 4294.57 | texture (2) | 70.35 | 0.00 | 0.0 | 0.09 | 473.00 | 3.24 | 4.6 | 7007.08 | 4294.57 | threading (2) | 3.42 | 0.00 | 0.1 | 10.94 | 739.00 | 0.16 | 4.7 | 304.46 | 4622.27 | utils (2) | 0.91 | 0.00 | 0.0 | 0.00 | 475.00 | 0.00 | 0.0 | 88.99 | 4622.43 | cudadrv/context (2) | 0.52 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 6.26 | 4778.20 | cudadrv/devices (2) | 0.38 | 0.00 | 0.0 | 0.00 | 413.00 | 0.09 | 22.7 | 27.96 | 4778.20 | cudadrv/errors (2) | 0.12 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 10.67 | 4778.20 | cudadrv/events (2) | 0.11 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 4.63 | 4778.20 | cudadrv/execution (2) | 0.68 | 0.00 | 0.0 | 0.00 | 415.00 | 0.00 | 0.0 | 49.14 | 4778.20 | cudadrv/memory (2) | 1.43 | 0.00 | 0.0 | 0.00 | 415.00 | 0.08 | 5.5 | 93.57 | 4778.20 | cudadrv/module (2) | 0.29 | 0.00 | 0.0 | 0.00 | 415.00 | 0.00 | 0.0 | 29.88 | 4778.20 | cudadrv/occupancy (2) | 0.12 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 6.46 | 4778.20 | cudadrv/profile (2) | 0.28 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 53.04 | 4778.20 | cudadrv/stream (2) | 0.09 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 5.44 | 4778.20 | cudadrv/version (2) | 0.01 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 0.07 | 4778.20 | cudnn/activation (2) | 1.96 | 0.00 | 0.0 | 0.00 | 677.00 | 0.09 | 4.5 | 173.11 | 4863.32 | cudnn/convolution (2) | 10.31 | 0.00 | 0.0 | 8.07 | 1035.00 | 0.48 | 4.6 | 1065.38 | 6042.48 | ┌ Warning: CUDA.CUDNN.cudnnDropoutSeed[] >= 0: dropout operations will be deterministic but 40x more expensive └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/dropout.jl:40 cudnn/dropout (2) | 2.83 | 0.00 | 0.0 | 5.08 | 733.00 | 0.10 | 3.6 | 136.94 | 6101.38 | cudnn/inplace (2) | 1.02 | 0.00 | 0.0 | 0.01 | 681.00 | 0.00 | 0.0 | 53.76 | 6101.38 | cudnn/multiheadattn (2) | 14.92 | 0.00 | 0.0 | 0.14 | 843.00 | 0.64 | 4.3 | 1377.51 | 6101.38 | ┌ Warning: ∇softmax(dy,x) should be deprecated, please use ∇softmax(dy,x,y) └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/nnlib.jl:44 ┌ Warning: ∇logsoftmax(dy,x) should be deprecated, please use ∇logsoftmax(dy,x,y) └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/nnlib.jl:54 cudnn/nnlib (2) | 99.09 | 0.01 | 0.0 | 29.77 | 1131.00 | 4.75 | 4.8 | 10621.38 | 6766.15 | cudnn/normalization (2) | 19.33 | 0.00 | 0.0 | 0.11 | 801.00 | 0.72 | 3.7 | 1423.72 | 6912.87 | cudnn/optensor (2) | 1.54 | 0.00 | 0.0 | 0.00 | 677.00 | 0.10 | 6.5 | 135.45 | 6912.87 | cudnn/pooling (2) | 5.13 | 0.00 | 0.0 | 0.06 | 677.00 | 0.28 | 5.5 | 532.68 | 6912.87 | cudnn/reduce (2) | 2.65 | 0.00 | 0.0 | 0.02 | 677.00 | 0.11 | 4.0 | 299.10 | 6912.87 | cudnn/rnn (2) | 8.69 | 0.00 | 0.0 | 5.81 | 851.00 | 0.46 | 5.3 | 1010.02 | 6912.87 | cudnn/softmax (2) | 1.67 | 0.00 | 0.0 | 0.01 | 677.00 | 0.09 | 5.5 | 112.22 | 6912.87 | cudnn/tensor (2) | 1.01 | 0.00 | 0.0 | 0.00 | 421.00 | 0.68 | 68.0 | 22.76 | 6912.87 | cusolver/cusparse (2) | 8.47 | 0.00 | 0.0 | 0.19 | 885.00 | 0.23 | 2.7 | 543.51 | 6912.87 | cusparse/interfaces (2) | 17.29 | 0.00 | 0.0 | 0.24 | 551.00 | 0.68 | 3.9 | 1485.97 | 6912.87 | cutensor/base (2) | 0.12 | 0.00 | 0.2 | 1.11 | 415.00 | 0.00 | 0.0 | 11.47 | 6912.87 | cutensor/contractions (2) | 47.89 | 0.02 | 0.0 | 32033.95 | 837.00 | 2.95 | 6.2 | 7029.74 | 6912.87 | cutensor/elementwise_binary (2) | 21.35 | 0.01 | 0.0 | 54.99 | 573.00 | 0.93 | 4.4 | 2669.10 | 6912.87 | cutensor/elementwise_trinary (2) | 27.20 | 0.00 | 0.0 | 24.44 | 533.00 | 1.25 | 4.6 | 3625.92 | 6912.87 | cutensor/permutations (2) | 3.57 | 0.00 | 0.1 | 12.22 | 513.00 | 0.15 | 4.1 | 494.71 | 6912.87 | cutensor/reductions (2) | 15.02 | 0.00 | 0.0 | 41.72 | 503.00 | 0.64 | 4.2 | 1571.97 | 6912.87 | device/array (2) | 3.85 | 0.00 | 0.0 | 0.00 | 463.00 | 0.20 | 5.1 | 378.14 | 6912.87 | device/intrinsics (2) | failed at 2021-02-12T13:39:35.215 device/ldg (3) | 26.72 | 0.06 | 0.2 | 0.00 | 463.00 | 0.92 | 3.4 | 2668.76 | 1493.88 | device/wmma (3) | 76.65 | 0.00 | 0.0 | 0.38 | 465.00 | 2.41 | 3.1 | 7923.70 | 1493.88 | gpuarrays/math (3) | 5.38 | 0.00 | 0.0 | 0.00 | 463.00 | 0.28 | 5.3 | 828.95 | 1493.88 | gpuarrays/indexing scalar (3) | 15.15 | 0.00 | 0.0 | 0.00 | 463.00 | 0.75 | 4.9 | 1716.44 | 1493.88 | gpuarrays/input output (3) | 1.74 | 0.00 | 0.0 | 0.00 | 415.00 | 0.08 | 4.4 | 193.43 | 1493.88 | gpuarrays/value constructors (3) | 19.77 | 0.00 | 0.0 | 0.00 | 465.00 | 0.98 | 5.0 | 1999.22 | 1493.88 | gpuarrays/indexing multidimensional (3) | 41.44 | 0.00 | 0.0 | 0.69 | 465.00 | 1.92 | 4.6 | 4605.96 | 1493.88 | gpuarrays/interface (3) | 7.26 | 0.00 | 0.0 | 0.00 | 463.00 | 0.35 | 4.8 | 749.15 | 1493.88 | gpuarrays/iterator constructors (3) | 3.14 | 0.00 | 0.0 | 0.02 | 463.00 | 0.11 | 3.5 | 258.27 | 1493.88 | gpuarrays/uniformscaling (3) | 15.15 | 0.00 | 0.0 | 0.01 | 463.00 | 0.62 | 4.1 | 1411.81 | 1493.88 | gpuarrays/linear algebra (3) | 164.27 | 0.01 | 0.0 | 1.24 | 759.00 | 6.55 | 4.0 | 17657.14 | 2358.24 | gpuarrays/conversions (3) | 3.09 | 0.00 | 0.0 | 0.01 | 415.00 | 0.17 | 5.7 | 397.65 | 2358.24 | gpuarrays/constructors (3) | 1.53 | 0.00 | 0.1 | 0.03 | 415.00 | 0.04 | 2.4 | 99.54 | 2358.24 | gpuarrays/random (3) | 35.20 | 0.00 | 0.0 | 0.03 | 467.00 | 1.35 | 3.8 | 3154.19 | 2358.24 | gpuarrays/base (3) | 35.72 | 0.00 | 0.0 | 17.44 | 497.00 | 2.06 | 5.8 | 4047.80 | 2365.50 | gpuarrays/mapreduce essentials (3) | 192.99 | 0.01 | 0.0 | 3.19 | 489.00 | 8.49 | 4.4 | 18717.08 | 2365.50 | gpuarrays/broadcasting (3) | 141.36 | 0.00 | 0.0 | 1.19 | 481.00 | 7.24 | 5.1 | 14564.80 | 2413.45 | gpuarrays/mapreduce derivatives (3) | 336.98 | 0.01 | 0.0 | 3.06 | 509.00 | 12.48 | 3.7 | 26924.39 | 3406.84 | Worker 2 failed running test device/intrinsics: Some tests did not pass: 256 passed, 0 failed, 3 errored, 0 broken. device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:836 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) Stacktrace: [1] throw_api_error(res::CUDA.cudaError_enum) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/error.jl:85 [2] macro expansion @ ~/.julia/dev/CUDA/lib/cudadrv/error.jl:92 [inlined] [3] cuLinkAddData_v2(state::CuLink, type::CUDA.CUjitInputType_enum, data::Ptr{Int8}, size::Int64, name::String, numOptions::Int64, options::Ptr{Nothing}, optionValues::Ptr{Nothing}) @ CUDA ~/.julia/dev/CUDA/lib/utils/call.jl:26 [4] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:74 [5] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [6] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [7] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:840 [inlined] [8] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#kernel#976", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [9] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [10] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [11] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [12] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:849 [inlined] [13] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [14] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:837 [inlined] [15] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [16] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [17] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [18] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [19] include(fname::String) @ Base.MainInclude ./client.jl:444 [20] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [22] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [23] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [24] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [25] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [26] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [27] eval @ ./boot.jl:360 [inlined] [28] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [29] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [30] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [31] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [32] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:854 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) Stacktrace: [1] throw_api_error(res::CUDA.cudaError_enum) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/error.jl:85 [2] macro expansion @ ~/.julia/dev/CUDA/lib/cudadrv/error.jl:92 [inlined] [3] cuLinkAddData_v2(state::CuLink, type::CUDA.CUjitInputType_enum, data::Ptr{Int8}, size::Int64, name::String, numOptions::Int64, options::Ptr{Nothing}, optionValues::Ptr{Nothing}) @ CUDA ~/.julia/dev/CUDA/lib/utils/call.jl:26 [4] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:74 [5] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [6] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [7] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:858 [inlined] [8] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9840#kernel#977", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [9] cufunction(f::var"#9840#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [10] cufunction(f::var"#9840#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [11] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [12] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:865 [inlined] [13] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [14] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:855 [inlined] [15] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [16] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [17] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [18] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [19] include(fname::String) @ Base.MainInclude ./client.jl:444 [20] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [22] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [23] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [24] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [25] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [26] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [27] eval @ ./boot.jl:360 [inlined] [28] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [29] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [30] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [31] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [32] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:875 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) Stacktrace: [1] throw_api_error(res::CUDA.cudaError_enum) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/error.jl:85 [2] macro expansion @ ~/.julia/dev/CUDA/lib/cudadrv/error.jl:92 [inlined] [3] cuLinkAddData_v2(state::CuLink, type::CUDA.CUjitInputType_enum, data::Ptr{Int8}, size::Int64, name::String, numOptions::Int64, options::Ptr{Nothing}, optionValues::Ptr{Nothing}) @ CUDA ~/.julia/dev/CUDA/lib/utils/call.jl:26 [4] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:74 [5] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [6] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [7] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:879 [inlined] [8] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9842#kernel#978", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [9] cufunction(f::var"#9842#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [10] cufunction(f::var"#9842#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [11] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [12] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:886 [inlined] [13] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [14] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:876 [inlined] [15] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [16] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [17] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [18] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [19] include(fname::String) @ Base.MainInclude ./client.jl:444 [20] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [22] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [23] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [24] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [25] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [26] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [27] eval @ ./boot.jl:360 [inlined] [28] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [29] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [30] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [31] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [32] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Test Summary: | Pass Error Broken Total Overall | 13997 3 5 14005 initialization | 34 34 apiutils | 9 9 array | 205 205 broadcast | 29 29 codegen | 10 10 cublas | 2239 2239 cufft | 175 175 curand | 1 1 cusolver | 1580 1580 cusparse | 822 822 examples | 7 7 exceptions | 17 17 execution | 69 69 forwarddiff | 107 107 iterator | 30 30 nnlib | 21 21 nvml | 7 7 nvtx | No tests pointer | 35 35 pool | 10 10 random | 101 101 sorting | 123 123 statistics | 18 18 texture | 38 4 42 threading | No tests utils | 5 5 cudadrv/context | 12 12 cudadrv/devices | 6 6 cudadrv/errors | 6 6 cudadrv/events | 4 4 cudadrv/execution | 15 15 cudadrv/memory | 48 1 49 cudadrv/module | 12 12 cudadrv/occupancy | 1 1 cudadrv/profile | 2 2 cudadrv/stream | 7 7 cudadrv/version | 3 3 cudnn/activation | 13 13 cudnn/convolution | 29 29 cudnn/dropout | 7 7 cudnn/inplace | 12 12 cudnn/multiheadattn | 23 23 cudnn/nnlib | 233 233 cudnn/normalization | 16 16 cudnn/optensor | 13 13 cudnn/pooling | 12 12 cudnn/reduce | 15 15 cudnn/rnn | 28 28 cudnn/softmax | 8 8 cudnn/tensor | 10 10 cusolver/cusparse | 84 84 cusparse/interfaces | 84 84 cutensor/base | 8 8 cutensor/contractions | 3321 3321 cutensor/elementwise_binary | 260 260 cutensor/elementwise_trinary | 340 340 cutensor/permutations | 80 80 cutensor/reductions | 280 280 device/array | 18 18 device/intrinsics | 256 3 259 device/ldg | 22 22 device/wmma | 210 210 gpuarrays/math | 8 8 gpuarrays/indexing scalar | 249 249 gpuarrays/input output | 5 5 gpuarrays/value constructors | 36 36 gpuarrays/indexing multidimensional | 34 34 gpuarrays/interface | 7 7 gpuarrays/iterator constructors | 24 24 gpuarrays/uniformscaling | 56 56 gpuarrays/linear algebra | 389 389 gpuarrays/conversions | 72 72 gpuarrays/constructors | 335 335 gpuarrays/random | 46 46 gpuarrays/base | 42 42 gpuarrays/mapreduce essentials | 522 522 gpuarrays/broadcasting | 155 155 gpuarrays/mapreduce derivatives | 827 827 FAILURE Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:836 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) Stacktrace: [1] throw_api_error(res::CUDA.cudaError_enum) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/error.jl:85 [2] macro expansion @ ~/.julia/dev/CUDA/lib/cudadrv/error.jl:92 [inlined] [3] cuLinkAddData_v2(state::CuLink, type::CUDA.CUjitInputType_enum, data::Ptr{Int8}, size::Int64, name::String, numOptions::Int64, options::Ptr{Nothing}, optionValues::Ptr{Nothing}) @ CUDA ~/.julia/dev/CUDA/lib/utils/call.jl:26 [4] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:74 [5] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [6] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [7] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:840 [inlined] [8] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#kernel#976", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [9] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [10] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [11] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [12] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:849 [inlined] [13] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [14] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:837 [inlined] [15] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [16] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [17] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [18] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [19] include(fname::String) @ Base.MainInclude ./client.jl:444 [20] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [22] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [23] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [24] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [25] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [26] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [27] eval @ ./boot.jl:360 [inlined] [28] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [29] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [30] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [31] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [32] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:854 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) Stacktrace: [1] throw_api_error(res::CUDA.cudaError_enum) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/error.jl:85 [2] macro expansion @ ~/.julia/dev/CUDA/lib/cudadrv/error.jl:92 [inlined] [3] cuLinkAddData_v2(state::CuLink, type::CUDA.CUjitInputType_enum, data::Ptr{Int8}, size::Int64, name::String, numOptions::Int64, options::Ptr{Nothing}, optionValues::Ptr{Nothing}) @ CUDA ~/.julia/dev/CUDA/lib/utils/call.jl:26 [4] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:74 [5] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [6] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [7] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:858 [inlined] [8] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9840#kernel#977", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [9] cufunction(f::var"#9840#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [10] cufunction(f::var"#9840#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [11] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [12] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:865 [inlined] [13] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [14] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:855 [inlined] [15] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [16] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [17] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [18] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [19] include(fname::String) @ Base.MainInclude ./client.jl:444 [20] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [22] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [23] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [24] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [25] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [26] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [27] eval @ ./boot.jl:360 [inlined] [28] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [29] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [30] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [31] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [32] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:875 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) Stacktrace: [1] throw_api_error(res::CUDA.cudaError_enum) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/error.jl:85 [2] macro expansion @ ~/.julia/dev/CUDA/lib/cudadrv/error.jl:92 [inlined] [3] cuLinkAddData_v2(state::CuLink, type::CUDA.CUjitInputType_enum, data::Ptr{Int8}, size::Int64, name::String, numOptions::Int64, options::Ptr{Nothing}, optionValues::Ptr{Nothing}) @ CUDA ~/.julia/dev/CUDA/lib/utils/call.jl:26 [4] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:74 [5] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [6] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [7] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:879 [inlined] [8] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9842#kernel#978", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [9] cufunction(f::var"#9842#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [10] cufunction(f::var"#9842#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [11] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [12] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:886 [inlined] [13] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [14] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:876 [inlined] [15] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [16] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [17] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [18] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [19] include(fname::String) @ Base.MainInclude ./client.jl:444 [20] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [22] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [23] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [24] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [25] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [26] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [27] eval @ ./boot.jl:360 [inlined] [28] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [29] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [30] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [31] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [32] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 ERROR: LoadError: Test run finished with errors in expression starting at /home/isensee/.julia/dev/CUDA/test/runtests.jl:487 ERROR: Package CUDA errored during testing (@v1.6) pkg> ```

Best, Jonas

maleadt commented 3 years ago

Looks like we aren't properly reporting linker errors. Could you re-run with https://github.com/JuliaGPU/CUDA.jl/pull/712, it should be easier to diagnose then.

JonasIsensee commented 3 years ago
Here's the tests using your PR

``` (@v1.6) pkg> test CUDA Testing CUDA Status `/tmp/jl_O1CZbv/Project.toml` [79e6a3ab] Adapt v3.2.0 [ab4f0b2a] BFloat16s v0.1.0 [052768ef] CUDA v2.6.0 `~/.julia/dev/CUDA` [864edb3b] DataStructures v0.18.9 [7a1cc6ca] FFTW v1.3.2 [1a297f60] FillArrays v0.11.2 [f6369f11] ForwardDiff v0.10.16 [0c68f7d7] GPUArrays v6.2.0 [a98d9a8b] Interpolations v0.13.1 [872c559c] NNlib v0.7.14 [ade2ca70] Dates `@stdlib/Dates` [8ba89e20] Distributed `@stdlib/Distributed` [37e2e46d] LinearAlgebra `@stdlib/LinearAlgebra` [de0858da] Printf `@stdlib/Printf` [3fa0cd96] REPL `@stdlib/REPL` [9a3f8284] Random `@stdlib/Random` [2f01184e] SparseArrays `@stdlib/SparseArrays` [10745b16] Statistics `@stdlib/Statistics` [8dfed614] Test `@stdlib/Test` Status `/tmp/jl_O1CZbv/Manifest.toml` [621f4979] AbstractFFTs v1.0.0 [79e6a3ab] Adapt v3.2.0 [13072b0f] AxisAlgorithms v1.0.0 [ab4f0b2a] BFloat16s v0.1.0 [fa961155] CEnum v0.4.1 [052768ef] CUDA v2.6.0 `~/.julia/dev/CUDA` [d360d2e6] ChainRulesCore v0.9.28 [bbf7d656] CommonSubexpressions v0.3.0 [34da2185] Compat v3.25.0 [864edb3b] DataStructures v0.18.9 [163ba53b] DiffResults v1.0.3 [b552c78f] DiffRules v1.0.2 [e2ba6199] ExprTools v0.1.3 [7a1cc6ca] FFTW v1.3.2 [1a297f60] FillArrays v0.11.2 [f6369f11] ForwardDiff v0.10.16 [0c68f7d7] GPUArrays v6.2.0 [61eb1bfa] GPUCompiler v0.10.0 [a98d9a8b] Interpolations v0.13.1 [692b3bcd] JLLWrappers v1.2.0 [929cbde3] LLVM v3.6.0 [1914dd2f] MacroTools v0.5.6 [c03570c3] Memoize v0.4.4 [872c559c] NNlib v0.7.14 [77ba4419] NaNMath v0.3.5 [6fe1bfb0] OffsetArrays v1.5.3 [bac558e1] OrderedCollections v1.3.3 [c84ed2f1] Ratios v0.4.0 [189a3867] Reexport v1.0.0 [ae029012] Requires v1.1.2 [6c6a2e73] Scratch v1.0.3 [276daf66] SpecialFunctions v1.2.1 [90137ffa] StaticArrays v1.0.1 [a759f4b9] TimerOutputs v0.5.7 [efce3f68] WoodburyMatrices v0.5.3 [f5851436] FFTW_jll v3.3.9+7 [1d5cc7b8] IntelOpenMP_jll v2018.0.3+2 [856f044c] MKL_jll v2021.1.1+1 [efe28fd5] OpenSpecFun_jll v0.5.3+4 [0dad84c5] ArgTools `@stdlib/ArgTools` [56f22d72] Artifacts `@stdlib/Artifacts` [2a0f44e3] Base64 `@stdlib/Base64` [ade2ca70] Dates `@stdlib/Dates` [8bb1440f] DelimitedFiles `@stdlib/DelimitedFiles` [8ba89e20] Distributed `@stdlib/Distributed` [f43a241f] Downloads `@stdlib/Downloads` [b77e0a4c] InteractiveUtils `@stdlib/InteractiveUtils` [4af54fe1] LazyArtifacts `@stdlib/LazyArtifacts` [b27032c2] LibCURL `@stdlib/LibCURL` [76f85450] LibGit2 `@stdlib/LibGit2` [8f399da3] Libdl `@stdlib/Libdl` [37e2e46d] LinearAlgebra `@stdlib/LinearAlgebra` [56ddb016] Logging `@stdlib/Logging` [d6f4376e] Markdown `@stdlib/Markdown` [a63ad114] Mmap `@stdlib/Mmap` [ca575930] NetworkOptions `@stdlib/NetworkOptions` [44cfe95a] Pkg `@stdlib/Pkg` [de0858da] Printf `@stdlib/Printf` [3fa0cd96] REPL `@stdlib/REPL` [9a3f8284] Random `@stdlib/Random` [ea8e919c] SHA `@stdlib/SHA` [9e88b42a] Serialization `@stdlib/Serialization` [1a1011a3] SharedArrays `@stdlib/SharedArrays` [6462fe0b] Sockets `@stdlib/Sockets` [2f01184e] SparseArrays `@stdlib/SparseArrays` [10745b16] Statistics `@stdlib/Statistics` [fa267f1f] TOML `@stdlib/TOML` [a4e569a6] Tar `@stdlib/Tar` [8dfed614] Test `@stdlib/Test` [cf7118a7] UUIDs `@stdlib/UUIDs` [4ec0a83e] Unicode `@stdlib/Unicode` [e66e0078] CompilerSupportLibraries_jll `@stdlib/CompilerSupportLibraries_jll` [deac9b47] LibCURL_jll `@stdlib/LibCURL_jll` [29816b5a] LibSSH2_jll `@stdlib/LibSSH2_jll` [c8ffd9c3] MbedTLS_jll `@stdlib/MbedTLS_jll` [14a3606d] MozillaCACerts_jll `@stdlib/MozillaCACerts_jll` [83775a58] Zlib_jll `@stdlib/Zlib_jll` [8e850ede] nghttp2_jll `@stdlib/nghttp2_jll` Testing Running tests... ┌ Info: System information: │ CUDA toolkit 11.2.0, artifact installation │ CUDA driver 11.2.0 │ NVIDIA driver 460.32.3 │ │ Libraries: │ - CUBLAS: 11.3.1 │ - CURAND: 10.2.3 │ - CUFFT: 10.4.0 │ - CUSOLVER: 11.0.2 │ - CUSPARSE: 11.3.1 │ - CUPTI: 14.0.0 │ - NVML: 11.0.0+460.32.3 │ - CUDNN: 8.10.0 (for CUDA 11.2.0) │ - CUTENSOR: 1.2.2 (for CUDA 11.1.0) │ │ Toolchain: │ - Julia: 1.6.0-rc1 │ - LLVM: 11.0.1 │ - PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0 │ - Device support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80 │ │ 2 devices: │ 0: A100-PCIE-40GB (sm_80, 38.916 GiB / 39.586 GiB available) └ 1: A100-PCIE-40GB (sm_80, 39.583 GiB / 39.586 GiB available) [ Info: Testing using 1 device(s): 2. A100-PCIE-40GB (UUID 541fcd36-c901-0c21-1712-9456bc3e0548) | | ---------------- GPU ---------------- | ---------------- CPU ---------------- | Test (Worker) | Time (s) | GC (s) | GC % | Alloc (MB) | RSS (MB) | GC (s) | GC % | Alloc (MB) | RSS (MB) | initialization (2) | 2.70 | 0.00 | 0.0 | 0.00 | 413.00 | 0.01 | 0.5 | 153.64 | 1463.37 | apiutils (2) | 0.18 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 1.81 | 1463.37 | array (2) | 129.38 | 0.04 | 0.0 | 5.29 | 487.00 | 6.30 | 4.9 | 14064.29 | 1463.37 | broadcast (2) | 46.46 | 0.00 | 0.0 | 0.00 | 465.00 | 2.04 | 4.4 | 3797.22 | 1463.37 | codegen (2) | 11.88 | 0.18 | 1.5 | 0.00 | 659.00 | 0.51 | 4.3 | 1229.78 | 1463.37 | cublas (2) | 112.84 | 0.02 | 0.0 | 14.50 | 751.00 | 6.31 | 5.6 | 14164.54 | 2846.77 | cufft (2) | 26.43 | 0.01 | 0.0 | 151.26 | 629.00 | 1.20 | 4.5 | 2607.97 | 2882.43 | curand (2) | 0.10 | 0.00 | 0.0 | 0.00 | 421.00 | 0.00 | 0.0 | 6.49 | 2882.43 | cusolver (2) | 90.69 | 0.05 | 0.0 | 1294.29 | 1113.00 | 4.55 | 5.0 | 10129.40 | 3939.22 | cusparse (2) | 35.18 | 0.01 | 0.0 | 8.45 | 613.00 | 1.13 | 3.2 | 2636.79 | 4019.42 | examples (2) | 136.37 | 0.00 | 0.0 | 0.00 | 413.00 | 0.05 | 0.0 | 29.10 | 4019.42 | exceptions (2) | 83.53 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 38.95 | 4019.42 | From worker 2: WARNING: Method definition #5848#kernel(Any) in module Main at /home/isensee/.julia/dev/CUDA/test/execution.jl:316 overwritten at /home/isensee/.julia/dev/CUDA/test/execution.jl:324. execution (2) | 88.57 | 0.00 | 0.0 | 1.28 | 697.00 | 4.11 | 4.6 | 8966.23 | 4070.70 | forwarddiff (2) | 111.68 | 0.00 | 0.0 | 0.00 | 469.00 | 3.25 | 2.9 | 7353.32 | 4186.96 | iterator (2) | 2.64 | 0.06 | 2.1 | 1.16 | 415.00 | 0.18 | 6.9 | 376.05 | 4186.96 | nnlib (2) | 15.75 | 0.00 | 0.0 | 0.01 | 803.00 | 1.46 | 9.3 | 1895.57 | 4186.96 | nvml (2) | 0.56 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 27.82 | 4186.96 | nvtx (2) | 0.27 | 0.00 | 0.0 | 0.00 | 413.00 | 0.06 | 22.7 | 23.39 | 4186.96 | pointer (2) | 0.26 | 0.00 | 0.1 | 0.00 | 415.00 | 0.00 | 0.0 | 13.50 | 4186.96 | pool (2) | 3.47 | 0.00 | 0.0 | 0.00 | 413.00 | 1.05 | 30.4 | 561.95 | 4186.96 | random (2) | 13.25 | 0.00 | 0.0 | 0.02 | 465.00 | 0.60 | 4.6 | 1232.57 | 4186.96 | sorting (2) | 76.24 | 0.26 | 0.3 | 259.47 | 6353.00 | 3.87 | 5.1 | 9818.30 | 4230.59 | statistics (2) | 19.32 | 0.00 | 0.0 | 0.00 | 463.00 | 0.81 | 4.2 | 1786.35 | 4238.26 | texture (2) | 71.72 | 0.00 | 0.0 | 0.09 | 473.00 | 3.31 | 4.6 | 7006.52 | 4269.76 | threading (2) | 3.58 | 0.00 | 0.1 | 10.94 | 741.00 | 0.10 | 2.8 | 304.19 | 4620.46 | utils (2) | 0.91 | 0.00 | 0.0 | 0.00 | 475.00 | 0.00 | 0.0 | 89.00 | 4620.46 | cudadrv/context (2) | 0.76 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 6.26 | 4787.21 | cudadrv/devices (2) | 0.29 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 27.71 | 4787.21 | cudadrv/errors (2) | 0.12 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 10.61 | 4787.21 | cudadrv/events (2) | 0.11 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 4.63 | 4787.21 | cudadrv/execution (2) | 0.68 | 0.00 | 0.0 | 0.00 | 415.00 | 0.00 | 0.0 | 49.14 | 4787.21 | cudadrv/memory (2) | 1.67 | 0.00 | 0.0 | 0.00 | 415.00 | 0.09 | 5.6 | 93.55 | 4787.21 | cudadrv/module (2) | 0.42 | 0.00 | 0.0 | 0.00 | 415.00 | 0.00 | 0.0 | 32.54 | 4787.21 | cudadrv/occupancy (2) | 0.12 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 6.46 | 4787.21 | cudadrv/profile (2) | 0.28 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 53.03 | 4787.21 | cudadrv/stream (2) | 0.09 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 5.44 | 4787.21 | cudadrv/version (2) | 0.01 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 0.07 | 4787.21 | cudnn/activation (2) | 1.97 | 0.00 | 0.0 | 0.00 | 677.00 | 0.08 | 4.2 | 171.77 | 4854.97 | cudnn/convolution (2) | 10.95 | 0.00 | 0.0 | 8.07 | 1035.00 | 0.57 | 5.2 | 1090.44 | 6031.16 | ┌ Warning: CUDA.CUDNN.cudnnDropoutSeed[] >= 0: dropout operations will be deterministic but 40x more expensive └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/dropout.jl:40 cudnn/dropout (2) | 3.36 | 0.00 | 0.0 | 5.08 | 733.00 | 0.00 | 0.0 | 135.44 | 6031.55 | cudnn/inplace (2) | 1.02 | 0.00 | 0.0 | 0.01 | 681.00 | 0.00 | 0.0 | 54.01 | 6031.55 | cudnn/multiheadattn (2) | 14.14 | 0.00 | 0.0 | 0.14 | 843.00 | 0.47 | 3.3 | 1393.50 | 6244.75 | ┌ Warning: ∇softmax(dy,x) should be deprecated, please use ∇softmax(dy,x,y) └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/nnlib.jl:44 ┌ Warning: ∇logsoftmax(dy,x) should be deprecated, please use ∇logsoftmax(dy,x,y) └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/nnlib.jl:54 cudnn/nnlib (2) | 97.53 | 0.01 | 0.0 | 29.77 | 1131.00 | 4.47 | 4.6 | 10618.42 | 6794.52 | cudnn/normalization (2) | 17.69 | 0.00 | 0.0 | 0.11 | 801.00 | 0.83 | 4.7 | 1467.71 | 6797.00 | cudnn/optensor (2) | 1.69 | 0.00 | 0.0 | 0.00 | 677.00 | 0.00 | 0.0 | 133.05 | 6797.00 | cudnn/pooling (2) | 5.12 | 0.00 | 0.0 | 0.06 | 677.00 | 0.28 | 5.5 | 534.97 | 6798.84 | cudnn/reduce (2) | 2.98 | 0.00 | 0.0 | 0.02 | 677.00 | 0.10 | 3.2 | 302.84 | 6798.84 | cudnn/rnn (2) | 7.70 | 0.00 | 0.0 | 5.81 | 851.00 | 0.32 | 4.2 | 1022.93 | 7117.21 | cudnn/softmax (2) | 1.36 | 0.00 | 0.0 | 0.01 | 677.00 | 0.00 | 0.0 | 78.74 | 7117.21 | cudnn/tensor (2) | 1.02 | 0.00 | 0.0 | 0.00 | 421.00 | 0.69 | 68.4 | 22.74 | 7117.21 | cusolver/cusparse (2) | 8.47 | 0.00 | 0.0 | 0.19 | 885.00 | 0.21 | 2.5 | 543.53 | 7117.21 | cusparse/interfaces (2) | 17.28 | 0.00 | 0.0 | 0.24 | 551.00 | 0.53 | 3.1 | 1567.40 | 7117.21 | cutensor/base (2) | 0.13 | 0.00 | 0.2 | 1.11 | 415.00 | 0.00 | 0.0 | 11.46 | 7117.21 | cutensor/contractions (2) | 50.99 | 0.02 | 0.0 | 32033.95 | 833.00 | 2.72 | 5.3 | 7037.06 | 7117.21 | cutensor/elementwise_binary (2) | 22.44 | 0.25 | 1.1 | 54.99 | 575.00 | 1.03 | 4.6 | 2669.21 | 7117.21 | cutensor/elementwise_trinary (2) | 29.81 | 0.00 | 0.0 | 24.44 | 533.00 | 1.37 | 4.6 | 3626.17 | 7117.21 | cutensor/permutations (2) | 3.93 | 0.00 | 0.1 | 12.22 | 513.00 | 0.16 | 4.1 | 494.74 | 7117.21 | cutensor/reductions (2) | 16.33 | 0.20 | 1.2 | 41.72 | 503.00 | 0.75 | 4.6 | 1572.13 | 7117.21 | device/array (2) | 4.91 | 0.00 | 0.0 | 0.00 | 463.00 | 0.22 | 4.4 | 378.11 | 7117.21 | device/intrinsics (2) | failed at 2021-02-12T16:33:01.583 device/ldg (3) | 26.58 | 0.21 | 0.8 | 0.00 | 463.00 | 0.94 | 3.5 | 2672.46 | 1491.97 | device/wmma (3) | 79.63 | 0.01 | 0.0 | 0.38 | 465.00 | 2.40 | 3.0 | 7926.10 | 1491.97 | gpuarrays/math (3) | 5.48 | 0.00 | 0.0 | 0.00 | 463.00 | 0.28 | 5.1 | 829.05 | 1491.97 | gpuarrays/indexing scalar (3) | 15.54 | 0.00 | 0.0 | 0.00 | 463.00 | 0.69 | 4.5 | 1716.66 | 1491.97 | gpuarrays/input output (3) | 1.79 | 0.00 | 0.0 | 0.00 | 415.00 | 0.10 | 5.5 | 193.43 | 1491.97 | gpuarrays/value constructors (3) | 18.84 | 0.00 | 0.0 | 0.00 | 465.00 | 1.09 | 5.8 | 1999.30 | 1491.97 | gpuarrays/indexing multidimensional (3) | 41.48 | 0.00 | 0.0 | 0.69 | 465.00 | 1.88 | 4.5 | 4605.83 | 1491.97 | gpuarrays/interface (3) | 8.05 | 0.00 | 0.0 | 0.00 | 463.00 | 0.34 | 4.2 | 748.56 | 1491.97 | gpuarrays/iterator constructors (3) | 4.32 | 0.00 | 0.0 | 0.02 | 463.00 | 0.12 | 2.8 | 258.49 | 1491.97 | gpuarrays/uniformscaling (3) | 15.82 | 0.00 | 0.0 | 0.01 | 463.00 | 0.92 | 5.8 | 1419.39 | 1491.97 | gpuarrays/linear algebra (3) | 164.67 | 0.01 | 0.0 | 1.24 | 759.00 | 6.63 | 4.0 | 18772.27 | 2341.61 | gpuarrays/conversions (3) | 3.04 | 0.00 | 0.0 | 0.01 | 415.00 | 0.14 | 4.6 | 397.63 | 2345.10 | gpuarrays/constructors (3) | 1.55 | 0.00 | 0.1 | 0.03 | 415.00 | 0.08 | 5.1 | 99.55 | 2380.36 | gpuarrays/random (3) | 34.43 | 0.00 | 0.0 | 0.03 | 467.00 | 1.31 | 3.8 | 3154.05 | 2380.36 | gpuarrays/base (3) | 33.56 | 0.05 | 0.1 | 17.44 | 497.00 | 2.28 | 6.8 | 4060.77 | 2380.36 | gpuarrays/mapreduce essentials (3) | 194.08 | 0.01 | 0.0 | 3.19 | 489.00 | 7.62 | 3.9 | 18715.84 | 2380.36 | gpuarrays/broadcasting (3) | 141.12 | 0.00 | 0.0 | 1.19 | 481.00 | 6.98 | 4.9 | 14563.45 | 2454.40 | gpuarrays/mapreduce derivatives (3) | 335.14 | 0.01 | 0.0 | 3.06 | 509.00 | 11.75 | 3.5 | 26923.24 | 3560.47 | Worker 2 failed running test device/intrinsics: Some tests did not pass: 256 passed, 0 failed, 3 errored, 0 broken. device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:836 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 49; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:840 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#kernel#976", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:849 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:837 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:854 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 50; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:858 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9786#kernel#977", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#9786#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#9786#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:865 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:855 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:875 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 50; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:879 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9788#kernel#978", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#9788#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#9788#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:886 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:876 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Test Summary: | Pass Error Broken Total Overall | 14340 3 5 14348 initialization | 34 34 apiutils | 9 9 array | 205 205 broadcast | 29 29 codegen | 10 10 cublas | 2239 2239 cufft | 175 175 curand | 1 1 cusolver | 1580 1580 cusparse | 822 822 examples | 7 7 exceptions | 17 17 execution | 69 69 forwarddiff | 107 107 iterator | 30 30 nnlib | 21 21 nvml | 7 7 nvtx | No tests pointer | 35 35 pool | 10 10 random | 101 101 sorting | 123 123 statistics | 18 18 texture | 38 4 42 threading | No tests utils | 5 5 cudadrv/context | 12 12 cudadrv/devices | 6 6 cudadrv/errors | 6 6 cudadrv/events | 4 4 cudadrv/execution | 15 15 cudadrv/memory | 48 1 49 cudadrv/module | 16 16 cudadrv/occupancy | 1 1 cudadrv/profile | 2 2 cudadrv/stream | 7 7 cudadrv/version | 3 3 cudnn/activation | 43 43 cudnn/convolution | 110 110 cudnn/dropout | 7 7 cudnn/inplace | 12 12 cudnn/multiheadattn | 69 69 cudnn/nnlib | 233 233 cudnn/normalization | 32 32 cudnn/optensor | 43 43 cudnn/pooling | 48 48 cudnn/reduce | 51 51 cudnn/rnn | 84 84 cudnn/softmax | 16 16 cudnn/tensor | 10 10 cusolver/cusparse | 84 84 cusparse/interfaces | 84 84 cutensor/base | 8 8 cutensor/contractions | 3321 3321 cutensor/elementwise_binary | 260 260 cutensor/elementwise_trinary | 340 340 cutensor/permutations | 80 80 cutensor/reductions | 280 280 device/array | 18 18 device/intrinsics | 256 3 259 device/ldg | 22 22 device/wmma | 210 210 gpuarrays/math | 8 8 gpuarrays/indexing scalar | 249 249 gpuarrays/input output | 5 5 gpuarrays/value constructors | 36 36 gpuarrays/indexing multidimensional | 34 34 gpuarrays/interface | 7 7 gpuarrays/iterator constructors | 24 24 gpuarrays/uniformscaling | 56 56 gpuarrays/linear algebra | 389 389 gpuarrays/conversions | 72 72 gpuarrays/constructors | 335 335 gpuarrays/random | 46 46 gpuarrays/base | 42 42 gpuarrays/mapreduce essentials | 522 522 gpuarrays/broadcasting | 155 155 gpuarrays/mapreduce derivatives | 827 827 FAILURE Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:836 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 49; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:840 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#kernel#976", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:849 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:837 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:854 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 50; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:858 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9786#kernel#977", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#9786#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#9786#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:865 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:855 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:875 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 50; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:879 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9788#kernel#978", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#9788#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#9788#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:886 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:876 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 ERROR: LoadError: Test run finished with errors in expression starting at /home/isensee/.julia/dev/CUDA/test/runtests.jl:487 ERROR: Package CUDA errored during testing (@v1.6) pkg> test CUDA Testing CUDA Status `/tmp/jl_O1CZbv/Project.toml` [79e6a3ab] Adapt v3.2.0 [ab4f0b2a] BFloat16s v0.1.0 [052768ef] CUDA v2.6.0 `~/.julia/dev/CUDA` [864edb3b] DataStructures v0.18.9 [7a1cc6ca] FFTW v1.3.2 [1a297f60] FillArrays v0.11.2 [f6369f11] ForwardDiff v0.10.16 [0c68f7d7] GPUArrays v6.2.0 [a98d9a8b] Interpolations v0.13.1 [872c559c] NNlib v0.7.14 [ade2ca70] Dates `@stdlib/Dates` [8ba89e20] Distributed `@stdlib/Distributed` [37e2e46d] LinearAlgebra `@stdlib/LinearAlgebra` [de0858da] Printf `@stdlib/Printf` [3fa0cd96] REPL `@stdlib/REPL` [9a3f8284] Random `@stdlib/Random` [2f01184e] SparseArrays `@stdlib/SparseArrays` [10745b16] Statistics `@stdlib/Statistics` [8dfed614] Test `@stdlib/Test` Status `/tmp/jl_O1CZbv/Manifest.toml` [621f4979] AbstractFFTs v1.0.0 [79e6a3ab] Adapt v3.2.0 [13072b0f] AxisAlgorithms v1.0.0 [ab4f0b2a] BFloat16s v0.1.0 [fa961155] CEnum v0.4.1 [052768ef] CUDA v2.6.0 `~/.julia/dev/CUDA` [d360d2e6] ChainRulesCore v0.9.28 [bbf7d656] CommonSubexpressions v0.3.0 [34da2185] Compat v3.25.0 [864edb3b] DataStructures v0.18.9 [163ba53b] DiffResults v1.0.3 [b552c78f] DiffRules v1.0.2 [e2ba6199] ExprTools v0.1.3 [7a1cc6ca] FFTW v1.3.2 [1a297f60] FillArrays v0.11.2 [f6369f11] ForwardDiff v0.10.16 [0c68f7d7] GPUArrays v6.2.0 [61eb1bfa] GPUCompiler v0.10.0 [a98d9a8b] Interpolations v0.13.1 [692b3bcd] JLLWrappers v1.2.0 [929cbde3] LLVM v3.6.0 [1914dd2f] MacroTools v0.5.6 [c03570c3] Memoize v0.4.4 [872c559c] NNlib v0.7.14 [77ba4419] NaNMath v0.3.5 [6fe1bfb0] OffsetArrays v1.5.3 [bac558e1] OrderedCollections v1.3.3 [c84ed2f1] Ratios v0.4.0 [189a3867] Reexport v1.0.0 [ae029012] Requires v1.1.2 [6c6a2e73] Scratch v1.0.3 [276daf66] SpecialFunctions v1.2.1 [90137ffa] StaticArrays v1.0.1 [a759f4b9] TimerOutputs v0.5.7 [efce3f68] WoodburyMatrices v0.5.3 [f5851436] FFTW_jll v3.3.9+7 [1d5cc7b8] IntelOpenMP_jll v2018.0.3+2 [856f044c] MKL_jll v2021.1.1+1 [efe28fd5] OpenSpecFun_jll v0.5.3+4 [0dad84c5] ArgTools `@stdlib/ArgTools` [56f22d72] Artifacts `@stdlib/Artifacts` [2a0f44e3] Base64 `@stdlib/Base64` [ade2ca70] Dates `@stdlib/Dates` [8bb1440f] DelimitedFiles `@stdlib/DelimitedFiles` [8ba89e20] Distributed `@stdlib/Distributed` [f43a241f] Downloads `@stdlib/Downloads` [b77e0a4c] InteractiveUtils `@stdlib/InteractiveUtils` [4af54fe1] LazyArtifacts `@stdlib/LazyArtifacts` [b27032c2] LibCURL `@stdlib/LibCURL` [76f85450] LibGit2 `@stdlib/LibGit2` [8f399da3] Libdl `@stdlib/Libdl` [37e2e46d] LinearAlgebra `@stdlib/LinearAlgebra` [56ddb016] Logging `@stdlib/Logging` [d6f4376e] Markdown `@stdlib/Markdown` [a63ad114] Mmap `@stdlib/Mmap` [ca575930] NetworkOptions `@stdlib/NetworkOptions` [44cfe95a] Pkg `@stdlib/Pkg` [de0858da] Printf `@stdlib/Printf` [3fa0cd96] REPL `@stdlib/REPL` [9a3f8284] Random `@stdlib/Random` [ea8e919c] SHA `@stdlib/SHA` [9e88b42a] Serialization `@stdlib/Serialization` [1a1011a3] SharedArrays `@stdlib/SharedArrays` [6462fe0b] Sockets `@stdlib/Sockets` [2f01184e] SparseArrays `@stdlib/SparseArrays` [10745b16] Statistics `@stdlib/Statistics` [fa267f1f] TOML `@stdlib/TOML` [a4e569a6] Tar `@stdlib/Tar` [8dfed614] Test `@stdlib/Test` [cf7118a7] UUIDs `@stdlib/UUIDs` [4ec0a83e] Unicode `@stdlib/Unicode` [e66e0078] CompilerSupportLibraries_jll `@stdlib/CompilerSupportLibraries_jll` [deac9b47] LibCURL_jll `@stdlib/LibCURL_jll` [29816b5a] LibSSH2_jll `@stdlib/LibSSH2_jll` [c8ffd9c3] MbedTLS_jll `@stdlib/MbedTLS_jll` [14a3606d] MozillaCACerts_jll `@stdlib/MozillaCACerts_jll` [83775a58] Zlib_jll `@stdlib/Zlib_jll` [8e850ede] nghttp2_jll `@stdlib/nghttp2_jll` Testing Running tests... ┌ Info: System information: │ CUDA toolkit 11.2.0, artifact installation │ CUDA driver 11.2.0 │ NVIDIA driver 460.32.3 │ │ Libraries: │ - CUBLAS: 11.3.1 │ - CURAND: 10.2.3 │ - CUFFT: 10.4.0 │ - CUSOLVER: 11.0.2 │ - CUSPARSE: 11.3.1 │ - CUPTI: 14.0.0 │ - NVML: 11.0.0+460.32.3 │ - CUDNN: 8.10.0 (for CUDA 11.2.0) │ - CUTENSOR: 1.2.2 (for CUDA 11.1.0) │ │ Toolchain: │ - Julia: 1.6.0-rc1 │ - LLVM: 11.0.1 │ - PTX ISA support: 3.2, 4.0, 4.1, 4.2, 4.3, 5.0, 6.0, 6.1, 6.3, 6.4, 6.5, 7.0 │ - Device support: sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80 │ │ 2 devices: │ 0: A100-PCIE-40GB (sm_80, 38.916 GiB / 39.586 GiB available) └ 1: A100-PCIE-40GB (sm_80, 39.583 GiB / 39.586 GiB available) [ Info: Testing using 1 device(s): 2. A100-PCIE-40GB (UUID 541fcd36-c901-0c21-1712-9456bc3e0548) | | ---------------- GPU ---------------- | ---------------- CPU ---------------- | Test (Worker) | Time (s) | GC (s) | GC % | Alloc (MB) | RSS (MB) | GC (s) | GC % | Alloc (MB) | RSS (MB) | initialization (2) | 2.70 | 0.00 | 0.0 | 0.00 | 413.00 | 0.01 | 0.5 | 153.64 | 1463.37 | apiutils (2) | 0.18 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 1.81 | 1463.37 | array (2) | 129.38 | 0.04 | 0.0 | 5.29 | 487.00 | 6.30 | 4.9 | 14064.29 | 1463.37 | broadcast (2) | 46.46 | 0.00 | 0.0 | 0.00 | 465.00 | 2.04 | 4.4 | 3797.22 | 1463.37 | codegen (2) | 11.88 | 0.18 | 1.5 | 0.00 | 659.00 | 0.51 | 4.3 | 1229.78 | 1463.37 | cublas (2) | 112.84 | 0.02 | 0.0 | 14.50 | 751.00 | 6.31 | 5.6 | 14164.54 | 2846.77 | cufft (2) | 26.43 | 0.01 | 0.0 | 151.26 | 629.00 | 1.20 | 4.5 | 2607.97 | 2882.43 | curand (2) | 0.10 | 0.00 | 0.0 | 0.00 | 421.00 | 0.00 | 0.0 | 6.49 | 2882.43 | cusolver (2) | 90.69 | 0.05 | 0.0 | 1294.29 | 1113.00 | 4.55 | 5.0 | 10129.40 | 3939.22 | cusparse (2) | 35.18 | 0.01 | 0.0 | 8.45 | 613.00 | 1.13 | 3.2 | 2636.79 | 4019.42 | examples (2) | 136.37 | 0.00 | 0.0 | 0.00 | 413.00 | 0.05 | 0.0 | 29.10 | 4019.42 | exceptions (2) | 83.53 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 38.95 | 4019.42 | From worker 2: WARNING: Method definition #5848#kernel(Any) in module Main at /home/isensee/.julia/dev/CUDA/test/execution.jl:316 overwritten at /home/isensee/.julia/dev/CUDA/test/execution.jl:324. execution (2) | 88.57 | 0.00 | 0.0 | 1.28 | 697.00 | 4.11 | 4.6 | 8966.23 | 4070.70 | forwarddiff (2) | 111.68 | 0.00 | 0.0 | 0.00 | 469.00 | 3.25 | 2.9 | 7353.32 | 4186.96 | iterator (2) | 2.64 | 0.06 | 2.1 | 1.16 | 415.00 | 0.18 | 6.9 | 376.05 | 4186.96 | nnlib (2) | 15.75 | 0.00 | 0.0 | 0.01 | 803.00 | 1.46 | 9.3 | 1895.57 | 4186.96 | nvml (2) | 0.56 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 27.82 | 4186.96 | nvtx (2) | 0.27 | 0.00 | 0.0 | 0.00 | 413.00 | 0.06 | 22.7 | 23.39 | 4186.96 | pointer (2) | 0.26 | 0.00 | 0.1 | 0.00 | 415.00 | 0.00 | 0.0 | 13.50 | 4186.96 | pool (2) | 3.47 | 0.00 | 0.0 | 0.00 | 413.00 | 1.05 | 30.4 | 561.95 | 4186.96 | random (2) | 13.25 | 0.00 | 0.0 | 0.02 | 465.00 | 0.60 | 4.6 | 1232.57 | 4186.96 | sorting (2) | 76.24 | 0.26 | 0.3 | 259.47 | 6353.00 | 3.87 | 5.1 | 9818.30 | 4230.59 | statistics (2) | 19.32 | 0.00 | 0.0 | 0.00 | 463.00 | 0.81 | 4.2 | 1786.35 | 4238.26 | texture (2) | 71.72 | 0.00 | 0.0 | 0.09 | 473.00 | 3.31 | 4.6 | 7006.52 | 4269.76 | threading (2) | 3.58 | 0.00 | 0.1 | 10.94 | 741.00 | 0.10 | 2.8 | 304.19 | 4620.46 | utils (2) | 0.91 | 0.00 | 0.0 | 0.00 | 475.00 | 0.00 | 0.0 | 89.00 | 4620.46 | cudadrv/context (2) | 0.76 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 6.26 | 4787.21 | cudadrv/devices (2) | 0.29 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 27.71 | 4787.21 | cudadrv/errors (2) | 0.12 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 10.61 | 4787.21 | cudadrv/events (2) | 0.11 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 4.63 | 4787.21 | cudadrv/execution (2) | 0.68 | 0.00 | 0.0 | 0.00 | 415.00 | 0.00 | 0.0 | 49.14 | 4787.21 | cudadrv/memory (2) | 1.67 | 0.00 | 0.0 | 0.00 | 415.00 | 0.09 | 5.6 | 93.55 | 4787.21 | cudadrv/module (2) | 0.42 | 0.00 | 0.0 | 0.00 | 415.00 | 0.00 | 0.0 | 32.54 | 4787.21 | cudadrv/occupancy (2) | 0.12 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 6.46 | 4787.21 | cudadrv/profile (2) | 0.28 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 53.03 | 4787.21 | cudadrv/stream (2) | 0.09 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 5.44 | 4787.21 | cudadrv/version (2) | 0.01 | 0.00 | 0.0 | 0.00 | 413.00 | 0.00 | 0.0 | 0.07 | 4787.21 | cudnn/activation (2) | 1.97 | 0.00 | 0.0 | 0.00 | 677.00 | 0.08 | 4.2 | 171.77 | 4854.97 | cudnn/convolution (2) | 10.95 | 0.00 | 0.0 | 8.07 | 1035.00 | 0.57 | 5.2 | 1090.44 | 6031.16 | ┌ Warning: CUDA.CUDNN.cudnnDropoutSeed[] >= 0: dropout operations will be deterministic but 40x more expensive └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/dropout.jl:40 cudnn/dropout (2) | 3.36 | 0.00 | 0.0 | 5.08 | 733.00 | 0.00 | 0.0 | 135.44 | 6031.55 | cudnn/inplace (2) | 1.02 | 0.00 | 0.0 | 0.01 | 681.00 | 0.00 | 0.0 | 54.01 | 6031.55 | cudnn/multiheadattn (2) | 14.14 | 0.00 | 0.0 | 0.14 | 843.00 | 0.47 | 3.3 | 1393.50 | 6244.75 | ┌ Warning: ∇softmax(dy,x) should be deprecated, please use ∇softmax(dy,x,y) └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/nnlib.jl:44 ┌ Warning: ∇logsoftmax(dy,x) should be deprecated, please use ∇logsoftmax(dy,x,y) └ @ CUDA.CUDNN ~/.julia/dev/CUDA/lib/cudnn/nnlib.jl:54 cudnn/nnlib (2) | 97.53 | 0.01 | 0.0 | 29.77 | 1131.00 | 4.47 | 4.6 | 10618.42 | 6794.52 | cudnn/normalization (2) | 17.69 | 0.00 | 0.0 | 0.11 | 801.00 | 0.83 | 4.7 | 1467.71 | 6797.00 | cudnn/optensor (2) | 1.69 | 0.00 | 0.0 | 0.00 | 677.00 | 0.00 | 0.0 | 133.05 | 6797.00 | cudnn/pooling (2) | 5.12 | 0.00 | 0.0 | 0.06 | 677.00 | 0.28 | 5.5 | 534.97 | 6798.84 | cudnn/reduce (2) | 2.98 | 0.00 | 0.0 | 0.02 | 677.00 | 0.10 | 3.2 | 302.84 | 6798.84 | cudnn/rnn (2) | 7.70 | 0.00 | 0.0 | 5.81 | 851.00 | 0.32 | 4.2 | 1022.93 | 7117.21 | cudnn/softmax (2) | 1.36 | 0.00 | 0.0 | 0.01 | 677.00 | 0.00 | 0.0 | 78.74 | 7117.21 | cudnn/tensor (2) | 1.02 | 0.00 | 0.0 | 0.00 | 421.00 | 0.69 | 68.4 | 22.74 | 7117.21 | cusolver/cusparse (2) | 8.47 | 0.00 | 0.0 | 0.19 | 885.00 | 0.21 | 2.5 | 543.53 | 7117.21 | cusparse/interfaces (2) | 17.28 | 0.00 | 0.0 | 0.24 | 551.00 | 0.53 | 3.1 | 1567.40 | 7117.21 | cutensor/base (2) | 0.13 | 0.00 | 0.2 | 1.11 | 415.00 | 0.00 | 0.0 | 11.46 | 7117.21 | cutensor/contractions (2) | 50.99 | 0.02 | 0.0 | 32033.95 | 833.00 | 2.72 | 5.3 | 7037.06 | 7117.21 | cutensor/elementwise_binary (2) | 22.44 | 0.25 | 1.1 | 54.99 | 575.00 | 1.03 | 4.6 | 2669.21 | 7117.21 | cutensor/elementwise_trinary (2) | 29.81 | 0.00 | 0.0 | 24.44 | 533.00 | 1.37 | 4.6 | 3626.17 | 7117.21 | cutensor/permutations (2) | 3.93 | 0.00 | 0.1 | 12.22 | 513.00 | 0.16 | 4.1 | 494.74 | 7117.21 | cutensor/reductions (2) | 16.33 | 0.20 | 1.2 | 41.72 | 503.00 | 0.75 | 4.6 | 1572.13 | 7117.21 | device/array (2) | 4.91 | 0.00 | 0.0 | 0.00 | 463.00 | 0.22 | 4.4 | 378.11 | 7117.21 | device/intrinsics (2) | failed at 2021-02-12T16:33:01.583 device/ldg (3) | 26.58 | 0.21 | 0.8 | 0.00 | 463.00 | 0.94 | 3.5 | 2672.46 | 1491.97 | device/wmma (3) | 79.63 | 0.01 | 0.0 | 0.38 | 465.00 | 2.40 | 3.0 | 7926.10 | 1491.97 | gpuarrays/math (3) | 5.48 | 0.00 | 0.0 | 0.00 | 463.00 | 0.28 | 5.1 | 829.05 | 1491.97 | gpuarrays/indexing scalar (3) | 15.54 | 0.00 | 0.0 | 0.00 | 463.00 | 0.69 | 4.5 | 1716.66 | 1491.97 | gpuarrays/input output (3) | 1.79 | 0.00 | 0.0 | 0.00 | 415.00 | 0.10 | 5.5 | 193.43 | 1491.97 | gpuarrays/value constructors (3) | 18.84 | 0.00 | 0.0 | 0.00 | 465.00 | 1.09 | 5.8 | 1999.30 | 1491.97 | gpuarrays/indexing multidimensional (3) | 41.48 | 0.00 | 0.0 | 0.69 | 465.00 | 1.88 | 4.5 | 4605.83 | 1491.97 | gpuarrays/interface (3) | 8.05 | 0.00 | 0.0 | 0.00 | 463.00 | 0.34 | 4.2 | 748.56 | 1491.97 | gpuarrays/iterator constructors (3) | 4.32 | 0.00 | 0.0 | 0.02 | 463.00 | 0.12 | 2.8 | 258.49 | 1491.97 | gpuarrays/uniformscaling (3) | 15.82 | 0.00 | 0.0 | 0.01 | 463.00 | 0.92 | 5.8 | 1419.39 | 1491.97 | gpuarrays/linear algebra (3) | 164.67 | 0.01 | 0.0 | 1.24 | 759.00 | 6.63 | 4.0 | 18772.27 | 2341.61 | gpuarrays/conversions (3) | 3.04 | 0.00 | 0.0 | 0.01 | 415.00 | 0.14 | 4.6 | 397.63 | 2345.10 | gpuarrays/constructors (3) | 1.55 | 0.00 | 0.1 | 0.03 | 415.00 | 0.08 | 5.1 | 99.55 | 2380.36 | gpuarrays/random (3) | 34.43 | 0.00 | 0.0 | 0.03 | 467.00 | 1.31 | 3.8 | 3154.05 | 2380.36 | gpuarrays/base (3) | 33.56 | 0.05 | 0.1 | 17.44 | 497.00 | 2.28 | 6.8 | 4060.77 | 2380.36 | gpuarrays/mapreduce essentials (3) | 194.08 | 0.01 | 0.0 | 3.19 | 489.00 | 7.62 | 3.9 | 18715.84 | 2380.36 | gpuarrays/broadcasting (3) | 141.12 | 0.00 | 0.0 | 1.19 | 481.00 | 6.98 | 4.9 | 14563.45 | 2454.40 | gpuarrays/mapreduce derivatives (3) | 335.14 | 0.01 | 0.0 | 3.06 | 509.00 | 11.75 | 3.5 | 26923.24 | 3560.47 | Worker 2 failed running test device/intrinsics: Some tests did not pass: 256 passed, 0 failed, 3 errored, 0 broken. device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:836 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 49; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:840 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#kernel#976", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:849 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:837 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:854 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 50; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:858 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9786#kernel#977", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#9786#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#9786#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:865 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:855 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:875 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 50; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:879 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9788#kernel#978", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#9788#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#9788#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:886 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:876 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Test Summary: | Pass Error Broken Total Overall | 14340 3 5 14348 initialization | 34 34 apiutils | 9 9 array | 205 205 broadcast | 29 29 codegen | 10 10 cublas | 2239 2239 cufft | 175 175 curand | 1 1 cusolver | 1580 1580 cusparse | 822 822 examples | 7 7 exceptions | 17 17 execution | 69 69 forwarddiff | 107 107 iterator | 30 30 nnlib | 21 21 nvml | 7 7 nvtx | No tests pointer | 35 35 pool | 10 10 random | 101 101 sorting | 123 123 statistics | 18 18 texture | 38 4 42 threading | No tests utils | 5 5 cudadrv/context | 12 12 cudadrv/devices | 6 6 cudadrv/errors | 6 6 cudadrv/events | 4 4 cudadrv/execution | 15 15 cudadrv/memory | 48 1 49 cudadrv/module | 16 16 cudadrv/occupancy | 1 1 cudadrv/profile | 2 2 cudadrv/stream | 7 7 cudadrv/version | 3 3 cudnn/activation | 43 43 cudnn/convolution | 110 110 cudnn/dropout | 7 7 cudnn/inplace | 12 12 cudnn/multiheadattn | 69 69 cudnn/nnlib | 233 233 cudnn/normalization | 32 32 cudnn/optensor | 43 43 cudnn/pooling | 48 48 cudnn/reduce | 51 51 cudnn/rnn | 84 84 cudnn/softmax | 16 16 cudnn/tensor | 10 10 cusolver/cusparse | 84 84 cusparse/interfaces | 84 84 cutensor/base | 8 8 cutensor/contractions | 3321 3321 cutensor/elementwise_binary | 260 260 cutensor/elementwise_trinary | 340 340 cutensor/permutations | 80 80 cutensor/reductions | 280 280 device/array | 18 18 device/intrinsics | 256 3 259 device/ldg | 22 22 device/wmma | 210 210 gpuarrays/math | 8 8 gpuarrays/indexing scalar | 249 249 gpuarrays/input output | 5 5 gpuarrays/value constructors | 36 36 gpuarrays/indexing multidimensional | 34 34 gpuarrays/interface | 7 7 gpuarrays/iterator constructors | 24 24 gpuarrays/uniformscaling | 56 56 gpuarrays/linear algebra | 389 389 gpuarrays/conversions | 72 72 gpuarrays/constructors | 335 335 gpuarrays/random | 46 46 gpuarrays/base | 42 42 gpuarrays/mapreduce essentials | 522 522 gpuarrays/broadcasting | 155 155 gpuarrays/mapreduce derivatives | 827 827 FAILURE Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:836 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 49; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:840 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#kernel#976", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#kernel#976", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:849 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:837 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:854 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 50; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:858 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9786#kernel#977", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#9786#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#9786#kernel#977", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:865 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:855 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 Error in testset device/intrinsics: Error During Test at /home/isensee/.julia/dev/CUDA/test/device/intrinsics.jl:875 Got exception outside of a @test CUDA error: a PTX JIT compilation failed (code 218, ERROR_INVALID_PTX) ptxas application ptx input, line 50; error : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4 ptxas fatal : Ptx assembly aborted due to errors Stacktrace: [1] add_data!(link::CuLink, name::String, code::String) @ CUDA ~/.julia/dev/CUDA/lib/cudadrv/module/linker.jl:79 [2] cufunction_link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:code, :entry, :needs_cudadevrt, :external_gvars), Tuple{String, String, Bool, Vector{String}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:345 [3] check_cache @ ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:53 [inlined] [4] cached_compilation @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:879 [inlined] [5] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{var"#9788#kernel#978", Tuple{CuDeviceVector{UInt32, 1}, Int64}}}, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link)) @ GPUCompiler ~/.julia/packages/GPUCompiler/XwWPj/src/cache.jl:0 [6] cufunction(f::var"#9788#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}; name::Nothing, kwargs::Base.Iterators.Pairs{Union{}, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:294 [7] cufunction(f::var"#9788#kernel#978", tt::Type{Tuple{CuDeviceVector{UInt32, 1}, Int64}}) @ CUDA ~/.julia/dev/CUDA/src/compiler/execution.jl:288 [8] macro expansion @ ~/.julia/dev/CUDA/src/compiler/execution.jl:102 [inlined] [9] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:886 [inlined] [10] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [11] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:876 [inlined] [12] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [13] macro expansion @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:836 [inlined] [14] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [15] top-level scope @ ~/.julia/dev/CUDA/test/device/intrinsics.jl:726 [16] include(fname::String) @ Base.MainInclude ./client.jl:444 [17] #9 @ ~/.julia/dev/CUDA/test/runtests.jl:79 [inlined] [18] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [19] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1151 [inlined] [20] macro expansion @ ~/.julia/dev/CUDA/test/setup.jl:56 [inlined] [21] macro expansion @ ~/.julia/dev/CUDA/src/utilities.jl:28 [inlined] [22] macro expansion @ ~/.julia/dev/CUDA/src/pool.jl:547 [inlined] [23] top-level scope @ ~/.julia/dev/CUDA/test/setup.jl:55 [24] eval @ ./boot.jl:360 [inlined] [25] runtests(f::Function, name::String, time_source::Symbol, snoop::Nothing) @ Main ~/.julia/dev/CUDA/test/setup.jl:67 [26] (::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}})() @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [27] run_work_thunk(thunk::Distributed.var"#106#108"{Distributed.CallMsg{:call_fetch}}, print_error::Bool) @ Distributed /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:63 [28] macro expansion @ /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.6/Distributed/src/process_messages.jl:278 [inlined] [29] (::Distributed.var"#105#107"{Distributed.CallMsg{:call_fetch}, Distributed.MsgHeader, Sockets.TCPSocket})() @ Distributed ./task.jl:406 ERROR: LoadError: Test run finished with errors in expression starting at /home/isensee/.julia/dev/CUDA/test/runtests.jl:487 ERROR: Package CUDA errored during testing ```

maleadt commented 3 years ago
  ptxas application ptx input, line 49; error   : Instruction 'vote' without '.sync' is not supported on .target sm_70 and higher from PTX ISA version 6.4
  ptxas fatal   : Ptx assembly aborted due to errors