Add CUDA.peakflops() - Githubissues

I'm currently attempting something like this in a package (we could add this feature to CUDA.jl afterwards). However, I wonder how you would implement this to reliably achieve peak performance? (In particular, given that you marked this as "good first issue".)

Currently I'm taking a very straightforward approach:

function peakflops_gpu(; device=CUDA.device(), dtype=Float32, size=20_000, nmatmuls=5, nbench=5, verbose=true)
    device!(device) do
        C = CUDA.zeros(dtype, size, size)
        A = CUDA.rand(dtype, size, size)
        B = CUDA.rand(dtype, size, size)

        t = Inf
        for i in 1:nbench
            Δt = CUDA.@elapsed for _ in 1:nmatmuls
                mul!(C, A, B)
            end
            t = min(t, Δt)
        end

        flops = (size^3 * nmatmuls) / t
        if verbose
            printstyled("Peakflops (TFLOPS):\n"; bold=true)
            print(" └ max: ")
            printstyled(round(flops * 1e-12; digits=2), "\n"; color=:green, bold=true)
        end
        return flops
    end
end

But this "only" gives

Peakflops (TFLOPS):
 └ max: 62.44

on a A100 SXM4 40GB, which (I think) has to be compared to 156 TFLOPS that NVIDIA seems to report as peak performance. (Note that this is with CUDA.math_mode!(CUDA.FAST_MATH).)

JuliaGPU / CUDA.jl

Add CUDA.peakflops() #203