arrayfile-python is much slower than cupy

3togo commented 2 years ago

I modified the example code "bench_blas.py" and find that cupy is at least 5 times faster than array_cuda. But why?

Below is the console output:

ArrayFire v3.9.0 (CUDA, 64-bit Linux, build 1bb74d18c)
Platform: CUDA Runtime 11.5, Driver: 510.60.02
[0] NVIDIA GeForce GTX 1060, 6079 MB, CUDA Compute 6.1
Benchmark N x N matrix multiply on arrayfire_cpu
Time taken for  128 x  128: 2.9004 Gflops
Time taken for  256 x  256: 36.4426 Gflops
Time taken for  384 x  384: 71.2389 Gflops
Time taken for  512 x  512: 73.9079 Gflops
Benchmark N x N matrix multiply on arrayfire_cuda
Time taken for  128 x  128: 0.5859 Gflops
Time taken for  256 x  256: 107.8804 Gflops
Time taken for  384 x  384: 298.4424 Gflops
Time taken for  512 x  512: 670.6695 Gflops
Benchmark N x N matrix multiply on numpy
Time taken for  128 x  128: 1.1879 Gflops
Time taken for  256 x  256: 12.7069 Gflops
Time taken for  384 x  384: 19.4420 Gflops
Time taken for  512 x  512: 18.5651 Gflops
Benchmark N x N matrix multiply on cupy
Time taken for  128 x  128: 46.0312 Gflops
Time taken for  256 x  256: 567.8333 Gflops
Time taken for  384 x  384: 1880.7722 Gflops
Time taken for  512 x  512: 3559.7076 Gflops

import sys
from time import time
import arrayfire as af

try:
    import numpy as np
except ImportError:
    np = None

try:
    import cupy as cp
except ImportError:
    cp = None

def calc_arrayfire(n, backend):
    af.set_backend(backend)
    A = af.randu(n, n)
    af.sync()

    def run(iters):
        for t in range(iters):
            B = af.matmul(A, A)
        af.sync()

    return run

def calc_arrayfire_cpu(n):
    return calc_arrayfire(n, 'cpu')

def calc_arrayfire_cuda(n):
    return calc_arrayfire(n, 'cuda')

def calc_numpy(n):
    np.random.seed(1)
    A = np.random.rand(n, n).astype(np.float32)

    def run(iters):
        for t in range(iters):
            B = np.dot(A, A)

    return run

def calc_cupy(n):
    cp.random.seed(1)
    A = cp.random.rand(n, n).astype(cp.float32)

    def run(iters):
        for t in range(iters):
            B = cp.dot(A, A)

    return run

def bench(calc, upto=2048, iters=100):
    _, name = calc.__name__.split("_", 1)
    print("Benchmark N x N matrix multiply on %s" % name)

    for n in range(128, upto + 128, 128):
        run = calc(n)
        start = time()
        run(iters)
        t = (time() - start) / iters
        gflops = 2.0 * (n ** 3) / (t * 1E9)
        print("Time taken for %4d x %4d: %0.4f Gflops" % (n, n, gflops))

if __name__ == "__main__":

    if (len(sys.argv) > 1):
        af.set_device(int(sys.argv[1]))

    af.info()
    upto=512
    bench(calc_arrayfire_cpu, upto)
    bench(calc_arrayfire_cuda, upto)
    if np:
        bench(calc_numpy, upto)
    if cp:
        bench(calc_cupy, upto)

umar456 commented 2 years ago

You are benchmarking cupy incorrectly. You need to synchronize cupy like you synchronize ArrayFire.

3togo commented 2 years ago

@umar456 , As per your suggestion, I add some code to synchronize cupy as well. Still, cupy is running much much faster than arrayfire.

ArrayFire v3.9.0 (CUDA, 64-bit Linux, build 014dbdfa)
Platform: CUDA Runtime 11.6, Driver: 510.60.02
[0] NVIDIA GeForce GTX 1060, 6070 MB, CUDA Compute 6.1

Benchmark N x N matrix multiply on arrayfire_cpu
Time taken for  128 x  128: 6.6770 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 114.9112 Gflops
Max = 8153.567383 with iter count = 100 
Time taken for  384 x  384: 203.1039 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 338.8448 Gflops
Max = 15126.784180 with iter count = 100 

Benchmark N x N matrix multiply on arrayfire_cuda
Time taken for  128 x  128: 1.3202 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 473.5128 Gflops
Max = 8153.573730 with iter count = 100 
Time taken for  384 x  384: 737.9158 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 849.4986 Gflops
Max = 15126.784180 with iter count = 100 

Benchmark N x N matrix multiply on numpy
Time taken for  128 x  128: 51.7874 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 132.1988 Gflops
Max = 8153.567383 with iter count = 100 
Time taken for  384 x  384: 253.2883 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 254.5874 Gflops
Max = 15126.784180 with iter count = 100 

Benchmark N x N matrix multiply on cupy
Time taken for  128 x  128: 1.5381 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 1680.2470 Gflops
Max = 8153.573730 with iter count = 100 
Time taken for  384 x  384: 5650.5951 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 12432.6403 Gflops
Max = 15126.784180 with iter count = 100

import sys
from time import time
import arrayfire as af

try:
    import numpy as np
except ImportError:
    np = None

try:
    import cupy as cp
except ImportError:
    cp = None

def calc_arrayfire(A0, n, backend):
    af.set_backend(backend)
    A = af.interop.from_ndarray(A0).as_type(af.Dtype.f32)
    def run(iters):
        B = af.constant(0, n,n, dtype=af.Dtype.f32)
        count = 0
        for t in range(iters):
            B += af.matmul(A, A)
            count += 1
        C = af.max(B)
        return C, count

    return run

def calc_arrayfire_cpu(A0, n):
    return calc_arrayfire(A0, n, 'cpu')

def calc_arrayfire_cuda(A0, n):
    return calc_arrayfire(A0, n, 'cuda')

def calc_numpy(A0, n):
    A = A0
    def run(iters):
        B = np.zeros((n,n), np.float32)
        count = 0
        for t in range(iters):
            B += np.dot(A, A)
            count += 1
        C = np.max(B)
        return C, count
    return run

def calc_cupy(A0, n):
    s0 = cp.cuda.Stream(non_blocking=False)
    A=cp.asarray(A0)
    s0.synchronize()
    def run(iters):
        B = cp.zeros((n,n), cp.float32)
        s1 = cp.cuda.Stream(non_blocking=False)
        count = 0
        for t in range(iters):
            B += cp.dot(A, A)
            count += 1
        s1.synchronize()
        s2 = cp.cuda.Stream(non_blocking=False)
        C = cp.max(B)
        s2.synchronize()
        return C.astype(float), count
    return run

def bench(calc, upto=2048, iters=100):
    _, name = calc.__name__.split("_", 1)
    print("\nBenchmark N x N matrix multiply on %s" % name)
    np.random.seed(1)

    for n in range(128, upto + 128, 128):
        A0 = np.random.rand(n, n).astype(np.float32)
        calc1 = calc(A0, n)
        start = time()
        mmax, count = calc1(iters)
        t = (time() - start) / iters
        gflops = 2.0 * (n ** 3) / (t * 1E9)
        print("Time taken for %4d x %4d: %0.4f Gflops" % (n, n, gflops))
        print("Max = %f with iter count = %d "%(mmax, count))

if __name__ == "__main__":

    if (len(sys.argv) > 1):
        af.set_device(int(sys.argv[1]))

    af.info()
    upto=512
    bench(calc_arrayfire_cpu, upto)
    bench(calc_arrayfire_cuda, upto)
    if np:
        bench(calc_numpy, upto)
    if cp:
        bench(calc_cupy, upto)

umar456 commented 2 years ago

There are better ways of doing what you are trying to do. Can you add eval on the B array in the loop. It should help. If you want to get ideal performance you want to use the af_gemm function.

3togo commented 2 years ago

@umar456 , Thank your for your advise on using af.eval(), it did speed up arrayfire significantly. However, it is still not comparable to cupy.

ArrayFire v3.9.0 (CUDA, 64-bit Linux, build 82110a03)
Platform: CUDA Runtime 11.6, Driver: 510.60.02
[0] NVIDIA GeForce GTX 1060, 6070 MB, CUDA Compute 6.1

Benchmark N x N matrix multiply on arrayfire_cpu
Time taken for  128 x  128: 23.1992 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 50.7984 Gflops
Max = 8153.567383 with iter count = 100 
Time taken for  384 x  384: 145.7491 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 173.7908 Gflops
Max = 15126.784180 with iter count = 100 

Benchmark N x N matrix multiply on arrayfire_cuda
Time taken for  128 x  128: 1.1964 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 1270.0793 Gflops
Max = 8153.573730 with iter count = 100 
Time taken for  384 x  384: 1864.1641 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 1972.7711 Gflops
Max = 15126.784180 with iter count = 100 

Benchmark N x N matrix multiply on numpy
Time taken for  128 x  128: 45.6241 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 85.3053 Gflops
Max = 8153.567383 with iter count = 100 
Time taken for  384 x  384: 238.5980 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 236.8964 Gflops
Max = 15126.784180 with iter count = 100 

Benchmark N x N matrix multiply on cupy
Time taken for  128 x  128: 3.8401 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 1707.7720 Gflops
Max = 8153.573730 with iter count = 100 
Time taken for  384 x  384: 5199.6609 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 11103.5494 Gflops
Max = 15126.784180 with iter count = 100

import sys
from time import time
import arrayfire as af

try:
    import numpy as np
except ImportError:
    np = None

try:
    import cupy as cp
except ImportError:
    cp = None

def calc_arrayfire(A0, n, backend):
    af.set_backend(backend)
    A = af.interop.from_ndarray(A0).as_type(af.Dtype.f32)
    def run(iters):
        B = af.constant(0, n,n, dtype=af.Dtype.f32)
        count = 0
        for t in range(iters):
            B += af.matmul(A, A)
            count += 1
            af.eval(B)
        C = af.max(B)
        return C, count

    return run

def calc_arrayfire_cpu(A0, n):
    return calc_arrayfire(A0, n, 'cpu')

def calc_arrayfire_cuda(A0, n):
    return calc_arrayfire(A0, n, 'cuda')

def calc_numpy(A0, n):
    A = A0
    def run(iters):
        B = np.zeros((n,n), np.float32)
        count = 0
        for t in range(iters):
            B += np.dot(A, A)
            count += 1
        C = np.max(B)
        return C, count
    return run

def calc_cupy(A0, n):
    s0 = cp.cuda.Stream(non_blocking=False)
    A=cp.asarray(A0)
    s0.synchronize()
    def run(iters):
        B = cp.zeros((n,n), cp.float32)
        s1 = cp.cuda.Stream(non_blocking=False)
        count = 0
        for t in range(iters):
            B += cp.dot(A, A)
            count += 1
        s1.synchronize()
        s2 = cp.cuda.Stream(non_blocking=False)
        C = cp.max(B)
        s2.synchronize()
        return C.astype(float), count
    return run

def bench(calc, upto=2048, iters=100):
    _, name = calc.__name__.split("_", 1)
    print("\nBenchmark N x N matrix multiply on %s" % name)
    np.random.seed(1)

    for n in range(128, upto + 128, 128):
        A0 = np.random.rand(n, n).astype(np.float32)
        calc1 = calc(A0, n)
        start = time()
        mmax, count = calc1(iters)
        t = (time() - start) / iters
        gflops = 2.0 * (n ** 3) / (t * 1E9)
        print("Time taken for %4d x %4d: %0.4f Gflops" % (n, n, gflops))
        print("Max = %f with iter count = %d "%(mmax, count))

if __name__ == "__main__":

    if (len(sys.argv) > 1):
        af.set_device(int(sys.argv[1]))

    af.info()
    upto=512
    bench(calc_arrayfire_cpu, upto)
    bench(calc_arrayfire_cuda, upto)
    if np:
        bench(calc_numpy, upto)
    if cp:
        bench(calc_cupy, upto)

umar456 commented 2 years ago

I think you are still profiling the cupy version incorrectly. The maximum compute for fp32 your card is capable of is 4300 GFLOPs but you are geting close to 11 GFLOPs AND you are doing additional work on top of matmul. I suggest you look into what is being calculated by your benchmark using a profiler.

3togo commented 2 years ago

By tracing the profile dump, I can't find any clue to solve my puzzle. af_cuda.txt cupy.txt af_cpu.txt np.txt

umar456 commented 2 years ago

The streams you were synchronizing were not being used so the synchronizations were not doing anything. You needed to use cu.cuda.Device(0).synchronize()

cat blah.py 
import sys
from time import time
import arrayfire as af

try:
    import numpy as np
except ImportError:
    np = None

try:
    import cupy as cp
except ImportError:
    cp = None

def calc_arrayfire(A0, n, backend):
    af.set_backend(backend)
    A = af.interop.from_ndarray(A0).as_type(af.Dtype.f32)
    af.sync()
    def run(iters):
        B = af.constant(0, n,n, dtype=af.Dtype.f32)
        count = 0
        af.sync()
        for t in range(iters):
            B += af.matmul(A, A)
            count += 1
            af.eval(B)
        C = af.max(B)
        af.sync()
        return C, count

    return run

def calc_arrayfire_cpu(A0, n):
    return calc_arrayfire(A0, n, 'cpu')

def calc_arrayfire_cuda(A0, n):
    return calc_arrayfire(A0, n, 'cuda')

def calc_numpy(A0, n):
    A = A0
    def run(iters):
        B = np.zeros((n,n), np.float32)
        count = 0
        for t in range(iters):
            B += np.dot(A, A)
            count += 1
        C = np.max(B)
        return C, count
    return run

def calc_cupy(A0, n):
    A=cp.asarray(A0)
    cp.cuda.Device(0).synchronize()
    def run(iters):
        B = cp.zeros((n,n), cp.float32)
        count = 0
        cp.cuda.Device(0).synchronize()
        for t in range(iters):
            B += cp.dot(A, A)
            count += 1
        C = cp.max(B)
        cp.cuda.Device(0).synchronize()
        return C.astype(float), count
    return run

def bench(calc, upto=2048, iters=100):
    _, name = calc.__name__.split("_", 1)
    print("\nBenchmark N x N matrix multiply on %s" % name)
    np.random.seed(1)

    for n in range(128, upto + 128, 128):
        A0 = np.random.rand(n, n).astype(np.float32)
        calc1 = calc(A0, n)
        start = time()
        mmax, count = calc1(iters)
        t = (time() - start) / iters
        gflops = 2.0 * (n ** 3) / (t * 1E9)
        print("Time taken for %4d x %4d: %0.4f Gflops" % (n, n, gflops))
        print("Max = %f with iter count = %d "%(mmax, count))

if __name__ == "__main__":

    if (len(sys.argv) > 1):
        af.set_device(int(sys.argv[1]))

    af.info()
    upto=512
    bench(calc_arrayfire_cpu, upto)
    bench(calc_arrayfire_cuda, upto)
    if np:
        bench(calc_numpy, upto)
    if cp:
        bench(calc_cupy, upto)

python blah.py
ArrayFire v3.9.0 (CUDA, 64-bit Linux, build 1d03d07)
Platform: CUDA Runtime 11.4, Driver: 510.60.02
[0] Quadro T2000, 3915 MB, CUDA Compute 7.5

Benchmark N x N matrix multiply on arrayfire_cpu
Time taken for  128 x  128: 12.7640 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 122.4049 Gflops
Max = 8153.567383 with iter count = 100 
Time taken for  384 x  384: 189.4583 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 237.9744 Gflops
Max = 15126.784180 with iter count = 100 

Benchmark N x N matrix multiply on arrayfire_cuda
Time taken for  128 x  128: 88.2345 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 627.5080 Gflops
Max = 8153.573730 with iter count = 100 
Time taken for  384 x  384: 1072.7912 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 1326.2108 Gflops
Max = 15126.784180 with iter count = 100 

Benchmark N x N matrix multiply on numpy
Time taken for  128 x  128: 27.5433 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 205.1417 Gflops
Max = 8153.567383 with iter count = 100 
Time taken for  384 x  384: 261.1695 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 322.3793 Gflops
Max = 15126.783203 with iter count = 100 

Benchmark N x N matrix multiply on cupy
Time taken for  128 x  128: 0.9067 Gflops
Max = 4117.871582 with iter count = 100 
Time taken for  256 x  256: 614.8427 Gflops
Max = 8153.573730 with iter count = 100 
Time taken for  384 x  384: 1052.3507 Gflops
Max = 11353.426758 with iter count = 100 
Time taken for  512 x  512: 1296.7014 Gflops
Max = 15126.784180 with iter count = 100

arrayfire / arrayfire-python

arrayfile-python is much slower than cupy #258