NVLink not working in ILGPU

Ruberik commented 2 years ago

I'll preface this by saying that it's probably working sometimes, since otherwise you wouldn't have closed @yurygotham's #378. But I can say fairly confidently that it isn't working on an Azure ND40rs_v2 machine in ILGPU, but it is working from C code.

Here's a snippet of the output of my C# program using ILGPU:

Rate 0->4: 10,641,553,803 bytes / second
Rate 0->5: 10,639,075,942 bytes / second
Rate 0->6: 8,126,724,309 bytes / second

Here's a snippet of the output of my adapted version of NVIDIA's simpleP2P sample:

cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU4: 22.52GB/s
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU5: 44.87GB/s
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU6: 44.87GB/s

Code follows. Output is attached.

C# Program

        public static void TestMultiGpuCopy() {
            var startTime = DateTime.Now;
            void LogTime(string s) { var timeSpent = DateTime.Now.Subtract(startTime).TotalSeconds; Console.WriteLine($"{DateTime.Now}\tTime to {s}: {timeSpent:0.0000}s"); startTime = DateTime.Now; }

            int simultaneousCopies = 2;  // <-- changing to 1 does not affect the output
            long bufferSize = 256 * 1024 * 1024; // 1GB. We'll use 1GB * simultaneousCopies * 8 per accelerator.
            void LogRate(string description) { var timeSpent = DateTime.Now.Subtract(startTime).TotalSeconds; Console.WriteLine("Rate " + description + ": " + (bufferSize * simultaneousCopies * sizeof(int) / timeSpent).ToString("0,000" + " bytes / second")); startTime = DateTime.Now; }

            using var context = Context.Create().Cuda().EnableAlgorithms().ToContext();
            var accs = Enumerable.Range(0, 8).Select(i => context.CreateCudaAccelerator(i)).ToArray();
            //var accs = new[] { 0, 5 }.Select(i => context.CreateCudaAccelerator(i)).ToArray();
            var streams = accs.Select(acc => Enumerable.Range(0, simultaneousCopies).Select(_ => acc.CreateStream()).ToArray()).ToArray();

            var buffers = new MemoryBuffer1D<int, Stride1D.Dense>[accs.Length][][];

            for (int accId = 0; accId < accs.Length; accId++) {
                buffers[accId] = new MemoryBuffer1D<int, Stride1D.Dense>[accs.Length][];
                for (int bufferId = 0; bufferId < accs.Length; bufferId++) {
                    buffers[accId][bufferId] = Enumerable.Range(0, simultaneousCopies).Select(_ => {
                        //Console.WriteLine($"Allocating {bufferSize} ints on accelerator {accId}");
                        return accs[accId].Allocate1D<int>(bufferSize);
                    }).ToArray();
                    for (int streamId = 0; streamId < simultaneousCopies; streamId++) {
                        if (bufferId == accId) accs[accId].Initialize(streams[accId][streamId], buffers[accId][bufferId][streamId].AsContiguous(), accId);
                        else accs[accId].Initialize(streams[accId][streamId], buffers[accId][bufferId][streamId].AsContiguous(), -1);
                    }
                }

                LogTime("allocate and initialize buffers on accelerator #" + accId);
            }

            PrintBuffers(buffers);

            // print peer access
            Console.WriteLine("Reporting Access: ");
            for (int acc1 = 0; acc1 < accs.Length; acc1++) {
                for (int acc2 = 0; acc2 < accs.Length; acc2++) {
                    //Console.WriteLine(acc1 + " " + acc2 + " " + accs[acc1].CanAccessPeer(accs[acc2]));
                    Console.Write(accs[acc1].CanAccessPeer(accs[acc2]) + "\t");
                }
                Console.WriteLine();
            }

            Console.WriteLine("Enabling Access: ");
            for (int acc1 = 0; acc1 < accs.Length; acc1++) {
                for (int acc2 = 0; acc2 < accs.Length; acc2++) {
                    var enabled = accs[acc1].EnablePeerAccess(accs[acc2]);
                    Console.WriteLine(acc1 + " " + acc2 + " " + enabled);
                }
            }
            Console.WriteLine();

            // print peer access
            Console.WriteLine("Reporting Access: ");
            for (int acc1 = 0; acc1 < accs.Length; acc1++) {
                for (int acc2 = 0; acc2 < accs.Length; acc2++) {
                    //Console.WriteLine(acc1 + " " + acc2 + " " + accs[acc1].CanAccessPeer(accs[acc2]));
                    Console.Write(accs[acc1].CanAccessPeer(accs[acc2]) + "\t");
                }
                Console.WriteLine();
            }

            // parallel copies, different streams
            Console.WriteLine("testing copy: ");
            for (int acc1 = 0; acc1 < accs.Length; acc1++) {
                for (int acc2 = 0; acc2 < accs.Length; acc2++)
                    if (acc1 != acc2) {
                        Parallel.For(0, simultaneousCopies, streamId => {
                            buffers[acc1][acc1][streamId].CopyTo(streams[acc1][streamId], buffers[acc2][acc1][streamId]);
                        });
                        LogRate(acc1 + "->" + acc2);
                        //LogTime("copy " + acc1 + " to " + acc2);
                    }
                Console.WriteLine();
            }

            PrintBuffers(buffers);

            for (int accId = 0; accId < accs.Length; accId++) {
                accs[accId].Dispose();
            }
        }

        public static unsafe void PrintBuffers(MemoryBuffer1D<int, Stride1D.Dense>[][][] buffers) {
            Console.WriteLine("Buffers: ");
            for (int i = 0; i < buffers.Length; i++) {
                for (int j = 0; j < buffers[i].Length; j++) {
                    for (int k = 0; k < buffers[i][j].Length; k++) {
                        var value = new int[1];
                        fixed (int* v = value) {
                            buffers[i][j][k].AsContiguous().CopyToCPU(ref Unsafe.AsRef<int>(v), 1);
                            Console.Write(value[0] + "\t");
                        }
                    }
                }
                Console.Writ
[cuda-output.txt](https://github.com/m4rs-mt/ILGPU/files/7419917/cuda-output.txt)
[csharp-output.txt](https://github.com/m4rs-mt/ILGPU/files/7419918/csharp-output.txt)
eLine();
            }
        }

C program

Note that I took NVIDIA's sample code, modified simpleP2P.cu to contain the following, and built it.

// Notice required by NVIDIA: This software contains source code provided by NVIDIA Corporation.

// includes, system
#include <stdlib.h>
#include <stdio.h>

// CUDA includes
#include <cuda_runtime.h>

// includes, project
#include <helper_cuda.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples

__global__ void SimpleKernel(float *src, float *dst)
{
    // Just a dummy kernel, doing enough for us to verify that everything
    // worked
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    dst[idx] = src[idx] * 2.0f;
}

inline bool IsAppBuiltAs64()
{
    return sizeof(void*) == 8;
}

int main(int argc, char **argv)
{
    printf("[%s] - Starting...\n", argv[0]);

    if (!IsAppBuiltAs64())
    {
        printf("%s is only supported with on 64-bit OSs and the application must be built as a 64-bit target.  Test is being waived.\n", argv[0]);
        exit(EXIT_WAIVED);
    }

    // Number of GPUs
    printf("Checking for multiple GPUs...\n");
    int gpu_n;
    checkCudaErrors(cudaGetDeviceCount(&gpu_n));
    printf("CUDA-capable device count: %i\n", gpu_n);

    if (gpu_n < 2)
    {
        printf("Two or more GPUs with Peer-to-Peer access capability are required for %s.\n", argv[0]);
        printf("Waiving test.\n");
        exit(EXIT_WAIVED);
    }

    // Query device properties
    cudaDeviceProp prop[64];

    for (int i=0; i < gpu_n; i++)
    {
        checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
    }
    // Check possibility for peer access
    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");

    int can_access_peer;
    bool is_match[64][64]; // We take only 1 pair of P2P capable GPUs

    // Show all the combinations of supported P2P GPUs
    for (int i = 0; i < gpu_n; i++)
    {
        for (int j = 0; j < gpu_n; j++)
        {
            if (i == j) {
                is_match[i][j] = false;
            } else {
                checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
                printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name, i,
                    prop[j].name, j, can_access_peer ? "Yes" : "No");
                is_match[i][j] = can_access_peer;
            }
        }
    }
    int error_count = 0;

    for (int i = 0; i < gpu_n; i++) {
        for (int j = 0; j < gpu_n; j++) {
            if (!is_match[i][j]) continue;
            int gpuid[2];
            gpuid[0] = i;
            gpuid[1] = j;

            // Enable peer access
            //printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]);
            checkCudaErrors(cudaSetDevice(gpuid[0]));
            checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
            checkCudaErrors(cudaSetDevice(gpuid[1]));
            checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));

            // Allocate buffers
            const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
            //printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
            checkCudaErrors(cudaSetDevice(gpuid[0]));
            float* g0;
            checkCudaErrors(cudaMalloc(&g0, buf_size));
            checkCudaErrors(cudaSetDevice(gpuid[1]));
            float* g1;
            checkCudaErrors(cudaMalloc(&g1, buf_size));
            float* h0;
            checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA

            // Create CUDA event handles
            //printf("Creating event handles...\n");
            cudaEvent_t start_event, stop_event;
            float time_memcpy;
            int eventflags = cudaEventBlockingSync;
            checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
            checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));

            // P2P memcopy() benchmark
            checkCudaErrors(cudaEventRecord(start_event, 0));

            for (int i = 0; i < 100; i++)
            {
                // With UVA we don't need to specify source and target devices, the
                // runtime figures this out by itself from the pointers
                // Ping-pong copy between GPUs -- ACTUALLY don't, just copy in one direction. The speed doesn't change
                // if you copy back and forth, though.
                //if (i % 2 == 0)
                //{
                //    checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
                //}
                //else
                //{
                    checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
                //}
            }

            checkCudaErrors(cudaEventRecord(stop_event, 0));
            checkCudaErrors(cudaEventSynchronize(stop_event));
            checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
            printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n", gpuid[0], gpuid[1],
                (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);

            // Prepare host buffer and copy to GPU 0
            //printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);

            for (int i = 0; i < buf_size / sizeof(float); i++)
            {
                h0[i] = float(i % 4096);
            }

            checkCudaErrors(cudaSetDevice(gpuid[0]));
            checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));

            // Kernel launch configuration
            const dim3 threads(512, 1);
            const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);

            // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
            // output to the GPU 1 buffer
            //printf("Run kernel on GPU%d, taking source data from GPU%d and writing to GPU%d...\n",
            //    gpuid[1], gpuid[0], gpuid[1]);
            checkCudaErrors(cudaSetDevice(gpuid[1]));
            SimpleKernel << <blocks, threads >> > (g0, g1);

            checkCudaErrors(cudaDeviceSynchronize());

            // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
            // output to the GPU 0 buffer
            //printf("Run kernel on GPU%d, taking source data from GPU%d and writing to GPU%d...\n",
            //    gpuid[0], gpuid[1], gpuid[0]);
            checkCudaErrors(cudaSetDevice(gpuid[0]));
            SimpleKernel << <blocks, threads >> > (g1, g0);

            checkCudaErrors(cudaDeviceSynchronize());

            // Copy data back to host and verify
            //printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
            checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));

            for (int i = 0; i < buf_size / sizeof(float); i++)
            {
                // Re-generate input data and apply 2x '* 2.0f' computation of both
                // kernel runs
                if (h0[i] != float(i % 4096) * 2.0f * 2.0f)
                {
                    printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f));

                    if (error_count++ > 10)
                    {
                        break;
                    }
                }
            }

            // Disable peer access (also unregisters memory for non-UVA cases)
            //printf("Disabling peer access...\n");
            checkCudaErrors(cudaSetDevice(gpuid[0]));
            checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
            checkCudaErrors(cudaSetDevice(gpuid[1]));
            checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));

            // Cleanup and shutdown
            //printf("Shutting down...\n");
            checkCudaErrors(cudaEventDestroy(start_event));
            checkCudaErrors(cudaEventDestroy(stop_event));
            checkCudaErrors(cudaSetDevice(gpuid[0]));
            checkCudaErrors(cudaFree(g0));
            checkCudaErrors(cudaSetDevice(gpuid[1]));
            checkCudaErrors(cudaFree(g1));
            checkCudaErrors(cudaFreeHost(h0));
        }
    }
    for (int i = 0; i < gpu_n; i++)
    {
        checkCudaErrors(cudaSetDevice(i));
    }
    if (error_count != 0)
    {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }
    else
    {
        printf("Test passed\n");
        exit(EXIT_SUCCESS);
    }
}

Ruberik commented 2 years ago

Since I can't figure out how to see the files I tried to attach, I've pasted the output here.

C# output

Note that anything that says "Buffers:" can be ignored: It's just there to use the output data, and make sure our various calls don't get optimized out somehow.

10/26/2021 12:35:50 PM  Time to allocate and initialize buffers on accelerator #0: 2.1168s
10/26/2021 12:35:50 PM  Time to allocate and initialize buffers on accelerator #1: 0.0433s
10/26/2021 12:35:50 PM  Time to allocate and initialize buffers on accelerator #2: 0.0440s
10/26/2021 12:35:50 PM  Time to allocate and initialize buffers on accelerator #3: 0.0465s
10/26/2021 12:35:50 PM  Time to allocate and initialize buffers on accelerator #4: 0.0480s
10/26/2021 12:35:50 PM  Time to allocate and initialize buffers on accelerator #5: 0.0482s
10/26/2021 12:35:50 PM  Time to allocate and initialize buffers on accelerator #6: 0.0471s
10/26/2021 12:35:50 PM  Time to allocate and initialize buffers on accelerator #7: 0.0464s
Buffers: 
0   0   -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  
-1  -1  1   1   -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  
-1  -1  -1  -1  2   2   -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  
-1  -1  -1  -1  -1  -1  3   3   -1  -1  -1  -1  -1  -1  -1  -1  
-1  -1  -1  -1  -1  -1  -1  -1  4   4   -1  -1  -1  -1  -1  -1  
-1  -1  -1  -1  -1  -1  -1  -1  -1  -1  5   5   -1  -1  -1  -1  
-1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  6   6   -1  -1  
-1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  7   7   
Reporting Access: 
False   True    False   False   True    True    True    False   
True    False   True    True    False   False   False   True    
False   True    False   True    False   True    False   True    
False   True    True    False   True    False   False   True    
True    False   False   True    False   True    True    False   
True    False   True    False   True    False   True    False   
True    False   False   False   True    True    False   True    
False   True    True    True    False   False   True    False   
Enabling Access: 
0 0 True
0 1 True
0 2 True
0 3 True
0 4 True
0 5 True
0 6 True
0 7 True
1 0 True
1 1 True
1 2 True
1 3 True
1 4 True
1 5 True
1 6 True
1 7 True
2 0 True
2 1 True
2 2 True
2 3 True
2 4 True
2 5 True
2 6 True
2 7 True
3 0 True
3 1 True
3 2 True
3 3 True
3 4 True
3 5 True
3 6 True
3 7 True
4 0 True
4 1 True
4 2 True
4 3 True
4 4 True
4 5 True
4 6 True
4 7 True
5 0 True
5 1 True
5 2 True
5 3 True
5 4 True
5 5 True
5 6 True
5 7 True
6 0 True
6 1 True
6 2 True
6 3 True
6 4 True
6 5 True
6 6 True
6 7 True
7 0 True
7 1 True
7 2 True
7 3 True
7 4 True
7 5 True
7 6 True
7 7 True

Reporting Access: 
False   True    False   False   True    True    True    False   
True    False   True    True    False   False   False   True    
False   True    False   True    False   True    False   True    
False   True    True    False   True    False   False   True    
True    False   False   True    False   True    True    False   
True    False   True    False   True    False   True    False   
True    False   False   False   True    True    False   True    
False   True    True    True    False   False   True    False   
testing copy: 
Rate 0->1: 3,941,520,148 bytes / second
Rate 0->2: 9,902,384,789 bytes / second
Rate 0->3: 9,884,859,033 bytes / second
Rate 0->4: 10,641,553,803 bytes / second
Rate 0->5: 10,639,075,942 bytes / second
Rate 0->6: 8,126,724,309 bytes / second
Rate 0->7: 10,641,189,959 bytes / second

Rate 1->0: 12,568,195,919 bytes / second
Rate 1->2: 10,648,851,740 bytes / second
Rate 1->3: 10,645,267,482 bytes / second
Rate 1->4: 10,636,978,570 bytes / second
Rate 1->5: 10,623,786,223 bytes / second
Rate 1->6: 10,665,590,819 bytes / second
Rate 1->7: 8,131,401,603 bytes / second

Rate 2->0: 12,600,503,484 bytes / second
Rate 2->1: 10,639,523,980 bytes / second
Rate 2->3: 8,128,499,201 bytes / second
Rate 2->4: 10,612,010,802 bytes / second
Rate 2->5: 10,643,990,612 bytes / second
Rate 2->6: 10,642,782,617 bytes / second
Rate 2->7: 10,648,460,997 bytes / second

Rate 3->0: 10,652,248,190 bytes / second
Rate 3->1: 10,636,056,620 bytes / second
Rate 3->2: 8,106,094,765 bytes / second
Rate 3->4: 10,658,746,036 bytes / second
Rate 3->5: 10,638,385,509 bytes / second
Rate 3->6: 10,632,175,649 bytes / second
Rate 3->7: 10,649,691,407 bytes / second

Rate 4->0: 12,602,736,696 bytes / second
Rate 4->1: 10,633,491,808 bytes / second
Rate 4->2: 10,641,986,228 bytes / second
Rate 4->3: 10,646,871,918 bytes / second
Rate 4->5: 8,123,173,777 bytes / second
Rate 4->6: 9,891,456,379 bytes / second
Rate 4->7: 10,615,819,337 bytes / second

Rate 5->0: 10,665,908,655 bytes / second
Rate 5->1: 10,631,391,374 bytes / second
Rate 5->2: 10,647,114,736 bytes / second
Rate 5->3: 10,634,655,563 bytes / second
Rate 5->4: 8,127,311,752 bytes / second
Rate 5->6: 10,626,362,129 bytes / second
Rate 5->7: 10,641,042,320 bytes / second

Rate 6->0: 9,226,817,516 bytes / second
Rate 6->1: 10,625,100,305 bytes / second
Rate 6->2: 10,642,260,468 bytes / second
Rate 6->3: 10,647,262,544 bytes / second
Rate 6->4: 10,640,341,088 bytes / second
Rate 6->5: 10,644,169,989 bytes / second
Rate 6->7: 10,650,018,860 bytes / second

Rate 7->0: 11,536,673,190 bytes / second
Rate 7->1: 8,133,267,868 bytes / second
Rate 7->2: 10,628,144,968 bytes / second
Rate 7->3: 10,639,740,106 bytes / second
Rate 7->4: 10,634,892,558 bytes / second
Rate 7->5: 10,644,497,103 bytes / second
Rate 7->6: 10,649,147,457 bytes / second

Buffers: 
0   0   1   1   2   2   3   3   4   4   5   5   6   6   7   7   
0   0   1   1   2   2   3   3   4   4   5   5   6   6   7   7   
0   0   1   1   2   2   3   3   4   4   5   5   6   6   7   7   
0   0   1   1   2   2   3   3   4   4   5   5   6   6   7   7   
0   0   1   1   2   2   3   3   4   4   5   5   6   6   7   7   
0   0   1   1   2   2   3   3   4   4   5   5   6   6   7   7   
0   0   1   1   2   2   3   3   4   4   5   5   6   6   7   7   
0   0   1   1   2   2   3   3   4   4   5   5   6   6   7   7

C output

[Y:\test\simpleP2P.exe] - Starting...
Checking for multiple GPUs...
CUDA-capable device count: 8

Checking GPU(s) for support of peer to peer memory access...
> Peer access from Tesla V100-SXM2-32GB (GPU0) -> Tesla V100-SXM2-32GB (GPU1) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU0) -> Tesla V100-SXM2-32GB (GPU2) : No
> Peer access from Tesla V100-SXM2-32GB (GPU0) -> Tesla V100-SXM2-32GB (GPU3) : No
> Peer access from Tesla V100-SXM2-32GB (GPU0) -> Tesla V100-SXM2-32GB (GPU4) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU0) -> Tesla V100-SXM2-32GB (GPU5) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU0) -> Tesla V100-SXM2-32GB (GPU6) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU0) -> Tesla V100-SXM2-32GB (GPU7) : No
> Peer access from Tesla V100-SXM2-32GB (GPU1) -> Tesla V100-SXM2-32GB (GPU0) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU1) -> Tesla V100-SXM2-32GB (GPU2) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU1) -> Tesla V100-SXM2-32GB (GPU3) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU1) -> Tesla V100-SXM2-32GB (GPU4) : No
> Peer access from Tesla V100-SXM2-32GB (GPU1) -> Tesla V100-SXM2-32GB (GPU5) : No
> Peer access from Tesla V100-SXM2-32GB (GPU1) -> Tesla V100-SXM2-32GB (GPU6) : No
> Peer access from Tesla V100-SXM2-32GB (GPU1) -> Tesla V100-SXM2-32GB (GPU7) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU2) -> Tesla V100-SXM2-32GB (GPU0) : No
> Peer access from Tesla V100-SXM2-32GB (GPU2) -> Tesla V100-SXM2-32GB (GPU1) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU2) -> Tesla V100-SXM2-32GB (GPU3) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU2) -> Tesla V100-SXM2-32GB (GPU4) : No
> Peer access from Tesla V100-SXM2-32GB (GPU2) -> Tesla V100-SXM2-32GB (GPU5) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU2) -> Tesla V100-SXM2-32GB (GPU6) : No
> Peer access from Tesla V100-SXM2-32GB (GPU2) -> Tesla V100-SXM2-32GB (GPU7) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU3) -> Tesla V100-SXM2-32GB (GPU0) : No
> Peer access from Tesla V100-SXM2-32GB (GPU3) -> Tesla V100-SXM2-32GB (GPU1) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU3) -> Tesla V100-SXM2-32GB (GPU2) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU3) -> Tesla V100-SXM2-32GB (GPU4) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU3) -> Tesla V100-SXM2-32GB (GPU5) : No
> Peer access from Tesla V100-SXM2-32GB (GPU3) -> Tesla V100-SXM2-32GB (GPU6) : No
> Peer access from Tesla V100-SXM2-32GB (GPU3) -> Tesla V100-SXM2-32GB (GPU7) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU4) -> Tesla V100-SXM2-32GB (GPU0) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU4) -> Tesla V100-SXM2-32GB (GPU1) : No
> Peer access from Tesla V100-SXM2-32GB (GPU4) -> Tesla V100-SXM2-32GB (GPU2) : No
> Peer access from Tesla V100-SXM2-32GB (GPU4) -> Tesla V100-SXM2-32GB (GPU3) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU4) -> Tesla V100-SXM2-32GB (GPU5) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU4) -> Tesla V100-SXM2-32GB (GPU6) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU4) -> Tesla V100-SXM2-32GB (GPU7) : No
> Peer access from Tesla V100-SXM2-32GB (GPU5) -> Tesla V100-SXM2-32GB (GPU0) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU5) -> Tesla V100-SXM2-32GB (GPU1) : No
> Peer access from Tesla V100-SXM2-32GB (GPU5) -> Tesla V100-SXM2-32GB (GPU2) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU5) -> Tesla V100-SXM2-32GB (GPU3) : No
> Peer access from Tesla V100-SXM2-32GB (GPU5) -> Tesla V100-SXM2-32GB (GPU4) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU5) -> Tesla V100-SXM2-32GB (GPU6) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU5) -> Tesla V100-SXM2-32GB (GPU7) : No
> Peer access from Tesla V100-SXM2-32GB (GPU6) -> Tesla V100-SXM2-32GB (GPU0) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU6) -> Tesla V100-SXM2-32GB (GPU1) : No
> Peer access from Tesla V100-SXM2-32GB (GPU6) -> Tesla V100-SXM2-32GB (GPU2) : No
> Peer access from Tesla V100-SXM2-32GB (GPU6) -> Tesla V100-SXM2-32GB (GPU3) : No
> Peer access from Tesla V100-SXM2-32GB (GPU6) -> Tesla V100-SXM2-32GB (GPU4) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU6) -> Tesla V100-SXM2-32GB (GPU5) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU6) -> Tesla V100-SXM2-32GB (GPU7) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU7) -> Tesla V100-SXM2-32GB (GPU0) : No
> Peer access from Tesla V100-SXM2-32GB (GPU7) -> Tesla V100-SXM2-32GB (GPU1) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU7) -> Tesla V100-SXM2-32GB (GPU2) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU7) -> Tesla V100-SXM2-32GB (GPU3) : Yes
> Peer access from Tesla V100-SXM2-32GB (GPU7) -> Tesla V100-SXM2-32GB (GPU4) : No
> Peer access from Tesla V100-SXM2-32GB (GPU7) -> Tesla V100-SXM2-32GB (GPU5) : No
> Peer access from Tesla V100-SXM2-32GB (GPU7) -> Tesla V100-SXM2-32GB (GPU6) : Yes
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: 22.52GB/s
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU4: 22.52GB/s
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU5: 44.87GB/s
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU6: 44.87GB/s
cudaMemcpyPeer / cudaMemcpy between GPU1 and GPU0: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU1 and GPU2: 44.88GB/s
cudaMemcpyPeer / cudaMemcpy between GPU1 and GPU3: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU1 and GPU7: 44.88GB/s
cudaMemcpyPeer / cudaMemcpy between GPU2 and GPU1: 44.90GB/s
cudaMemcpyPeer / cudaMemcpy between GPU2 and GPU3: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU2 and GPU5: 44.88GB/s
cudaMemcpyPeer / cudaMemcpy between GPU2 and GPU7: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU3 and GPU1: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU3 and GPU2: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU3 and GPU4: 44.90GB/s
cudaMemcpyPeer / cudaMemcpy between GPU3 and GPU7: 44.91GB/s
cudaMemcpyPeer / cudaMemcpy between GPU4 and GPU0: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU4 and GPU3: 44.90GB/s
cudaMemcpyPeer / cudaMemcpy between GPU4 and GPU5: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU4 and GPU6: 44.88GB/s
cudaMemcpyPeer / cudaMemcpy between GPU5 and GPU0: 44.90GB/s
cudaMemcpyPeer / cudaMemcpy between GPU5 and GPU2: 44.91GB/s
cudaMemcpyPeer / cudaMemcpy between GPU5 and GPU4: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU5 and GPU6: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU6 and GPU0: 44.88GB/s
cudaMemcpyPeer / cudaMemcpy between GPU6 and GPU4: 44.88GB/s
cudaMemcpyPeer / cudaMemcpy between GPU6 and GPU5: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU6 and GPU7: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU7 and GPU1: 44.90GB/s
cudaMemcpyPeer / cudaMemcpy between GPU7 and GPU2: 22.53GB/s
cudaMemcpyPeer / cudaMemcpy between GPU7 and GPU3: 44.90GB/s
cudaMemcpyPeer / cudaMemcpy between GPU7 and GPU6: 22.53GB/s
Test passed

m4rs-mt commented 2 years ago

@Ruberik Thank you very much for analyzing this issue in depth. I am trying to reproduce this issue on an NVLink-capable machine next week and let you know the details. I currently believe that the problem might be related to one of our memcpy functions that we leverage from the Cuda API...

MoFtZ commented 2 years ago

hi @Ruberik, I've put up a PR that attempts to fix the copy performance. Are you able to try it out? I don't have NVLink devices to test the behaviour.

Ruberik commented 2 years ago

Thanks for working on this! It doesn't seem to have worked:

Rate 0->1: 12,172,984,374 bytes / second
Rate 0->2: 11,491,944,864 bytes / second
Rate 0->3: 12,466,995,724 bytes / second
Rate 0->4: 11,477,854,643 bytes / second
Rate 0->5: 11,473,243,221 bytes / second
Rate 0->6: 8,593,898,098 bytes / second
Rate 0->7: 11,452,348,259 bytes / second

Rate 1->0: 13,735,394,786 bytes / second
Rate 1->2: 11,488,551,206 bytes / second
Rate 1->3: 12,470,572,118 bytes / second
Rate 1->4: 11,476,903,847 bytes / second
Rate 1->5: 11,477,511,111 bytes / second
Rate 1->6: 12,468,675,071 bytes / second
Rate 1->7: 8,606,724,590 bytes / second

Rate 2->0: 15,213,968,818 bytes / second
Rate 2->1: 11,486,935,006 bytes / second
Rate 2->3: 8,603,762,559 bytes / second
Rate 2->4: 11,450,552,953 bytes / second
Rate 2->5: 12,471,716,417 bytes / second
Rate 2->6: 11,480,106,511 bytes / second
Rate 2->7: 12,475,215,801 bytes / second

Rate 3->0: 11,488,723,300 bytes / second
Rate 3->1: 12,474,940,416 bytes / second
Rate 3->2: 8,606,145,127 bytes / second
Rate 3->4: 11,451,462,748 bytes / second
Rate 3->5: 11,483,145,171 bytes / second
Rate 3->6: 12,472,339,352 bytes / second
Rate 3->7: 11,482,340,843 bytes / second

Rate 4->0: 13,763,547,491 bytes / second
Rate 4->1: 12,471,694,688 bytes / second
Rate 4->2: 11,482,181,219 bytes / second
Rate 4->3: 12,468,023,546 bytes / second
Rate 4->5: 8,607,863,049 bytes / second
Rate 4->6: 11,454,296,863 bytes / second
Rate 4->7: 11,485,718,546 bytes / second

Rate 5->0: 12,480,305,364 bytes / second
Rate 5->1: 11,480,720,252 bytes / second
Rate 5->2: 12,467,553,043 bytes / second
Rate 5->3: 11,476,357,977 bytes / second
Rate 5->4: 8,597,521,045 bytes / second
Rate 5->6: 11,456,258,355 bytes / second
Rate 5->7: 12,475,926,061 bytes / second

Rate 6->0: 9,830,523,002 bytes / second
Rate 6->1: 11,459,327,206 bytes / second
Rate 6->2: 11,479,363,972 bytes / second
Rate 6->3: 11,474,285,372 bytes / second
Rate 6->4: 12,468,660,591 bytes / second
Rate 6->5: 11,475,548,469 bytes / second
Rate 6->7: 12,473,295,606 bytes / second

Rate 7->0: 12,519,128,021 bytes / second
Rate 7->1: 8,606,083,046 bytes / second
Rate 7->2: 11,454,663,446 bytes / second
Rate 7->3: 11,470,859,245 bytes / second
Rate 7->4: 12,472,266,915 bytes / second
Rate 7->5: 11,476,867,045 bytes / second
Rate 7->6: 12,473,614,390 bytes / second

Ruberik commented 2 years ago

Clearer results, with less crap. From ILGPU, with #664 incorporated:

Table is in GB/s. Copying one way.

        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
GPU0            11.94   11.30   11.30   11.30   11.30   7.67    11.01
GPU1    11.94           11.31   11.30   11.30   11.30   11.31   7.81
GPU2    12.27   11.30           7.68    11.01   11.31   11.31   11.30
GPU3    11.61   11.31   7.81            11.01   11.31   11.31   11.30
GPU4    12.27   11.30   11.03   11.30           7.81    10.74   11.30
GPU5    11.61   11.30   11.30   11.31   7.67            10.74   11.30
GPU6    8.10    10.74   11.30   11.30   11.30   11.30           11.31
GPU7    11.62   7.81    11.01   11.30   11.30   11.31   11.31

Table is in GB/s. Copying both ways simultaneously. The speed is the one-way speed of the one that finishes last.

        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
GPU0            8.77    8.77    8.43    8.59    8.76    5.30    8.26
GPU1    8.43            8.95    8.59    8.43    8.94    8.77    5.37
GPU2    9.35    8.94            5.31    8.42    9.35    9.54    9.54
GPU3    8.76    8.43    5.37            8.10    8.76    8.77    8.77
GPU4    9.15    8.43    8.94    8.43            5.37    8.26    8.77
GPU5    8.95    8.77    9.53    8.77    5.37            8.77    9.54
GPU6    5.58    8.42    9.53    8.78    8.77    9.34            9.54
GPU7    8.77    5.37    8.94    8.77    8.95    9.34    9.54

From C, where I was improvising more than I was comfortable doing, so the both-way numbers might not be optimal:

Table is in GB/s. Copying one way.

        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
GPU0            22.42   10.47   10.49   22.52   44.86   44.85   10.46
GPU1    22.52           44.89   22.52   10.50   10.54   10.60   44.89
GPU2    10.55   44.89           22.52   10.63   44.89   10.55   22.52
GPU3    10.55   22.52   22.52           44.87   10.54   10.60   44.90
GPU4    22.51   10.47   10.47   44.88           22.52   44.88   10.44
GPU5    44.88   10.50   44.89   10.48   22.52           22.52   10.43
GPU6    44.88   10.49   10.49   10.44   44.88   22.52           22.53
GPU7    10.53   44.89   22.52   44.90   10.54   10.58   22.52
Table is in GB/s. Copying both ways simultaneously. The speed is the one-way speed of the one that finishes last.

        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
GPU0            22.50   7.94    7.68    22.49   44.77   44.76   7.97
GPU1    22.50           44.80   22.49   7.71    7.97    7.96    44.79
GPU2    8.02    44.80           22.50   8.06    44.79   8.60    22.50
GPU3    7.71    22.50   22.50           44.79   7.97    7.96    44.78
GPU4    22.49   7.69    7.95    44.79           22.49   44.77   7.95
GPU5    44.77   8.02    44.80   8.02    22.49           22.49   8.56
GPU6    44.79   8.04    8.55    8.05    44.77   22.49           22.49
GPU7    8.05    44.80   22.50   44.79   8.06    8.56    22.50

The most relevant code bits: ILGPU:

for (int k = 0; k < 100; k++) {
    Parallel.Invoke(
        () => { buffers[acc1][acc1].CopyTo(streams[acc1], buffers[acc2][acc1]); },
        () => { if (copyBothWaysSimultaneously) buffers[acc2][acc2].CopyTo(streams[acc2], buffers[acc1][acc2]); });
}

C:

for (int k = 0; k < 100; k++) {
    cudaMemcpyAsync(g0, g1, buf_size, cudaMemcpyDefault, stream0);
    if (copy_both_ways_simultaneously) {
        cudaMemcpyAsync(g3, g2, buf_size, cudaMemcpyDefault, stream1);
    }
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    checkCudaErrors(cudaStreamSynchronize(stream0));
    checkCudaErrors(cudaSetDevice(gpuid[1]));
    checkCudaErrors(cudaStreamSynchronize(stream1));
}

m4rs-mt commented 2 years ago

@Ruberik Thank you for your efforts and time to investigate this issue 👍 I am currently trying to reproduce your problem on a set of A100 cards. It turns out that (at least...) one problem is related to the fact that the "PeerAccess" between the different accelerators is not properly enabled. However, fixing this issue still results in "strange" numbers being reported by the application you provided.... I continue my investigation, so stay tuned 🚀.

Ruberik commented 2 years ago

Thanks, @m4rs-mt! I really appreciate the hard work you put into this project, and @MoFtZ as well. Please let me know if you want the full code that spits out a table like the one in my latest message.

m4rs-mt commented 2 years ago

@Ruberik I have analyzed the problem in detail and found a solution that fixes this performance issue 🤞. The problem was related to an invalid peer-access accelerator registration (see #675 for more information). First, I reproduced the problem with 2xA100 devices with NVLink capabilities. The Cuda example gives the following output:

// Checking GPU(s) for support of peer to peer memory access...
cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: 243.65GB/s
cudaMemcpyPeer / cudaMemcpy between GPU1 and GPU0: 243.73GB/s

My sample program (see below) written in ILGPU outputs the following information with and without peer-access:

// Without peer access :( 
-> GPU0 => GPU1 = 12.644GB/s
-> GPU1 => GPU0 = 12.898GB/s

// With peer access :)
-> GPU0 => GPU1 = 241.002GB/s
-> GPU1 => GPU0 = 241.226GB/s

The program used for benchmarking:

        static void Main(string[] args)
        {
            const long Length = 1024L * 1024L * 16L * sizeof(float);
            const int NumRuns = 100;
            using var context = Context.Create(builder => builder.Cuda(
                device => device.DeviceId < 2));

            var accls = new List<CudaAccelerator>(context.Devices.Length);
            foreach (var device in context.Devices)
                accls.Add(device.CreateAccelerator(context) as CudaAccelerator);

            // Enable peer access
            for (int i = 0; i < accls.Count; ++i)
            {
                for (int j = 0; j < accls.Count; ++j)
                {
                    // Skip invalid peer access on the same device
                    if (i == j)
                        continue;
                    bool canAccess = accls[i].CanAccessPeer(accls[j]);
                    if (!canAccess)
                        throw new NotSupportedException("Not supported peer config");

                    // Enable the actual access in both directions
                    if (!accls[i].EnableBidirectionalPeerAccess(accls[j]))
                        throw new NotSupportedException("Not supported peer access");
                }
            }

            // Allocate memory on all devices
            var buffers = new List<MemoryBuffer1D<byte, Stride1D.Dense>>(accls.Count);
            foreach (var accl in accls)
                buffers.Add(accl.Allocate1D<byte>(Length));

            // Perform the measurements
            var stream = accls[0].CreateStream();
            var watch = new Stopwatch();
            for (int i = 0; i < accls.Count; ++i)
            {
                for (int j = 0; j < accls.Count; ++j)
                {
                    if (i == j)
                        continue;

                    var source = buffers[i];
                    var target = buffers[j];

                    watch.Restart();
                    for (int r = 0; r < NumRuns; ++r)
                    {
                        source.CopyTo(stream, target);
                    }
                    stream.Synchronize();
                    watch.Stop();

                    double gbS = (1.0 / (watch.Elapsed.TotalMilliseconds / 1000.0)) *
                        ((100.0 * Length)) / (1024.0 * 1024.0 * 1024.0);
                    Console.WriteLine($"-> GPU{i} => GPU{j} = {Math.Round(gbS, 3)}GB/s");
                }
            }

            stream.Dispose();
            foreach (var buf in buffers)
                buf.Dispose();
            foreach (var accl in accls)
                accl.Dispose();
        }

Ruberik commented 2 years ago

Awesome! I'm ready to test as soon as I can get a machine with NVLINK on Azure... (Update 6 days later: Still trying several times a day, but we'll run your code exactly, and my own code, when we get one.)

Ruberik commented 2 years ago

Update 34 days later: I finally have access to a machine with NVSwitch, and this appears to have worked!

m4rs-mt / ILGPU