ROCm / rocprofiler

ROC profiler library. Profiling with perf-counters and derived metrics.
https://rocm.docs.amd.com/projects/rocprofiler/en/latest/
Other
116 stars 44 forks source link

rocm profiler creates trace for 1 gpu only when kernels launched onto two separate kernels. #60

Open gggh000 opened 2 years ago

gggh000 commented 2 years ago
#include <stdio.h>
#include "hip/hip_runtime.h"

// 1. if N is set to up to 1024, then sum is OK.
// 2. Set N past the 1024 which is past No. of threads per blocks, and then all iterations of sum results in
// even the ones within the block.

// 3. To circumvent the problem described in 2. above, since if N goes past No. of threads per block, we need multiple block launch.
// The trick is describe in p65 to use formula (N+127) / 128 for blocknumbers so that when block number starts from 1, it is
// (1+127) / 128.

#define N 2048
#define N 536870912
#define MAX_THREAD_PER_BLOCK 1024

__global__ void add( int * a, int * b, int * c ) {
    int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x ;
    if (tid < N)
        c[tid] = a[tid] + b[tid];
}

int main (void) {
    int *a, *b, *c;
    int *dev_a, *dev_b, *dev_c;
    int stepSize;

    int count = 0;

    hipGetDeviceCount(&count);

    printf("\nDevice count: %d.", count);

    if (count < 2) {
        printf("No. of devices must be at least 2.");
        return 1;
    }

    // allocate dev memory for N size for pointers declared earlier.
    // allocate dev memory for N size for pointers declared earlier.

    printf("\nAllocating memory...(size %u array size of INT).\n", N );

    hipMalloc( (void**)&dev_a, N * sizeof(int));
    hipMalloc( (void**)&dev_b, N * sizeof(int));
    hipMalloc( (void**)&dev_c, N * sizeof(int));

    const unsigned blocks = 512;
    const unsigned threadsPerBlock = 256;

    // invoke the kernel:
    // block count: (N+127)/128
    // thread count: 128

    hipSetDevice(0);
    hipLaunchKernelGGL(add, blocks, threadsPerBlock, 0, 0, dev_a, dev_b, dev_c);
    hipSetDevice(1);
    hipLaunchKernelGGL(add, blocks, threadsPerBlock, 0, 0, dev_a, dev_b, dev_c);
    hipDeviceSynchronize();

    hipFree(dev_a);
    hipFree(dev_b);
    hipFree(dev_c);
}

use following to compile and create trace:


FILE1=p61
for FILE in $FILE1 ; do
    hipcc $FILE.cpp -o $FILE.out
    rocprof --hip-trace  -d ./$FILE ./$FILE.out
done

there is a result.json created and when opened in chrome tracer, only gpu0 is seen.

00:07.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Vega 10 [Radeon Instinct MI25 MxGPU] (rev 06)
00:08.0 Display controller: Advanced Micro Devices, Inc. [AMD/ATI] Vega 10 [Radeon Instinct MI25 MxGPU] (rev 06)
root@sriov-guest:~/dev-learn/gpu/hip/hip-stream-aql/p61-2gpus# egrep -irn gpu results.json
4:,{"args":{"name":"GPU0"},"ph":"M","pid":6,"name":"process_name","sort_index":2}
268:    "Agent2.Uuid": "GPU-0215141e35aa2184",
269:    "Agent2.MarketingName": "Vega10[RadeonInstinctMI25MxGPU]",
279:    "Agent2.DeviceType": "GPU",
335:    "Agent3.Uuid": "GPU-0215141e35aa2904",
336:    "Agent3.MarketingName": "Vega10[RadeonInstinctMI25MxGPU]",
346:    "Agent3.DeviceType": "GPU",
ex-rzr commented 2 years ago

I suspect that the first kernel doesn't finish when the program exits.

    hipSetDevice(0);
    hipLaunchKernelGGL(add, blocks, threadsPerBlock, 0, 0, dev_a, dev_b, dev_c); // launch on gpu 0
    hipSetDevice(1);
    hipLaunchKernelGGL(add, blocks, threadsPerBlock, 0, 0, dev_a, dev_b, dev_c); // launch on gpu 1
    hipDeviceSynchronize();  // wait for gpu 1

    hipSetDevice(0); // new code
    hipDeviceSynchronize();  // new code: wait for gpu 0

I haven't checked it, it's just a thought.

(By the way, recent HIP versions support CUDA's syntax kernel<<<...>>>(...))

gggh000 commented 2 years ago

then should not it show that exactly?? It shlould not be the reason for ignoring 2nd gpu?