saqibkh / CPU_Workloads

0 stars 0 forks source link

DGEMM #1

Open saqibkh opened 1 year ago

saqibkh commented 1 year ago
  1. What is the purpose of this workload?
  2. How to install it on ubuntu (preferably 22.04)?
  3. How to run this workload and benchmark results? How long does it take to run? What parameters causes more stress and uses all resources?
MaheenKhalid08 commented 1 year ago
  1. The general matrix multiplication (GEMM) workloads accept three N × N matrices A, B, and C and compute the result: C =AB+C. The GEMM algorithm divides the matrices into blocks and operates on one block at a time. This improves cache coherency and leads to faster execution compared to an implementation of the “pencil and paper” matrix multiplication algorithm. In Geekbench, SGEMM performs the operation using single precision floating point and DGEMM performs it using double precision floating point. The block sizes are empirically determined values that give good cache performance across tested platforms. SGEMM uses a block size of 128×128 elements and DGEMM uses 64×64 elements. The input values are Aij = Bji = (i+jN)( mod 10) and Cij = 0. For both SGEMM and DGEMM, N =896 on desktop machines and N = 512 for mobile devices.
MaheenKhalid08 commented 1 year ago

2) To install the Lapack and BLAS library on Ubuntu:

  1. Open browser and download file https://netlib.org/lapack/lapack-3.11.0.html

  2. Move it into home folder

    mv lapack-3.11.tar.gz ~

  3. Extract it !

  4. Open folder in ~/lapack-3.9.0/ and go terminal

  5. Type this command : cp make.inc.example make.inc

  6. Type this command : make blaslib

  7. Type this command : make lapacklib

  8. It is Done. Now make it link: sudo ln -s $HOME/lapack-3.9.0/librefblas.a /usr/local/lib/libblas.a

sudo ln -s $HOME/lapack-3.9.0/liblapack.a /usr/local/lib/liblapack.a

MaheenKhalid08 commented 1 year ago

// compilation: nvcc -o dgemmSweep11.out -arch sm_13 dgemmSweep.1.1.cu -lcublas

/*

include

include

include

include

include

include

define SWEEP_SUCCESS ((void*)1)

define SWEEP_FAILURE ((void*)0)

define MAX_DEVICES 256 //256 devices is enough for anyone...

unsigned int testIDs[MAX_DEVICES]; unsigned int testedDevices = 0; int iterations = 1; unsigned int speedSetting = 32; int deviceCount = 0; pthread_mutex_t lock; pthread_cond_t condvar; pthread_t devThreads[MAX_DEVICES]; float elapsedTimes[MAX_DEVICES];

volatile int terminatingDevice = -1;

global void doubleMemset(double ptr, unsigned int length, double value) { unsigned int idx = threadIdx.x + blockDim.x blockIdx.x; unsigned int stride = blockDim.x * gridDim.x; for (; idx < length; idx += stride) ptr[idx] = value; }

void dgemmSweep(void devID) { int device = (intptr_t)devID; printf("device = %d\n", device); double A, B, C, alpha = 1.0, beta = 1.0; unsigned int i, j, k; if (cudaSetDevice(device) != cudaSuccess) { fprintf(stderr, "cudaSetDevice(%d) failed\n", device); return SWEEP_FAILURE; } if (cublasInit() != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "Error: cublasInit failed from device %u\n",device); return SWEEP_FAILURE; } struct cudaDeviceProp properties; cudaGetDeviceProperties(&properties, device); unsigned int iterSize = ((unsigned int)(sqrt((properties.totalGlobalMem-(200(1<<20)))/24))) & ~(speedSetting-1); printf("iterSize = %u\n", iterSize); // printf("Performing %d iterations with increment size %d on device %d...\n", iterations, speedSetting, device); for (int curIter = 0; curIter < iterations; curIter++) { for (i = 128; i < iterSize; i+= speedSetting) { if (terminatingDevice != -1) { cublasFree(A); cublasFree(B); cublasFree(C); return SWEEP_SUCCESS; } printf("Device %d: i = %d\n",device, i); double c_h = (double)malloc(sizeof(double) ii); if (!c_h) { fprintf(stderr, "ERROR: malloc of c_h failed. Aborting.\n"); terminatingDevice = device; return SWEEP_FAILURE; } if (cudaMalloc((void)&A, iisizeof(double)) != cudaSuccess) { fprintf(stderr, "Error: cublasAlloc(A) failed at i = %d\n", i); terminatingDevice = device; free(c_h); return SWEEP_FAILURE; } if (cudaMalloc((void)&B, iisizeof(double)) != cudaSuccess) { fprintf(stderr, "Error: cublasAlloc(B) failed at i = %d\n", i); terminatingDevice = device; free(c_h); return SWEEP_FAILURE; }

        if (cudaMalloc((void**)&C, i*i*sizeof(double)) != cudaSuccess)
        {
            fprintf(stderr, "Error: cublasAlloc(C) failed at i = %d\n", i);
            terminatingDevice = device;
            free(c_h);
            return SWEEP_FAILURE;
        }
        doubleMemset<<<i/128, 128>>>(A, i*i, 1.0);
        doubleMemset<<<i/128, 128>>>(B, i*i, 2.0);
        doubleMemset<<<i/128, 128>>>(C, i*i, 3.0);

        if (cudaThreadSynchronize() != cudaSuccess)
        {
            fprintf(stderr, "Error: cudaThreadSynchronize returned %s\n", cudaGetErrorString(cudaGetLastError()));
            terminatingDevice = device;
            free(c_h);
            return SWEEP_FAILURE;
        }
        double result = 2.0 * i + 3.0;

  cublasDgemm('n', 'n', i, i, i, alpha, A, i, B, i, beta, C, i);
  if (cublasGetError() != CUBLAS_STATUS_SUCCESS)
  {
      fprintf(stderr, "Error: cublasDgemm failed!\n");
      terminatingDevice = device;
      free(c_h);
      return SWEEP_FAILURE;
  }
  cudaMemcpy(c_h, C, sizeof(double)*i*i, cudaMemcpyDeviceToHost);
  for (j = 0; j < i; j++)
  {
      for (k = 0; k < i; k++)
      if (c_h[j*i+k] != result)
      {
          fprintf(stderr, "Error: cublasDgemm returned an invalid result at location %d,%d in iteration %d on device %d\n", j, k, i, device);
          printf("%f\n", c_h[j*i+k]);
          terminatingDevice = device;
          free(c_h);
          return SWEEP_FAILURE;
      }
  }
  free(c_h);
  cublasFree(A);
  cublasFree(B);
  cublasFree(C);

}

  //      printf("Finished iteration %d\n", curIter);
  }
  printf("Device %d completed successfully\n", device);
  return SWEEP_SUCCESS;

}

int main (int argc, char** argv) { int i;

if (argc < 2)
{
    fprintf(stderr, "usage: %s <speed setting> <iterations>\nSpeed settings:\n0 = iterate by 32 (default)\n1 = iterate by 64\n2 = iterate by 128 (fastest)\n", argv[0]);
    return 1;
}
switch (argc)
{
    case 3:
    sscanf(argv[2], "%d", &iterations);
    case 2:
    unsigned int speed = 0;
    sscanf(argv[1], "%u", &speed);
    if (speed == 2)
    speedSetting = 128;
    else if (speed == 1)
    speedSetting = 64;
}

cudaGetDeviceCount(&deviceCount);
printf("deviceCount = %d\n", deviceCount);
for (i = 0; i < deviceCount; i++)
{
    struct cudaDeviceProp properties;
    if (cudaGetDeviceProperties(&properties, i) != cudaSuccess)
    {
        printf("Could not retrieve properties of device %d\n", i);
        exit(1);
    }

    printf("Testing device %d: %s\n", i, properties.name);
    if ((properties.major != 9999 && properties.minor != 9999) 
        &&
        ((properties.major >= 1 && properties.minor >= 3) ||
         (properties.major >= 2))
        // && !properties.kernelExecTimeoutEnabled
       )
    {
        testIDs[testedDevices++] = i;
    }
}
if (testedDevices == 0)
{
    printf("No suitable NVIDIA GPUs found. Aborting...\n");
    exit(1);
}

for (i = 0; i < testedDevices; i++)
{
    pthread_create(&devThreads[i], NULL, 
                   (dgemmSweep),(void*)((intptr_t)testIDs[i]));
}

void* returnVal = 0; for (int i = 0; i < testedDevices; i++) { pthread_join(devThreads[i], &returnVal); if (returnVal != SWEEP_SUCCESS) { printf("ERROR: Failed with device %d. dgemmSweep FAILED.\n", terminatingDevice); exit(1); } } printf("dgemmSweep PASSED.\n"); }