saqibkh commented 1 year ago

What is the purpose of this workload?
How to install it on ubuntu (preferably 22.04)?
How to run this workload and benchmark results? How long does it take to run? What parameters causes more stress and uses all resources?

MaheenKhalid08 commented 1 year ago

The general matrix multiplication (GEMM) workloads accept three N × N matrices A, B, and C and compute the result: C =AB+C. The GEMM algorithm divides the matrices into blocks and operates on one block at a time. This improves cache coherency and leads to faster execution compared to an implementation of the “pencil and paper” matrix multiplication algorithm. In Geekbench, SGEMM performs the operation using single precision floating point and DGEMM performs it using double precision floating point. The block sizes are empirically determined values that give good cache performance across tested platforms. SGEMM uses a block size of 128×128 elements and DGEMM uses 64×64 elements. The input values are Aij = Bji = (i+jN)( mod 10) and Cij = 0. For both SGEMM and DGEMM, N =896 on desktop machines and N = 512 for mobile devices.

MaheenKhalid08 commented 1 year ago

2) To install the Lapack and BLAS library on Ubuntu:

Open browser and download file https://netlib.org/lapack/lapack-3.11.0.html
Move it into home folder

mv lapack-3.11.tar.gz ~
Extract it !
Open folder in ~/lapack-3.9.0/ and go terminal
Type this command : cp make.inc.example make.inc
Type this command : make blaslib
Type this command : make lapacklib
It is Done. Now make it link: sudo ln -s $HOME/lapack-3.9.0/librefblas.a /usr/local/lib/libblas.a

sudo ln -s $HOME/lapack-3.9.0/liblapack.a /usr/local/lib/liblapack.a

MaheenKhalid08 commented 1 year ago

// compilation: nvcc -o dgemmSweep11.out -arch sm_13 dgemmSweep.1.1.cu -lcublas

/*

Copyright 1993-2009 NVIDIA Corporation. All rights reserved.
NOTICE TO USER:
This source code is subject to NVIDIA ownership rights under U.S. and
international Copyright laws. Users and possessors of this source code
are hereby granted a nonexclusive, royalty-free license to use this code
in individual and commercial software.
NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
OR PERFORMANCE OF THIS SOURCE CODE.
U.S. Government End Users. This source code is a "commercial item" as
that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
"commercial computer software" and "commercial computer software
documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
and is provided to the U.S. Government only as a commercial end item.
Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
source code with only those rights set forth herein.
Any use of this source code in individual and commercial software must
include, in the user documentation and internal comments to the code,
the above Disclaimer and U.S. Government End Users Notice. */

include

define SWEEP_SUCCESS ((void*)1)

define SWEEP_FAILURE ((void*)0)

define MAX_DEVICES 256 //256 devices is enough for anyone...

unsigned int testIDs[MAX_DEVICES]; unsigned int testedDevices = 0; int iterations = 1; unsigned int speedSetting = 32; int deviceCount = 0; pthread_mutex_t lock; pthread_cond_t condvar; pthread_t devThreads[MAX_DEVICES]; float elapsedTimes[MAX_DEVICES];

volatile int terminatingDevice = -1;

global void doubleMemset(double ptr, unsigned int length, double value) { unsigned int idx = threadIdx.x + blockDim.x blockIdx.x; unsigned int stride = blockDim.x * gridDim.x; for (; idx < length; idx += stride) ptr[idx] = value; }

void dgemmSweep(void devID) { int device = (intptr_t)devID; printf("device = %d\n", device); double A, B, C, alpha = 1.0, beta = 1.0; unsigned int i, j, k; if (cudaSetDevice(device) != cudaSuccess) { fprintf(stderr, "cudaSetDevice(%d) failed\n", device); return SWEEP_FAILURE; } if (cublasInit() != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "Error: cublasInit failed from device %u\n",device); return SWEEP_FAILURE; } struct cudaDeviceProp properties; cudaGetDeviceProperties(&properties, device); unsigned int iterSize = ((unsigned int)(sqrt((properties.totalGlobalMem-(200(1<<20)))/24))) & ~(speedSetting-1); printf("iterSize = %u\n", iterSize); // printf("Performing %d iterations with increment size %d on device %d...\n", iterations, speedSetting, device); for (int curIter = 0; curIter < iterations; curIter++) { for (i = 128; i < iterSize; i+= speedSetting) { if (terminatingDevice != -1) { cublasFree(A); cublasFree(B); cublasFree(C); return SWEEP_SUCCESS; } printf("Device %d: i = %d\n",device, i); double c_h = (double)malloc(sizeof(double) ii); if (!c_h) { fprintf(stderr, "ERROR: malloc of c_h failed. Aborting.\n"); terminatingDevice = device; return SWEEP_FAILURE; } if (cudaMalloc((void)&A, iisizeof(double)) != cudaSuccess) { fprintf(stderr, "Error: cublasAlloc(A) failed at i = %d\n", i); terminatingDevice = device; free(c_h); return SWEEP_FAILURE; } if (cudaMalloc((void)&B, iisizeof(double)) != cudaSuccess) { fprintf(stderr, "Error: cublasAlloc(B) failed at i = %d\n", i); terminatingDevice = device; free(c_h); return SWEEP_FAILURE; }

        if (cudaMalloc((void**)&C, i*i*sizeof(double)) != cudaSuccess)
        {
            fprintf(stderr, "Error: cublasAlloc(C) failed at i = %d\n", i);
            terminatingDevice = device;
            free(c_h);
            return SWEEP_FAILURE;
        }
        doubleMemset<<<i/128, 128>>>(A, i*i, 1.0);
        doubleMemset<<<i/128, 128>>>(B, i*i, 2.0);
        doubleMemset<<<i/128, 128>>>(C, i*i, 3.0);

        if (cudaThreadSynchronize() != cudaSuccess)
        {
            fprintf(stderr, "Error: cudaThreadSynchronize returned %s\n", cudaGetErrorString(cudaGetLastError()));
            terminatingDevice = device;
            free(c_h);
            return SWEEP_FAILURE;
        }
        double result = 2.0 * i + 3.0;

  cublasDgemm('n', 'n', i, i, i, alpha, A, i, B, i, beta, C, i);
  if (cublasGetError() != CUBLAS_STATUS_SUCCESS)
  {
      fprintf(stderr, "Error: cublasDgemm failed!\n");
      terminatingDevice = device;
      free(c_h);
      return SWEEP_FAILURE;
  }
  cudaMemcpy(c_h, C, sizeof(double)*i*i, cudaMemcpyDeviceToHost);
  for (j = 0; j < i; j++)
  {
      for (k = 0; k < i; k++)
      if (c_h[j*i+k] != result)
      {
          fprintf(stderr, "Error: cublasDgemm returned an invalid result at location %d,%d in iteration %d on device %d\n", j, k, i, device);
          printf("%f\n", c_h[j*i+k]);
          terminatingDevice = device;
          free(c_h);
          return SWEEP_FAILURE;
      }
  }
  free(c_h);
  cublasFree(A);
  cublasFree(B);
  cublasFree(C);

}

  //      printf("Finished iteration %d\n", curIter);
  }
  printf("Device %d completed successfully\n", device);
  return SWEEP_SUCCESS;

}

int main (int argc, char** argv) { int i;

if (argc < 2)
{
    fprintf(stderr, "usage: %s <speed setting> <iterations>\nSpeed settings:\n0 = iterate by 32 (default)\n1 = iterate by 64\n2 = iterate by 128 (fastest)\n", argv[0]);
    return 1;
}
switch (argc)
{
    case 3:
    sscanf(argv[2], "%d", &iterations);
    case 2:
    unsigned int speed = 0;
    sscanf(argv[1], "%u", &speed);
    if (speed == 2)
    speedSetting = 128;
    else if (speed == 1)
    speedSetting = 64;
}

cudaGetDeviceCount(&deviceCount);
printf("deviceCount = %d\n", deviceCount);
for (i = 0; i < deviceCount; i++)
{
    struct cudaDeviceProp properties;
    if (cudaGetDeviceProperties(&properties, i) != cudaSuccess)
    {
        printf("Could not retrieve properties of device %d\n", i);
        exit(1);
    }

    printf("Testing device %d: %s\n", i, properties.name);
    if ((properties.major != 9999 && properties.minor != 9999) 
        &&
        ((properties.major >= 1 && properties.minor >= 3) ||
         (properties.major >= 2))
        // && !properties.kernelExecTimeoutEnabled
       )
    {
        testIDs[testedDevices++] = i;
    }
}
if (testedDevices == 0)
{
    printf("No suitable NVIDIA GPUs found. Aborting...\n");
    exit(1);
}

for (i = 0; i < testedDevices; i++)
{
    pthread_create(&devThreads[i], NULL, 
                   (dgemmSweep),(void*)((intptr_t)testIDs[i]));
}

void* returnVal = 0; for (int i = 0; i < testedDevices; i++) { pthread_join(devThreads[i], &returnVal); if (returnVal != SWEEP_SUCCESS) { printf("ERROR: Failed with device %d. dgemmSweep FAILED.\n", terminatingDevice); exit(1); } } printf("dgemmSweep PASSED.\n"); }

saqibkh / CPU_Workloads

DGEMM #1

include

include

include

include

include

include

define SWEEP_SUCCESS ((void*)1)

define SWEEP_FAILURE ((void*)0)

define MAX_DEVICES 256 //256 devices is enough for anyone...