Open saqibkh opened 1 year ago
2) To install the Lapack and BLAS library on Ubuntu:
Open browser and download file https://netlib.org/lapack/lapack-3.11.0.html
Move it into home folder
mv lapack-3.11.tar.gz ~
Extract it !
Open folder in ~/lapack-3.9.0/ and go terminal
Type this command : cp make.inc.example make.inc
Type this command : make blaslib
Type this command : make lapacklib
It is Done. Now make it link: sudo ln -s $HOME/lapack-3.9.0/librefblas.a /usr/local/lib/libblas.a
sudo ln -s $HOME/lapack-3.9.0/liblapack.a /usr/local/lib/liblapack.a
// compilation: nvcc -o dgemmSweep11.out -arch sm_13 dgemmSweep.1.1.cu -lcublas
/*
unsigned int testIDs[MAX_DEVICES]; unsigned int testedDevices = 0; int iterations = 1; unsigned int speedSetting = 32; int deviceCount = 0; pthread_mutex_t lock; pthread_cond_t condvar; pthread_t devThreads[MAX_DEVICES]; float elapsedTimes[MAX_DEVICES];
volatile int terminatingDevice = -1;
global void doubleMemset(double ptr, unsigned int length, double value) { unsigned int idx = threadIdx.x + blockDim.x blockIdx.x; unsigned int stride = blockDim.x * gridDim.x; for (; idx < length; idx += stride) ptr[idx] = value; }
void dgemmSweep(void devID) { int device = (intptr_t)devID; printf("device = %d\n", device); double A, B, C, alpha = 1.0, beta = 1.0; unsigned int i, j, k; if (cudaSetDevice(device) != cudaSuccess) { fprintf(stderr, "cudaSetDevice(%d) failed\n", device); return SWEEP_FAILURE; } if (cublasInit() != CUBLAS_STATUS_SUCCESS) { fprintf(stderr, "Error: cublasInit failed from device %u\n",device); return SWEEP_FAILURE; } struct cudaDeviceProp properties; cudaGetDeviceProperties(&properties, device); unsigned int iterSize = ((unsigned int)(sqrt((properties.totalGlobalMem-(200(1<<20)))/24))) & ~(speedSetting-1); printf("iterSize = %u\n", iterSize); // printf("Performing %d iterations with increment size %d on device %d...\n", iterations, speedSetting, device); for (int curIter = 0; curIter < iterations; curIter++) { for (i = 128; i < iterSize; i+= speedSetting) { if (terminatingDevice != -1) { cublasFree(A); cublasFree(B); cublasFree(C); return SWEEP_SUCCESS; } printf("Device %d: i = %d\n",device, i); double c_h = (double)malloc(sizeof(double) ii); if (!c_h) { fprintf(stderr, "ERROR: malloc of c_h failed. Aborting.\n"); terminatingDevice = device; return SWEEP_FAILURE; } if (cudaMalloc((void)&A, iisizeof(double)) != cudaSuccess) { fprintf(stderr, "Error: cublasAlloc(A) failed at i = %d\n", i); terminatingDevice = device; free(c_h); return SWEEP_FAILURE; } if (cudaMalloc((void)&B, iisizeof(double)) != cudaSuccess) { fprintf(stderr, "Error: cublasAlloc(B) failed at i = %d\n", i); terminatingDevice = device; free(c_h); return SWEEP_FAILURE; }
if (cudaMalloc((void**)&C, i*i*sizeof(double)) != cudaSuccess)
{
fprintf(stderr, "Error: cublasAlloc(C) failed at i = %d\n", i);
terminatingDevice = device;
free(c_h);
return SWEEP_FAILURE;
}
doubleMemset<<<i/128, 128>>>(A, i*i, 1.0);
doubleMemset<<<i/128, 128>>>(B, i*i, 2.0);
doubleMemset<<<i/128, 128>>>(C, i*i, 3.0);
if (cudaThreadSynchronize() != cudaSuccess)
{
fprintf(stderr, "Error: cudaThreadSynchronize returned %s\n", cudaGetErrorString(cudaGetLastError()));
terminatingDevice = device;
free(c_h);
return SWEEP_FAILURE;
}
double result = 2.0 * i + 3.0;
cublasDgemm('n', 'n', i, i, i, alpha, A, i, B, i, beta, C, i);
if (cublasGetError() != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "Error: cublasDgemm failed!\n");
terminatingDevice = device;
free(c_h);
return SWEEP_FAILURE;
}
cudaMemcpy(c_h, C, sizeof(double)*i*i, cudaMemcpyDeviceToHost);
for (j = 0; j < i; j++)
{
for (k = 0; k < i; k++)
if (c_h[j*i+k] != result)
{
fprintf(stderr, "Error: cublasDgemm returned an invalid result at location %d,%d in iteration %d on device %d\n", j, k, i, device);
printf("%f\n", c_h[j*i+k]);
terminatingDevice = device;
free(c_h);
return SWEEP_FAILURE;
}
}
free(c_h);
cublasFree(A);
cublasFree(B);
cublasFree(C);
}
// printf("Finished iteration %d\n", curIter);
}
printf("Device %d completed successfully\n", device);
return SWEEP_SUCCESS;
}
int main (int argc, char** argv) { int i;
if (argc < 2)
{
fprintf(stderr, "usage: %s <speed setting> <iterations>\nSpeed settings:\n0 = iterate by 32 (default)\n1 = iterate by 64\n2 = iterate by 128 (fastest)\n", argv[0]);
return 1;
}
switch (argc)
{
case 3:
sscanf(argv[2], "%d", &iterations);
case 2:
unsigned int speed = 0;
sscanf(argv[1], "%u", &speed);
if (speed == 2)
speedSetting = 128;
else if (speed == 1)
speedSetting = 64;
}
cudaGetDeviceCount(&deviceCount);
printf("deviceCount = %d\n", deviceCount);
for (i = 0; i < deviceCount; i++)
{
struct cudaDeviceProp properties;
if (cudaGetDeviceProperties(&properties, i) != cudaSuccess)
{
printf("Could not retrieve properties of device %d\n", i);
exit(1);
}
printf("Testing device %d: %s\n", i, properties.name);
if ((properties.major != 9999 && properties.minor != 9999)
&&
((properties.major >= 1 && properties.minor >= 3) ||
(properties.major >= 2))
// && !properties.kernelExecTimeoutEnabled
)
{
testIDs[testedDevices++] = i;
}
}
if (testedDevices == 0)
{
printf("No suitable NVIDIA GPUs found. Aborting...\n");
exit(1);
}
for (i = 0; i < testedDevices; i++)
{
pthread_create(&devThreads[i], NULL,
(dgemmSweep),(void*)((intptr_t)testIDs[i]));
}
void* returnVal = 0; for (int i = 0; i < testedDevices; i++) { pthread_join(devThreads[i], &returnVal); if (returnVal != SWEEP_SUCCESS) { printf("ERROR: Failed with device %d. dgemmSweep FAILED.\n", terminatingDevice); exit(1); } } printf("dgemmSweep PASSED.\n"); }