DmitryLyakh / TAL_SH

Tensor Algebra Library Routines for Shared Memory Systems
BSD 3-Clause "New" or "Revised" License
38 stars 15 forks source link

contraction errors when using C8 types on AMD gpus #22

Open ajaypanyala opened 2 years ago

ajaypanyala commented 2 years ago

I see error messages when contracting tensors that are of type complex double (C8) on AMD GPUs.

#MESSAGE: Printing TAL-SH task info:
 Device kind -1: Error 106
#END OF MESSAGE

I consistently see this error with rocm versions 4.5.0, 4.5.2 and 5.1.0.

Below is a slimmer version of test.cpp which only runs the test_talsh_c routine. Additionally changed the R8 occurrences to C8 to reproduce the error. It looks like call to gpu_tensor_block_contract_dlf is where things go wrong. This call returns a task error code that is > 0 for when the tensor type is C8.


#include "talshxx.hpp"
#include "talsh.h"
#include "device_algebra.hip.h"

#include <iostream>
#include <memory>
#include <string>
#include <complex> 

#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <cassert>

void test_talsh_c(int * ierr)
{
 const int VDIM_SIZE=30; //virtual
 const int ODIM_SIZE=20; //occupied
 int errc;
 //size_t host_buffer_size=TALSH_NO_HOST_BUFFER;
 size_t host_buffer_size = 1024*1024*1024; //bytes
 int gpu_list[MAX_GPUS_PER_NODE];

 *ierr=0;

//Query the total number of NVIDIA GPU on node:
 int ngpu;
 errc=talshDeviceCount(DEV_NVIDIA_GPU,&ngpu); if(errc){*ierr=1; return;};
 printf(" Number of NVIDIA GPU found on node = %d\n",ngpu);

//Initialize TAL-SH (with a negligible Host buffer since we will use external memory):
 int host_arg_max;
 for(int i=0; i<ngpu; ++i) gpu_list[i]=i; //list of NVIDIA GPU devices to use in this process
 errc=talshInit(&host_buffer_size,&host_arg_max,ngpu,gpu_list,0,NULL,0,NULL);
 printf(" TAL-SH has been initialized: Status %d: Host buffer size = %lu\n",errc,host_buffer_size); if(errc){*ierr=2; return;};

//Allocate three tensor blocks in Host memory outside of TAL-SH (external application):
 //Tensor block 0:
 int trank0 = 4; //tensor block rank
 const int dims0[] = {VDIM_SIZE,VDIM_SIZE,ODIM_SIZE,ODIM_SIZE}; //tensor block dimension extents
 int trank1 = 4; //tensor block rank
 const int dims1[] = {VDIM_SIZE,VDIM_SIZE,VDIM_SIZE,VDIM_SIZE}; //tensor block dimension extents

 int trank2 = 4; //tensor block rank
 const int dims2[] = {ODIM_SIZE,VDIM_SIZE,ODIM_SIZE,VDIM_SIZE}; //tensor block dimension extents

 talsh_tens_t tens0; //declare a TAL-SH tensor block
 errc = talshTensorClean(&tens0); if(errc){*ierr=3; return;}; //clean TAL-SH tensor block object (default ctor)
 errc = talshTensorConstruct(&tens0,C8,trank0,dims0,talshFlatDevId(DEV_HOST,0),NULL,-1,NULL,0.0); //construct tensor block in Host buffer
 //errc = talshTensorConstruct(&tens0,C8,trank0,dims0,talshFlatDevId(DEV_HOST,0),(void*)tblock0); //register tensor block with external memory
 if(errc){*ierr=4; return;};
 size_t vol0 = talshTensorVolume(&tens0);
 //Tensor block 1:
 talsh_tens_t tens1; //declare a TAL-SH tensor block
 errc = talshTensorClean(&tens1); if(errc){*ierr=5; return;}; //clean TAL-SH tensor block object (default ctor)
 errc = talshTensorConstruct(&tens1,C8,trank1,dims1,talshFlatDevId(DEV_HOST,0),NULL,-1,NULL,0.001); //construct tensor block in Host buffer
 //errc = talshTensorConstruct(&tens1,C8,trank1,dims1,talshFlatDevId(DEV_HOST,0),(void*)tblock1); //register tensor block with external memory
 if(errc){*ierr=6; return;};
 size_t vol1 = talshTensorVolume(&tens1);
 //Tensor block 2:
 talsh_tens_t tens2; //declare a TAL-SH tensor block
 errc = talshTensorClean(&tens2); if(errc){*ierr=7; return;}; //clean TAL-SH tensor block object (default ctor)
 errc = talshTensorConstruct(&tens2,C8,trank2,dims2,talshFlatDevId(DEV_HOST,0),NULL,-1,NULL,0.01); //construct tensor block in Host buffer
 //errc=talshTensorConstruct(&tens2,C8,trank2,dims2,talshFlatDevId(DEV_HOST,0),(void*)tblock2); //register tensor block with external memory
 if(errc){*ierr=8; return;};
 size_t vol2 = talshTensorVolume(&tens2);
 double gflops = (sqrt(((double)(vol0))*((double)(vol1))*((double)(vol2)))*2.0)/1e9; //total number of floating point operations (GFlops)
 double theor_norm1 = gflops * 0.01 * 0.001 * 1e9;
 printf(" Three TAL-SH tensor blocks have been constructed: Volumes: %lu, %lu, %lu: GFlops = %f\n",vol0,vol1,vol2,gflops);

//Declare a TAL-SH task handle:
 talsh_task_t task0; //declare a TAL-SH task handle
 errc=talshTaskClean(&task0); //clean TAL-SH task handle object to an empty state
 if(errc){*ierr=9; return;};

//Execute a tensor contraction either on CPU (synchronously) or GPU (asynchronously):
#ifndef NO_GPU
 int dev_kind = DEV_NVIDIA_GPU; //NVIDIA GPU devices
 int dev_num = 0; //specific device number (any from gpu_list[])
#else
 int dev_kind = DEV_HOST; //CPU Host (multicore)
 int dev_num = 0; //CPU Host is always a single device (but multicore)
#endif
 //Schedule:
 clock_t tms = clock();
 errc=talshTensorContract("D(a,b,i,j)+=L(c,b,d,a)*R(j,d,i,c)",&tens0,&tens1,&tens2,2.0,0.0,dev_num,dev_kind,COPY_MTT,YEP,&task0);
 printf(" Tensor contraction has been scheduled for execution: Status %d\n",errc); if(errc){*ierr=10; return;};
 //Test for completion: 
 int sts,done=NOPE;
 while(done != YEP && errc == TALSH_SUCCESS){done=talshTaskComplete(&task0,&sts,&errc);}
 double tm = ((double)(clock() - tms))/CLOCKS_PER_SEC;
 if(errc == TALSH_SUCCESS){
  printf(" Tensor contraction has completed successfully: Status %d: Time %f sec\n",sts,tm);
 }else{
  printf(" Tensor contraction has failed: Status %d: Error %d\n",sts,errc);
  *ierr=11; return;
 }
 //Timing:
 double total_time;
 errc=talshTaskTime(&task0,&total_time); if(errc){*ierr=12; return;};
 printf(" Tensor contraction total time = %f: GFlop/s = %f\n",total_time,gflops/total_time);
 //Destruct the task handle:
 errc=talshTaskDestruct(&task0); if(errc){*ierr=13; return;};
#ifndef NO_GPU
 //If executed on GPU, COPY_MTT parameter in the tensor contraction call above means that the
 //destination tensor image was moved to GPU device (letter M means MOVE).
 //So, let's move it back to Host (to a user-specified memory location):
 errc=talshTensorPlace(&tens0,0,DEV_HOST,NULL,COPY_M); //this will move the resulting tensor block back to Host (letter M means MOVE)
 if(errc){*ierr=14; return;};
#endif
 printf(" Tensor result was moved back to Host: Norm1 = %E: Correct = %E\n",talshTensorImageNorm1_cpu(&tens0),theor_norm1);

//Unregister tensor blocks with TAL-SH:
 errc=talshTensorDestruct(&tens2); if(errc){*ierr=15; return;};
 errc=talshTensorDestruct(&tens1); if(errc){*ierr=16; return;};
 errc=talshTensorDestruct(&tens0); if(errc){*ierr=17; return;};
 printf(" Three external tensor blocks have been unregistered with TAL-SH\n");

//Shutdown TAL-SH:
 errc=talshShutdown();
 printf(" TAL-SH has been shut down: Status %d\n",errc); if(errc){*ierr=18; return;};

 return;
}

int main(int argc, char* argv[]) {
  int ierr=0;
  test_talsh_c(&ierr);
  return 0;
}
aspgomes commented 1 year ago

on frontier, using rocm 5.7 (5.6 should be the same), the sample code appears to work properly :

 Number of NVIDIA GPU found on node = 8
 TAL-SH has been initialized: Status 0: Host buffer size = 1072693248
 Three TAL-SH tensor blocks have been constructed: Volumes: 360000, 810000, 360000: GFlops = 0.648000
 Tensor contraction has been scheduled for execution: Status 0
 Tensor contraction has completed successfully: Status 2000005: Time 1.461498 sec
 Tensor contraction total time = 0.280260: GFlop/s = 2.312139
 Tensor result was moved back to Host: Norm1 = 6.480000E+03: Correct = 6.480000E+03
 Three external tensor blocks have been unregistered with TAL-SH
 TAL-SH has been shut down: Status 0

using rocm 5.1.0, the execution goes to the end but norm1 does not match the reference value:

 Number of NVIDIA GPU found on node = 8
 TAL-SH has been initialized: Status 0: Host buffer size = 1072693248
 Three TAL-SH tensor blocks have been constructed: Volumes: 360000, 810000, 360000: GFlops = 0.648000
 Tensor contraction has been scheduled for execution: Status 0
 Tensor contraction has completed successfully: Status 2000005: Time 9.924454 sec
 Tensor contraction total time = 9.105721: GFlop/s = 0.071164
 Tensor result was moved back to Host: Norm1 = 1.383536E-01: Correct = 6.480000E+03
 Three external tensor blocks have been unregistered with TAL-SH
 TAL-SH has been shut down: Status 0

which is also the case for rocm 5.4.3:

 Number of NVIDIA GPU found on node = 8
#WARNING(tensor_algebra_gpu_nvidia:init_gpus): Unable to set GPU SHMEM width 8: Error 2 
#WARNING(tensor_algebra_gpu_nvidia:init_gpus): Unable to set GPU SHMEM width 8: Error 2 
#WARNING(tensor_algebra_gpu_nvidia:init_gpus): Unable to set GPU SHMEM width 8: Error 2 
#WARNING(tensor_algebra_gpu_nvidia:init_gpus): Unable to set GPU SHMEM width 8: Error 2 
#WARNING(tensor_algebra_gpu_nvidia:init_gpus): Unable to set GPU SHMEM width 8: Error 2 
#WARNING(tensor_algebra_gpu_nvidia:init_gpus): Unable to set GPU SHMEM width 8: Error 2 
#WARNING(tensor_algebra_gpu_nvidia:init_gpus): Unable to set GPU SHMEM width 8: Error 2 
#WARNING(tensor_algebra_gpu_nvidia:init_gpus): Unable to set GPU SHMEM width 8: Error 2 
 TAL-SH has been initialized: Status 0: Host buffer size = 1072693248
 Three TAL-SH tensor blocks have been constructed: Volumes: 360000, 810000, 360000: GFlops = 0.648000
 Tensor contraction has been scheduled for execution: Status 0
 Tensor contraction has completed successfully: Status 2000005: Time 1.028282 sec
 Tensor contraction total time = 0.329818: GFlop/s = 1.964722
 Tensor result was moved back to Host: Norm1 = 4.352281E-03: Correct = 6.480000E+03
 Three external tensor blocks have been unregistered with TAL-SH
 TAL-SH has been shut down: Status 0

i've observed that for rocm below 5.6 there's an issue with the optimization levels beyond -O1 with hipcc that can result in runtime errors such as

:0:rocdevice.cpp            :2614: 3360862704560 us: 16818: [tid:0x7fff99b97700] Device::callbackQueue aborting with error : HSA_STATUS_ERROR_MEMORY_APERTURE_VIOLATION: The agent attempted to access memory beyond the largest legal address. code: 0x29

when running test_talsh.x

so coming back to rocm 5.1.0 but reducing the optimization to -O1 for hipcc results in a norm1 that matches the reference :

 Number of NVIDIA GPU found on node = 8
 TAL-SH has been initialized: Status 0: Host buffer size = 1072693248
 Three TAL-SH tensor blocks have been constructed: Volumes: 360000, 810000, 360000: GFlops = 0.648000
 Tensor contraction has been scheduled for execution: Status 0
 Tensor contraction has completed successfully: Status 2000005: Time 9.875879 sec
 Tensor contraction total time = 9.014154: GFlop/s = 0.071887
 Tensor result was moved back to Host: Norm1 = 6.480000E+03: Correct = 6.480000E+03
 Three external tensor blocks have been unregistered with TAL-SH
 TAL-SH has been shut down: Status 0

so my impression from all of this is that the issue looks to be due to rocm rather than in tal-sh, and that it could be interesting to list rocm 5.6 as a minimum requirement (and/or indicating the decrease in optimization level for earlier rocm versions).

aspgomes commented 11 months ago

I have looked at this again, and as it turns out rocm 5.6.0 still shows the issue with yielding an incorrect Norm1 values unless -O1 is used. code compiled with BUILD_TYPE=PRF:

-O3

 Number of NVIDIA GPU found on node = 8
 TAL-SH has been initialized: Status 0: Host buffer size = 1072693248
 Three TAL-SH tensor blocks have been constructed: Volumes: 360000, 810000, 360000: GFlops = 0.648000
 Tensor contraction has been scheduled for execution: Status 0
 Tensor contraction has completed successfully: Status 2000005: Time 0.059791 sec
 Tensor contraction total time = 0.280899: GFlop/s = 2.306882
 Tensor result was moved back to Host: Norm1 = 9.368648E-02: Correct = 6.480000E+03
 Three external tensor blocks have been unregistered with TAL-SH
 TAL-SH has been shut down: Status 0

-O2

 Number of NVIDIA GPU found on node = 8
 TAL-SH has been initialized: Status 0: Host buffer size = 1072693248
 Three TAL-SH tensor blocks have been constructed: Volumes: 360000, 810000, 360000: GFlops = 0.648000
 Tensor contraction has been scheduled for execution: Status 0
 Tensor contraction has completed successfully: Status 2000005: Time 0.056695 sec
 Tensor contraction total time = 0.321033: GFlop/s = 2.018483
 Tensor result was moved back to Host: Norm1 = 4.342731E-01: Correct = 6.480000E+03
 Three external tensor blocks have been unregistered with TAL-SH
 TAL-SH has been shut down: Status 0

-O1

 Number of NVIDIA GPU found on node = 8
 TAL-SH has been initialized: Status 0: Host buffer size = 1072693248
 Three TAL-SH tensor blocks have been constructed: Volumes: 360000, 810000, 360000: GFlops = 0.648000
 Tensor contraction has been scheduled for execution: Status 0
 Tensor contraction has completed successfully: Status 2000005: Time 0.059132 sec
 Tensor contraction total time = 0.307055: GFlop/s = 2.110372
 Tensor result was moved back to Host: Norm1 = 6.480000E+03: Correct = 6.480000E+03
 Three external tensor blocks have been unregistered with TAL-SH
 TAL-SH has been shut down: Status 0                            

with that, i would amend the suggestion to indicate rocm 5.6 still requires the workaround, and rocm 5.7.0 as minimum requirement without it.