template global void forallkernel(int start, int N, Func f) {
int tid = start + threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) f(tid);
}
template
void forall(int start, int end, LoopBody &&body) {
int tpb = 32;
int blocks = (end - start) / tpb;
blocks = ((end - start) % tpb == 0) ? blocks : blocks + 1;
// printf("Launching the kernel blocks= %d tpb= %d \n",blocks,tpb);
// forallkernel<<<blocks, tpb>>>(start, end, body);
hipLaunchKernelGGL(forallkernel, blocks, tpb, 0, 0, start, end,
body);
hipStreamSynchronize(0);
}
int main(int argc, char *argv){
const int N=100000;
double a,b,c;
void ptr;
hipMalloc(&ptr,sizeof(double)N3);
a = (double)ptr;
b = a+N;
c = b+N;
CALI_MARK_BEGIN("TEST");
include
include <hip/hip_runtime.h>
include <caliper/cali.h>
// Build using: // hipcc -I caliper_repro.C -L -Wl,rpath= -fopenmp -lcaliper
template
global void forallkernel(int start, int N, Func f) {
int tid = start + threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) f(tid);
}
template
void forall(int start, int end, LoopBody &&body) {
int tpb = 32;
int blocks = (end - start) / tpb;
blocks = ((end - start) % tpb == 0) ? blocks : blocks + 1;
// printf("Launching the kernel blocks= %d tpb= %d \n",blocks,tpb);
// forallkernel<<<blocks, tpb>>>(start, end, body);
hipLaunchKernelGGL(forallkernel, blocks, tpb, 0, 0, start, end,
body);
hipStreamSynchronize(0);
}
int main(int argc, char *argv){ const int N=100000; double a,b,c; void ptr; hipMalloc(&ptr,sizeof(double)N3); a = (double)ptr; b = a+N; c = b+N; CALI_MARK_BEGIN("TEST");
pragma omp parallel for
for( int i=0;i<N;i++){ a[i]=i; b[i]=ii; c[i]=2a[i]+b[i]; } forall(0,N,[=]host device(int i){ c[i]=c[i]-i*i; }); CALI_MARK_END("TEST"); std::cout<<c[256]<<"\n"; }
Run using
export CALI_CONFIG=hatchet-region-profile,output.format=cali export AMD_SERIALIZE_KERNEL=3 export MPICH_GPU_SUPPORT_ENABLED=1 flux mini run -n 8 -c 8 -g 1 -o gpu-affinity=per-task -o cpu-affinity=per-task ./a.out