#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define MAX_N 64 * 1024 * 4
void reset_input(double *a, double *a_h, double *b, double *c) {
for(int i = 0 ; i < MAX_N ; i++) {
a[i] = a_h[i] = 0;
b[i] = i;
c[i] = i;
}
}
int main(int argc, char *argv[]) {
double * a = (double *) malloc(MAX_N * sizeof(double));
double * a_h = (double *) malloc(MAX_N * sizeof(double));
double * b = (double *) malloc(MAX_N * sizeof(double));
double * c = (double *) malloc(MAX_N * sizeof(double));
int n = MAX_N;
#pragma omp target enter data map(to:a[:MAX_N],b[:MAX_N],c[:MAX_N])
reset_input(a, a_h, b, c);
#pragma omp target update to(a[:n],b[:n],c[:n])
#pragma omp target teams distribute parallel for
for (uint64_t sample=0; sample < 64 * 1024; sample++) {
double partial_sum = 0.0;
#pragma omp parallel for reduction(+:partial_sum)
for (int i = 0; i < 4; ++i) {
partial_sum += b[sample * 4 + i] + c[sample * 4 + i];
}
a[sample * 4] = partial_sum;
}
for (int i = 0; i < 64 * 1024; ++i) {
double p_sum = 0.0;
for (int j = 0; j < 4; ++j) {
p_sum += b[4*i + j] + c[4*i + j];
}
a_h[i*4] = p_sum;
}
#pragma omp target update from(a[:n])
for (int i = 0; i < MAX_N; ++i) {
if (a_h[i] != a[i]) {
printf("A Error at n = %d, i = %d: host = %f, device = %f\n", n, i, a_h[i], a[i]);
return 1;
}
}
printf("Succeeded\n");
return 0;
}
the inner reduction crashes at runtime.
Compile line:
clang++ -O3 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a test.cpp -o test -save-temps
The postlink LLVM IR looks ok. The postopt LLVM IR looks suspicious with function bodies being set to unreachable and then called from the kernel. This could well be a red herring and/or the error could be somewhere else but it could be starting point. At first glance it seems to be an issue with OpenMP Opt.
In the following example:
```
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define MAX_N 64 * 1024 * 4
void reset_input(double *a, double *a_h, double *b, double *c) {
for(int i = 0 ; i < MAX_N ; i++) {
a[i] = a_h[i] = 0;
b[i] = i;
c[i] = i;
}
}
int main(int argc, char *argv[]) {
double * a = (double *) malloc(MAX_N * sizeof(double));
double * a_h = (double *) malloc(MAX_N * sizeof(double));
double * b = (double *) malloc(MAX_N * sizeof(double));
double * c = (double *) malloc(MAX_N * sizeof(double));
int n = MAX_N;
#pragma omp target enter data map(to:a[:MAX_N],b[:MAX_N],c[:MAX_N])
reset_input(a, a_h, b, c);
#pragma omp target update to(a[:n],b[:n],c[:n])
#pragma omp target teams distribute parallel for
for (uint64_t sample=0; sample < 64 * 1024; sample++) {
double partial_sum = 0.0;
#pragma omp parallel for reduction(+:partial_sum)
for (int i = 0; i < 4; ++i) {
partial_sum += b[sample * 4 + i] + c[sample * 4 + i];
}
a[sample * 4] = partial_sum;
}
for (int i = 0; i < 64 * 1024; ++i) {
double p_sum = 0.0;
for (int j = 0; j < 4; ++j) {
p_sum += b[4*i + j] + c[4*i + j];
}
a_h[i*4] = p_sum;
}
#pragma omp target update from(a[:n])
for (int i = 0; i < MAX_N; ++i) {
if (a_h[i] != a[i]) {
printf("A Error at n = %d, i = %d: host = %f, device = %f\n", n, i, a_h[i], a[i]);
return 1;
}
}
printf("Succeeded\n");
return 0;
}
```
the inner reduction crashes at runtime.
Compile line:
```
clang++ -O3 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a test.cpp -o test -save-temps
```
The postlink LLVM IR looks ok. The postopt LLVM IR looks suspicious with function bodies being set to unreachable and then called from the kernel. This could well be a red herring and/or the error could be somewhere else but it could be starting point. At first glance it seems to be an issue with OpenMP Opt.
In the following example:
the inner reduction crashes at runtime.
Compile line:
The postlink LLVM IR looks ok. The postopt LLVM IR looks suspicious with function bodies being set to unreachable and then called from the kernel. This could well be a red herring and/or the error could be somewhere else but it could be starting point. At first glance it seems to be an issue with OpenMP Opt.