llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.86k stars 11.91k forks source link

[OpenMP] Inner reduction crashes at runtime #66708

Open doru1004 opened 1 year ago

doru1004 commented 1 year ago

In the following example:

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

#define MAX_N 64 * 1024 * 4

void reset_input(double *a, double *a_h, double *b, double *c) {
  for(int i = 0 ; i < MAX_N ; i++) {
    a[i] = a_h[i] = 0;
    b[i] = i;
    c[i] = i;
  }
}

int main(int argc, char *argv[]) {
  double * a = (double *) malloc(MAX_N * sizeof(double));
  double * a_h = (double *) malloc(MAX_N * sizeof(double));
  double * b = (double *) malloc(MAX_N * sizeof(double));
  double * c = (double *) malloc(MAX_N * sizeof(double));

  int n = MAX_N;
#pragma omp target enter data map(to:a[:MAX_N],b[:MAX_N],c[:MAX_N])

  reset_input(a, a_h, b, c);

#pragma omp target update to(a[:n],b[:n],c[:n])
#pragma omp target teams distribute parallel for
  for (uint64_t sample=0; sample < 64 * 1024; sample++) {
    double partial_sum = 0.0;
    #pragma omp parallel for reduction(+:partial_sum)
    for (int i = 0; i < 4; ++i) {
      partial_sum += b[sample * 4 + i] + c[sample * 4 + i];
    }
    a[sample * 4] = partial_sum;
  }

  for (int i = 0; i < 64 * 1024; ++i) {
    double p_sum = 0.0;
    for (int j = 0; j < 4; ++j) {
      p_sum += b[4*i + j] + c[4*i + j];
    }
    a_h[i*4] = p_sum;
  }
#pragma omp target update from(a[:n])

  for (int i = 0; i < MAX_N; ++i) {
    if (a_h[i] != a[i]) {
      printf("A Error at n = %d, i = %d: host = %f, device = %f\n", n, i, a_h[i], a[i]);
      return 1;
    }
  }
  printf("Succeeded\n");
  return 0;
}

the inner reduction crashes at runtime.

Compile line:

clang++  -O3 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a test.cpp  -o test -save-temps

The postlink LLVM IR looks ok. The postopt LLVM IR looks suspicious with function bodies being set to unreachable and then called from the kernel. This could well be a red herring and/or the error could be somewhere else but it could be starting point. At first glance it seems to be an issue with OpenMP Opt.

llvmbot commented 1 year ago

@llvm/issue-subscribers-openmp

In the following example: ``` #include <omp.h> #include <stdio.h> #include <stdlib.h> #define MAX_N 64 * 1024 * 4 void reset_input(double *a, double *a_h, double *b, double *c) { for(int i = 0 ; i < MAX_N ; i++) { a[i] = a_h[i] = 0; b[i] = i; c[i] = i; } } int main(int argc, char *argv[]) { double * a = (double *) malloc(MAX_N * sizeof(double)); double * a_h = (double *) malloc(MAX_N * sizeof(double)); double * b = (double *) malloc(MAX_N * sizeof(double)); double * c = (double *) malloc(MAX_N * sizeof(double)); int n = MAX_N; #pragma omp target enter data map(to:a[:MAX_N],b[:MAX_N],c[:MAX_N]) reset_input(a, a_h, b, c); #pragma omp target update to(a[:n],b[:n],c[:n]) #pragma omp target teams distribute parallel for for (uint64_t sample=0; sample < 64 * 1024; sample++) { double partial_sum = 0.0; #pragma omp parallel for reduction(+:partial_sum) for (int i = 0; i < 4; ++i) { partial_sum += b[sample * 4 + i] + c[sample * 4 + i]; } a[sample * 4] = partial_sum; } for (int i = 0; i < 64 * 1024; ++i) { double p_sum = 0.0; for (int j = 0; j < 4; ++j) { p_sum += b[4*i + j] + c[4*i + j]; } a_h[i*4] = p_sum; } #pragma omp target update from(a[:n]) for (int i = 0; i < MAX_N; ++i) { if (a_h[i] != a[i]) { printf("A Error at n = %d, i = %d: host = %f, device = %f\n", n, i, a_h[i], a[i]); return 1; } } printf("Succeeded\n"); return 0; } ``` the inner reduction crashes at runtime. Compile line: ``` clang++ -O3 -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx90a test.cpp -o test -save-temps ``` The postlink LLVM IR looks ok. The postopt LLVM IR looks suspicious with function bodies being set to unreachable and then called from the kernel. This could well be a red herring and/or the error could be somewhere else but it could be starting point. At first glance it seems to be an issue with OpenMP Opt.
josemonsalve2 commented 1 year ago

@rodrigo-ceccato May be of your interest