clang-omp / clang

clang with OpenMP 3.1 and some elements of OpenMP 4.0 support
clang-omp.github.com
Other
91 stars 15 forks source link

clang crash on dependencies #26

Closed pbrunet closed 10 years ago

pbrunet commented 10 years ago

Hi,

As dependencies appears in the clang-omp version, I try them and clang crash on a cholesky case.

code is :

#include <string.h>
#include <stdio.h>
#include <math.h>
#include <sys/types.h>
#include <stdlib.h>
#include <errno.h>
#include <atlas/cblas.h> 
#include <atlas/clapack.h> /* assume MKL/ATLAS clapack version */

int clapack_dpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo,
                   const int N, double *A, const int lda);

void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_DIAG Diag, const int M, const int N,
                 const double alpha, const double *A, const int lda,
                 double *B, const int ldb);

void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
                 const double alpha, const double *A, const int lda,
                 const double beta, double *C, const int ldc);

void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const double alpha, const double *A,                  const int lda, const double *B, const int ldb,
                 const double beta, double *C, const int ldc);

/* Generate a random matrix symetric definite positive matrix of size m x m 
   - it will be also interesting to generate symetric diagonally dominant 
   matrices which are known to be definite postive.
*/
static void generate_matrix(double* A, size_t m)
{
  // 
  for (size_t i = 0; i< m; ++i)
  {
    for (size_t j = 0; j< m; ++j)
      A[i*m+j] = 1.0 / (1.0+i+j);
    A[i*m+i] = m*1.0; 
  }
}

/* Block Cholesky factorization A <- L * L^t
   Lower triangular matrix, with the diagonal, stores the Cholesky factor.
*/
void Cholesky( double* AA, int N, size_t blocsize )
{
    double (*A)[N][N] = (double (*)[N][N])&AA[0];
#pragma omp parallel
#pragma omp single
  for (size_t k=0; k < N; k += blocsize)
  {
    #pragma omp task shared(A, blocsize, N) depend(inout: A[k:blocsize][k:blocsize])
    clapack_dpotrf(
      CblasRowMajor, CblasLower, blocsize, &(*A)[k][k], N
    );

    for (size_t m=k+blocsize; m < N; m += blocsize)
    {
       #pragma omp task shared(A, blocsize, N) \
             depend(inout: A[m:blocsize][k:blocsize]) depend(in: A[k:blocsize][k:blocsize])
      cblas_dtrsm
      (
        CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
        blocsize, blocsize, 1., &(*A)[k][k], N, &(*A)[m][k], N
      );
    }

    for (size_t m=k+blocsize; m < N; m += blocsize)
    {
       #pragma omp task shared(A, blocsize, N) \
            depend(inout: A[m:blocsize][m:blocsize]) depend(in: A[m:blocsize][k:blocsize])
      cblas_dsyrk
      (
        CblasRowMajor, CblasLower, CblasNoTrans,
        blocsize, blocsize, -1.0, &(*A)[m][k], N, 1.0, &(*A)[m][m], N
      );
      for (size_t n=k+blocsize; n < m; n += blocsize)
      {
#pragma omp task shared(A, blocsize, N) \
                depend(inout: A[m:blocsize][m:blocsize]) depend(in: A[m:blocsize][k:blocsize], A[n:blocsize][k:blocsize])
        cblas_dgemm
        (
          CblasRowMajor, CblasNoTrans, CblasTrans,
          blocsize, blocsize, blocsize, -1.0, &(*A)[m][k], N, &(*A)[n][k], N, 1.0, &(*A)[m][n], N
        );
      }
    }
  }
}

/* Do one run for cholesky
*/
void doone_exp( int N, int block_count )
{
  size_t blocsize = N / block_count;

  printf("N         : %i\n", N);
  printf("size block: %i\n", blocsize);
  printf("#blocks   : %i\n", block_count);
  double* A = 0;
  if (0 != posix_memalign((void**)&A, 4096, N*N*sizeof(double)))
  {
    printf("Fatal Error. Cannot allocate matrice A, errno: %i\n", errno);
    return;
  }

  generate_matrix(A, N);

  Cholesky(A, N, blocsize);

  free(A);
}

/* main entry point
*/
int main(int argc, char** argv)
{
  // matrix dimension
  int n = 32;
  if (argc > 1)
    n = atoi(argv[1]);

  // block count
  int block_count = 2;
  if (argc > 2)
    block_count = atoi(argv[2]);

  doone_exp( n, block_count );

  return 0;
}

I compile it with atlas cblas/lapack version:

$> clang cholesky_inplace.c -lcblas -llapack_atlas -fopenmp

It works fine without -fopenmp but crash with it.

Regards, Pierrick

alexey-bataev commented 10 years ago

Pierrick, Thanks for the report and reproducer. I'll check it ASAP.

Best regards,

Alexey Bataev

Software Engineer Intel Compiler Team Intel Corp.

18 Март 2014 г. 16:33:32, pbrunet писал:

Hi,

As dependencies appears in the clang-omp version, I try them and clang crash on a cholesky case.

code is :

include

include

include

include <sys/types.h>

include

include

include <atlas/cblas.h>

include <atlas/clapack.h>/* assume MKL/ATLAS clapack version */

int clapack_dpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N, double *A, const int lda);

void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double A, const int lda, double B, const int ldb);

void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double A, const int lda, const double beta, double C, const int ldc);

void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double A, const int lda, const double B, const int ldb, const double beta, double *C, const int ldc);

/* Generate a random matrix symetric definite positive matrix of size m x m

  • it will be also interesting to generate symetric diagonally dominant matrices which are known to be definite postive. _/ static void generatematrix(double A, size_t m) { // for (size_t i = 0; i< m; ++i) { for (size_t j = 0; j< m; ++j) A[i_m+j] = 1.0 / (1.0+i+j); A[i_m+i] = m*1.0; } }

/* Block Cholesky factorization A <- L * L^t Lower triangular matrix, with the diagonal, stores the Cholesky factor. / void Cholesky( double AA, int N, size_t blocsize ) { double (A)[N][N] = (double ()[N][N])&AA[0];

pragma omp parallel

pragma omp single

for (size_t k=0; k < N; k += blocsize) {

pragma omp task shared(A), private(blocsize, N) depend(inout: A[k:blocsize][k:blocsize])

 clapack_dpotrf(
   CblasRowMajor,  CblasLower,  blocsize,  &(*A)[k][k],  N
 );
 for  (size_t  m=k+blocsize;  m  <  N;  m  +=  blocsize)
 {
    #pragma omp task shared(A), private(blocsize, N) \
          depend(inout: A[m:blocsize][k:blocsize]) depend(in: A[k:blocsize][k:blocsize])
   cblas_dtrsm
   (
     CblasRowMajor,  CblasLeft,  CblasLower,  CblasNoTrans,  CblasUnit,
     blocsize,  blocsize,  1.,  &(*A)[k][k],  N,  &(*A)[m][k],  N
   );
 }

 for  (size_t  m=k+blocsize;  m  <  N;  m  +=  blocsize)
 {
    #pragma omp task shared(A), private(blocsize, N) \
         depend(inout: A[m:blocsize][m:blocsize]) depend(in: A[m:blocsize][k:blocsize])
   cblas_dsyrk
   (
     CblasRowMajor,  CblasLower,  CblasNoTrans,
     blocsize,  blocsize,  -1.0,  &(*A)[m][k],  N,  1.0,  &(*A)[m][m],  N
   );
   for  (size_t  n=k+blocsize;  n  <  m;  n  +=  blocsize)
   {

pragma omp task shared(A), private(blocsize, N) \

             depend(inout: A[m:blocsize][m:blocsize]) depend(in: A[m:blocsize][k:blocsize], A[n:blocsize][k:blocsize])
     cblas_dgemm
     (
       CblasRowMajor,  CblasNoTrans,  CblasTrans,
       blocsize,  blocsize,  blocsize,  -1.0,  &(_A)[m][k],  N,  &(_A)[n][k],  N,  1.0,  &(*A)[m][n],  N
     );
   }
 }

} }

/* Do one run for cholesky */ void doone_exp( int N, int block_count ) { size_t blocsize = N / block_count;

printf("N : %i\n", N); printf("size block: %i\n", blocsize); printf("#blocks : %i\n", block_count); double* A = 0; if (0 != posixmemalign((void*)&A, 4096, N_N*sizeof(double))) { printf("Fatal Error. Cannot allocate matrice A, errno: %i\n", errno); return; }

generate_matrix(A, N);

Cholesky(A, N, blocsize);

free(A); }

/* main entry point / int main(int argc, char* argv) { // matrix dimension int n = 32; if (argc > 1) n = atoi(argv[1]);

// block count int block_count = 2; if (argc > 2) block_count = atoi(argv[2]);

doone_exp( n, block_count );

return 0; }

I compile it with atlas cblas/lapack version:

$> clang cholesky_inplace.c -lcblas -llapack_atlas -fopenmp

It works fine without -fopenmp but crash with it.

Regards, Pierrick

— Reply to this email directly or view it on GitHub https://github.com/clang-omp/clang/issues/26.

alexey-bataev commented 10 years ago

Pierrick, Try again. I've fixed a bug.

Best regards,

Alexey Bataev

Software Engineer Intel Compiler Team Intel Corp.

19 Март 2014 г. 7:17:06, Alexey Bataev писал:

Pierrick, Thanks for the report and reproducer. I'll check it ASAP.

Best regards,

Alexey Bataev

Software Engineer Intel Compiler Team Intel Corp.

18 Март 2014 г. 16:33:32, pbrunet писал:

Hi,

As dependencies appears in the clang-omp version, I try them and clang crash on a cholesky case.

code is :

include

include

include

include <sys/types.h>

include

include

include <atlas/cblas.h>

include <atlas/clapack.h>/* assume MKL/ATLAS clapack version */

int clapack_dpotrf(const enum ATLAS_ORDER Order, const enum ATLAS_UPLO Uplo, const int N, double *A, const int lda);

void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const int M, const int N, const double alpha, const double A, const int lda, double B, const int ldb);

void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const int N, const int K, const double alpha, const double A, const int lda, const double beta, double C, const int ldc);

void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const double alpha, const double A, const int lda, const double B, const int ldb, const double beta, double *C, const int ldc);

/* Generate a random matrix symetric definite positive matrix of size m x m

  • it will be also interesting to generate symetric diagonally dominant matrices which are known to be definite postive. _/ static void generatematrix(double A, size_t m) { // for (size_t i = 0; i< m; ++i) { for (size_t j = 0; j< m; ++j) A[i_m+j] = 1.0 / (1.0+i+j); A[i_m+i] = m*1.0; } }

/* Block Cholesky factorization A <- L * L^t Lower triangular matrix, with the diagonal, stores the Cholesky factor. / void Cholesky( double AA, int N, size_t blocsize ) { double (A)[N][N] = (double ()[N][N])&AA[0];

pragma omp parallel

pragma omp single

for (size_t k=0; k < N; k += blocsize) {

pragma omp task shared(A), private(blocsize, N) depend(inout:

A[k:blocsize][k:blocsize]) clapack_dpotrf( CblasRowMajor, CblasLower, blocsize, &(*A)[k][k], N );

 for  (size_t  m=k+blocsize;  m  <  N;  m  +=  blocsize)
 {
    #pragma omp task shared(A), private(blocsize, N) \
          depend(inout: A[m:blocsize][k:blocsize]) depend(in:

A[k:blocsize][k:blocsize]) cblas_dtrsm ( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, blocsize, blocsize, 1., &(_A)[k][k], N, &(_A)[m][k], N ); }

 for  (size_t  m=k+blocsize;  m  <  N;  m  +=  blocsize)
 {
    #pragma omp task shared(A), private(blocsize, N) \
         depend(inout: A[m:blocsize][m:blocsize]) depend(in:

A[m:blocsize][k:blocsize]) cblas_dsyrk ( CblasRowMajor, CblasLower, CblasNoTrans, blocsize, blocsize, -1.0, &(_A)[m][k], N, 1.0, &(_A)[m][m], N ); for (size_t n=k+blocsize; n < m; n += blocsize) {

pragma omp task shared(A), private(blocsize, N) \

             depend(inout: A[m:blocsize][m:blocsize]) depend(in:

A[m:blocsize][k:blocsize], A[n:blocsize][k:blocsize]) cblas_dgemm ( CblasRowMajor, CblasNoTrans, CblasTrans, blocsize, blocsize, blocsize, -1.0, &(_A)[m][k], N, &(_A)[n][k], N, 1.0, &(*A)[m][n], N ); } } } }

/* Do one run for cholesky */ void doone_exp( int N, int block_count ) { size_t blocsize = N / block_count;

printf("N : %i\n", N); printf("size block: %i\n", blocsize); printf("#blocks : %i\n", block_count); double* A = 0; if (0 != posixmemalign((void*)&A, 4096, N_N*sizeof(double))) { printf("Fatal Error. Cannot allocate matrice A, errno: %i\n", errno); return; }

generate_matrix(A, N);

Cholesky(A, N, blocsize);

free(A); }

/* main entry point / int main(int argc, char* argv) { // matrix dimension int n = 32; if (argc > 1) n = atoi(argv[1]);

// block count int block_count = 2; if (argc > 2) block_count = atoi(argv[2]);

doone_exp( n, block_count );

return 0; }

I compile it with atlas cblas/lapack version:

$> clang cholesky_inplace.c -lcblas -llapack_atlas -fopenmp

It works fine without -fopenmp but crash with it.

Regards, Pierrick

— Reply to this email directly or view it on GitHub https://github.com/clang-omp/clang/issues/26.

pbrunet commented 10 years ago

Alexey, Thanks for the modification. It looks to work now.

Best regards, Pierrick

pbrunet commented 10 years ago

Alexey,

Sorry but I checked the result and it differ for clang with -fopenmp and clang without -fopenmp. I first check that my pragma are OK but I didn't see errors and gcc 4.9 succeed with it. Can you have a look?

Best, Pierrick

PS: previous code have been updated for datasharing issue.

alexey-bataev commented 10 years ago

Hi Pierrick, I'll take a look ASAP. Unfortunately I had problems with building the code, but I'll try one more time. I'm just a little bit busy trying to fix code according to your previous requests (I mean AST representation for combined directives).

Best regards,

Alexey Bataev

Software Engineer Intel Compiler Team Intel Corp.

24 Март 2014 г. 13:49:47, pbrunet писал:

Alexey,

Sorry but I checked the result and it differ for clang with -fopenmp and clang without -fopenmp. I first check that my pragma are OK but I didn't see errors and gcc 4.9 succeed with it. Can you have a look?

Best, Pierrick

— Reply to this email directly or view it on GitHub https://github.com/clang-omp/clang/issues/26#issuecomment-38426323.

alexey-bataev commented 10 years ago

Committed 467c95dbd1ef080ff4672d10f164367a52b90339

pbrunet commented 10 years ago

Thanks but I have a segfault now :-s

I use the previous programm as:

./a.out 10000 10

and it is always reproductible.

alexey-bataev commented 10 years ago

Hi, could you send me LLVM IR generated from your code for investigation?

Best regards,

Alexey Bataev

Software Engineer Intel Compiler Team Intel Corp.

26 Март 2014 г. 19:39:21, pbrunet писал:

Thanks but I have a segfault now :-s

I use the previous programm as:

./a.out 10000 10

and it is always reproductible.

— Reply to this email directly or view it on GitHub https://github.com/clang-omp/clang/issues/26#issuecomment-38699238.

pbrunet commented 10 years ago

Hi, I think you will not need it as your last commit fix the issue. It is great !!

I don't know if you are working on performance improvement for now or if it will be for later but current overhead is really important.

I am not really accurate in my measures but: gcc = 8.5 sec with 4 cores clang -fopenmp = 33 sec with 4 cores clang = 27 sec with 1 core

Whatever, it is a really nice improvement to support these pragma !!

Best, Pierrick

alexey-bataev commented 10 years ago

Hi Pierrick, I know, that currently depend is very-very slow. I'm trying to improve codegen for task with depend clause right now. I hope it will improve performance.

Best regards,

Alexey Bataev

Software Engineer Intel Compiler Team Intel Corp.

28 Март 2014 г. 13:56:37, pbrunet писал:

Hi, I think you will not need it as your last commit fix the issue. It is great !!

I don't know if you are working on performance improvement for now or if it will be for later but current overhead is really important.

I am not really accurate in my measures but: gcc = 8.5 sec with 4 cores clang -fopenmp = 33 sec with 4 cores clang = 27 sec with 1 core

Whatever, it is a really nice improvement to support these pragma !!

Best, Pierrick

— Reply to this email directly or view it on GitHub https://github.com/clang-omp/clang/issues/26#issuecomment-38903058.

alexey-bataev commented 10 years ago

Hi Pierrick, I've just committed 72161c73d85329dfc766330e6cac76bb9875263c which should improve performance of OpenMP code. Try this version.

Best regards,

Alexey Bataev

Software Engineer Intel Compiler Team Intel Corp.

28.03.2014 13:56, pbrunet пишет:

Hi, I think you will not need it as your last commit fix the issue. It is great !!

I don't know if you are working on performance improvement for now or if it will be for later but current overhead is really important.

I am not really accurate in my measures but: gcc = 8.5 sec with 4 cores clang -fopenmp = 33 sec with 4 cores clang = 27 sec with 1 core

Whatever, it is a really nice improvement to support these pragma !!

Best, Pierrick

— Reply to this email directly or view it on GitHub https://github.com/clang-omp/clang/issues/26#issuecomment-38903058.

pbrunet commented 10 years ago

Hi Alexey,

Nice improvement, I only check on my computer but it is comparable with gcc.

Best, Pierrick

alexey-bataev commented 10 years ago

Hi Pierrick, It's good to hear it from you! Thanks for your help!!!

Best regards,

Alexey Bataev

Software Engineer Intel Compiler Team Intel Corp.

7 Апрель 2014 г. 13:32:26, pbrunet писал:

Hi Alexey,

Nice improvement, I only check on my computer but it is comparable with gcc.

Best, Pierrick

— Reply to this email directly or view it on GitHub https://github.com/clang-omp/clang/issues/26#issuecomment-39711147.