In order to improve baseline HPCG performance, Consider the following:
Replace this code:
// Now allocate the arrays pointed to
for (local_int_t i=0; i< localNumberOfRows; ++i) {
mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow];
matrixValues[i] = new double[numberOfNonzerosPerRow];
mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow];
}
With this code:
// Now allocate the arrays pointed to
mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow];
matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow];
mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow];
for (local_int_t i=1; i< localNumberOfRows; ++i) {
mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow;
matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow;
mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow;
}
In order to improve baseline HPCG performance, Consider the following:
Replace this code:
// Now allocate the arrays pointed to for (local_int_t i=0; i< localNumberOfRows; ++i) { mtxIndL[i] = new local_int_t[numberOfNonzerosPerRow]; matrixValues[i] = new double[numberOfNonzerosPerRow]; mtxIndG[i] = new global_int_t[numberOfNonzerosPerRow]; }
With this code:
// Now allocate the arrays pointed to mtxIndL[0] = new local_int_t[localNumberOfRows * numberOfNonzerosPerRow]; matrixValues[0] = new double[localNumberOfRows * numberOfNonzerosPerRow]; mtxIndG[0] = new global_int_t[localNumberOfRows * numberOfNonzerosPerRow]; for (local_int_t i=1; i< localNumberOfRows; ++i) { mtxIndL[i] = mtxIndL[0] + i * numberOfNonzerosPerRow; matrixValues[i] = matrixValues[0] + i * numberOfNonzerosPerRow; mtxIndG[i] = mtxIndG[0] + i * numberOfNonzerosPerRow; }