At this moment NCC has support for vectorization/inlining/parallelism reporting. But due to the format used it's not really helpful.
Please consider this code:
template <typename T, typename OpType>
int64_t omp(T *x, uint32_t *shapeX, uint32_t *stridesX, T *y, uint32_t *shapeY, uint32_t *stridesY, T *z, uint32_t *shapeZ, uint32_t *stridesZ) {
auto timeStart = std::chrono::system_clock::now();
auto xStrd0 = stridesX[0];
auto xStrd1 = stridesX[1];
auto xStrd2 = stridesX[2];
auto xStrd3 = stridesX[3];
auto yStrd0 = stridesY[0];
auto yStrd1 = 0;
auto yStrd2 = stridesY[2];
auto yStrd3 = stridesY[3];
auto zStrd0 = stridesZ[0];
auto zStrd1 = stridesZ[1];
auto zStrd2 = stridesZ[2];
auto zStrd3 = stridesZ[3];
auto zAxis0 = shapeZ[0];
auto zAxis1 = shapeZ[1];
auto zAxis2 = shapeZ[2];
auto zAxis3 = shapeZ[3];
#pragma omp parallel for collapse(3)
for (uint32_t i0 = 0; i0 < zAxis0; ++i0) {
for (uint32_t i1 = 0; i1 < zAxis1; ++i1) {
for (uint32_t i2 = 0; i2 < zAxis2; ++i2) {
auto x2 = x + i0 * xStrd0 + i1 * xStrd1 + i2 * xStrd2;
auto y2 = y + i0 * yStrd0 + i1 * yStrd1 + i2 * yStrd2;
auto z2 = z + i0 * zStrd0 + i1 * zStrd1 + i2 * zStrd2;
if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 0)
for (uint32_t i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[i3], *y2);
else if(zStrd3 == 1 && xStrd3 == 0 && yStrd3 == 1)
for (uint32_t i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(*x2, y2[i3]);
else if(zStrd3 == 1 && xStrd3 == 1 && yStrd3 == 1)
for (uint32_t i3 = 0; i3 < zAxis3; ++i3)
z2[i3] = OpType::op(x2[i3], y2[i3]);
else
for (uint32_t i3 = 0; i3 < zAxis3; ++i3)
z2[i3 * zStrd3] = OpType::op(x2[i3 * xStrd3], y2[i3 * yStrd3]);
}
}
}
auto timeEnd = std::chrono::system_clock::now();
auto outerTime = std::chrono::duration_cast<std::chrono::microseconds>(timeEnd - timeStart).count();
return outerTime;
}
Compiler output will generally look like this:
nc++: vec( 102): /home/raver119/develop/deeplearning4j/libnd4j/include/loops/cpu/compilation_units/../broadcasting.hpp, line 638: Partially vectorized loop.
nc++: vec( 113): /home/raver119/develop/deeplearning4j/libnd4j/include/loops/cpu/compilation_units/../broadcasting.hpp, line 638: Overhead of loop division is too large.
nc++: vec( 118): /home/raver119/develop/deeplearning4j/libnd4j/include/loops/cpu/compilation_units/../broadcasting.hpp, line 638: Unvectorizable data type.
nc++: vec( 103): /home/raver119/develop/deeplearning4j/libnd4j/include/loops/cpu/compilation_units/../broadcasting.hpp, line 675: Vectorized loop.
But since the code relies on templates, it's impossible to say what was vectorized, what was not vectorized, thus making it REALLY hard to improve performance.
So, it would be really great to see function template arguments included in vectorizer/inliner/parallelizer output
At this moment NCC has support for vectorization/inlining/parallelism reporting. But due to the format used it's not really helpful.
Please consider this code:
Compiler output will generally look like this:
But since the code relies on templates, it's impossible to say what was vectorized, what was not vectorized, thus making it REALLY hard to improve performance.
So, it would be really great to see function template arguments included in vectorizer/inliner/parallelizer output