gcc scales way differently compared to icc

Intel compilers behave differently when optimizing the same code (obvious), but the speed difference can be significantly large to notice.

gcc speed vs icc speed, below, after paralllel outer loop (main) + inner loop pragma optimization (build_hist). Look at NUMA_tune4 (gcc) and NUMA_tune4_icc (icc). Looking at the efficiency chart is interesting, ran 5x 5 times to make sure it was not a sampling mistake, it is not...

Average speed:

Average speed, zoom:

Average efficiency:

Average inverted efficiency:

Code for outer loop:

#pragma ivdep
#pragma omp parallel for num_threads(nthread) schedule(dynamic)
  for (int i = 0; i < num_instance_set; ++i) {
    hist_builder.BuildHist(gpair, instance_set[i], gmat, &histogram[i]);
  }

Code four inner loop:

#include <dmlc/omp.h>
#include <perflab/build_hist.h>
#include <omp.h>
#include <vector>

namespace perflab {

void GHistBuilder::BuildHist(const std::vector<GradientPair>& gpair,
                             const std::vector<size_t>& instance_set,
                             const GHistIndexMatrix& gmat,
                             std::vector<GHistEntry>* hist) {
  // std::fill(data_.begin(), data_.end(), GHistEntry());
  // # pragma omp simd // SLOWS DOWN the initialization
  for (size_t i = 0; i < data_.size(); i++) {
    data_[i] = GHistEntry();
  }

  constexpr int kUnroll = 8;  // loop unrolling factor
  const auto nthread = static_cast<dmlc::omp_uint>(this->nthread_);
  const size_t nrows = instance_set.size();
  const size_t rest = nrows % kUnroll;
  const size_t unrolled = nrows - rest;
  const dmlc::omp_uint tid = omp_get_thread_num();
  const size_t off = tid * nbin_;
  size_t rid[kUnroll];
  size_t ibegin[kUnroll];
  size_t iend[kUnroll];
  GradientPair stat[kUnroll];
  uint32_t bin;

  for (dmlc::omp_uint i = 0; i < unrolled; i += kUnroll) {

    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      rid[k] = instance_set[i + k];
    }

    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      ibegin[k] = gmat.row_ptr[rid[k]];
      iend[k] = gmat.row_ptr[rid[k] + 1];
    }

    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      stat[k] = gpair[rid[k]];
    }

    #if defined(__INTEL_COMPILER)
    #  pragma ivdep
    #  pragma unroll
    #elif defined(__GNUG__)
    #  pragma GCC ivdep
    #  pragma unroll
    #endif
    for (int k = 0; k < kUnroll; ++k) {
      // Very bad inner loop carried dependency causing inter-thread mass locks, should rewrite .Add(stat[k]) from scratch
      for (size_t j = ibegin[k]; j < iend[k]; ++j) {
        bin = gmat.index[j];
        data_[off + bin].Add(stat[k]);
      }
    }
  }

  for (size_t i = nrows - rest; i < nrows; ++i) {
    const size_t rid = instance_set[i];
    const size_t ibegin = gmat.row_ptr[rid];
    const size_t iend = gmat.row_ptr[rid + 1];
    const GradientPair stat = gpair[rid];
    for (size_t j = ibegin; j < iend; ++j) {
      const uint32_t bin = gmat.index[j];
      data_[bin].Add(stat);
    }
  }

  /* reduction */
  const uint32_t nbin = nbin_;
  #pragma omp simd // Stronger than ivdep, therefore no ivdep necessary
  for (dmlc::omp_uint bin_id = 0; bin_id < dmlc::omp_uint(nbin); ++bin_id) {
    (*hist)[bin_id].Add(data_[tid * nbin_ + bin_id]);
  }
}

}  // namespace perflab

Environment variables:

export GOMP_CPU_AFFINITY="0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71"

hcho3 / xgboost-fast-hist-perf-lab

gcc scales way differently compared to icc #13