martinus / robin-hood-hashing

Fast & memory efficient hashtable based on robin hood hashing for C++11/14/17/20
https://gitter.im/martinus/robin-hood-hashing
MIT License
1.5k stars 142 forks source link

func unaligned_load can be optimized #159

Closed wangqiim closed 1 year ago

wangqiim commented 1 year ago

https://github.com/martinus/robin-hood-hashing/blob/fb1483621fda28d4afb31c0097c1a4a457fdd35b/src/include/robin_hood.h#L354-L361

I find template parameter T is only used as uint16_t, uint32_t and uint64_t, so I think unaligned_load<int64_t>((int64_t *)s + i); can be optimezer through *((int64_t *)s + i).

test.cpp

#include <xmmintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <math.h>
#include <stdint.h>
#include <cstring>

template <typename T>
inline T unaligned_load(void const* ptr) noexcept {
  // using memcpy so we don't get into unaligned load problems.
  // compiler should optimize this very well anyways.
  T t;
  std::memcpy(&t, ptr, sizeof(T));
  return t;
}

size_t hash_bytes(void const* ptr, size_t len) noexcept {
  static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
  static constexpr uint64_t seed = UINT64_C(0xe17a1465);
  static constexpr unsigned int r = 47;

  auto const* const data64 = static_cast<uint64_t const*>(ptr);
  uint64_t h = seed ^ (len * m);

  size_t const n_blocks = len / 8;
  for (size_t i = 0; i < n_blocks; ++i) {
    auto k = unaligned_load<uint64_t>(data64 + i);

    k *= m;
    k ^= k >> r;
    k *= m;

    h ^= k;
    h *= m;
  }

  auto const* const data8 = reinterpret_cast<uint8_t const*>(data64 + n_blocks);
  switch (len & 7U) {
    case 7:
      h ^= static_cast<uint64_t>(data8[6]) << 48U;
    case 6:
      h ^= static_cast<uint64_t>(data8[5]) << 40U;
    case 5:
      h ^= static_cast<uint64_t>(data8[4]) << 32U;
    case 4:
      h ^= static_cast<uint64_t>(data8[3]) << 24U;
    case 3:
      h ^= static_cast<uint64_t>(data8[2]) << 16U;
    case 2:
      h ^= static_cast<uint64_t>(data8[1]) << 8U;
    case 1:
      h ^= static_cast<uint64_t>(data8[0]);
      h *= m;
    default:
      break;
  }

  h ^= h >> r;

  // not doing the final step here, because this will be done by keyToIdx anyways
  // h *= m;
  // h ^= h >> r;
  return static_cast<size_t>(h);
}

char s[8000000];

int main() {
    int n = 20;
    for (int i = 0; i < 4000000; i++) {
        s[i] = i;
    }
    struct timeval tv1, tv2, tv3;
    gettimeofday(&tv1, NULL);
    int sum = 0;
    for (int j = 0; j < n; j++) {
        for (int i = 0; i < 1000000; i++) {
            sum += unaligned_load<int64_t>((int64_t *)s + i);
        }
    }

    gettimeofday(&tv2, NULL);
    tv2.tv_sec -= tv1.tv_sec;
    tv2.tv_usec -= tv1.tv_usec;
    printf("sum = %d, time cost: %d.%06d, \n", sum, tv2.tv_sec, tv2.tv_usec);

    gettimeofday(&tv1, NULL);
    sum = 0;
    for (int j = 0; j < n; j++) {
        for (int i = 0; i < 1000000; i++) {
            sum += *((int64_t *)s + i);
        }
    }
    gettimeofday(&tv3, NULL);
    tv3.tv_sec -= tv1.tv_sec;
    tv3.tv_usec -= tv1.tv_usec;
    printf("sum = %d, time cost: %d.%06d, \n", sum, tv3.tv_sec, tv3.tv_usec);
}

output

sum = 1583703552, time cost: 0.067667, 
sum = 1583703552, time cost: 0.043813,
martinus commented 1 year ago

I think your benchmark numbers are bogus. It optimizes to the same code: https://godbolt.org/z/bevvonoKf

And the more important issue is that on some architectures it is not allowed to do an unaligned load with something like *(int64_t *)s. That will cause a SEGFAULT.

wangqiim commented 1 year ago

I think your benchmark numbers are bogus. It optimizes to the same code: https://godbolt.org/z/bevvonoKf

And the more important issue is that on some architectures it is not allowed to do an unaligned load with something like *(int64_t *)s. That will cause a SEGFAULT.

yes, you are right. I make a mistake. I ignored the impact of the CPU cache.:joy: In for (int i = 0; i < 4000000; i++) the numbers should be 8000000 instead of 4000000. (warm up cpu cache). Thanks for your code example in the tool (Compiler Explorer) , This tool is awesome.