vmware-archive / go-pmem-transaction

Golang library for using persistent memory
Other
29 stars 5 forks source link

Implementation of Log2(fast logging) + movnt #38

Closed mohit10verma closed 4 years ago

mohit10verma commented 4 years ago

Opening a pull request to showcase the movnt change I made, and if you guys can point out some issue. I don't see much speedup after changing to movnt, although I should.

This pull request has 2 changes:

  1. Implementation of Log2() method, which is the faster Logging method for undoTx.
  2. Change Log2() to use movnt.

There are no other changes that we have been discussing in the recent past. Even only with movnt() we should see speedups.

Assembly obtained from objdump of Intel's movnt C intrinsics:

void foo(char *dest, char *src) {
  __m128i xmm0 = _mm_loadu_si128((__m128i *)src);
  _mm_stream_si128((__m128i *)dest, xmm0);
}

void foo1(char *dest, char* src) {
  _mm_stream_si64((long long *)dest, *(long long *)src);
}

void foo2(char *dest, char* src) {
  _mm_stream_si32((int*)dest, *(int*)src);
}

Objdump for these are:

0000000000000000 <foo>:
   0:   55                      push   %rbp
   1:   48 89 e5                mov    %rsp,%rbp
   4:   48 89 7d c8             mov    %rdi,-0x38(%rbp)
   8:   48 89 75 c0             mov    %rsi,-0x40(%rbp)
   c:   48 8b 45 c0             mov    -0x40(%rbp),%rax
  10:   48 89 45 d8             mov    %rax,-0x28(%rbp)
  14:   48 8b 45 d8             mov    -0x28(%rbp),%rax
  18:   f3 0f 6f 00             movdqu (%rax),%xmm0
  1c:   0f 29 45 e0             movaps %xmm0,-0x20(%rbp)
  20:   48 8b 45 c8             mov    -0x38(%rbp),%rax
  24:   48 89 45 d0             mov    %rax,-0x30(%rbp)
  28:   66 0f 6f 45 e0          movdqa -0x20(%rbp),%xmm0
  2d:   0f 29 45 f0             movaps %xmm0,-0x10(%rbp)
  31:   48 8b 45 d0             mov    -0x30(%rbp),%rax
  35:   66 0f 6f 45 f0          movdqa -0x10(%rbp),%xmm0
  3a:   66 0f e7 00             movntdq %xmm0,(%rax)
  3e:   90                      nop
  3f:   5d                      pop    %rbp
  40:   c3                      retq

0000000000000041 <foo1>:
  41:   55                      push   %rbp
  42:   48 89 e5                mov    %rsp,%rbp
  45:   48 89 7d e8             mov    %rdi,-0x18(%rbp)
  49:   48 89 75 e0             mov    %rsi,-0x20(%rbp)
  4d:   48 8b 45 e0             mov    -0x20(%rbp),%rax
  51:   48 8b 00                mov    (%rax),%rax
  54:   48 8b 55 e8             mov    -0x18(%rbp),%rdx
  58:   48 89 55 f0             mov    %rdx,-0x10(%rbp)
  5c:   48 89 45 f8             mov    %rax,-0x8(%rbp)
  60:   48 8b 45 f0             mov    -0x10(%rbp),%rax
  64:   48 8b 55 f8             mov    -0x8(%rbp),%rdx
  68:   48 0f c3 10             movnti %rdx,(%rax)
  6c:   90                      nop
  6d:   5d                      pop    %rbp
  6e:   c3                      retq

000000000000006f <foo2>:
  6f:   55                      push   %rbp
  70:   48 89 e5                mov    %rsp,%rbp
  73:   48 89 7d e8             mov    %rdi,-0x18(%rbp)
  77:   48 89 75 e0             mov    %rsi,-0x20(%rbp)
  7b:   48 8b 45 e0             mov    -0x20(%rbp),%rax
  7f:   8b 00                   mov    (%rax),%eax
81:   48 8b 55 e8             mov    -0x18(%rbp),%rdx
  85:   48 89 55 f8             mov    %rdx,-0x8(%rbp)
  89:   89 45 f4                mov    %eax,-0xc(%rbp)
  8c:   48 8b 45 f8             mov    -0x8(%rbp),%rax
  90:   8b 55 f4                mov    -0xc(%rbp),%edx
  93:   0f c3 10                movnti %edx,(%rax)
  96:   90                      nop
  97:   5d                      pop    %rbp
  98:   c3                      retq