cmd/compile: tight code optimization opportunities

rsc commented 3 years ago

The generated x86 code can be improved in some fairly simple ways - hoisting computed constants out of loop bodies, and avoiding unnecessary register moves - that have a significant performance impact on tight loops. In the following example those improvements produce a 35% speedup.

Here is an alternate, DFA-based implementation of utf8.Valid that I have been playing with:

func Valid(x []byte) bool {
    state := uint64(1 * 6)
    for _, b := range x {
        state = dfa[b] >> (state & 63)
    }
    return (state & 63) == 1*6
}

const (
    s00 = 0 | (1*6)<<(1*6)
    sC0 = 0
    sC2 = 0 | (2*6)<<(1*6)
    sE0 = 0 | (3*6)<<(1*6)
    sE1 = 0 | (4*6)<<(1*6)
    sED = 0 | (5*6)<<(1*6)
    sEE = sE1
    sF0 = 0 | (6*6)<<(1*6)
    sF1 = 0 | (7*6)<<(1*6)
    sF4 = 0 | (8*6)<<(1*6)
    sF5 = 0

    s80 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(7*6) | (4*6)<<(8*6)
    s90 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
    sA0 = 0 | (1*6)<<(2*6) | (2*6)<<(3*6) | (2*6)<<(4*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
)

var dfa = [256]uint64{
    s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
    s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
    s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
    s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
    s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
    s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
    s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
    s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
    s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80,
    s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90,
    sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
    sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
    sC0, sC0, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
    sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
    sE0, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sED, sEE, sEE,
    sF0, sF1, sF1, sF1, sF4, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5,
}

There are no big benchmarks of Valid in the package, but here are some that could be added:

var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
var valid1M = bytes.Repeat(valid1k, 1024)

func BenchmarkValid1K(b *testing.B) {
    b.SetBytes(int64(len(valid1k)))
    for i := 0; i < b.N; i++ {
        Valid(valid1k)
    }
}

func BenchmarkValid1M(b *testing.B) {
    b.SetBytes(int64(len(valid1M)))
    for i := 0; i < b.N; i++ {
        Valid(valid1M)
    }
}

The old Valid implementation runs at around 1450 MB/s. The implementation above runs at around 1600 MB/s. Better but not what I had hoped. It compiles as follows:

% go test -c && go tool objdump -s 'utf8.Valid$' utf8.test
TEXT unicode/utf8.Valid(SB) /Users/rsc/go/src/unicode/utf8/utf8.go
  utf8.go:647       0x10789c0       4889442408      MOVQ AX, 0x8(SP)
  utf8.go:649       0x10789c5       31c9            XORL CX, CX
  utf8.go:649       0x10789c7       ba06000000      MOVL $0x6, DX
  utf8.go:649       0x10789cc       eb1f            JMP 0x10789ed
  utf8.go:649       0x10789ce       0fb63408        MOVZX 0(AX)(CX*1), SI
  utf8.go:649       0x10789d2       488d7901        LEAQ 0x1(CX), DI
  utf8.go:650       0x10789d6       4c8d0543ef1600      LEAQ unicode/utf8.dfa(SB), R8
  utf8.go:650       0x10789dd       498b34f0        MOVQ 0(R8)(SI*8), SI
  utf8.go:650       0x10789e1       4889d1          MOVQ DX, CX
  utf8.go:650       0x10789e4       48d3ee          SHRQ CL, SI
  utf8.go:649       0x10789e7       4889f9          MOVQ DI, CX
  utf8.go:652       0x10789ea       4889f2          MOVQ SI, DX
  utf8.go:649       0x10789ed       4839cb          CMPQ CX, BX
  utf8.go:649       0x10789f0       7fdc            JG 0x10789ce
  utf8.go:652       0x10789f2       4883e23f        ANDQ $0x3f, DX
  utf8.go:652       0x10789f6       4883fa06        CMPQ $0x6, DX
  utf8.go:652       0x10789fa       0f94c0          SETE AL
  utf8.go:652       0x10789fd       c3          RET
  :-1           0x10789fe       cc          INT $0x3
  :-1           0x10789ff       cc          INT $0x3
%

Translating this to proper non-regabi assembly I get:

#include "textflag.h"

TEXT ·Valid(SB),NOSPLIT,$0
    MOVQ    x_base+0(FP),AX // p = &x[0]
    MOVQ    x_len+8(FP),BX // n = len(x)
    XORL    CX, CX // i = 0
    MOVL    $6, DX
    JMP loop

body:
    MOVBLZX (AX)(CX*1), SI // b = p[i]
    LEAQ    1(CX), DI // j = i+1
    LEAQ    ·dfa(SB), R8
    MOVQ    (R8)(SI*8), SI // t = dfa[b]
    MOVQ    DX, CX // CX = state
    SHRQ    CX, SI // t >>= state&63
    MOVQ    DI, CX // i = j
    MOVQ    SI, DX // state = t

loop:
    CMPQ    CX, BX
    JLT body

    ANDL    $0x3f, DX
    CMPL    DX, $6
    SETEQ   AL
    MOVB    AX, ret+24(FP)
    RET

This runs also at about 1600 MB/s.

First optimization: the LEAQ ·dfa(SB), R8 should be hoisted out of the loop body. (I tried to do this in the Go version with dfa := &dfa but it got constant propagated away!)

That change brings it up to 1750 MB/s.

Second optimization: use DI for i instead of CX, to avoid the pressure on CX. This lets the LEAQ 1(CX), DI and the later MOVQ DI, CX collapse to just LEAQ 1(DI), DI.

That change brings it up to 1900 MB/s.

The body is now:

body:
    MOVBLZX (AX)(DI*1), SI // b = p[i]
    LEAQ    1(DI), DI // i++
    MOVQ    (R8)(SI*8), SI // t = dfa[b]
    MOVQ    DX, CX // CX = state
    SHRQ    CX, SI // t >>= state&63
    MOVQ    SI, DX // state = t

Third optimization: since DX is moving into CX, do that one instruction earlier, allowing the use of SI to be optimized into DX to eliminate the final MOVQ:

body:
    MOVBLZX (AX)(DI*1), SI // b = p[i]
    LEAQ    1(DI), DI // i++
    MOVQ    DX, CX // CX = state
    MOVQ    (R8)(SI*8), DX // state = dfa[b]
    SHRQ    CX, DX // state >>= CX&63

I think this ends up being just "compute the shift amount before the shifted value". That change brings it up to 2150 MB/s.

This is still a direct translation of the Go code: there are no tricks the compiler couldn't do. For this particular loop, the optimizations make the code run 35% faster.

Final assembly:

#include "textflag.h"

TEXT ·Valid(SB),NOSPLIT,$0
    MOVQ    x_base+0(FP),AX // p = &x[0]
    MOVQ    x_len+8(FP),BX // n = len(x)
    XORL    DI, DI // i = 0
    MOVL    $6, DX
    LEAQ    ·dfa(SB), R8
    JMP loop

body:
    MOVBLZX (AX)(DI*1), SI // b = p[i]
    LEAQ    1(DI), DI // i++
    MOVQ    DX, CX // CX = state
    MOVQ    (R8)(SI*8), DX // t = dfa[b]
    SHRQ    CX, DX // t >>= state&63

loop:
    CMPQ    DI, BX
    JLT body

    ANDL    $0x3f, DX
    CMPL    DX, $6
    SETEQ   AL
    MOVB    AX, ret+24(FP)
    RET

rsc commented 3 years ago

Here is another, separate opportunity, for GOAMD64=v3 compilation. The SHRXQ instruction takes an explicit shift register, has separate source and destination operands, and can read source from memory. That allows reducing the loop to

body:
    MOVBLZX (AX)(DI*1), SI // b = p[i]
    LEAQ    1(DI), DI // i++
    SHRXQ   DX, (R8)(SI*8), DX // state = dfa[b] >> (state&63)

That change runs at 3400 MB/s (!).

(The DFA tables were carefully constructed exactly to enable this implementation.)

zchee commented 3 years ago

@rsc sorry for hijacked, but what means GOAMD64=v3?

ALTree commented 3 years ago

@zchee see https://github.com/golang/go/issues/45453 and https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels.

laboger commented 2 years ago

I see this hasn't had attention for a while but this is a problem I've noticed in ppc64 code too. Invariant values are not moved out of loops. I thought at one time there was work to do this but it must have been abandoned.

Here is one example:

   ff0a0:       00 00 e3 8b     lbz     r31,0(r3)  <--- nil check is not needed on each iteration
   ff0a4:       24 1f c7 78     rldicr  r7,r6,3,60  \
   ff0a8:       14 3a 03 7d     add     r8,r3,r7   /  These two could be strength-reduced?
   ff0ac:       00 00 28 e9     ld      r9,0(r8)
   ff0b0:       2a 20 47 7d     ldx     r10,r7,r4
   ff0b4:       2a 28 67 7d     ldx     r11,r7,r5
   ff0b8:       78 52 6a 7d     xor     r10,r11,r10
   ff0bc:       b0 00 61 39     addi    r11,r1,176  <---- this is invariant in the loop
   ff0c0:       2a 58 e7 7c     ldx     r7,r7,r11
   ff0c4:       78 52 e7 7c     xor     r7,r7,r10
   ff0c8:       78 3a 27 7d     xor     r7,r9,r7
   ff0cc:       00 00 e8 f8     std     r7,0(r8)
   ff0d0:       01 00 c6 38     addi    r6,r6,1
   ff0d4:       80 00 26 2c     cmpdi   r6,128
   ff0d8:       c8 ff 80 41     blt     ff0a0 <golang.org/x/crypto/argon2.processBlockGeneric+0x3a0>

gopherbot commented 2 years ago

Change https://go.dev/cl/385174 mentions this issue: cmd/compile: use shlx&shrx instruction for GOAMD64>=v3

golang / go

cmd/compile: tight code optimization opportunities #47120