Missed Optimization: saturating byte addition

bmaurer commented 1 month ago

The folly f14 hashtable has a 8 bit counter that "saturates" at 255.

https://github.com/facebook/folly/blob/c8b8d4cac3b7cf049b007fa08e12061c5b239a5e/folly/container/detail/F14Table.h#L572-L582

  void incrOutboundOverflowCount() {
    if (outboundOverflowCount_ != 255) {
      ++outboundOverflowCount_;
    }
  }

  void decrOutboundOverflowCount() {
    if (outboundOverflowCount_ != 255) {
      --outboundOverflowCount_;
    }
  }

The optimal code for the incr case is:

        movzx   eax, BYTE PTR [rdi]
        add     al, 1
        sbb     al, 0
        mov     BYTE PTR [rdi], al

void add1(uint8_t* x) {
    *x = *x == 255 ? 255 : *x + 1;
}

void add2(uint8_t* x) {
    uint8_t res;
    uint8_t carry = __builtin_add_overflow(*x, 1, &res) ? 1 : 0;
    *x = res - carry;
}

# clang output 

add1(unsigned char*):                              # @add1(unsigned char*)
        movzx   eax, byte ptr [rdi]
        inc     al
        movzx   eax, al
        mov     ecx, 255
        cmovne  ecx, eax
        mov     byte ptr [rdi], cl
        ret
add2(unsigned char*):                              # @add2(unsigned char*)
        movzx   eax, byte ptr [rdi]
        inc     al
        sete    cl
        sub     al, cl
        mov     byte ptr [rdi], al
        ret

Neither clang nor gcc does this for add1. GCC can do it for add2 but clang cannot. And I'm not sure what the optimal output is for the subtraction case but all of the compilers end up using multiple registers.


void sub1(uint8_t* x) {
    *x = *x == 255 ? 255 : *x - 1;
}

void sub2(uint8_t* x) {
    uint8_t xp1 = *x + 1;
    uint8_t res;
    uint8_t carry = __builtin_sub_overflow(xp1, 1, &res) ? 1 : 0;
    *x = res - 1 + carry;
}

#gcc 
sub1(unsigned char*):
        movzx   eax, BYTE PTR [rdi]
        cmp     al, -1
        setne   dl
        sub     eax, edx
        mov     BYTE PTR [rdi], al
        ret
sub2(unsigned char*):
        movzx   eax, BYTE PTR [rdi]
        inc     al
        sete    dl
        lea     eax, [rax-2+rdx]
        mov     BYTE PTR [rdi], al
        ret
# clang

sub1(unsigned char*):                              # @sub1(unsigned char*)
        movzx   eax, byte ptr [rdi]
        lea     ecx, [rax - 1]
        cmp     al, -1
        movzx   eax, cl
        mov     ecx, 255
        cmovne  ecx, eax
        mov     byte ptr [rdi], cl
        ret
sub2(unsigned char*):                              # @sub2(unsigned char*)
        movzx   eax, byte ptr [rdi]
        cmp     al, -1
        sete    cl
        add     cl, al
        dec     cl
        mov     byte ptr [rdi], cl
        ret

Full repro: https://godbolt.org/z/PqP7vcjsz

#include <cstdint>
#include <cassert>
#include <iostream>

void add1(uint8_t* x) {
    *x = *x == 255 ? 255 : *x + 1;
}

void add2(uint8_t* x) {
    uint8_t res;
    uint8_t carry = __builtin_add_overflow(*x, 1, &res) ? 1 : 0;
    *x = res - carry;
}

void sub1(uint8_t* x) {
    *x = *x == 255 ? 255 : *x - 1;
}

void sub2(uint8_t* x) {
    uint8_t xp1 = *x + 1;
    uint8_t res;
    uint8_t carry = __builtin_sub_overflow(xp1, 1, &res) ? 1 : 0;
    *x = res - 1 + carry;
}

int main() {
    for (int i = 0; i < 256; i ++) {
        uint8_t a = i, b = i, c = i, d = i;
        add1(&a);
        add2(&b);
        sub1(&c);
        sub2(&d);

        assert(a == b);
        assert(c == d);
        std::cout << (int)a << " " << (int)c << std::endl;
    }
    return 0;
}

bmaurer commented 1 month ago

The optimal output for "the subtraction" version I believe is:

void sub2(unsigned char *n)
{
    asm(
        "cmpb $255, %0\n"
        "sbbb $0, %0\n"
        : "+r"(*n)
        :
        : "cc"
    );
}

I couldn't get either compiler close to this.

ParkHanbum commented 1 month ago

I do some tests.

no transform to a saturating operation is done in the middle-end. ref : https://alive2.llvm.org/ce/z/ApY5rr

Even if we change binary operation for saturated, there is no proper logic in the backend to handle it.

Optimized type-legalized selection DAG: %bb.0 'src:entry'
SelectionDAG has 12 nodes:
  t0: ch,glue = EntryToken 
  t2: i64,ch = CopyFromReg t0, Register:i64 %0
  t5: i8,ch = load<(load (s8) from %ir.x)> t0, t2, undef:i64
  t13: i8,i8 = uaddo t5, Constant:i8<1>
      t9: i8 = select t13:1, Constant:i8<-1>, t13
    t10: ch = store<(store (s8) into %ir.x)> t5:1, t9, t2, undef:i64
  t12: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>
...
Optimized legalized selection DAG: %bb.0 'src:entry'
SelectionDAG has 15 nodes:
  t0: ch,glue = EntryToken
  t2: i64,ch = CopyFromReg t0, Register:i64 %0
  t5: i8,ch = load<(load (s8) from %ir.x)> t0, t2, undef:i64
  t16: i8,i32 = X86ISD::ADD t5, Constant:i8<1>
          t19: i32 = any_extend t16
        t20: i32 = X86ISD::CMOV t19, Constant:i32<255>, TargetConstant:i8<4>, t16:1
      t21: i8 = truncate t20
    t10: ch = store<(store (s8) into %ir.x)> t5:1, t21, t2, undef:i64
  t12: ch = X86ISD::RET_GLUE t10, TargetConstant:i32<0>

Optimized lowered selection DAG: %bb.0 'tgt:entry'

SelectionDAG has 10 nodes:
  t0: ch,glue = EntryToken
  t2: i64,ch = CopyFromReg t0, Register:i64 %0
  t5: i8,ch = load<(load (s8) from %ir.x)> t0, t2, undef:i64
      t7: i8 = uaddsat t5, Constant:i8<1>
    t8: ch = store<(store (s8) into %ir.x)> t5:1, t7, t2, undef:i64
  t10: ch = X86ISD::RET_GLUE t8, TargetConstant:i32<0>

...
Optimized legalized selection DAG: %bb.0 'tgt:entry'
SelectionDAG has 15 nodes:
  t0: ch,glue = EntryToken
  t2: i64,ch = CopyFromReg t0, Register:i64 %0
  t5: i8,ch = load<(load (s8) from %ir.x)> t0, t2, undef:i64 
  t15: i8,i32 = X86ISD::ADD t5, Constant:i8<1>
          t18: i32 = any_extend t15
        t19: i32 = X86ISD::CMOV t18, Constant:i32<255>, TargetConstant:i8<4>, t15:1
      t20: i8 = truncate t19
    t8: ch = store<(store (s8) into %ir.x)> t5:1, t20, t2, undef:i64
  t10: ch = X86ISD::RET_GLUE t8, TargetConstant:i32<0>

My guess is that the truncate conversion is saturated and needs to be handled.

bmaurer commented 1 month ago

A related missed optimization:

#include <cstdint>

void foo(uint8_t& x) {
    if (x != 255) {
        x++;
    }
}

void foox(uint8_t& x) {
    uint8_t tmp = x + 1;

    if (tmp != 0) {
        x = tmp;
    }
}

Clang misses that it can save an instruction by transforming the first version into the second.

foo(unsigned char&):                               # @foo(unsigned char&)
        movzx   eax, byte ptr [rdi]
        cmp     al, -1
        je      .LBB0_2
        inc     al
        mov     byte ptr [rdi], al
.LBB0_2:
        ret
foox(unsigned char&):                              # @foox(unsigned char&)
        movzx   eax, byte ptr [rdi]
        inc     al
        je      .LBB1_2
        mov     byte ptr [rdi], al
.LBB1_2:
        ret

pinskia commented 1 month ago

here is another way to write saturating unsigned add which is not optimized either:

#include <stdint.h>
void add3(uint8_t* x) {
    uint8_t res;
    uint8_t carry = __builtin_add_overflow(*x, 1, &res) ? 1 : 0;
    *x = carry ? 255 : res;
}

llvm / llvm-project

Missed Optimization: saturating byte addition #99391