[x86-64] Avoid usage of multi-uop CMOVBE/CMOVNBE

daniel-zabawa commented 3 weeks ago

The CMOVBE/CMOVNBE instructions generate 2 uops and have a throughput of 1 for P-cores. Other CMOVs are a single uop with a throughput of 2.

The following case shows the backend generating the more expensive CMOVBE/CMOVA instructions:

//  file f.c
int f(int x) {
    if (x < 2)
      return x;
    long long int l = 1;
    long long int u = x;
    do {
      long long int m = (l + u) >> 1;
      if (m*m > x) u=m; else l=m;
    } while (l+1 < u);
    return (int)l;
}

Compiling the above with trunk as clang -O2 -march=core-avx2 -S f.c generates:

f(int):
        mov     eax, edi
        cmp     edi, 2
        jl      .LBB0_3
        mov     ecx, eax
        mov     eax, 1
        mov     rdx, rcx
.LBB0_2:
        lea     rsi, [rdx + rax]
        sar     rsi
        mov     rdi, rsi
        imul    rdi, rsi
        cmp     rdi, rcx
        cmovbe  rax, rsi
        cmova   rdx, rsi
        lea     rsi, [rax + 1]
        cmp     rsi, rdx
        jl      .LBB0_2
.LBB0_3:
        ret

The cmovge and cmovl instructions should be preferred to these where possible.

llvmbot commented 3 weeks ago

@llvm/issue-subscribers-backend-x86

Author: Daniel Zabawa (daniel-zabawa)

The CMOVBE/CMOVNBE instructions generate 2 uops and have a throughput of 1 for P-cores. Other CMOVs are a single uop with a throughput of 2. The following case shows the backend generating the more expensive CMOVBE/CMOVA instructions: ``` // file f.c int f(int x) { if (x < 2) return x; long long int l = 1; long long int u = x; do { long long int m = (l + u) >> 1; if (m*m > x) u=m; else l=m; } while (l+1 < u); return (int)l; } ``` Compiling the above with trunk as `clang -O2 -march=core-avx2 -S f.c` generates: ``` f(int): mov eax, edi cmp edi, 2 jl .LBB0_3 mov ecx, eax mov eax, 1 mov rdx, rcx .LBB0_2: lea rsi, [rdx + rax] sar rsi mov rdi, rsi imul rdi, rsi cmp rdi, rcx cmovbe rax, rsi cmova rdx, rsi lea rsi, [rax + 1] cmp rsi, rdx jl .LBB0_2 .LBB0_3: ret ``` The `cmovge` and `cmovl` instructions should be preferred to these where possible.

RKSimon commented 3 weeks ago

Thanks for the test case - I thought we'd already handled most of these

define i32 @PR113965(i32 noundef %x) {
entry:
  %cmp = icmp slt i32 %x, 2
  br i1 %cmp, label %return, label %if.end

if.end:
  %conv = zext nneg i32 %x to i64
  br label %do.body

do.body:
  %l.0 = phi i64 [ 1, %if.end ], [ %l.0.shr, %do.body ]
  %u.0 = phi i64 [ %conv, %if.end ], [ %shr.u.0, %do.body ]
  %add = add nsw i64 %u.0, %l.0
  %shr = ashr i64 %add, 1
  %mul = mul nsw i64 %shr, %shr
  %cmp2 = icmp samesign ugt i64 %mul, %conv
  %l.0.shr = select i1 %cmp2, i64 %l.0, i64 %shr
  %shr.u.0 = select i1 %cmp2, i64 %shr, i64 %u.0
  %add5 = add nsw i64 %l.0.shr, 1
  %cmp6 = icmp slt i64 %add5, %shr.u.0
  br i1 %cmp6, label %do.body, label %do.end

do.end:
  %conv7 = trunc i64 %l.0.shr to i32
  br label %return

return:
  %retval.0 = phi i32 [ %conv7, %do.end ], [ %x, %entry ]
  ret i32 %retval.0
}

llvm / llvm-project

[x86-64] Avoid usage of multi-uop CMOVBE/CMOVNBE #113965