Simplification of arguments of `fshl` leads to unoptimized assembly

quic-eikansh commented 1 year ago

In the code mentioned in the Godbolt link it can be seen that clang generates four ldrb to load W[0]. In comparison, GCC 12.1 generates one ldr for the same test case.

The reason for the unoptimized code is the simplification of the arguments of fshl. The LLVM IR of the test case is in this Godbolt link . The IR of interest is:

%or15 = tail call i32 @llvm.fshl.i32(i32 %conv9, i32 %or10, i32 25)

Instead, if we change the first arguments of fshl to %or10, the optimal assembly is generated.

Test case

#include <stdint.h>

#define GET_DATA(n,b,i)                     \
{                                           \
    (n) = ( (uint32_t) (b)[(i)] << 24 )     \
        | ( (uint32_t) (b)[(i) + 1] << 16 ) \
        | ( (uint32_t) (b)[(i) + 2] << 8 )  \
        | ( (uint32_t) (b)[(i) + 3] );      \
}

int test( unsigned char data[64] )
{
    uint32_t W[64];

    GET_DATA( W[ 0], data, 0 );

#define SHR(x,n) ((x & 0xFFFFFFFF) >> n)
#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))

    return ROTR(W[0], 7);
}

LLVM IR for the test case

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"

define i32 @test(ptr %data) {
entry:
  %0 = load i8, ptr %data
  %conv = zext i8 %0 to i32
  %shl = shl nuw i32 %conv, 24
  %arrayidx1 = getelementptr inbounds i8, ptr %data, i64 1
  %1 = load i8, ptr %arrayidx1
  %conv2 = zext i8 %1 to i32
  %shl3 = shl nuw nsw i32 %conv2, 16
  %or = or i32 %shl3, %shl
  %arrayidx4 = getelementptr inbounds i8, ptr %data, i64 2
  %2 = load i8, ptr %arrayidx4
  %conv5 = zext i8 %2 to i32
  %shl6 = shl nuw nsw i32 %conv5, 8
  %or7 = or i32 %or, %shl6
  %arrayidx8 = getelementptr inbounds i8, ptr %data, i64 3
  %3 = load i8, ptr %arrayidx8
  %conv9 = zext i8 %3 to i32
  %or10 = or i32 %or7, %conv9
  %or15 = tail call i32 @llvm.fshl.i32(i32 %conv9, i32 %or10, i32 25)
  ; Uncomment below line and comment above line to see the difference in asm
  ;%or15 = tail call i32 @llvm.fshl.i32(i32 %or10, i32 %or10, i32 25)
  ret i32 %or15
}

declare i32 @llvm.fshl.i32(i32, i32, i32)

Assembly generated by armv8-a clang for the test case

test(unsigned char*):                              // @test(unsigned char*)
        ldrb    w8, [x0]
        ldrb    w9, [x0, #1]
        lsl     w8, w8, #24
        orr     w8, w8, w9, lsl #16
        ldrb    w9, [x0, #2]
        orr     w8, w8, w9, lsl #8
        ldrb    w9, [x0, #3]
        orr     w8, w8, w9
        extr    w0, w9, w8, #7
        ret

Assembly generated after making both arguments of `fshl` as same

test:                                   // @test
        ldr     w8, [x0]
        rev     w8, w8
        ror     w0, w8, #7
        ret

quic-eikansh commented 1 year ago

I have submitted the pull request https://github.com/llvm/llvm-project/pull/66115 for this issue.

llvmbot commented 1 year ago

@llvm/issue-subscribers-backend-aarch64

In the code mentioned in the [Godbolt link](https://godbolt.org/z/7Y7h8e9c1) it can be seen that clang generates four `ldrb` to load `W[0]`. In comparison, GCC 12.1 generates one `ldr` for the same test case. The reason for the unoptimized code is the simplification of the arguments of `fshl`. The LLVM IR of the test case is in this [Godbolt link]( https://godbolt.org/z/P3E8Trqc7) . The IR of interest is: ``` %or15 = tail call i32 @llvm.fshl.i32(i32 %conv9, i32 %or10, i32 25) ``` Instead, if we change the first arguments of `fshl` to `%or10`, the optimal assembly is generated. ### Test case ```c #include #define GET_DATA(n,b,i) \ { \ (n) = ( (uint32_t) (b)[(i)] << 24 ) \ | ( (uint32_t) (b)[(i) + 1] << 16 ) \ | ( (uint32_t) (b)[(i) + 2] << 8 ) \ | ( (uint32_t) (b)[(i) + 3] ); \ } int test( unsigned char data[64] ) { uint32_t W[64]; GET_DATA( W[ 0], data, 0 ); #define SHR(x,n) ((x & 0xFFFFFFFF) >> n) #define ROTR(x,n) (SHR(x,n) | (x << (32 - n))) return ROTR(W[0], 7); } ``` ### LLVM IR for the test case ``` target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" define i32 @test(ptr %data) { entry: %0 = load i8, ptr %data %conv = zext i8 %0 to i32 %shl = shl nuw i32 %conv, 24 %arrayidx1 = getelementptr inbounds i8, ptr %data, i64 1 %1 = load i8, ptr %arrayidx1 %conv2 = zext i8 %1 to i32 %shl3 = shl nuw nsw i32 %conv2, 16 %or = or i32 %shl3, %shl %arrayidx4 = getelementptr inbounds i8, ptr %data, i64 2 %2 = load i8, ptr %arrayidx4 %conv5 = zext i8 %2 to i32 %shl6 = shl nuw nsw i32 %conv5, 8 %or7 = or i32 %or, %shl6 %arrayidx8 = getelementptr inbounds i8, ptr %data, i64 3 %3 = load i8, ptr %arrayidx8 %conv9 = zext i8 %3 to i32 %or10 = or i32 %or7, %conv9 %or15 = tail call i32 @llvm.fshl.i32(i32 %conv9, i32 %or10, i32 25) ; Uncomment below line and comment above line to see the difference in asm ;%or15 = tail call i32 @llvm.fshl.i32(i32 %or10, i32 %or10, i32 25) ret i32 %or15 } declare i32 @llvm.fshl.i32(i32, i32, i32) ``` ### Assembly generated by armv8-a clang for the test case ``` test(unsigned char*): // @test(unsigned char*) ldrb w8, [x0] ldrb w9, [x0, #1] lsl w8, w8, #24 orr w8, w8, w9, lsl #16 ldrb w9, [x0, #2] orr w8, w8, w9, lsl #8 ldrb w9, [x0, #3] orr w8, w8, w9 extr w0, w9, w8, #7 ret ``` ### Assembly generated after making both arguments of `fshl` as same ``` test: // @test ldr w8, [x0] rev w8, w8 ror w0, w8, #7 ret ```

llvm / llvm-project