Poor basic block scheduling/branch optimization

The following code produces sub-optimal code:

extern void f(void);

void f2(int test) {
    switch (test) {
    case 1:
        f();
        break;
    case 2:
        f();
        break;
    case 3:
        f();
        break;
    case 4:
        f();
        break;
    }
}

Running it with

clang -mriscv=RV64IAMFD -target riscv64 -S -O3 -o invalid-branch.c.64Full.S invalid-branch.c

produces

f2:
  addi  x2, x2, -16
  sd  x1, 8(x2)
  sd  x8, 0(x2)
  add x8, x2, x0
  li  x5, 2
  blt x5, x10, LBB0_3
  li  x6, 1
  beq x10, x6, LBB0_5
  beq x10, x5, LBB0_5
  j LBB0_6
LBB0_3:
  li  x5, 3
  beq x10, x5, LBB0_5
  li  x5, 4
  bne x10, x5, LBB0_6
LBB0_5:
  lui x5, %hi(f)
  addi  x5, x5, %lo(f)
  jalr  x1, x5, 0
LBB0_6:
  add x2, x8, x0
  ld  x8, 0(x2)
  ld  x1, 8(x2)
  addi  x2, x2, 16
  ret

which is not as good as the riscv-gcc version, most likely due to the implementation of analyze branch and associated target specific code in llvm

riscv64-unknown-elf-gcc -O3 -S -o invalid-branch.c.64Full.gcc.S invalid-branch.c

produces

f2:
  li  a5,2
  beq a0,a5,.L7
  ble a0,a5,.L9
  li  a5,3
  beq a0,a5,.L7
  li  a5,4
  bne a0,a5,.L10
.L7:
  tail  f
.L9:
  li  a5,1
  beq a0,a5,.L7
  ret
.L10:
  ret

ucb-bar / esp-llvm

Poor basic block scheduling/branch optimization #28