Vectorizer generates inefficient code for variable-step loop at -Os

The loop vectorizer can generate very inefficient IR for this code at -Os:

int printf(const char *format, ...);

#define ARR_SIZE 2
#define STEP 1

char arr[ARR_SIZE];

__attribute((noinline))
void test(int step) {
  for (int i = 0; i < ARR_SIZE; i += step) {
    arr[i] = 1;
  }
}

int main() {
  test(STEP);
  for (int i = 0; i < ARR_SIZE; ++i)
    printf("%d, ", arr[i]);
  printf("\n");
}

$ /work/llvm/build/bin/clang --target=aarch64-none-elf -S test.c -emit-llvm -o - -S -emit-llvm -Os
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64-unknown-none-elf"

@arr = dso_local local_unnamed_addr global [2 x i8] zeroinitializer, align 4
@.str = private unnamed_addr constant [5 x i8] c"%d, \00", align 1

; Function Attrs: nofree noinline norecurse nosync nounwind optsize memory(write, argmem: none, inaccessiblemem: none)
define dso_local void @test(i32 noundef %step) local_unnamed_addr #0 {
entry:
  %0 = sext i32 %step to i64
  %smax = tail call i64 @llvm.smax.i64(i64 %0, i64 2)
  %1 = add nsw i64 %smax, -1
  %2 = udiv i64 %1, %0
  %3 = and i64 %2, -16
  %broadcast.splatinsert5 = insertelement <16 x i64> poison, i64 %2, i64 0
  %broadcast.splat6 = shufflevector <16 x i64> %broadcast.splatinsert5, <16 x i64> poison, <16 x i32> zeroinitializer
  br label %vector.body

vector.body:                                      ; preds = %pred.store.continue36, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue36 ]
  %offset.idx = mul i64 %index, %0
  %broadcast.splatinsert = insertelement <16 x i64> poison, i64 %index, i64 0
  %broadcast.splat = shufflevector <16 x i64> %broadcast.splatinsert, <16 x i64> poison, <16 x i32> zeroinitializer
  %vec.iv = or disjoint <16 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
  %4 = icmp ule <16 x i64> %vec.iv, %broadcast.splat6
  %5 = extractelement <16 x i1> %4, i64 0
  br i1 %5, label %pred.store.if, label %pred.store.continue

pred.store.if:                                    ; preds = %vector.body
  %6 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %offset.idx
  store i8 1, ptr %6, align 1, !tbaa !3
  br label %pred.store.continue

pred.store.continue:                              ; preds = %pred.store.if, %vector.body
  %7 = extractelement <16 x i1> %4, i64 1
  br i1 %7, label %pred.store.if7, label %pred.store.continue8

pred.store.if7:                                   ; preds = %pred.store.continue
  %8 = add i64 %offset.idx, %0
  %9 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %8
  store i8 1, ptr %9, align 1, !tbaa !3
  br label %pred.store.continue8

pred.store.continue8:                             ; preds = %pred.store.if7, %pred.store.continue
  %10 = extractelement <16 x i1> %4, i64 2
  br i1 %10, label %pred.store.if9, label %pred.store.continue10

pred.store.if9:                                   ; preds = %pred.store.continue8
  %offset.idx37 = or disjoint i64 %index, 2
  %11 = mul i64 %offset.idx37, %0
  %12 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %11
  store i8 1, ptr %12, align 1, !tbaa !3
  br label %pred.store.continue10

pred.store.continue10:                            ; preds = %pred.store.if9, %pred.store.continue8
  %13 = extractelement <16 x i1> %4, i64 3
  br i1 %13, label %pred.store.if11, label %pred.store.continue12

pred.store.if11:                                  ; preds = %pred.store.continue10
  %offset.idx38 = or disjoint i64 %index, 3
  %14 = mul i64 %offset.idx38, %0
  %15 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %14
  store i8 1, ptr %15, align 1, !tbaa !3
  br label %pred.store.continue12

pred.store.continue12:                            ; preds = %pred.store.if11, %pred.store.continue10
  %16 = extractelement <16 x i1> %4, i64 4
  br i1 %16, label %pred.store.if13, label %pred.store.continue14

pred.store.if13:                                  ; preds = %pred.store.continue12
  %offset.idx39 = or disjoint i64 %index, 4
  %17 = mul i64 %offset.idx39, %0
  %18 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %17
  store i8 1, ptr %18, align 1, !tbaa !3
  br label %pred.store.continue14

pred.store.continue14:                            ; preds = %pred.store.if13, %pred.store.continue12
  %19 = extractelement <16 x i1> %4, i64 5
  br i1 %19, label %pred.store.if15, label %pred.store.continue16

pred.store.if15:                                  ; preds = %pred.store.continue14
  %offset.idx40 = or disjoint i64 %index, 5
  %20 = mul i64 %offset.idx40, %0
  %21 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %20
  store i8 1, ptr %21, align 1, !tbaa !3
  br label %pred.store.continue16

pred.store.continue16:                            ; preds = %pred.store.if15, %pred.store.continue14
  %22 = extractelement <16 x i1> %4, i64 6
  br i1 %22, label %pred.store.if17, label %pred.store.continue18

pred.store.if17:                                  ; preds = %pred.store.continue16
  %offset.idx41 = or disjoint i64 %index, 6
  %23 = mul i64 %offset.idx41, %0
  %24 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %23
  store i8 1, ptr %24, align 1, !tbaa !3
  br label %pred.store.continue18

pred.store.continue18:                            ; preds = %pred.store.if17, %pred.store.continue16
  %25 = extractelement <16 x i1> %4, i64 7
  br i1 %25, label %pred.store.if19, label %pred.store.continue20

pred.store.if19:                                  ; preds = %pred.store.continue18
  %offset.idx42 = or disjoint i64 %index, 7
  %26 = mul i64 %offset.idx42, %0
  %27 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %26
  store i8 1, ptr %27, align 1, !tbaa !3
  br label %pred.store.continue20

pred.store.continue20:                            ; preds = %pred.store.if19, %pred.store.continue18
  %28 = extractelement <16 x i1> %4, i64 8
  br i1 %28, label %pred.store.if21, label %pred.store.continue22

pred.store.if21:                                  ; preds = %pred.store.continue20
  %offset.idx43 = or disjoint i64 %index, 8
  %29 = mul i64 %offset.idx43, %0
  %30 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %29
  store i8 1, ptr %30, align 1, !tbaa !3
  br label %pred.store.continue22

pred.store.continue22:                            ; preds = %pred.store.if21, %pred.store.continue20
  %31 = extractelement <16 x i1> %4, i64 9
  br i1 %31, label %pred.store.if23, label %pred.store.continue24

pred.store.if23:                                  ; preds = %pred.store.continue22
  %offset.idx44 = or disjoint i64 %index, 9
  %32 = mul i64 %offset.idx44, %0
  %33 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %32
  store i8 1, ptr %33, align 1, !tbaa !3
  br label %pred.store.continue24

pred.store.continue24:                            ; preds = %pred.store.if23, %pred.store.continue22
  %34 = extractelement <16 x i1> %4, i64 10
  br i1 %34, label %pred.store.if25, label %pred.store.continue26

pred.store.if25:                                  ; preds = %pred.store.continue24
  %offset.idx45 = or disjoint i64 %index, 10
  %35 = mul i64 %offset.idx45, %0
  %36 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %35
  store i8 1, ptr %36, align 1, !tbaa !3
  br label %pred.store.continue26

pred.store.continue26:                            ; preds = %pred.store.if25, %pred.store.continue24
  %37 = extractelement <16 x i1> %4, i64 11
  br i1 %37, label %pred.store.if27, label %pred.store.continue28

pred.store.if27:                                  ; preds = %pred.store.continue26
  %offset.idx46 = or disjoint i64 %index, 11
  %38 = mul i64 %offset.idx46, %0
  %39 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %38
  store i8 1, ptr %39, align 1, !tbaa !3
  br label %pred.store.continue28

pred.store.continue28:                            ; preds = %pred.store.if27, %pred.store.continue26
  %40 = extractelement <16 x i1> %4, i64 12
  br i1 %40, label %pred.store.if29, label %pred.store.continue30

pred.store.if29:                                  ; preds = %pred.store.continue28
  %offset.idx47 = or disjoint i64 %index, 12
  %41 = mul i64 %offset.idx47, %0
  %42 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %41
  store i8 1, ptr %42, align 1, !tbaa !3
  br label %pred.store.continue30

pred.store.continue30:                            ; preds = %pred.store.if29, %pred.store.continue28
  %43 = extractelement <16 x i1> %4, i64 13
  br i1 %43, label %pred.store.if31, label %pred.store.continue32

pred.store.if31:                                  ; preds = %pred.store.continue30
  %offset.idx48 = or disjoint i64 %index, 13
  %44 = mul i64 %offset.idx48, %0
  %45 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %44
  store i8 1, ptr %45, align 1, !tbaa !3
  br label %pred.store.continue32

pred.store.continue32:                            ; preds = %pred.store.if31, %pred.store.continue30
  %46 = extractelement <16 x i1> %4, i64 14
  br i1 %46, label %pred.store.if33, label %pred.store.continue34

pred.store.if33:                                  ; preds = %pred.store.continue32
  %offset.idx49 = or disjoint i64 %index, 14
  %47 = mul i64 %offset.idx49, %0
  %48 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %47
  store i8 1, ptr %48, align 1, !tbaa !3
  br label %pred.store.continue34

pred.store.continue34:                            ; preds = %pred.store.if33, %pred.store.continue32
  %49 = extractelement <16 x i1> %4, i64 15
  br i1 %49, label %pred.store.if35, label %pred.store.continue36

pred.store.if35:                                  ; preds = %pred.store.continue34
  %offset.idx50 = or disjoint i64 %index, 15
  %50 = mul i64 %offset.idx50, %0
  %51 = getelementptr inbounds [2 x i8], ptr @arr, i64 0, i64 %50
  store i8 1, ptr %51, align 1, !tbaa !3
  br label %pred.store.continue36

pred.store.continue36:                            ; preds = %pred.store.if35, %pred.store.continue34
  %index.next = add i64 %index, 16
  %52 = icmp eq i64 %index, %3
  br i1 %52, label %for.cond.cleanup, label %vector.body, !llvm.loop !6

for.cond.cleanup:                                 ; preds = %pred.store.continue36
  ret void
}

; Function Attrs: nofree nounwind optsize
define dso_local noundef i32 @main() local_unnamed_addr #1 {
entry:
  tail call void @test(i32 noundef 1) #4
  %0 = load i8, ptr @arr, align 4, !tbaa !3
  %conv = zext i8 %0 to i32
  %call = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, i32 noundef %conv) #4
  %1 = load i8, ptr getelementptr inbounds (i8, ptr @arr, i64 1), align 1, !tbaa !3
  %conv.c = zext i8 %1 to i32
  %call.c = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, i32 noundef %conv.c) #4
  %putchar = tail call i32 @putchar(i32 10)
  ret i32 0
}

; Function Attrs: nofree nounwind optsize
declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) local_unnamed_addr #1

; Function Attrs: nofree nounwind
declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #2

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.smax.i64(i64, i64) #3

attributes #0 = { nofree noinline norecurse nosync nounwind optsize memory(write, argmem: none, inaccessiblemem: none) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a" }
attributes #1 = { nofree nounwind optsize "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a" }
attributes #2 = { nofree nounwind }
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { optsize }

!llvm.module.flags = !{!0, !1}
!llvm.ident = !{!2}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"frame-pointer", i32 1}
!2 = !{!"clang version 20.0.0git (git@github.com:llvm/llvm-project.git 700a90545d68d7f4e40ccee5de67b2f7c7c85d9e)"}
!3 = !{!4, !4, i64 0}
!4 = !{!"omnipotent char", !5, i64 0}
!5 = !{!"Simple C/C++ TBAA"}
!6 = distinct !{!6, !7, !8, !9}
!7 = !{!"llvm.loop.mustprogress"}
!8 = !{!"llvm.loop.isvectorized", i32 1}
!9 = !{!"llvm.loop.unroll.runtime.disable"}

The induction variable has been vectorized with a VF of 16, but the body of the loop is done with lots of repeated scalar code. The code emitted at -O2 and -O3 is much smaller.

This affects at least --target=aarch64-none-elf, --target=armv8a-none-eabihf and --target=x86_64-unknown-linux-gnu.

This was found by a fuzzer (because a variation on it triggered an unrelated miscompilation in the ARM backend), not real-world code, but I think the code looks "normal" enough to consider this a missed optimisation bug.

llvm / llvm-project

Vectorizer generates inefficient code for variable-step loop at -Os #118100