[RISCV][clang] Failed to vectorize a[i] + b[CONSTANT] when a and b both in global struct

Fros1er commented 3 weeks ago

Found this in tsvc benchmark. It's quite diifferent from original code after I removed all unrelated parts.

#define LEN 32000

struct GlobalData {
    float a[LEN];
    float b[LEN];
} global_data;

int s122(int n1) {
    for (int i = n1; i < LEN; i++) {
        global_data.a[i] += global_data.b[0];
    }
}

clang -Ofast -march=rv64gcv:

s122:
        lui     a2, 8
        addiw   a1, a2, -769
        blt     a1, a0, .LBB0_3
        slli    a1, a0, 2
.Lpcrel_hi0:
        auipc   a3, %pcrel_hi(global_data)
        addi    a4, a3, %pcrel_lo(.Lpcrel_hi0)
        add     a1, a1, a4
        addiw   a3, a0, 1
        lui     a0, 31
        add     a0, a0, a4
        addiw   a2, a2, -768
.LBB0_2:
        flw     fa5, 1024(a0)
        flw     fa4, 0(a1)
        mv      a4, a3
        fadd.s  fa5, fa4, fa5
        fsw     fa5, 0(a1)
        addi    a1, a1, 4
        addiw   a3, a3, 1
        bne     a4, a2, .LBB0_2
.LBB0_3:
        li      a0, 0
        ret

RVV code will generate if we change:

global_data.a[i] += global_data.b[0]; to global_data.a[i] += global_data.b[i];
take a and b out of struct, make them global
int i = n1 to int i = 0

llvmbot commented 3 weeks ago

@llvm/issue-subscribers-backend-risc-v

Author: Froster (Fros1er)

``` c #define LEN 32000 struct GlobalData { float a[LEN]; float b[LEN]; } global_data; int s122(int n1) { for (int i = n1; i < LEN; i++) { global_data.a[i] += global_data.b[0]; } } ``` `clang -Ofast -march=rv64gcv`: ``` asm s122: lui a2, 8 addiw a1, a2, -769 blt a1, a0, .LBB0_3 slli a1, a0, 2 .Lpcrel_hi0: auipc a3, %pcrel_hi(global_data) addi a4, a3, %pcrel_lo(.Lpcrel_hi0) add a1, a1, a4 addiw a3, a0, 1 lui a0, 31 add a0, a0, a4 addiw a2, a2, -768 .LBB0_2: flw fa5, 1024(a0) flw fa4, 0(a1) mv a4, a3 fadd.s fa5, fa4, fa5 fsw fa5, 0(a1) addi a1, a1, 4 addiw a3, a3, 1 bne a4, a2, .LBB0_2 .LBB0_3: li a0, 0 ret ``` RVV code will generate if we change: - `global_data.a[i] += global_data.b[0];` to `global_data.a[i] += global_data.b[i];` - take a and b out of struct, make them global - `int i = n1` to `int i = 0`

Fros1er commented 3 weeks ago

Code above with clang -Ofast -march=rv64gcv -S -emit-llvm:

llvm ir

``` llvm %struct.GlobalData = type { [32000 x float], [32000 x float] } @global_data = dso_local local_unnamed_addr global %struct.GlobalData zeroinitializer, align 4, !dbg !0 @a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 4, !dbg !5 @b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 4, !dbg !12 define dso_local signext i32 @s122(i32 noundef signext %0) local_unnamed_addr #0 !dbg !30 { #dbg_value(i32 %0, !35, !DIExpression(), !38) #dbg_value(i32 %0, !36, !DIExpression(), !39) %2 = icmp slt i32 %0, 32000, !dbg !40 br i1 %2, label %3, label %5, !dbg !42 3: %4 = sext i32 %0 to i64, !dbg !42 br label %6, !dbg !42 5: ret i32 undef, !dbg !43 6: %7 = phi i64 [ %4, %3 ], [ %12, %6 ] #dbg_value(i64 %7, !36, !DIExpression(), !39) %8 = load float, ptr getelementptr inbounds nuw (i8, ptr @global_data, i64 128000), align 4, !dbg !44 %9 = getelementptr inbounds [32000 x float], ptr @global_data, i64 0, i64 %7, !dbg !50 %10 = load float, ptr %9, align 4, !dbg !51 %11 = fadd fast float %10, %8, !dbg !51 store float %11, ptr %9, align 4, !dbg !51 %12 = add nsw i64 %7, 1, !dbg !52 #dbg_value(i64 %12, !36, !DIExpression(), !39) %13 = and i64 %12, 4294967295, !dbg !40 %14 = icmp eq i64 %13, 32000, !dbg !40 br i1 %14, label %5, label %6, !dbg !42 } ```

Changing global_data.a[i] += global_data.b[0]; to global_data.a[i] += global_data.b[i];:

llvm ir

``` llvm %struct.GlobalData = type { [32000 x float], [32000 x float] } @global_data = dso_local local_unnamed_addr global %struct.GlobalData zeroinitializer, align 4, !dbg !0 @a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 4, !dbg !5 @b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 4, !dbg !12 define dso_local signext i32 @s122(i32 noundef signext %0) local_unnamed_addr #0 !dbg !30 { #dbg_value(i32 %0, !35, !DIExpression(), !38) #dbg_value(i32 %0, !36, !DIExpression(), !39) %2 = icmp slt i32 %0, 32000, !dbg !40 br i1 %2, label %3, label %42, !dbg !42 3: %4 = sext i32 %0 to i64, !dbg !42 %5 = add i32 %0, 1, !dbg !42 %6 = zext i32 %5 to i64, !dbg !42 %7 = sub nsw i64 32001, %6, !dbg !42 %8 = tail call i64 @llvm.vscale.i64(), !dbg !42 %9 = shl nuw nsw i64 %8, 2, !dbg !42 %10 = tail call i64 @llvm.umax.i64(i64 %9, i64 16), !dbg !42 %11 = icmp ult i64 %7, %10, !dbg !42 br i1 %11, label %12, label %14, !dbg !42 12: %13 = phi i64 [ %4, %14 ], [ %4, %3 ], [ %27, %40 ] br label %43, !dbg !42 14: %15 = add i32 %0, 1, !dbg !42 %16 = zext i32 %15 to i64, !dbg !42 %17 = sub nsw i64 32000, %16, !dbg !42 %18 = trunc i64 %17 to i32, !dbg !42 %19 = sub i32 -2, %0, !dbg !42 %20 = icmp ult i32 %19, %18, !dbg !42 %21 = icmp ugt i64 %17, 4294967295, !dbg !42 %22 = or i1 %20, %21, !dbg !42 br i1 %22, label %12, label %23 23: %24 = tail call i64 @llvm.vscale.i64(), !dbg !42 %25 = mul nsw i64 %24, -4, !dbg !42 %26 = and i64 %7, %25, !dbg !42 %27 = add nsw i64 %26, %4, !dbg !42 %28 = tail call i64 @llvm.vscale.i64(), !dbg !42 %29 = shl nuw nsw i64 %28, 2, !dbg !42 br label %30, !dbg !42 30: %31 = phi i64 [ 0, %23 ], [ %38, %30 ] %32 = add i64 %31, %4, !dbg !42 %33 = getelementptr inbounds [32000 x float], ptr getelementptr inbounds nuw (i8, ptr @global_data, i64 128000), i64 0, i64 %32, !dbg !43 %34 = load , ptr %33, align 4, !dbg !43 %35 = getelementptr inbounds [32000 x float], ptr @global_data, i64 0, i64 %32, !dbg !49 %36 = load , ptr %35, align 4, !dbg !50 %37 = fadd fast %36, %34, !dbg !50 store %37, ptr %35, align 4, !dbg !50 %38 = add nuw i64 %31, %29 %39 = icmp eq i64 %38, %26 br i1 %39, label %40, label %30 40: %41 = icmp eq i64 %7, %26, !dbg !42 br i1 %41, label %42, label %12, !dbg !42 42: ret i32 undef, !dbg !56 43: %44 = phi i64 [ %50, %43 ], [ %13, %12 ] #dbg_value(i64 %44, !36, !DIExpression(), !39) %45 = getelementptr inbounds [32000 x float], ptr getelementptr inbounds nuw (i8, ptr @global_data, i64 128000), i64 0, i64 %44, !dbg !43 %46 = load float, ptr %45, align 4, !dbg !43 %47 = getelementptr inbounds [32000 x float], ptr @global_data, i64 0, i64 %44, !dbg !49 %48 = load float, ptr %47, align 4, !dbg !50 %49 = fadd fast float %48, %46, !dbg !50 store float %49, ptr %47, align 4, !dbg !50 %50 = add nsw i64 %44, 1, !dbg !57 #dbg_value(i64 %50, !36, !DIExpression(), !39) %51 = and i64 %50, 4294967295, !dbg !40 %52 = icmp eq i64 %51, 32000, !dbg !40 br i1 %52, label %42, label %43, !dbg !42 } declare i64 @llvm.vscale.i64() #1 declare i64 @llvm.umax.i64(i64, i64) #2 ```

topperc commented 3 weeks ago

This doesn't look specific to RISC-V.

I think its an issue with alias analysis.

hiraditya commented 3 weeks ago

yes as Craig mentioned clang doesn't even vectorize for x86 targets but gcc does. https://godbolt.org/z/q74eoxGfj

topperc commented 3 weeks ago

If alias analysis was working, I'd expect LICM to pull the load out of the loop.

hiraditya commented 3 weeks ago

Changing global_data.a[i] += global_data.b[0]; to global_data.a[i] += global_data.b[i];

Yeah, this is strange when clang vectorizes with global_data.b[i] but not with global_data.b[0]

topperc commented 3 weeks ago

Changing global_data.a[i] += global_data.b[0]; to global_data.a[i] += global_data.b[i];

Yeah, this is strange when clang vectorizes with global_data.b[i] but not with global_data.b[0]

The vectorizer emitted a runtime check to verify the aliasing. I guess for some reason it doesn't do that for uniform loads?

Fros1er commented 3 weeks ago

May I self assign this? Although I'm not familiar with clang, it might take a while for me to understand and solve the issue.

fhahn commented 3 weeks ago

Is there any info that allows us to prove n1 >=s 0? If that is the case, this gets handled by applying loop guards to the SCEV expressions when trying to determine all source accesses start/end before the sink accesses.

With https://github.com/fhahn/llvm-project/commit/62235dca45533e6e5155470e79a5c18be1a21762 a variant of the reproducer here with an assumption that n1 >=s 0 (https://llvm.godbolt.org/z/bT57M8nnY) can vectorized without runtime checks

hiraditya commented 3 weeks ago

Even when the struct is passed as a parameter, the vectorization doesn't happen. https://godbolt.org/z/q5r6dboK3

Fros1er commented 3 weeks ago

Seems b[0] is also not the point.

#define LEN 32000

struct GlobalData {
    float a[LEN];
    float b;
};

int s122(int n1, struct GlobalData &global_data) {
    for (int i = n1; i < LEN; i++) {
        global_data.a[i] += global_data.b;
    }
    return n1;
}

https://godbolt.org/z/n8zT4xohn

fhahn commented 2 weeks ago

Seems b[0] is also not the point.

Yes that doesn't matter, in both cases we should prove that the loads from global_data.b | global_data.b[0] don't overlap with the stores to a[I].

Even when the struct is passed as a parameter, the vectorization doesn't happen. https://godbolt.org/z/q5r6dboK3 Don't think passing it as parameter changes things, if anything it would make reasoning a bit harder, if there are other accesses.

Hmm, looking at https://github.com/llvm/llvm-test-suite/blob/main/MultiSource/Benchmarks/TSVC/tsc.inc#L1260 which I think should be s122, it seems like there's no info the determine that n1 >=s 0

Fros1er commented 2 weeks ago

This doesn't look specific to RISC-V.

I think its an issue with alias analysis.

Index.Val.V here is phi n1, i. As we don't know about n1 here, CR is (INT_MIN, INT_MAX). Maybe we can check GEP1->getSourceElementType()->isArrayTy() here and use array size to narrow CR?

https://github.com/llvm/llvm-project/blob/c8cac33ad23acc671a0a7390a5254b9f6e848138/llvm/lib/Analysis/BasicAliasAnalysis.cpp#L1268-L1324

hiraditya commented 2 weeks ago

cc: @nikic and @efriedma-quic for help with this.

efriedma-quic commented 2 weeks ago

Trying to use the array bounds from the C source code isn't possible given the way the LLVM IR is currently emitted. Some people have looked at encoding more information about arrays into IR, but it's extremely complicated.

It should be possible for LoopAccessAnalysis to prove the lack of aliasing, just based on the indexing. If i is nonnegative, i < LEN implies &global_data.a[i] < &global_data.a[LEN]. If i is negative, &global_data.a[i] doesn't point to global_data.

Fros1er commented 1 week ago

Trying to use the array bounds from the C source code isn't possible given the way the LLVM IR is currently emitted. Some people have looked at encoding more information about arrays into IR, but it's extremely complicated.

It should be possible for LoopAccessAnalysis to prove the lack of aliasing, just based on the indexing. If i is nonnegative, i < LEN implies &global_data.a[i] < &global_data.a[LEN]. If i is negative, &global_data.a[i] doesn't point to global_data.

What about -scev-aa? I don't know how to enable it though. LoopAccessAnalysis doesn't have interface for this problem and it's hard to write one as it is coupled heavily with vectorization.

efriedma-quic commented 1 week ago

scev-aa is a thin wrapper around SCEV; you should be able to just use SCEV directly.

llvm / llvm-project

[RISCV][clang] Failed to vectorize a[i] + b[CONSTANT] when a and b both in global struct #104596