Open nathanchance opened 2 years ago
i386 has the same issue:
ERROR: modpost: "__udivdi3" [drivers/mtd/parsers/scpart.ko] undefined!
The cvise
'd reproducer above does not reproduce that issue, so I ran the reduction again and ended up with:
struct {
int part_id;
int part_offs;
int part_bytes;
} scpart_scan_partmap_tmpdesc;
int scpart_scan_partmap_master_0;
long long scpart_scan_partmap_offs;
void scpart_scan_partmap() {
while (scpart_scan_partmap_offs < scpart_scan_partmap_master_0)
scpart_scan_partmap_offs += sizeof scpart_scan_partmap_tmpdesc;
}
which reproduces the issue on both ARM and i386.
ARM GCC:
$ arm-linux-gnu-gcc --version | head -1
arm-linux-gnu-gcc (GCC) 12.1.1 20220507 (Red Hat Cross 12.1.1-1)
$ arm-linux-gnu-gcc -O2 -c -o scpart.{o,i}
$ llvm-nm -S scpart.o | grep __aeabi_uldivmod
$ llvm-objdump -dr scpart.o
scpart.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <scpart_scan_partmap>:
0: 04 e0 2d e5 str lr, [sp, #-4]!
4: 00 e0 00 e3 movw lr, #0
00000004: R_ARM_MOVW_ABS_NC .LANCHOR0
8: 00 e0 40 e3 movt lr, #0
00000008: R_ARM_MOVT_ABS .LANCHOR0
c: 00 10 9e e5 ldr r1, [lr]
10: 08 30 9e e5 ldr r3, [lr, #8]
14: 0c 20 9e e5 ldr r2, [lr, #12]
18: c1 0f a0 e1 asr r0, r1, #31
1c: 01 00 53 e1 cmp r3, r1
20: 00 c0 d2 e0 sbcs r12, r2, r0
24: 04 f0 9d a4 ldrge pc, [sp], #4
28: 0c 30 93 e2 adds r3, r3, #12
2c: 00 20 a2 e2 adc r2, r2, #0
30: 01 00 53 e1 cmp r3, r1
34: 00 c0 d2 e0 sbcs r12, r2, r0
38: fa ff ff ba blt 0x28 <scpart_scan_partmap+0x28> @ imm = #-24
3c: 08 30 8e e5 str r3, [lr, #8]
40: 0c 20 8e e5 str r2, [lr, #12]
44: 04 f0 9d e4 ldr pc, [sp], #4
ARM Clang:
$ clang --version | head -1
ClangBuiltLinux clang version 15.0.0 (https://github.com/llvm/llvm-project e91a73de24d60954700d7ac0293c050ab2cbe90b)
$ clang --target=arm-linux-gnueabi -O2 -c -o scpart.{o,i}
$ llvm-nm -S scpart.o | grep __aeabi_uldivmod
U __aeabi_uldivmod
$ llvm-objdump -dr scpart.o [6/3327]
scpart.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <scpart_scan_partmap>:
0: f0 41 2d e9 push {r4, r5, r6, r7, r8, lr}
4: b4 40 9f e5 ldr r4, [pc, #180] @ 0xc0 <$d.1>
8: 04 40 8f e0 add r4, pc, r4
c: 04 50 94 e5 ldr r5, [r4, #4]
10: ac 60 9f e5 ldr r6, [pc, #172] @ 0xc4 <$d.1+0x4>
14: 06 60 9f e7 ldr r6, [pc, r6]
18: a8 00 9f e5 ldr r0, [pc, #168] @ 0xc8 <$d.1+0x8>
1c: 00 00 9f e7 ldr r0, [pc, r0]
20: 00 10 56 e0 subs r1, r6, r0
24: c0 1f d5 e0 sbcs r1, r5, r0, asr #31
28: 22 00 00 aa bge 0xb8 <scpart_scan_partmap+0xb8> @ imm = #136
2c: 0c 10 96 e2 adds r1, r6, #12
30: 00 30 a0 e1 mov r3, r0
34: 0c 80 a0 e3 mov r8, #12
38: 00 20 a5 e2 adc r2, r5, #0
3c: c0 0f 52 e1 cmp r2, r0, asr #31
40: 01 30 a0 c1 movgt r3, r1
44: 00 00 51 e1 cmp r1, r0
48: 00 10 a0 91 movls r1, r0
4c: c0 0f 52 e1 cmp r2, r0, asr #31
50: 03 10 a0 11 movne r1, r3
54: c0 2f a0 d1 asrle r2, r0, #31
58: 0c 00 51 e2 subs r0, r1, #12
5c: 00 20 c2 e2 sbc r2, r2, #0
60: 06 10 20 e0 eor r1, r0, r6
64: 05 30 22 e0 eor r3, r2, r5
68: 03 70 91 e1 orrs r7, r1, r3
6c: 01 70 a0 13 movne r7, #1
70: 07 10 96 e0 adds r1, r6, r7
74: 00 30 a5 e2 adc r3, r5, #0
78: 01 00 50 e0 subs r0, r0, r1
7c: 03 10 c2 e0 sbc r1, r2, r3
80: 0c 20 a0 e3 mov r2, #12
84: 00 30 a0 e3 mov r3, #0
88: fe ff ff eb bl 0x88 <scpart_scan_partmap+0x88> @ imm = #-8
00000088: R_ARM_CALL __aeabi_uldivmod
8c: 07 00 90 e0 adds r0, r0, r7
90: 00 10 a1 e2 adc r1, r1, #0
94: 90 28 83 e0 <unknown>
98: 81 10 81 e0 add r1, r1, r1, lsl #1
9c: 01 01 83 e0 add r0, r3, r1, lsl #2
a0: 02 10 96 e0 adds r1, r6, r2
a4: 00 00 a5 e0 adc r0, r5, r0
a8: 0c 10 91 e2 adds r1, r1, #12
ac: 00 00 a0 e2 adc r0, r0, #0
b0: 00 10 84 e5 str r1, [r4]
b4: 04 00 84 e5 str r0, [r4, #4]
b8: f0 41 bd e8 pop {r4, r5, r6, r7, r8, lr}
bc: 1e ff 2f e1 bx lr
000000c0 <$d.1>:
c0: b0 00 00 00 .word 0x000000b0
000000c0: R_ARM_REL32 scpart_scan_partmap_offs
c4: a8 00 00 00 .word 0x000000a8
000000c4: R_ARM_REL32 scpart_scan_partmap_offs
c8: a4 00 00 00 .word 0x000000a4
000000c8: R_ARM_REL32 scpart_scan_partmap_master_0
i386 GCC:
$ gcc --version | head -1
gcc (GCC) 12.1.0
$ llvm-nm -S scpart.o | grep __udivdi3
$ llvm-objdump -dr scpart.o
scpart.o: file format elf32-i386
Disassembly of section .text:
00000000 <scpart_scan_partmap>:
0: 57 pushl %edi
1: 56 pushl %esi
2: e8 fc ff ff ff calll 0x3 <scpart_scan_partmap+0x3>
00000003: R_386_PC32 __x86.get_pc_thunk.si
7: 81 c6 02 00 00 00 addl $2, %esi
00000009: R_386_GOTPC _GLOBAL_OFFSET_TABLE_
d: 53 pushl %ebx
e: f3 0f 7e 8e 00 00 00 00 movq (%esi), %xmm1 # xmm1 = mem[0],zero
00000012: R_386_GOTOFF scpart_scan_partmap_offs
16: 8b be 00 00 00 00 movl (%esi), %edi
00000018: R_386_GOTOFF scpart_scan_partmap_master_0
1c: 66 0f 6f c1 movdqa %xmm1, %xmm0
20: 89 fb movl %edi, %ebx
22: 66 0f 7e ca movd %xmm1, %edx
26: 66 0f 73 d0 20 psrlq $32, %xmm0
2b: c1 fb 1f sarl $31, %ebx
2e: 39 fa cmpl %edi, %edx
30: 66 0f 7e c0 movd %xmm0, %eax
34: 19 d8 sbbl %ebx, %eax
36: 7d 33 jge 0x6b <scpart_scan_partmap+0x6b>
38: 89 f9 movl %edi, %ecx
3a: 8d b6 00 00 00 00 leal (%esi), %esi
40: 66 0f 6f 86 00 00 00 00 movdqa (%esi), %xmm0
00000044: R_386_GOTOFF .LC0
48: 66 0f d4 c8 paddq %xmm0, %xmm1
4c: 66 0f 6f c1 movdqa %xmm1, %xmm0
50: 66 0f 7e ca movd %xmm1, %edx
54: 66 0f 73 d0 20 psrlq $32, %xmm0
59: 39 ca cmpl %ecx, %edx
5b: 66 0f 7e c0 movd %xmm0, %eax
5f: 19 d8 sbbl %ebx, %eax
61: 7c dd jl 0x40 <scpart_scan_partmap+0x40>
63: 66 0f d6 8e 00 00 00 00 movq %xmm1, (%esi)
00000067: R_386_GOTOFF scpart_scan_partmap_offs
6b: 5b popl %ebx
6c: 5e popl %esi
6d: 5f popl %edi
6e: c3 retl
Disassembly of section .text.__x86.get_pc_thunk.si:
00000000 <__x86.get_pc_thunk.si>:
0: 8b 34 24 movl (%esp), %esi
3: c3 retl
i386 Clang:
$ clang --version | head -1
ClangBuiltLinux clang version 15.0.0 (https://github.com/llvm/llvm-project e91a73de24d60954700d7ac0293c050ab2cbe90b)
$ clang -O2 -m32 -c -o scpart.{o,i}
$ llvm-nm -S scpart.o | grep __udivdi3
U __udivdi3
$ llvm-objdump -dr scpart.o
scpart.o: file format elf32-i386
Disassembly of section .text:
00000000 <scpart_scan_partmap>:
0: 55 pushl %ebp
1: 53 pushl %ebx
2: 57 pushl %edi
3: 56 pushl %esi
4: 83 ec 0c subl $12, %esp
7: e8 00 00 00 00 calll 0xc <scpart_scan_partmap+0xc>
c: 5b popl %ebx
d: 81 c3 03 00 00 00 addl $3, %ebx
0000000f: R_386_GOTPC _GLOBAL_OFFSET_TABLE_
13: 8b ab 00 00 00 00 movl (%ebx), %ebp
00000015: R_386_GOTOFF scpart_scan_partmap_master_0
19: 89 ea movl %ebp, %edx
1b: c1 fa 1f sarl $31, %edx
1e: 8b 8b 04 00 00 00 movl 4(%ebx), %ecx
00000020: R_386_GOTOFF scpart_scan_partmap_offs
24: 8b bb 00 00 00 00 movl (%ebx), %edi
00000026: R_386_GOTOFF scpart_scan_partmap_offs
2a: 39 ef cmpl %ebp, %edi
2c: 89 c8 movl %ecx, %eax
2e: 19 d0 sbbl %edx, %eax
30: 0f 8d 87 00 00 00 jge 0xbd <scpart_scan_partmap+0xbd>
36: 89 f8 movl %edi, %eax
38: 83 c0 0c addl $12, %eax
3b: 89 4c 24 08 movl %ecx, 8(%esp)
3f: 83 d1 00 adcl $0, %ecx
42: 39 e8 cmpl %ebp, %eax
44: 89 ee movl %ebp, %esi
46: 0f 47 f0 cmoval %eax, %esi
49: 39 d1 cmpl %edx, %ecx
4b: 0f 4e c5 cmovlel %ebp, %eax
4e: 0f 44 c6 cmovel %esi, %eax
51: 0f 4e ca cmovlel %edx, %ecx
54: 83 c0 f4 addl $-12, %eax
57: 83 d1 ff adcl $-1, %ecx
5a: 89 c2 movl %eax, %edx
5c: 31 fa xorl %edi, %edx
5e: 89 ce movl %ecx, %esi
60: 8b 6c 24 08 movl 8(%esp), %ebp
64: 31 ee xorl %ebp, %esi
66: 09 d6 orl %edx, %esi
68: ba 00 00 00 00 movl $0, %edx
6d: 0f 95 c2 setne %dl
70: 89 54 24 04 movl %edx, 4(%esp)
74: 89 fa movl %edi, %edx
76: 03 54 24 04 addl 4(%esp), %edx
7a: 89 ee movl %ebp, %esi
7c: 83 d6 00 adcl $0, %esi
7f: 29 d0 subl %edx, %eax
81: 19 f1 sbbl %esi, %ecx
83: 6a 00 pushl $0
85: 6a 0c pushl $12
87: 51 pushl %ecx
88: 50 pushl %eax
89: e8 fc ff ff ff calll 0x8a <scpart_scan_partmap+0x8a>
0000008a: R_386_PLT32 __udivdi3
8e: 83 c4 10 addl $16, %esp
91: 89 d1 movl %edx, %ecx
93: 03 44 24 04 addl 4(%esp), %eax
97: 83 d1 00 adcl $0, %ecx
9a: ba 0c 00 00 00 movl $12, %edx
9f: f7 e2 mull %edx
a1: 8d 0c 49 leal (%ecx,%ecx,2), %ecx
a4: 8d 0c 8a leal (%edx,%ecx,4), %ecx
a7: 01 f8 addl %edi, %eax
a9: 11 e9 adcl %ebp, %ecx
ab: 83 c0 0c addl $12, %eax
ae: 83 d1 00 adcl $0, %ecx
b1: 89 83 00 00 00 00 movl %eax, (%ebx)
000000b3: R_386_GOTOFF scpart_scan_partmap_offs
b7: 89 8b 04 00 00 00 movl %ecx, 4(%ebx)
000000b9: R_386_GOTOFF scpart_scan_partmap_offs
bd: 83 c4 0c addl $12, %esp
c0: 5e popl %esi
c1: 5f popl %edi
c2: 5b popl %ebx
c3: 5d popl %ebp
c4: c3 retl
Is this a regression on the clang side?
I don't think so, it is visible with clang-11
:
https://github.com/ClangBuiltLinux/continuous-integration2/runs/6425305495?check_suite_focus=true
$ clang --version | head -1
Ubuntu clang version 11.0.1-2ubuntu5
$ clang --target=arm-linux-gnueabi -O2 -c -o scpart.{o,i}
$ llvm-nm -S scpart.o | grep __aeabi_uldivmod
U __aeabi_uldivmod
the reproducers look like they rely on signed integer overflow, which is UB
It looks the reproducer corresponds to:
static int scpart_scan_partmap(struct mtd_info *master, loff_t partmap_offs,
struct sc_part_desc **ppdesc)
{
...
uint8_t *buf;
loff_t offs;
size_t retlen;
struct sc_part_desc *pdesc = NULL;
struct sc_part_desc *tmpdesc;
int cnt = 0;
int res2;
int res = 0;
...
offs = MAP_OFFS_IN_BLK;
while (offs < (master->erasesize - sizeof(*tmpdesc))) {
tmpdesc = (struct sc_part_desc *)&(buf[offs]);
if (!scpart_desc_is_valid(tmpdesc))
break;
cnt++;
offs += sizeof(*tmpdesc);
}
...
I am not sure that the original code is relying on signed integer overflow? Regardless, the kernel builds with -fno-strict-overflow
, which implies -fwrapv
, so does that really matter here?
v6, which is now in -next, works around this by only building the driver when the platform it runs on (which is MIPS-based) is enabled, so we shouldn't see this again, but I think the root cause should still be looked into.
The RALINK dependency is not great, we usually want all code to be enabled for CONFIG_COMPILE_TEST if possible.
For the kernel code, I would simply make 'offs' a 32-bit type, since we know that it is bounded by master->erasesize, which in turn is a 32-bit type and in practice is limited to a few MB in size.
On the llvm side, the reduced test case is clearly a misoptimization that should be addressed in a future version by not turning the cheap loop into an expensive division.
indvars
is the pass introducing the expensive 64b divide.
target triple = "armv4t-unknown-linux-gnueabi"
@scpart_scan_partmap_offs = external global i64
define void @scpart_scan_partmap() {
entry:
br label %while.body.preheader
while.body.preheader: ; preds = %entry
br label %while.body
while.body: ; preds = %while.body, %while.body.preheader
%add13 = phi i64 [ %add, %while.body ], [ undef, %while.body.preheader ]
%add = add nsw i64 %add13, 12
%cmp = icmp slt i64 %add13, 0
br i1 %cmp, label %while.body, label %while.cond.while.end_crit_edge
while.cond.while.end_crit_edge: ; preds = %while.body
%add.lcssa = phi i64 [ %add13, %while.body ]
store i64 %add.lcssa, ptr null, align 8
ret void
}
$ opt -passes=indvars reduced.ll -S -o - | grep udiv
rewriteLoopExitValues
in llvm/lib/Transforms/Utils/LoopUtils.cpp
is doing the rewriting.
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 965b35a50978..1d3080b34720 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1360,7 +1360,7 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
// Only do the rewrite when the ExitValue can be expanded cheaply.
// If LoopCanBeDel is true, rewrite exit value aggressively.
- if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost)
+ if (ReplaceExitValue == OnlyCheapRepl && Phi.HighCost)
continue;
Value *ExitVal = Rewriter.expandCodeFor(
the context for this code was moved in 93175a5caa08360ca60b417cc04c094e1ed05c76.
rewriteLoopExitValues
tries to move code from the loop body to a "preheader" (seems like a form of LICM) if it determines that the loop body can be deleted, regardless of cost. Maybe the SCEV candidates can be constructed to not use 64b div on 32b hosts?
Filed upstream: https://github.com/llvm/llvm-project/issues/56153
This is fixed by:
The 0day bot reported an instance of
__aeabi_uldivmod
indrivers/mtd/parsers/scpart.c
, which is now visible in -next: https://lore.kernel.org/202205130802.ScXfu9JP-lkp@intel.com/Looking at the file, I don't see any modulo or division operators?
mtd_block_isbad()
ultimately leads to one but it is handled withdo_div()
already.https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/tree/drivers/mtd/parsers/scpart.c
This does not show up with GCC 12:
cvise
spits out: