Closed windenntw closed 5 years ago
I'd guess your variable fw0 has not the optimal type.
I've managed to create a standalone file that reproduces the problem with a fresh compiler from 26-nov-2018, at -O2 when handling either float or double. The problem disappears when using -O1.
( the full code is below )
There are 2 strange things there:
move.l _value,a0
.loop
move.l a0,-(sp)
fsmove.s (sp)+,fp0
fsadd.s _inc,fp0
fsmove.s fp0,-(sp)
move.l (sp)+,d0
dbf d7.loop
move.l a0,_value
fmove.s #0x3f800000,fp0
move.l a0,d0
fsdiv.s d0,fp0
to implemente the C code:
float t = 1.0f / v;
Full code below:
/*
* m68k-amigaos-gcc -fomit-frame-pointer -march=68060 -mhard-float -O2 -S repro-bebbo-gcc-44.c
*/
/*
* Problem happens for both float and double.
*/
#define FLOAT_TYPE float
extern FLOAT_TYPE fa0,fb0,fc0,fd0;
extern FLOAT_TYPE fa1,fb1,fc1,fd1;
extern FLOAT_TYPE fa2,fb2,fc2,fd2;
void interpol(int y, int* v) {
while(--y) {
FLOAT_TYPE aa,a,b,c;
int bb,cc,dd,bb1,cc1;
a = fa0; aa = 1.0f / a;
b = fb0; bb = b * aa;
c = fc0; cc = c * aa;
for (int s = 0 ; s < 8 ; ++s) {
a += fa1; aa = 1.0f * a;
b += fb1; bb1 = b * aa; bb1 >>= 4;
c += fc1; cc1 = c * aa; cc1 >>= 4;
for (int x = 0 ; x < 16 ; ++x) {
*v++ = bb + cc + dd;
bb += bb1;
cc += cc1;
}
}
v = v - (8*16) + 1024;
fa0 += fa2;
fb0 += fb2;
fc0 += fc2;
}
}
Tweaked the compiler options to dump debug info and line 1159 of m68k.md shows up a lot:
m68k-amigaos-gcc -march=68060 -mhard-float -O2 -fomit-frame-pointer -dp -S repro-bebbo-gcc-44.c && less ./repro-bebbo-gcc-44.s
#NO_APP
.text
.align 2
.globl _interpol
_interpol:
lea (-12,sp),sp | 120 *addsi3_internal/2
fmovem #252,-(sp) | 121 *m68k_store_multiple_automod
movem.l a4/a3/a2/d7/d6/d5/d4/d3/d2,-(sp) | 122 *m68k_store_multiple_
automod
move.l 128(sp),d7 | 5 *movsi_m68k2/1
move.l 124(sp),a3 | 106 *movsi_m68k/1
subq.l #1,a3 | 13 *addsi3_internal/2
move.l a3,d1 | 134 *movsi_m68k/1
jeq .L1 | 15 beq
move.l _fa0,a2 | 21 *m68k.md:1159
fsmove.s _fb0,fp7 | 22 *m68k.md:1159
move.l _fc0,a4 | 23 *m68k.md:1159
fsmove.s _fa1,fp5 | 24 *m68k.md:1159
fsmove.s _fb1,fp4 | 25 *m68k.md:1159
fsmove.s _fc1,fp3 | 26 *m68k.md:1159
move.l _fa2,108(sp) | 27 *m68k.md:1159
move.l _fb2,112(sp) | 28 *m68k.md:1159
move.l _fc2,116(sp) | 29 *m68k.md:1159
.L5:
fmove.s #0x3f800000,fp0 | 107 *m68k.md:1159
move.l a2,d0 | 108 *m68k.md:1159
fsdiv.s d0,fp0 | 31 divsf3_68881
fsmove.x fp0,fp1 | 109 *m68k.md:1159
fsmul.x fp7,fp1 | 32 mulsf_68881
fintrz.x fp1,fp1 | 33 ftruncsf2_68881
fmove.l fp1,d5 | 34 fixsfsi2_68881
move.l a4,d0 | 110 *m68k.md:1159
fsmul.s d0,fp0 | 35 mulsf_68881
fintrz.x fp0,fp0 | 36 ftruncsf2_68881
fmove.l fp0,d4 | 37 fixsfsi2_68881
move.l d7,d6 | 111 *movsi_m68k/1
add.l #512,d6 | 38 *addsi3_internal/4
move.l a4,-(sp) | 7 *m68k.md:1159
fsmove.s (sp)+,fp2
fsmove.x fp7,fp1 | 8 *m68k.md:1159
move.l a2,-(sp) | 9 *m68k.md:1159
fsmove.s (sp)+,fp0
move.l d7,a1 | 10 *movsi_m68k2/1
.L4:
well, you can't use AREGs with FREGs.
invalid:
fsdiv.s a0,fp0
valid:
fsdiv.s d0,fp0
Yes I understand that but, what could we do to ask the compiler to not use AREGs nor DREGs as a "swap space" for FREGs?
then the compiler won't chose registers as temp.
right now the costs are defined as:
fsadd.s _fc2,fp0 | 20
fsadd.s d0,fp0 | 4
move.l a4,-(sp) fsmove.s (sp)+,fp2 | 4
(no comment)
I've managed to get the use of AREGs or DREGs as temps to go away by using -march=68040 -mtune=68040 -- got the code at the bottom.
Still, the numbers I get when using -dp seem to be a sequence of integers (see 21, 22, 23, 24, 25,... at the start of the function) instead of costs you quoted above which seem more stable -- I wonder which is the option you used to dump these.
#NO_APP
.text
.align 2
.globl _interpol
_interpol:
add.w #-16,sp | 121 *addsi3_internal/2
fmovem #252,-(sp) | 122 *m68k_store_multiple_automod
movem.l a5/a4/a2/d7/d6/d5/d4/d3/d2,-(sp) | 123 *m68k_store_multiple_automod
move.l 132(sp),a4 | 5 *movsi_m68k2/1
move.l 128(sp),a5 | 112 *movsi_m68k/1
subq.l #1,a5 | 13 *addsi3_internal/2
move.l a5,d2 | 135 *movsi_m68k/1
jeq .L1 | 15 beq
fsmove.s _fa0,fp6 | 21 *m68k.md:1159
fsmove.s _fb0,fp5 | 22 *m68k.md:1159
fsmove.s _fc0,fp4 | 23 *m68k.md:1159
move.l _fa1,d6 | 24 *m68k.md:1159
move.l _fb1,108(sp) | 25 *m68k.md:1159
move.l _fc1,112(sp) | 26 *m68k.md:1159
fsmove.s _fa2,fp7 | 27 *m68k.md:1159
move.l _fb2,116(sp) | 28 *m68k.md:1159
move.l _fc2,120(sp) | 29 *m68k.md:1159
.L5:
fmove.s #0x3f800000,fp0 | 113 *m68k.md:1159
fsdiv.x fp6,fp0 | 31 divsf3_68881
fsmove.x fp0,fp2 | 114 *m68k.md:1159
fsmul.x fp5,fp2 | 32 mulsf_68881
fmovem.l fpcr,d0 | 34 fix_truncdfsi2
moveq #16,d1
or.l d0,d1
and.w #-33,d1
fmovem.l d1,fpcr
fmove.l fp2,d2
fmovem.l d0,fpcr
fsmul.x fp4,fp0 | 35 mulsf_68881
fmovem.l fpcr,d3 | 37 fix_truncdfsi2
moveq #16,d4
or.l d3,d4
and.w #-33,d4
fmovem.l d4,fpcr
fmove.l fp0,d1
fmovem.l d3,fpcr
move.l a4,d3 | 115 *movsi_m68k/1
add.l #512,d3 | 38 *addsi3_internal/4
fsmove.x fp4,fp3 | 7 *m68k.md:1159
fsmove.x fp5,fp2 | 8 *m68k.md:1159
fsmove.x fp6,fp1 | 9 *m68k.md:1159
move.l a4,a1 | 10 *movsi_m68k2/1
.L4:
fsadd.s d6,fp1 | 40 addsf3_68881
fsadd.s 108(sp),fp2 | 41 addsf3_68881
fsmove.x fp1,fp0 | 116 *m68k.md:1159
fsmul.x fp2,fp0 | 42 mulsf_68881
fmovem.l fpcr,d5 | 44 fix_truncdfsi2
moveq #16,d0
or.l d5,d0
and.w #-33,d0
fmovem.l d0,fpcr
fmove.l fp0,d7
fmovem.l d5,fpcr
asr.l #4,d7 | 45 ashrsi3
fsadd.s 112(sp),fp3 | 46 addsf3_68881
fsmove.x fp1,fp0 | 117 *m68k.md:1159
fsmul.x fp3,fp0 | 47 mulsf_68881
fmovem.l fpcr,d5 | 49 fix_truncdfsi2
moveq #16,d0
or.l d5,d0
and.w #-33,d0
fmovem.l d0,fpcr
fmove.l fp0,d4
fmovem.l d5,fpcr
asr.l #4,d4 | 50 ashrsi3
move.l d7,a2 | 118 *movsi_m68k/1
add.l d4,a2 | 51 *addsi3_internal/2
move.l d2,d0 | 119 *movsi_m68k/1
add.l d1,d0 | 53 *addsi3_internal/4
move.l a1,a0 | 54 *movsi_m68k2/1
add.w #64,a1 | 55 *addsi3_internal/2
.L3:
move.l d0,(a0)+ | 58 *movsi_m68k2/1
add.l a2,d0 | 60 *addsi3_internal/4
cmp.l a1,a0 | 62 *m68k.md:499/1
jne .L3 | 63 bne
lsl.l #4,d7 | 67 ashlsi3
add.l d7,d2 | 70 *addsi3_internal/4
move.l d4,d0 | 120 *movsi_m68k/1
lsl.l #4,d0 | 73 ashlsi3
add.l d0,d1 | 76 *addsi3_internal/4
cmp.l a1,d3 | 78 *m68k.md:499/1
jne .L4 | 79 bne
add.w #4096,a4 | 81 *addsi3_internal/2
fsadd.x fp7,fp6 | 82 addsf3_68881
fsadd.s 116(sp),fp5 | 83 addsf3_68881
fsadd.s 120(sp),fp4 | 84 addsf3_68881
subq.l #1,a5 | 85 *addsi3_internal/2
move.l a5,d2 | 137 *movsi_m68k/1
jne .L5 | 88 bne
fmove.s fp6,_fa0 | 90 *m68k.md:1159
fmove.s fp5,_fb0 | 91 *m68k.md:1159
fmove.s fp4,_fc0 | 92 *m68k.md:1159
.L1:
movem.l (sp)+,d2/d3/d4/d5/d6/d7/a2/a4/a5 | 126 *m68k_load_multiple_automod
fmovem (sp)+,#63 | 127 *m68k_load_multiple_automod
add.w #16,sp | 128 *addsi3_internal/2
rts | 129 *return
final update today: same bug happens with -mtune=68080
use -fbbb=+V
to see the internal costs. And I know that the current costs are off - far off.
please test
works for me
Confirmed on my end too :)
Don't have a reduced test case yet, but here is an indication:
I got a big loop that uses more than 8 floats and more than 16 integers / pointers, so there is register pressure to move things around.
One of the thing the loop does is update a float in memory in each iteration of the loop -- in this case it decided to do this sort of sequence:
which is really nuts, as doing this on each iteration:
fsmove.s _fw0,fp0 fsadd.x fp4,fp0 fsmove.s fp0,_fw0
would use less memory access in total.
I'll try and post a short reproducible loop later, wondering if there are any tips for that.