Open llvmbot opened 15 years ago
The problem is still present in LLVM 3.3-almost-svn. Now the run-time is the same 0.86 for GCC but now compiling a D version of the code with ldc2 it runs in 1.12 (instead of 1.24 as in past).
This is the D version:
import core.stdc.stdio;
// return true if (x,y) point is inside the given polygon bool pnpoly(size_t N) (in size_t npol, in ref double[N] xp, in ref double[N] yp, in double x, in double y) pure nothrow { bool c = false;
for (size_t i = 0, j = npol - 1; i < npol; j = i++)
if ((((yp[i] <= y) && (y < yp[j])) ||
((yp[j] <= y) && (y < yp[i]))) &&
(x < (xp[j] - xp[i]) * (y - yp[i]) / (yp[j] - yp[i]) + xp[i]))
c = !c;
return c;
}
void main() { int npol = 20; immutable double[20] xp = [0.0,1.0,1.0,0.0,0.0,1.0,-0.5,-1.0,-1.0,-2.0, -2.5,-2.0,-1.5,-.5,1.0,1.0,0.0,-0.5,-1.0,-.5]; immutable double[20] yp = [0.0,0.0,1.0,1.0,2.0,3.0,2.0,3.0,0.0,-.5, -1.0,-1.5,-2.0,-2.0,-1.5,-1.0,-.5,-1.0,-1.0,-.5];
int count = 0;
for (size_t i = 0; i < 1_000_000; i++) {
if (pnpoly(npol, xp, yp, 0.5, 0.5)) count++;
if (pnpoly(npol, xp, yp, 0.5, 1.5)) count++;
if (pnpoly(npol, xp, yp, -0.5, 1.5)) count++;
if (pnpoly(npol, xp, yp, 0.75, 2.25)) count++;
if (pnpoly(npol, xp, yp, 0.0, 2.01)) count++;
if (pnpoly(npol, xp, yp, -0.5, 2.5)) count++;
if (pnpoly(npol, xp, yp, -1.0, -0.5)) count++;
if (pnpoly(npol, xp, yp, -1.5, 0.5)) count++;
if (pnpoly(npol, xp, yp, -2.25, -1.0)) count++;
if (pnpoly(npol, xp, yp, 0.5, -0.25)) count++;
if (pnpoly(npol, xp, yp, 0.5, -1.25)) count++;
if (pnpoly(npol, xp, yp, -0.5, -2.5)) count++;
}
printf("Count %d \n", count);
}
And the asm produced by ldc2 (loop only):
LBB1_2: movl %esi, %ebx movl %edx, %esi movl 36(%esp), %edx movsd (%edx,%ebx,8), %xmm2 movsd (%ebp), %xmm3 ucomisd %xmm3, %xmm1 jb LBB1_4 ucomisd %xmm1, %xmm2 ja LBB1_6 .align 16, 0x90 LBB1_4: ucomisd %xmm2, %xmm1 jb LBB1_8 movsd (%ebp), %xmm3 ucomisd %xmm1, %xmm3 jbe LBB1_8 LBB1_6: movapd %xmm1, %xmm6 subsd %xmm3, %xmm6 movl 40(%esp), %edx movsd (%edx,%ebx,8), %xmm4 movsd (%edi), %xmm5 subsd %xmm5, %xmm4 mulsd %xmm6, %xmm4 subsd %xmm3, %xmm2 divsd %xmm2, %xmm4 addsd %xmm5, %xmm4 ucomisd %xmm0, %xmm4 jbe LBB1_8 xorb $1, %al .align 16, 0x90 LBB1_8: addl $8, %edi addl $8, %ebp decl %ecx leal 1(%esi), %edx jne LBB1_2
And this is the asm produced by gcc 4.8.0 (gcc -std=c99 -Ofast -msse3 -flto -mfpmath=sse -march=native) (loop only):
L15: sall $3, %ebx comisd %xmm3, %xmm2 jae L9 .p2align 4,,7 L5: leal 1(%edx), %ecx cmpl %esi, %ecx je L14 L19: movl %edx, %ebx movl %ecx, %edx movapd %xmm1, %xmm3 movsd (%edi,%edx,8), %xmm1 comisd %xmm1, %xmm2 jb L15 L18: comisd %xmm2, %xmm3 jbe L5 sall $3, %ebx L9: movl %eax, %ecx movsd 0(%ebp,%edx,8), %xmm4 movsd 0(%ebp,%ebx), %xmm0 xorl $1, %ecx subsd %xmm4, %xmm0 movapd %xmm2, %xmm6 subsd %xmm1, %xmm3 subsd %xmm1, %xmm6 mulsd %xmm6, %xmm0 divsd %xmm3, %xmm0 addsd %xmm4, %xmm0 comisd %xmm5, %xmm0 cmova %ecx, %eax leal 1(%edx), %ecx cmpl %esi, %ecx jne L19
Shouldn't the sdisel sign- / zero- hack help here?
j is a PHI, so sdisel won't handle it.
Shouldn't the sdisel sign- / zero- hack help here?
The thing that's tripping up indvars here is j, which doesn't iterate as a polynomial function of the canonical induction variable. This may require a dedicated sign-extension elimination pass.
On Mac OS on x86-32, llvm-gcc produces faster code than gcc 4.2:
$ time ./llvm.exe count 6000000 0.973u 0.001s 0:00.97 100.0% 0+0k 0+0io 0pf+0w $ time ./llvm.exe count 6000000 0.975u 0.001s 0:00.97 100.0% 0+0k 0+0io 0pf+0w
$ time ./gcc.exe count 6000000 1.059u 0.001s 0:01.06 99.0% 0+0k 0+0io 0pf+0w $ time ./gcc.exe count 6000000 1.057u 0.001s 0:01.05 100.0% 0+0k 0+0io 0pf+0w
However, on X86-64 we are slower.
Looking at the x86-64 code for pnpoly, I see some extraneous (and repetitive) sign extends in the llvm code, which are likely to be the problem.
movsd (%rdx,%r8,8), %xmm2
ucomisd %xmm2, %xmm1
jb LBB1_4
movslq %ecx, %r9
movsd (%rdx,%r9,8), %xmm3
ucomisd %xmm1, %xmm3
ja LBB1_6
LBB1_4: ## %bb2 movslq %ecx, %r9 ucomisd (%rdx,%r9,8), %xmm1 jb LBB1_8
ucomisd %xmm1, %xmm2
jbe LBB1_8
LBB1_6: ## %bb4 movsd (%rsi,%r8,8), %xmm3 movslq %ecx, %rcx movsd (%rsi,%rcx,8), %xmm4 ...
Extended Description
This is a small benchmark written in C. nicholas on #llvm has said "feel free to file it in bugzilla for further analysis".
I think this is the origin of this benchmark: http://openmap.bbn.com/~kanderso/performance/postscript/in-poly.ps
I have cleaned up a little the C code:
// C code, npoly.c
include "stdio.h"
// return true if (x,y) point is inside the given polygon int pnpoly(int npol, const double xp, const double yp, double x, double y) { int i, j, c = 0; for (i = 0, j = npol-1; i < npol; j = i++) if ((((yp[i] <= y) && (y < yp[j])) || ((yp[j] <= y) && (y < yp[i]))) && (x < (xp[j] - xp[i]) * (y - yp[i]) / (yp[j] - yp[i]) + xp[i])) c = !c;
}
int main() { int npol = 20; double xp[20] = {0.0,1.0,1.0,0.0,0.0,1.0,-0.5,-1.0,-1.0,-2.0, -2.5,-2.0,-1.5,-.5,1.0,1.0,0.0,-0.5,-1.0,-.5}; double yp[20] = {0.0,0.0,1.0,1.0,2.0,3.0,2.0,3.0,0.0,-.5, -1.0,-1.5,-2.0,-2.0,-1.5,-1.0,-.5,-1.0,-1.0,-.5}; int i; int count = 0; for (i = 0; i < 1000000; i++) { if (pnpoly(npol, xp, yp, 0.5, 0.5)) count++; if (pnpoly(npol, xp, yp, 0.5, 1.5)) count++; if (pnpoly(npol, xp, yp, -0.5, 1.5)) count++; if (pnpoly(npol, xp, yp, 0.75, 2.25)) count++; if (pnpoly(npol, xp, yp, 0.0, 2.01)) count++; if (pnpoly(npol, xp, yp, -0.5, 2.5)) count++; if (pnpoly(npol, xp, yp, -1.0, -0.5)) count++; if (pnpoly(npol, xp, yp, -1.5, 0.5)) count++; if (pnpoly(npol, xp, yp, -2.25, -1.0)) count++; if (pnpoly(npol, xp, yp, 0.5, -0.25)) count++; if (pnpoly(npol, xp, yp, 0.5, -1.25)) count++; if (pnpoly(npol, xp, yp, -0.5, -2.5)) count++; } printf("count %d \n", count);
}
Code compiled with (with gcc and llvm-gcc): llvm-gcc -Wall -O3 -s -fomit-frame-pointer -msse3 -march=native -ffast-math
Using -ftree-vectorizer-verbose=5 Says GCC: note: vectorized 0 loops in function.
Timings, best of 3, seconds: GCC: 0.87 LLVM-GCC: 1.24
(Feel free to increase the loop count if you have a faster PC.)
Compilers: LLVM 2.5, V.4.2.1 (Based on Apple Inc. build 5636) (LLVM build) gcc version 4.3.3-dw2-tdm-1 (GCC)
Windows Vista 32 bit on Celeron 2.13 MHz.
I have compiled the code with the daily build of LDC too (a D version) that uses the latest LLVM 2.6, with similar results.