llvmbot commented 15 years ago


Bugzilla Link	5165
Version	2.5
OS	Windows XP
Reporter	LLVM Bugzilla Contributor
CC	@atrick,@lattner,@sunfishcode

Extended Description

This is a small benchmark written in C. nicholas on #llvm has said "feel free to file it in bugzilla for further analysis".

I think this is the origin of this benchmark: http://openmap.bbn.com/~kanderso/performance/postscript/in-poly.ps

I have cleaned up a little the C code:

// C code, npoly.c

include "stdio.h"

// return true if (x,y) point is inside the given polygon int pnpoly(int npol, const double xp, const double yp, double x, double y) { int i, j, c = 0; for (i = 0, j = npol-1; i < npol; j = i++) if ((((yp[i] <= y) && (y < yp[j])) || ((yp[j] <= y) && (y < yp[i]))) && (x < (xp[j] - xp[i]) * (y - yp[i]) / (yp[j] - yp[i]) + xp[i])) c = !c;

return c;

}

int main() { int npol = 20; double xp[20] = {0.0,1.0,1.0,0.0,0.0,1.0,-0.5,-1.0,-1.0,-2.0, -2.5,-2.0,-1.5,-.5,1.0,1.0,0.0,-0.5,-1.0,-.5}; double yp[20] = {0.0,0.0,1.0,1.0,2.0,3.0,2.0,3.0,0.0,-.5, -1.0,-1.5,-2.0,-2.0,-1.5,-1.0,-.5,-1.0,-1.0,-.5}; int i; int count = 0; for (i = 0; i < 1000000; i++) { if (pnpoly(npol, xp, yp, 0.5, 0.5)) count++; if (pnpoly(npol, xp, yp, 0.5, 1.5)) count++; if (pnpoly(npol, xp, yp, -0.5, 1.5)) count++; if (pnpoly(npol, xp, yp, 0.75, 2.25)) count++; if (pnpoly(npol, xp, yp, 0.0, 2.01)) count++; if (pnpoly(npol, xp, yp, -0.5, 2.5)) count++; if (pnpoly(npol, xp, yp, -1.0, -0.5)) count++; if (pnpoly(npol, xp, yp, -1.5, 0.5)) count++; if (pnpoly(npol, xp, yp, -2.25, -1.0)) count++; if (pnpoly(npol, xp, yp, 0.5, -0.25)) count++; if (pnpoly(npol, xp, yp, 0.5, -1.25)) count++; if (pnpoly(npol, xp, yp, -0.5, -2.5)) count++; } printf("count %d \n", count);

return 0;

}

Code compiled with (with gcc and llvm-gcc): llvm-gcc -Wall -O3 -s -fomit-frame-pointer -msse3 -march=native -ffast-math

Using -ftree-vectorizer-verbose=5 Says GCC: note: vectorized 0 loops in function.

Timings, best of 3, seconds: GCC: 0.87 LLVM-GCC: 1.24

(Feel free to increase the loop count if you have a faster PC.)

Compilers: LLVM 2.5, V.4.2.1 (Based on Apple Inc. build 5636) (LLVM build) gcc version 4.3.3-dw2-tdm-1 (GCC)

Windows Vista 32 bit on Celeron 2.13 MHz.

I have compiled the code with the daily build of LDC too (a D version) that uses the latest LLVM 2.6, with similar results.

llvmbot commented 11 years ago

The problem is still present in LLVM 3.3-almost-svn. Now the run-time is the same 0.86 for GCC but now compiling a D version of the code with ldc2 it runs in 1.12 (instead of 1.24 as in past).

This is the D version:

import core.stdc.stdio;

// return true if (x,y) point is inside the given polygon bool pnpoly(size_t N) (in size_t npol, in ref double[N] xp, in ref double[N] yp, in double x, in double y) pure nothrow { bool c = false;

for (size_t i = 0, j = npol - 1; i < npol; j = i++)
    if ((((yp[i] <= y) && (y < yp[j])) ||
        ((yp[j] <= y) && (y < yp[i]))) &&
        (x < (xp[j] - xp[i]) * (y - yp[i]) / (yp[j] - yp[i]) + xp[i]))
        c = !c;

return c;

}

void main() { int npol = 20; immutable double[20] xp = [0.0,1.0,1.0,0.0,0.0,1.0,-0.5,-1.0,-1.0,-2.0, -2.5,-2.0,-1.5,-.5,1.0,1.0,0.0,-0.5,-1.0,-.5]; immutable double[20] yp = [0.0,0.0,1.0,1.0,2.0,3.0,2.0,3.0,0.0,-.5, -1.0,-1.5,-2.0,-2.0,-1.5,-1.0,-.5,-1.0,-1.0,-.5];

int count = 0;
for (size_t i = 0; i < 1_000_000; i++) {
    if (pnpoly(npol, xp, yp, 0.5, 0.5))    count++;
    if (pnpoly(npol, xp, yp, 0.5, 1.5))    count++;
    if (pnpoly(npol, xp, yp, -0.5, 1.5))   count++;
    if (pnpoly(npol, xp, yp, 0.75, 2.25))  count++;
    if (pnpoly(npol, xp, yp, 0.0, 2.01))   count++;
    if (pnpoly(npol, xp, yp, -0.5, 2.5))   count++;
    if (pnpoly(npol, xp, yp, -1.0, -0.5))  count++;
    if (pnpoly(npol, xp, yp, -1.5, 0.5))   count++;
    if (pnpoly(npol, xp, yp, -2.25, -1.0)) count++;
    if (pnpoly(npol, xp, yp, 0.5, -0.25))  count++;
    if (pnpoly(npol, xp, yp, 0.5, -1.25))  count++;
    if (pnpoly(npol, xp, yp, -0.5, -2.5))  count++;
}

printf("Count %d \n", count);

}

And the asm produced by ldc2 (loop only):

LBB1_2: movl %esi, %ebx movl %edx, %esi movl 36(%esp), %edx movsd (%edx,%ebx,8), %xmm2 movsd (%ebp), %xmm3 ucomisd %xmm3, %xmm1 jb LBB1_4 ucomisd %xmm1, %xmm2 ja LBB1_6 .align 16, 0x90 LBB1_4: ucomisd %xmm2, %xmm1 jb LBB1_8 movsd (%ebp), %xmm3 ucomisd %xmm1, %xmm3 jbe LBB1_8 LBB1_6: movapd %xmm1, %xmm6 subsd %xmm3, %xmm6 movl 40(%esp), %edx movsd (%edx,%ebx,8), %xmm4 movsd (%edi), %xmm5 subsd %xmm5, %xmm4 mulsd %xmm6, %xmm4 subsd %xmm3, %xmm2 divsd %xmm2, %xmm4 addsd %xmm5, %xmm4 ucomisd %xmm0, %xmm4 jbe LBB1_8 xorb $1, %al .align 16, 0x90 LBB1_8: addl $8, %edi addl $8, %ebp decl %ecx leal 1(%esi), %edx jne LBB1_2

And this is the asm produced by gcc 4.8.0 (gcc -std=c99 -Ofast -msse3 -flto -mfpmath=sse -march=native) (loop only):

L15: sall $3, %ebx comisd %xmm3, %xmm2 jae L9 .p2align 4,,7 L5: leal 1(%edx), %ecx cmpl %esi, %ecx je L14 L19: movl %edx, %ebx movl %ecx, %edx movapd %xmm1, %xmm3 movsd (%edi,%edx,8), %xmm1 comisd %xmm1, %xmm2 jb L15 L18: comisd %xmm2, %xmm3 jbe L5 sall $3, %ebx L9: movl %eax, %ecx movsd 0(%ebp,%edx,8), %xmm4 movsd 0(%ebp,%ebx), %xmm0 xorl $1, %ecx subsd %xmm4, %xmm0 movapd %xmm2, %xmm6 subsd %xmm1, %xmm3 subsd %xmm1, %xmm6 mulsd %xmm6, %xmm0 divsd %xmm3, %xmm0 addsd %xmm4, %xmm0 comisd %xmm5, %xmm0 cmova %ecx, %eax leal 1(%edx), %ecx cmpl %esi, %ecx jne L19

sunfishcode commented 14 years ago

Shouldn't the sdisel sign- / zero- hack help here?

j is a PHI, so sdisel won't handle it.

llvmbot commented 15 years ago

Shouldn't the sdisel sign- / zero- hack help here?

sunfishcode commented 15 years ago

The thing that's tripping up indvars here is j, which doesn't iterate as a polynomial function of the canonical induction variable. This may require a dedicated sign-extension elimination pass.

lattner commented 15 years ago

On Mac OS on x86-32, llvm-gcc produces faster code than gcc 4.2:

$ time ./llvm.exe count 6000000 0.973u 0.001s 0:00.97 100.0% 0+0k 0+0io 0pf+0w $ time ./llvm.exe count 6000000 0.975u 0.001s 0:00.97 100.0% 0+0k 0+0io 0pf+0w

$ time ./gcc.exe count 6000000 1.059u 0.001s 0:01.06 99.0% 0+0k 0+0io 0pf+0w $ time ./gcc.exe count 6000000 1.057u 0.001s 0:01.05 100.0% 0+0k 0+0io 0pf+0w

However, on X86-64 we are slower.

Looking at the x86-64 code for pnpoly, I see some extraneous (and repetitive) sign extends in the llvm code, which are likely to be the problem.

movsd   (%rdx,%r8,8), %xmm2
ucomisd %xmm2, %xmm1
jb  LBB1_4

BB#3: ## %bb1

movslq  %ecx, %r9
movsd   (%rdx,%r9,8), %xmm3
ucomisd %xmm1, %xmm3
ja  LBB1_6

LBB1_4: ## %bb2 movslq %ecx, %r9 ucomisd (%rdx,%r9,8), %xmm1 jb LBB1_8

ucomisd %xmm1, %xmm2
jbe LBB1_8

LBB1_6: ## %bb4 movsd (%rsi,%r8,8), %xmm3 movslq %ecx, %rcx movsd (%rsi,%rcx,8), %xmm4 ...

llvm / llvm-project

npoly benchmark #5537

Extended Description

include "stdio.h"

BB#3: ## %bb1