AngusJohnson / Clipper2

Polygon Clipping and Offsetting - C++, C# and Delphi
Boost Software License 1.0
1.47k stars 270 forks source link

FP exception in GetIntersectPoint() #317

Closed bahvalo closed 1 year ago

bahvalo commented 1 year ago

Executing the following code results in a floating point exception in function Point64 GetIntersectPoint(const Active& e1, const Active& e2). Namely, when a double value is cast to int64_t, it appears to be outside the range of int64_t.

The coordinates of the polygon vertices are within the range -4.6e18 ... 4.6e18. According to the documentation, that is a valid input.

#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <fenv.h>
#include <glob.h>
#include "clipper2/clipper.h"
using namespace Clipper2Lib;

#define ADD(A,X,Y) A.push_back(Point64(int64_t(X), int64_t(Y)))

int main(int, char**) {
    feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW );

    Path64 a, b;
    ADD(a,          -7009388980LL,     1671079308836362LL);
    ADD(a,  -576460711222920704LL,     1671063793410802LL);
    ADD(a,  -864690986154904320LL,  -286977111392363424LL);
    ADD(a, -1152921411648951168LL,  -575625207815834176LL);
    ADD(a,  -864691057648438912LL,  -864273328105026304LL);
    ADD(a,  -576460762799345152LL, -1152921434302619648LL);
    ADD(a,         -12880478468LL, -1152921451122536832LL);
    ADD(a,   576460787720869760LL, -1152921504606846976LL);
    ADD(a,   864691268131862400LL,  -864273433561362176LL);
    ADD(a,  1152921504606846848LL,  -575625211080225088LL);
    ADD(a,   864691062448491520LL,  -286977010832613792LL);
    ADD(a,   576460654512679296LL,     1671130833237401LL);

    ADD(b,   -54234486065476976LL,  1151250415789406208LL);
    ADD(b,  -612617068314194048LL,  1152921504606846976LL);
    ADD(b,  -864691197085273984LL,   867615548203339008LL);
    ADD(b, -1152921504606846848LL,   578967376389736064LL);
    ADD(b,  -864691140504451968LL,   290319223463759488LL);
    ADD(b,  -576460711222912256LL,     1671063793410802LL);
    ADD(b,          -7009388980LL,     1671079308836362LL);
    ADD(b,   576460654512679296LL,     1671130833237401LL);
    ADD(b,   864690908098943872LL,   290319324023499392LL);
    ADD(b,  1100313577101746304LL,   576428327042791872LL);
    ADD(b,   785779376874207360LL,   863806873623168128LL);
    ADD(b,   487696647658790656LL,  1150382388220066304LL);

    const int m = 10;
    for(size_t i=0; i<a.size(); i++) { a[i].x /= m; a[i].y /= m; b[i].x /= m, b[i].y /= m; }

    Paths64 AA; AA.push_back(a);
    Paths64 BB; BB.push_back(b);
    Paths64 solution = Intersect(Paths64(AA), Paths64(BB), FillRule::NonZero);
}

Division by ten is not essential.

alexisnaveros commented 1 year ago

Sorry for posting four messages in a row, I had another quick look at my log and I think I see the problem.

The math in GetIntersectPoint() is inaccurate, it computes an intersection point that's actually outside the "bounding box" of the two edges being intersected. I know you detect that case in AddNewIntersectNode() and correct by snapping to an existing vertex.

But the math reusing these precomputed dx can get a little inaccurate by design. So I switched to a more typical line-line intersection:

static inline int mathVertexLineLineIntersection( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  double ln0c, ln1c, det;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  ln0c = -( (double)ln0a * (double)l0p0x ) - ( (double)ln0b * (double)l0p0y );
  ln1c = -( (double)ln1a * (double)l1p0x ) - ( (double)ln1b * (double)l1p0y );
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det != 0.0 )
  {
    hitpt[0] = (int64_t)nearbyint( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det );
    hitpt[1] = (int64_t)nearbyint( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det );
    return 1;
  }
  return 0;
}

And the result seems correct in all 21 cases, I get S = 3.94191491330444989287e+36.

AngusJohnson commented 1 year ago

Thank you Alexis. That's very helpful and might solve this problem.

alexisnaveros commented 1 year ago

Cool, you are welcome!

Speaking of numerical precision, I think we can still do a little better by "moving" the origin to the average of the 4 points, to reduce the magnitude of the floating point math, before adding the origin back. This should be a little better for intersection points far from the origin:

static inline int mathVertexLineLineIntersection( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  int64_t originx, originy;
  double ln0c, ln1c, det;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  originx = (int64_t)( 0.25 * ( (double)l0p0x + (double)l0p1x + (double)l1p0x + (double)l1p1x ) );
  originy = (int64_t)( 0.25 * ( (double)l0p0y + (double)l0p1y + (double)l1p0y + (double)l1p1y ) );
  ln0c = -( (double)ln0a * (double)( l0p0x - originx ) ) - ( (double)ln0b * (double)( l0p0y - originy ) );
  ln1c = -( (double)ln1a * (double)( l1p0x - originx ) ) - ( (double)ln1b * (double)( l1p0y - originy ) );
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det != 0.0 )
  {
    hitpt[0] = originx + (int64_t)nearbyint( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det );
    hitpt[1] = originy + (int64_t)nearbyint( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det );
    return 1;
  }
  return 0;
}
AngusJohnson commented 1 year ago

@alexisnaveros With my testing, if I remove the nearby function call, it seems that your intersection function performs no better than mine WRT either performance or accuracy.

And I had intentionally removed integer rounding from my own function as it significantly degraded performance. Nevertheless I'm prepared to reconsider that.

If you want an accurate Area(), you are going to need a different approach. The same algorithm could be kept by switching to Shewchuk summation, for example.

Are you able to share the code for your Area function?

sergey-239 commented 1 year ago

Yes, but given that these non-overlapping regions are extremely long and thin, that's not unexpected.

Did I understand correctly that Clipper is not intended to obtain an accurate value of the intersection area (say, with the accuracy 1e24)?

For my purpose, it will be better to get more accurate results compromising the speed (say, using 128-bit arithmetic model). But I understand that there may be different applications with different criteria.

In all cases, thank you for your library. Even if you consider the accuracy of 2e33 to be tolerable, your library is helpful for me.

Dears,

I almost done with converting linux signals to c++ exceptions (the FPU faults are reported using signals in unix world). It appeared to be an interesting journey. However I am still not completely sure that it would work reliably enough to be used in production version, but definitely it could be used in CI tests using c++ exceptions or setjmp/longjmp technique in a worst case. There are some problems with IA-64 ABI itself, as it implemented in GCC, for stack unwinding. If I manage to clear a few still open questions on my list, then I will be able to define the safe to use conditions at least. Unfortunately, today I hit a bug in gdb that is already fixed in its upstream but not in Fedora, so I got a bit delayed (I spent half a day to figure out why gdb started to coredump suddenly). I did not experiment with msvc yet, but M$ claims that SEH and C++ exceptions are interoperable when using /EHa compiler switch.

If I eventually succeed with problems I still have for a linux build then there will be a possible solution of the problem of "limited" accuracy of the present implementation with a try-catch block that first executes the GetIntersectPoint calculations using 64-bit math (both, int and fp) in the try with relaxed (or completely removed) checks for border cases. If the overflow occurs then the catch block calculates using already implemented techniques with extra checks and, possibly, 128-bit math. Thus there will be a tradeoff some overhead for exception handling for the out-of-border cases that should be rare and better performance for the normal situations.

sergey-239 commented 1 year ago

@AngusJohnson BTW, why you don't use +-Inf for dx of vertical lines?

AngusJohnson commented 1 year ago

@AngusJohnson BTW, why you don't use +-Inf for dx of vertical lines?

I'm pretty sure there was a good reason but I'm not sure what it was now. (It was over 10yrs ago when I started Clipper.) However I suspect it was to do with an efficient way to detect and differentiate left and right horizontal (because there's no negative 0 if I was using the conventional dy/dx rather then dx/dy).

WRT FPU overflow detection, I think the CheckCastInt64 function (see above) works well enough. Do you see special advantages of using your own as yet unpublished fault detection code?

sergey-239 commented 1 year ago

@AngusJohnson BTW, why you don't use +-Inf for dx of vertical lines?

(because there's not negative 0 if I was using the conventional dy/dx rather then dx/dy).

Incorrect. there are -0 and +0.

WRT FPU overflow detection, I think the CheckCastInt64 function (see above) works well enough. Do you see special advantages of using your own as yet unpublished fault detection code? The possible optimisation: in the most cases all these extra checks could be omitted and necessary only when straight-forward math does overflow.

This is only one of possible uses an I do not insist on it, just mentioned an option. The initial idea was to extend CI tests with FPU faults detection, so all shape samples from this topic could be included into CI for regression detection.

AngusJohnson commented 1 year ago

Incorrect. there are -0 and +0.

There may be in C++, but not in Delphi and I suspect not in C#.

Edit: correction, -0.0 is permitted in Delphi too, I just wasn't aware that this was so.

sergey-239 commented 1 year ago

Incorrect. there are -0 and +0.

There may be in C++, but not in Delphi and I suspect not in C#.

It's a property of FP types representation, not of a language. OK, I got an answer for why you use +-numeric_limits<double>.max(). This is enough for now.

AngusJohnson commented 1 year ago

It's a property of FP types representation, not of a language.

Yep, see my edit above.😁

alexisnaveros commented 1 year ago

@AngusJohnson

With my testing, if I remove the nearby function call, it seems that your intersection function performs no better than mine WRT either performance or accuracy.

I made the tweaks in a clipper.engine.cpp I had from a couple days ago, the header says Date : 28 October 2022. The numerical precision and results improved as described above. Here's that actual tweaked file: http://www.rayforce.net/clipper2stuff/clipper.engine.cpp My changes are identifiable under the comment /* FOO */ The fix seems to work both without optimization and with -O3 -ffast-math -mtune=native -march=native

Can you test that file as such?...

Are you able to share the code for your Area function?

Sure. But first, an important note about Shewchuk summation, here's my file: http://www.rayforce.net/clipper2stuff/mathshewchuk.h

For that math to work, the compiler must NOT be allowed to reorder any math operation. It must not be allowed to transform (a+b)+c into a+(b+c). In that header file, you can see I disable associative math and such for GCC (also works for clang), using a function attribute. I have no idea how it works with other compilers (disabling all optimization everywhere is not a good option).

Then, in your Area() function, you can: Replace a = 0.0; with mathShewchukSum sum; mathShewchukInit( &sum ); Replace a += foo; with mathShewchukAdd( &sum, foo ); Replace return a; with return mathShewchukTotal( &sum );

And there you go, kind-of-infinite precision summation.

alexisnaveros commented 1 year ago

@sergey-239

@AngusJohnson BTW, why you don't use +-Inf for dx of vertical lines?

Infinities, like other abnormal numbers (including denormals), have abysmal performance on almost all CPUs. It varies between a couple times slower and several hundred times slower.

That's also why there's a x86/amd64 control register flag for "denormals are zero", flushing any denormal (really tiny number, when the exponent part of IEEE-754 is already at minimum) to zero.

sergey-239 commented 1 year ago

@alexisnaveros

Infinities, like other abnormal numbers (including denormals), have abysmal performance on almost all CPUs. It varies between a couple times slower and several hundred times slower.

It is true for 387 ISA and not for SSE2. I believe you hardly would pick Pentium2 nowadays for these tasks.

#include <chrono>
#include <iomanip>
#include <iostream>
#include <numeric>
#include <utility>
#include <vector>
#include <algorithm>

int main()
{
    std::cout.imbue(std::locale(""));
    std::cout << std::fixed << std::setprecision(1);
    auto eval = [](auto fun) {
        const auto t1 = std::chrono::high_resolution_clock::now();
        const auto name = fun();
        const auto t2 = std::chrono::high_resolution_clock::now();
        const std::chrono::duration<double, std::milli> ms = t2 - t1;
        std::cout << std::setw(28) << std::left << name << "\t time: " << ms.count() << " ms\n";
    };
    {
        std::vector<double> v(100'000'007, 0.1);

        eval([&v]{
            double m = std::numeric_limits<double>::max();
        std::for_each(v.begin(), v.end(), [m](double &d){ d *= m; });
            return "*= std::numeric_limits<double>.max()";
        });
    }
    {
        std::vector<double> v(100'000'007, 0.1);

        eval([&v]{
            double m = std::numeric_limits<double>::infinity();
        std::for_each(v.begin(), v.end(), [m](double &d){ d *= m; });
            return "*= std::numeric_limits<double>.infinity()";
        });
    }
    {
        std::vector<double> v(100'000'007, 0.1);

        eval([&v]{
            double m = std::numeric_limits<double>::max();
        std::for_each(v.begin(), v.end(), [m](double &d){ d += m; });
            return "+= std::numeric_limits<double>.max()";
        });
    }
    {
        std::vector<double> v(100'000'007, 0.1);

        eval([&v]{
            double m = std::numeric_limits<double>::infinity();
        std::for_each(v.begin(), v.end(), [m](double &d){ d += m; });
            return "+= std::numeric_limits<double>.infinity()";
        });
    }
    {
        std::vector<double> v(100'000'007, 0.1);

        eval([&v]{
            double m = std::numeric_limits<double>::max();
        std::for_each(v.begin(), v.end(), [m](double &d){ d /= m; });
            return "/= std::numeric_limits<double>.max()";
        });
    }
    {
        std::vector<double> v(100'000'007, 0.1);

        eval([&v]{
            double m = std::numeric_limits<double>::infinity();
        std::for_each(v.begin(), v.end(), [m](double &d){ d /= m; });
            return "/= std::numeric_limits<double>.infinity()";
        });
    }
}

Compiled with g++ -mfpmath=sse -std=c++17 -O1 Output:

*= std::numeric_limits<double>.max()     time: 87.0 ms
*= std::numeric_limits<double>.infinity()    time: 111.7 ms
+= std::numeric_limits<double>.max()     time: 86.0 ms
+= std::numeric_limits<double>.infinity()    time: 79.1 ms
/= std::numeric_limits<double>.max()     time: 5,227.6 ms
/= std::numeric_limits<double>.infinity()    time: 143.5 ms
alexisnaveros commented 1 year ago

It is true for 387 ISA and not for SSE2. I believe you hardly would pick Pentium2 nowadays for these tasks.

@sergey-239 I stand corrected! I also just did a quick test around. SSE Infinities are a bit slower on my old Opteron 6380 (Piledriver era), and they are the same speed on more modern chips. Denormals still suffer on modern chips, but not as much as on the Opteron (160x slower, yuck!), and that wasn't the topic. Thanks!

AngusJohnson commented 1 year ago

I made the tweaks in a clipper.engine.cpp I had from a couple days ago, the header says Date : 28 October 2022. The numerical precision and results improved as described above. Here's that actual tweaked file:

Thanks. I will have a good look again, I just need a day or two to enjoy the weekend and recharge.

alexisnaveros commented 1 year ago

Thanks. I will have a good look again, I just need a day or two to enjoy the weekend and recharge.

I'm getting seriously off-topic, but thanks for everything you do.

I actually admire that kind of dedication; I also would have a ton of code to share, but I guess I'm just scared of getting stuck doing tech support (or I don't want that extra source of stress on top?). Either way, thanks, it's well appreciated. I applaud you.

AngusJohnson commented 1 year ago

I will have a good look again, I just need a day or two to enjoy the weekend and recharge.

Alexis, I've just had a much needed break and had another look at you code, and I'm now seeing the improvements you were promising 😁. I'm not sure what I was doing wrong before, but evidently I needed the break to clear my head.

Here's what I'm hoping 🤞 is the final rendition of GetIntersectPoint():

    bool GetIntersectPoint(const Active& e1, const Active& e2, Point64& ip)
    {
        // precondition: neither edge is horizontal
        //assert(!IsHorizontal(e1) && !IsHorizontal(e2));
        if (e1.top.x == e1.bot.x)
        {
            if (e2.top.x == e2.bot.x) return false;
            double b2 = e2.bot.y * e2.dx - e2.bot.x;
            ip.x = e1.bot.x;
            ip.y = CheckCastInt64((e1.bot.x + b2) / e2.dx);
        }
        else if (e2.top.x == e2.bot.x)
        {
            double b1 = e1.bot.y * e1.dx - e1.bot.x;
            ip.x = e2.bot.x;
            ip.y = CheckCastInt64((e2.bot.x + b1) / e1.dx);
        }   
        else
        {
            double ln0a = static_cast<double>(e1.top.y - e1.bot.y);
            double ln0b = static_cast<double>(e1.bot.x - e1.top.x);
            double ln1a = static_cast<double>(e2.top.y - e2.bot.y);
            double ln1b = static_cast<double>(e2.bot.x - e2.top.x);
            double ln0c = ln0a * e1.bot.x + ln0b * e1.bot.y;
            double ln1c = ln1a * e2.bot.x + ln1b * e2.bot.y;
            double det = ln1a * ln0b - ln0a * ln1b;
            if (det == 0.0) return false;
            ip.x = CheckCastInt64((ln0b * ln1c - ln1b * ln0c) / det);
            ip.y = CheckCastInt64((ln1a * ln0c - ln0a * ln1c) / det);
        }
        return true;
    }
alexisnaveros commented 1 year ago

Great!

Although... I think you should also consider that origin shift I suggested. :)

Let's try it out:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <float.h>
#include <math.h>

/* Without origin shift */
static inline int mathVertexLineLineIntersection_A( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  double ln0c, ln1c, det;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  ln0c = -( (double)ln0a * (double)l0p0x ) - ( (double)ln0b * (double)l0p0y );
  ln1c = -( (double)ln1a * (double)l1p0x ) - ( (double)ln1b * (double)l1p0y );
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det != 0.0 )
  {
    hitpt[0] = (int64_t)nearbyint( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det );
    hitpt[1] = (int64_t)nearbyint( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det );
    return 1;
  }
  return 0;
}

/* With origin shift */
static inline int mathVertexLineLineIntersection_B( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  int64_t originx, originy;
  double ln0c, ln1c, det;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  originx = (int64_t)( 0.25 * ( (double)l0p0x + (double)l0p1x + (double)l1p0x + (double)l1p1x ) );
  originy = (int64_t)( 0.25 * ( (double)l0p0y + (double)l0p1y + (double)l1p0y + (double)l1p1y ) );
  ln0c = -( (double)ln0a * (double)( l0p0x - originx ) ) - ( (double)ln0b * (double)( l0p0y - originy ) );
  ln1c = -( (double)ln1a * (double)( l1p0x - originx ) ) - ( (double)ln1b * (double)( l1p0y - originy ) );
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det != 0.0 )
  {
    hitpt[0] = originx + (int64_t)nearbyint( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det );
    hitpt[1] = originy + (int64_t)nearbyint( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det );
    return 1;
  }
  return 0;
}

#define OFFSET_X (3000000000000000001)
#define OFFSET_Y (4000000000000000003)

int main()
{
  int64_t hitpt_A[2];
  int64_t hitpt_B[2];
  int64_t l0p0x, l0p0y, l0p1x, l0p1y, l1p0x, l1p0y, l1p1x, l1p1y;

  /* The two line segments form a X shape with the intersection right at OFFSET_X,OFFSET_Y */
  l0p0x = OFFSET_X - 100;
  l0p0y = OFFSET_Y - 10;
  l0p1x = OFFSET_X + 100;
  l0p1y = OFFSET_Y + 10;
  l1p0x = OFFSET_X - 40;
  l1p0y = OFFSET_Y - 90;
  l1p1x = OFFSET_X + 40;
  l1p1y = OFFSET_Y + 90;

  mathVertexLineLineIntersection_A( hitpt_A, l0p0x, l0p0y, l0p1x, l0p1y, l1p0x, l1p0y, l1p1x, l1p1y );
  mathVertexLineLineIntersection_B( hitpt_B, l0p0x, l0p0y, l0p1x, l0p1y, l1p0x, l1p0y, l1p1x, l1p1y );

  printf( "We should get           : %lld %lld\n", (long long)OFFSET_X, (long long)OFFSET_Y );
  printf( "Hit without origin shift: %lld %lld\n", (long long)hitpt_A[0], (long long)hitpt_A[1] );
  printf( "Hit with origin shift   : %lld %lld\n", (long long)hitpt_B[0], (long long)hitpt_B[1] );

  return 1;
}

Output:

We should get           : 3000000000000000001 4000000000000000003
Hit without origin shift: 3000000000000000000 4000000000000000000
Hit with origin shift   : 3000000000000000001 4000000000000000003

EDIT: Actually, we can still do better than that, scrap that. Better version coming up in next post.

alexisnaveros commented 1 year ago

Another version with new and improved flavor!

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <float.h>
#include <math.h>

/* Without origin shift */
static inline int mathVertexLineLineIntersection_A( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  double ln0c, ln1c, det;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  ln0c = -( (double)ln0a * (double)l0p0x ) - ( (double)ln0b * (double)l0p0y );
  ln1c = -( (double)ln1a * (double)l1p0x ) - ( (double)ln1b * (double)l1p0y );
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det != 0.0 )
  {
    hitpt[0] = (int64_t)nearbyint( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det );
    hitpt[1] = (int64_t)nearbyint( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det );
    return 1;
  }
  return 0;
}

/* With regular origin shift */
static inline int mathVertexLineLineIntersection_B( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  int64_t originx, originy;
  double ln0c, ln1c, det;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  originx = (int64_t)( 0.25 * ( (double)l0p0x + (double)l0p1x + (double)l1p0x + (double)l1p1x ) );
  originy = (int64_t)( 0.25 * ( (double)l0p0y + (double)l0p1y + (double)l1p0y + (double)l1p1y ) );
  ln0c = -( (double)ln0a * (double)( l0p0x - originx ) ) - ( (double)ln0b * (double)( l0p0y - originy ) );
  ln1c = -( (double)ln1a * (double)( l1p0x - originx ) ) - ( (double)ln1b * (double)( l1p0y - originy ) );
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det != 0.0 )
  {
    hitpt[0] = originx + (int64_t)nearbyint( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det );
    hitpt[1] = originy + (int64_t)nearbyint( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det );
    return 1;
  }
  return 0;
}

/* With fancy origin shift */
static inline int mathVertexLineLineIntersection_C( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  int64_t originx, originy;
  double ln0c, ln1c, det;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det == 0.0 )
    return 0;
  ln0c = -( (double)ln0a * (double)l0p0x ) - ( (double)ln0b * (double)l0p0y );
  ln1c = -( (double)ln1a * (double)l1p0x ) - ( (double)ln1b * (double)l1p0y );
#if 0
  /* Unsafe against overflows! */
  originx = (int64_t)nearbyint( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det );
  originy = (int64_t)nearbyint( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det );
#else
  /* Safer, but still unsafe against overflows */
  originx = (int64_t)nearbyint( fmin( fmax( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det, (double)INT64_MIN ), (double)INT64_MAX ) );
  originy = (int64_t)nearbyint( fmin( fmax( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det, (double)INT64_MIN ), (double)INT64_MAX ) );
#endif
  ln0c = -( (double)ln0a * (double)( l0p0x - originx ) ) - ( (double)ln0b * (double)( l0p0y - originy ) );
  ln1c = -( (double)ln1a * (double)( l1p0x - originx ) ) - ( (double)ln1b * (double)( l1p0y - originy ) );
  hitpt[0] = originx + (int64_t)nearbyint( ( ( (double)ln1b * ln0c ) - ( (double)ln0b * ln1c ) ) / det );
  hitpt[1] = originy + (int64_t)nearbyint( ( ( (double)ln0a * ln1c ) - ( (double)ln1a * ln0c ) ) / det );
  return 1;
}

#define OFFSET_X (3000000000000000001LL)
#define OFFSET_Y (4000000000000000003LL)

int main()
{
  int64_t hitpt_A[2];
  int64_t hitpt_B[2];
  int64_t hitpt_C[2];
  int64_t l0p0x, l0p0y, l0p1x, l0p1y, l1p0x, l1p0y, l1p1x, l1p1y;

#if 0
  /* The two line segments form a X shape with the intersection right at OFFSET_X,OFFSET_Y */
  l0p0x = OFFSET_X - 100;
  l0p0y = OFFSET_Y - 10;
  l0p1x = OFFSET_X + 100;
  l0p1y = OFFSET_Y + 10;
  l1p0x = OFFSET_X - 40;
  l1p0y = OFFSET_Y - 90;
  l1p1x = OFFSET_X + 40;
  l1p1y = OFFSET_Y + 90;
#else
  /* The two line segments form a / crossed with a long horizontal --- with the intersection right at OFFSET_X,OFFSET_Y */
  l0p0x = OFFSET_X - 100;
  l0p0y = OFFSET_Y - 10;
  l0p1x = OFFSET_X + 100;
  l0p1y = OFFSET_Y + 10;
  l1p0x = OFFSET_X - 6000000000000000000LL;
  l1p0y = OFFSET_Y;
  l1p1x = OFFSET_X + 40;
  l1p1y = OFFSET_Y;
#endif

  mathVertexLineLineIntersection_A( hitpt_A, l0p0x, l0p0y, l0p1x, l0p1y, l1p0x, l1p0y, l1p1x, l1p1y );
  mathVertexLineLineIntersection_B( hitpt_B, l0p0x, l0p0y, l0p1x, l0p1y, l1p0x, l1p0y, l1p1x, l1p1y );
  mathVertexLineLineIntersection_C( hitpt_C, l0p0x, l0p0y, l0p1x, l0p1y, l1p0x, l1p0y, l1p1x, l1p1y );

  printf( "We should get                 : %lld %lld\n", (long long)OFFSET_X, (long long)OFFSET_Y );
  printf( "Hit without origin shift      : %lld %lld\n", (long long)hitpt_A[0], (long long)hitpt_A[1] );
  printf( "Hit with regular origin shift : %lld %lld\n", (long long)hitpt_B[0], (long long)hitpt_B[1] );
  printf( "Hit with fancy origin shift   : %lld %lld\n", (long long)hitpt_C[0], (long long)hitpt_C[1] );

  return 1;
}

Output:

We should get                 : 3000000000000000001 4000000000000000003
Hit without origin shift      : 3000000000000000512 3999999999999999488
Hit with regular origin shift : 2999999999999999744 4000000000000000003
Hit with fancy origin shift   : 3000000000000000001 4000000000000000003

EDIT: Made mathVertexLineLineIntersection_C() safer against overflows.

EDIT EDIT: Could still be safer against overflows, I guess I'll write yet another version.

alexisnaveros commented 1 year ago

Okay, this is done; robust, accurate and safe against overflows.

static inline int mathVertexLineLineIntersection_D( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  int64_t originx, originy;
  double ln0c, ln1c, det, hitx, hity, originfx, originfy;
  double limitminx, limitminy, limitmaxx, limitmaxy;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det == 0.0 )
    return 0;
  ln0c = ( (double)ln0a * (double)l0p0x ) + ( (double)ln0b * (double)l0p0y );
  ln1c = ( (double)ln1a * (double)l1p0x ) + ( (double)ln1b * (double)l1p0y );
  hitx = ( ( (double)ln0b * ln1c ) - ( (double)ln1b * ln0c ) ) / det;
  hity = ( ( (double)ln1a * ln0c ) - ( (double)ln0a * ln1c ) ) / det;
  limitminx = fmin( fmin( (double)l0p0x, (double)l0p1x ), fmin( (double)l1p0x, (double)l1p1x ) );
  limitminy = fmin( fmin( (double)l0p0y, (double)l0p1y ), fmin( (double)l1p0y, (double)l1p1y ) );
  limitmaxx = fmax( fmax( (double)l0p0x, (double)l0p1x ), fmax( (double)l1p0x, (double)l1p1x ) );
  limitmaxy = fmax( fmax( (double)l0p0y, (double)l0p1y ), fmax( (double)l1p0y, (double)l1p1y ) );
  originfx = fmin( fmax( hitx, limitminx ), limitmaxx );
  originfy = fmin( fmax( hity, limitminy ), limitmaxy );
  originx = (int64_t)nearbyint( originfx );
  originy = (int64_t)nearbyint( originfy );
  ln0c = ( (double)ln0a * (double)( l0p0x - originx ) ) + ( (double)ln0b * (double)( l0p0y - originy ) );
  ln1c = ( (double)ln1a * (double)( l1p0x - originx ) ) + ( (double)ln1b * (double)( l1p0y - originy ) );
  hitx = ( ( (double)ln0b * ln1c ) - ( (double)ln1b * ln0c ) ) / det;
  hity = ( ( (double)ln1a * ln0c ) - ( (double)ln0a * ln1c ) ) / det;
  if( fmax( fabs( originfx + hitx ), fabs( originfy + hity ) ) >= (double)(INT64_MAX-2) )
    return 0;
  hitpt[0] = originx + (int64_t)nearbyint( hitx );
  hitpt[1] = originy + (int64_t)nearbyint( hity );
  return 1;
}

The only remaining change would be a SSE version for a 2.3x speed boost (this is kind of trivial if you want that).

EDIT: Fixed something.

AngusJohnson commented 1 year ago

Hi again Alexis. My reservation with your proposed code changes is that it's (relatively) time costly. My goal with this function is to return the intersection coordinate that's within one unit/pixel (in both x & y directions) from the exact intersect coordinate, assuming** coordinate values are roughly +/-1e8. Originally I was aiming for the closest coodinate (ie rounded) but ISTM that that additional precision wasn't justified given the relative time cost (and could be achieved more efficiently with ?additional coordinate scaling).

**However, with extremely large coordinates as we've been considering above, IMHO this degree of precision (+/- 1unit) would almost never by required (or achievable without resorting to floating point types > 8 bytes). And ISTM that we're still getting very close to integer precision with coordinates up to +/-1.0e12.

alexisnaveros commented 1 year ago

Hey Angus. Well, perfect accuracy is achievable without special floating point types, it just needs code that's a bit more complicated... and slower, yes.

Perhaps offer a compilation-time switch then? Some people need speed, other people need accuracy and robustness above all (bahvalo above seems to require accuracy ~ so do I; my computations take hours/days, the result is useless if incorrect, and I sure want to switch to Clipper2 eventually).

Well, I think I'll cook up a quick SSE optimized version, and without requiring AVX-512 (int64_t<->double is awkward without _mm_cvtepi64_pd() and _mm_cvtpd_epi64(), but I'll figure out something). Perhaps a 2-3x speed boost will make you change your mind? ;)

AngusJohnson commented 1 year ago

Perhaps offer a compilation-time switch then? Some people need speed, other people need accuracy and robustness above all.

Robustness is a given regardless of the degree of precision required. And that's why I was chasing so hard the overflow errors above. And precision is important too for everyone, up to a point it unacceptably compromises performance. And yes I am now considering a compiler switch as evidently some (?very few) like you do require extremely high (> 1.0e12) degrees of precision.

AngusJohnson commented 1 year ago

One problem I do have is how can I test / verify what is the correct (most accurate) coordinate for a given intersection? Even online calculators don't return the degrees of precision that you seem to require.

alexisnaveros commented 1 year ago

One problem I do have is how can I test / verify what is the correct (most accurate) coordinate for a given intersection? Even online calculators don't return the degrees of precision that you seem to require.

Ah yes, I guess I would check the way I did in the tests above: pick an exact intersection point, extend two arbitrary vectors both ways from that point by an arbitrary magnitude... and see if you recover the original intersection point?

Maybe that version would a better compromise between accuracy and performance?

static inline int mathVertexLineLineIntersection_E( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t ln0a, ln0b, ln1a, ln1b;
  int64_t originx, originy;
  double bb0minx, bb0miny, bb0maxx, bb0maxy, bb1minx, bb1miny, bb1maxx, bb1maxy;
  double ln0c, ln1c, det, hitx, hity, originfx, originfy;
  ln0a = l0p1y - l0p0y;
  ln0b = l0p0x - l0p1x;
  ln1a = l1p1y - l1p0y;
  ln1b = l1p0x - l1p1x;
  det = ( (double)ln1a * (double)ln0b ) - ( (double)ln0a * (double)ln1b );
  if( det == 0.0 )
    return 0;
  bb0minx = fmin( (double)l0p0x, (double)l0p1x );
  bb0miny = fmin( (double)l0p0y, (double)l0p1y );
  bb0maxx = fmax( (double)l0p0x, (double)l0p1x );
  bb0maxy = fmax( (double)l0p0y, (double)l0p1y );
  bb1minx = fmin( (double)l1p0x, (double)l1p1x );
  bb1miny = fmin( (double)l1p0y, (double)l1p1y );
  bb1maxx = fmax( (double)l1p0x, (double)l1p1x );
  bb1maxy = fmax( (double)l1p0y, (double)l1p1y );
  originfx = 0.5 * ( fmin( bb0maxx, bb1maxx ) + fmax( bb0minx, bb1minx ) );
  originfy = 0.5 * ( fmin( bb0maxy, bb1maxy ) + fmax( bb0miny, bb1miny ) );
  originx = (int64_t)nearbyint( originfx );
  originy = (int64_t)nearbyint( originfy );
  ln0c = ( (double)ln0a * (double)( l0p0x - originx ) ) + ( (double)ln0b * (double)( l0p0y - originy ) );
  ln1c = ( (double)ln1a * (double)( l1p0x - originx ) ) + ( (double)ln1b * (double)( l1p0y - originy ) );
  hitx = ( ( (double)ln0b * ln1c ) - ( (double)ln1b * ln0c ) ) / det;
  hity = ( ( (double)ln1a * ln0c ) - ( (double)ln0a * ln1c ) ) / det;
  if( fmax( fabs( originfx + hitx ), fabs( originfy + hity ) ) >= (double)(INT64_MAX-2) )
    return 0;
  hitpt[0] = originx + (int64_t)nearbyint( hitx );
  hitpt[1] = originy + (int64_t)nearbyint( hity );
  return 1;
}

It seems to pass my (very few) tests with the same accuracy as mathVertexLineLineIntersection_D() posted earlier.

sergey-239 commented 1 year ago

Ah yes, I guess I would check the way I did in the tests above: pick an exact intersection point, extend two arbitrary vectors both ways from that point by an arbitrary magnitude... and see if you recover the original intersection point?

Maybe that version would a better compromise between accuracy and performance?


static inline int mathVertexLineLineIntersection_E( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )

Replacing double with int64_t for bb0minx, bb0miny, bb0maxx, bb0maxy, bb1minx, bb1miny, bb1maxx, bb1maxy and fmin/fmax with min/max makes it a bit faster (the timings accounts memory reads of points coordinates from unique locations for each mathVertexLineLineIntersection_E_xx invocation) :

mathVertexLineLineIntersection_E_double_bb   count: 10,000,000   time: 191.4 ms
mathVertexLineLineIntersection_E_int64_t_bb  count: 10,000,000   time: 176.7 ms
alexisnaveros commented 1 year ago

@sergey-239 Confirmed, I get a similar difference on an Epyc 7532.

I can see the scalar int64_t->double (cvtsi2sd) instruction has very crappy latency and pipelining, and I added 8 of them. Fortunately, GCC is smart enough to use conditional moves for the int64_t min/max. I was worried it was going to put branches, hence the fmin/fmax (which are branchless instructions).

I hadn't benchmarked anything, but I just started doing so to compare my SSE version.

sergey-239 commented 1 year ago

@alexisnaveros replacing

double originfx = 0.5 * ( min( bb0maxx, bb1maxx ) + max( bb0minx, bb1minx ) );

with

int64_t originfx = ( min( bb0maxx, bb1maxx ) + max( bb0minx, bb1minx ) + 1) >> 1;

makes no performance change. However, if using sse instructions to compute both origins in parallel then it could be faster than floating multiplication and rounding.

alexisnaveros commented 1 year ago

@sergey-239 Be careful about overflow though, I was assuming the function would work with the full range of int64_t.

SSE/AVX is rather awkward, because there are no instructions to convert multiple int64_t <-> double before AVX-512. So it takes a bunch of funky bitwise operations to go forth and back (playing in the binary representation of doubles), and that ruins performance a bit. My first draft of SSE version is only 83% faster. Meh.

sergey-239 commented 1 year ago

@alexisnaveros,

Be careful about overflow though, I was assuming the function would work with the full range of int64_t.

yes I assumed the library defined safe range that is +-INT64_MAX/2

SSE/AVX is rather awkward, because there are no instructions to convert multiple int64_t <-> double before AVX-512. So it takes a bunch of funky bitwise operations to go forth and back (playing in the binary representation of doubles), and that ruins performance a bit. My first draft of SSE version is only 83% faster. Meh.

It's good that Intel redesigned fp math at all by introducing SSE ISA.

bahvalo commented 1 year ago

My version of the segment intersection subroutine. Pros: faster than 'B', 'C', 'D', 'E' version above; safe against overflows (provided that the input data is correct) Cons: slower than 'A' version; uses a the compiler-dependent int128 type

static inline int mathVertexLineLineIntersection_PB( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  __int128 alpha_num = __int128(l1p0x-l0p0x)*__int128(l1p1y-l1p0y) - __int128(l1p0y-l0p0y)*__int128(l1p1x-l1p0x);
  __int128  beta_num = __int128(l1p0x-l0p0x)*__int128(l0p1y-l0p0y) - __int128(l1p0y-l0p0y)*__int128(l0p1x-l0p0x);
  __int128     denom = __int128(l0p1x-l0p0x)*__int128(l1p1y-l1p0y) - __int128(l0p1y-l0p0y)*__int128(l1p1x-l1p0x);
  if(denom==0) {
    if(alpha_num || beta_num) return 0; // segments lay on parallel lines
    // special case with segments on the same line -- may be considered if necessary
    return 0;
  }
  else {
    if(denom<0) { alpha_num=-alpha_num; beta_num=-beta_num; denom=-denom; }
    if(alpha_num<0 || beta_num<0 || alpha_num>denom || beta_num>denom) return 0;
    double alpha = double(alpha_num)/double(denom);
    hitpt[0] = (int64_t)nearbyint(l0p0x+alpha*(l0p1x-l0p0x));
    hitpt[1] = (int64_t)nearbyint(l0p0y+alpha*(l0p1y-l0p0y));
    return 1;
  }
}
bahvalo commented 1 year ago

If one prefers speed than accuracy, there may be a version with the 32-bit input and the 64-bit integers inside.

bahvalo commented 1 year ago

@AngusJohnson I checked your last version of GetIntersectPoint. It throws an exception in the following example.

#include <stdio.h>
#include "clipper2/clipper.h"
using namespace Clipper2Lib;

#define ADD(A,X,Y) A.push_back(Point64(int64_t(X), int64_t(Y)))

int main(int, char**) {
    Path64 a,b;
    ADD(a,-6991627179LL,1666844791673082LL);
    ADD(a,-574999959023595328LL,1666829315563631LL);
    ADD(a,-862499858060358144LL,-286249911015197856LL);
    ADD(a,-1149999907277659648LL,-574166572783239680LL);
    ADD(a,-862499929372727936LL,-862083258356526976LL);
    ADD(a,-575000010469324992LL,-1149999929873923840LL);
    ADD(a,-12847839319LL,-1149999946651219328LL);
    ADD(a,575000035327698432LL,-1150000000000000128LL);
    ADD(a,862500139322786176LL,-862083363545636480LL);
    ADD(a,1150000000000000000LL,-574166576039358592LL);
    ADD(a,862499934160617216LL,-286249810710266752LL);
    ADD(a,574999902457057728LL,1666896185511221LL);
    ADD(b,-54097055806558968LL,1148333145723817472LL);
    ADD(b,-611064695858513920LL,1150000000000000128LL);
    ADD(b,-862500068456230016LL,865417009264721152LL);
    ADD(b,-1150000000000000000LL,577500272297585920LL);
    ADD(b,-862500012018783744LL,289583554170215680LL);
    ADD(b,-574999959023586880LL,1666829315563631LL);
    ADD(b,-6991627179LL,1666844791673082LL);
    ADD(b,574999902457057728LL,1666896185511221LL);
    ADD(b,862499780202191616LL,289583654475137088LL);
    ADD(b,1097525381052289280LL,574967656905021504LL);
    ADD(b,783788210901215872LL,861617985870938496LL);
    ADD(b,486460823713113792LL,1147467317737478272LL);

    Paths64 AA; AA.push_back(a);
    Paths64 BB; BB.push_back(b);
    Paths64 solution = Difference(Paths64(AA), Paths64(BB), FillRule::NonZero);
}
alexisnaveros commented 1 year ago

@bahvalo Cool version, though it doesn't pass my accuracy tests... It's accurate only if the intersection point lands near l0p0x,l0p0y (which is likely in most cases, granted).

(I also assume adding l0p0x,l0p0y should be outside of the nearbyint(), probably a typo)

EDIT: And I also wanted to avoid the foo=1.0/div; result.x *= foo; result.y *= foo; optimization for a little bit of extra accuracy, though it doesn't seem to make much difference. And the compiler does that anyway with -ffast-math.

bahvalo commented 1 year ago

I guess the precision of my subroutine is limited by the mantissa length, which is smaller than 64 bit. I'm curious, will the following code pass your accuracy test? (Can be compiled with gcc only).

static inline int mathVertexLineLineIntersection_PB3( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  __int128 alpha_num = __int128(l1p0x-l0p0x)*__int128(l1p1y-l1p0y) - __int128(l1p0y-l0p0y)*__int128(l1p1x-l1p0x);
  __int128  beta_num = __int128(l1p0x-l0p0x)*__int128(l0p1y-l0p0y) - __int128(l1p0y-l0p0y)*__int128(l0p1x-l0p0x);
  __int128     denom = __int128(l0p1x-l0p0x)*__int128(l1p1y-l1p0y) - __int128(l0p1y-l0p0y)*__int128(l1p1x-l1p0x);
  if(denom==0) {
    if(alpha_num || beta_num) return 0; // segments lay on parallel lines
    // special case with segments on the same line -- may be considered if necessary
    return 0;
  }
  else {
    if(denom<0) { alpha_num=-alpha_num; beta_num=-beta_num; denom=-denom; }
    if(alpha_num<0 || beta_num<0 || alpha_num>denom || beta_num>denom) return 0;
    int64_t* high = (int64_t*)(&denom)+1;
    if(*high) {
      int shift = 65-__builtin_clzll(*high);
      alpha_num>>=shift; denom>>=shift;
    }
    hitpt[0] = (denom*__int128(l0p0x)+alpha_num*__int128(l0p1x-l0p0x))/denom;
    hitpt[1] = (denom*__int128(l0p0y)+alpha_num*__int128(l0p1y-l0p0y))/denom;
    return 1;
  }
}

Upd. Code fixed, 'high' should be a pointer to the second dword of denom, not alpha_num.

alexisnaveros commented 1 year ago

Hey, that's an elegant pure integer math solution. It's x86-friendly, with the hardware actually having 128 bits multiplication results, and rdx:rax 128 bits division.

Yes, it passes my current (very limited) accuracy tests. You could replace storing/reading *high with a >> 64, as it should use the x86 multi-register shrd instruction.

It's kind of slow though, especially if you allow -ffast-math for the other solutions.

alexisnaveros commented 1 year ago

Well, I guess that's the fastest version we got, with perfect accuracy:

#define CC_MIN(x,y) ((x)>(y)?(y):(x))
#define CC_MAX(x,y) ((x)<(y)?(y):(x))
#define DETECT_HIT_OVERFLOW (0)

static inline int mathVertexLineLineIntersection_F( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t originx, originy;
  int64_t bb0minx, bb0miny, bb0maxx, bb0maxy, bb1minx, bb1miny, bb1maxx, bb1maxy;
  double ln0a, ln0b, ln1a, ln1b;
  double ln0c, ln1c, det, hitx, hity;
  ln0a = (double)( l0p1y - l0p0y );
  ln0b = (double)( l0p0x - l0p1x );
  ln1a = (double)( l1p1y - l1p0y );
  ln1b = (double)( l1p0x - l1p1x );
  det = ( ln1a * ln0b ) - ( ln0a * ln1b );
  if( det == 0.0 )
    return 0;
  bb0minx = CC_MIN( l0p0x, l0p1x );
  bb0miny = CC_MIN( l0p0y, l0p1y );
  bb0maxx = CC_MAX( l0p0x, l0p1x );
  bb0maxy = CC_MAX( l0p0y, l0p1y );
  bb1minx = CC_MIN( l1p0x, l1p1x );
  bb1miny = CC_MIN( l1p0y, l1p1y );
  bb1maxx = CC_MAX( l1p0x, l1p1x );
  bb1maxy = CC_MAX( l1p0y, l1p1y );
  originx = ( CC_MIN( bb0maxx, bb1maxx ) + CC_MAX( bb0minx, bb1minx ) ) >> 1;
  originy = ( CC_MIN( bb0maxy, bb1maxy ) + CC_MAX( bb0miny, bb1miny ) ) >> 1;
  ln0c = ( ln0a * (double)( l0p0x - originx ) ) + ( ln0b * (double)( l0p0y - originy ) );
  ln1c = ( ln1a * (double)( l1p0x - originx ) ) + ( ln1b * (double)( l1p0y - originy ) );
  hitx = ( ( ln0b * ln1c ) - ( ln1b * ln0c ) ) / det;
  hity = ( ( ln1a * ln0c ) - ( ln0a * ln1c ) ) / det;
#if DETECT_HIT_OVERFLOW
  if( fmax( fabs( (double)originx + hitx ), fabs( (double)originy + hity ) ) >= (double)(INT64_MAX-1) )
    return 0;
#endif
  hitpt[0] = originx + (int64_t)nearbyint( hitx );
  hitpt[1] = originy + (int64_t)nearbyint( hity );
  return 1;
}

It requires input to be within INT64_MAX/2 range, and assumes that detecting output hit overflow is pointless as it can't happen. My only concern is that some compilers might output a ton of branches instead of conditional moves.

I do have a SSE 4.2 version, but it's only 38% faster (_mm_cmpgt_epi64/_mm_blendv_epi8 is not really better than cmp/cmov/cmov), assuming you compile with -ffast-math (to output one divsd instead of two).

EDIT: And the second place is only 5% slower, without any risk of a compiler emitting branches:

static inline int mathVertexLineLineIntersection_G( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t originx, originy;
  double ln0a, ln0b, ln1a, ln1b;
  double ln0c, ln1c, det, detinv;
  ln0a = (double)( l0p1y - l0p0y );
  ln0b = (double)( l0p0x - l0p1x );
  ln1a = (double)( l1p1y - l1p0y );
  ln1b = (double)( l1p0x - l1p1x );
  det = ( ln1a * ln0b ) - ( ln0a * ln1b );
  if( det == 0.0 )
    return 0;
  detinv = 1.0 / det;
  ln0c = ( ln0a * (double)l0p0x ) + ( ln0b * (double)l0p0y );
  ln1c = ( ln1a * (double)l1p0x ) + ( ln1b * (double)l1p0y );
  originx = (int64_t)nearbyint( ( ( ln0b * ln1c ) - ( ln1b * ln0c ) ) * detinv );
  originy = (int64_t)nearbyint( ( ( ln1a * ln0c ) - ( ln0a * ln1c ) ) * detinv );
  ln0c = ( ln0a * (double)( l0p0x - originx ) ) + ( ln0b * (double)( l0p0y - originy ) );
  ln1c = ( ln1a * (double)( l1p0x - originx ) ) + ( ln1b * (double)( l1p0y - originy ) );
  hitpt[0] = originx + (int64_t)nearbyint( ( ( ln0b * ln1c ) - ( ln1b * ln0c ) ) * detinv );
  hitpt[1] = originy + (int64_t)nearbyint( ( ( ln1a * ln0c ) - ( ln0a * ln1c ) ) * detinv );
  return 1;
}
sergey-239 commented 1 year ago

@alexisnaveros

originx = ( CC_MIN( bb0maxx, bb1maxx ) + CC_MAX( bb0minx, bb1minx ) ) >> 1;

does the truncation, not rounding. If this is important, then the

originx = ( CC_MIN( bb0maxx, bb1maxx ) + CC_MAX( bb0minx, bb1minx ) + 1) >> 1;

rounds according to the math rules.

alexisnaveros commented 1 year ago

Hey, it doesn't matter that much in this case, it's just picking an estimate of where the intersection point should be (and moving the "origin" there), so that the second proper calculation can be accurate.

sergey-239 commented 1 year ago

Yes, you are right! Sorry, I did not think enough )

AngusJohnson commented 1 year ago

@AngusJohnson I checked your last version of GetIntersectPoint. It throws an exception in the following example.

    static const int64_t MAX_COORD = INT64_MAX /2;
    static const int64_t MIN_COORD = INT64_MIN /2;
    static const int64_t INVALID = INT64_MAX;

    inline int64_t CheckCastInt64(double val)
    {
        if (val > MAX_COORD || val < MIN_COORD) return INVALID;
        else return static_cast<int64_t>(val);
        //return static_cast<int64_t>(std::nearbyint(val));
    }

    bool GetIntersectPoint(const Active& e1, const Active& e2, Point64& ip)
    {
        // precondition: neither edge is horizontal
        //assert(!IsHorizontal(e1) && !IsHorizontal(e2));
        if (e1.top.x == e1.bot.x)
        {
            if (e2.top.x == e2.bot.x) return false;
            double b2 = e2.bot.y * e2.dx - e2.bot.x;
            ip.x = e1.bot.x;
            ip.y = CheckCastInt64((e1.bot.x + b2) / e2.dx);
        }
        else if (e2.top.x == e2.bot.x)
        {
            double b1 = e1.bot.y * e1.dx - e1.bot.x;
            ip.x = e2.bot.x;
            ip.y = CheckCastInt64((e2.bot.x + b1) / e1.dx);
        }   
        else
        {
            double dy1 = static_cast<double>(e1.top.y - e1.bot.y);
            double dx1 = static_cast<double>(e1.top.x - e1.bot.x);
            double dy2 = static_cast<double>(e2.top.y - e2.bot.y);
            double dx2 = static_cast<double>(e2.top.x - e2.bot.x);
            double q1 = dy1 * e1.bot.x - dx1 * e1.bot.y;
            double q2 = dy2 * e2.bot.x - dx2 * e2.bot.y;
            double cross_prod = dy1 * dx2 - dy2 * dx1;
            if (cross_prod == 0.0) return false;
            ip.x = CheckCastInt64((dx2 * q1 - dx1 * q2) / cross_prod);
            ip.y = CheckCastInt64((dy2 * q1 - dy1 * q2) / cross_prod);
        }
        return (ip.x != INVALID && ip.y != INVALID);
    }
AngusJohnson commented 1 year ago
inline int64_t RandInt64() 
{
  int64_t result = 0;
  for (int i = 0; i < 61; ++i) result = result << 1 | rand() % 2;
  return result;
}

inline void MakeRandomSeg(const Point64& ip, Point64& seg1, Point64& seg2)
{
  seg1.x = RandInt64();
  seg1.y = RandInt64();
  long double m = static_cast<double>(ip.y - seg1.y) / (ip.x - seg1.x);
  long double b = static_cast<double>(ip.y) - m * ip.x;
  // make the intersection the midpoint of each segment
  seg2.x = ip.x * 2 - seg1.x; 
  seg2.y = ip.y * 2 - seg1.y; 
  //seg2.y = static_cast<int64_t>(std::nearbyint(m * seg2.x + b));
}
  std::cout.imbue(std::locale(""));
  srand((unsigned)time(0));

  int64_t pt[2];
  Point64 ip, p1a, p2a, p3a, p4a;
  int a = 0, b = 0, c = 0, d = 0, g = 0;
  static const int count = 1000;
  for (int i = 0; i < count; ++i)
  {
    ip = Point64(RandInt64(), RandInt64());
    //std::cout << ip << std::endl;
    // make 2 random segments that have 'ip' as their midpoints
    MakeRandomSeg(ip, p1a, p2a);
    MakeRandomSeg(ip, p3a, p4a);

    // use various line intersection algorithms 
    mathVertexLineLineIntersection_A(pt, p1a.x, p1a.y, p2a.x, p2a.y, p3a.x, p3a.y, p4a.x, p4a.y);
    Point64 ipa = Point64(pt[0], pt[1]);
    mathVertexLineLineIntersection_B(pt, p1a.x, p1a.y, p2a.x, p2a.y, p3a.x, p3a.y, p4a.x, p4a.y);
    Point64 ipb = Point64(pt[0], pt[1]);
    mathVertexLineLineIntersection_C(pt, p1a.x, p1a.y, p2a.x, p2a.y, p3a.x, p3a.y, p4a.x, p4a.y);
    Point64 ipc = Point64(pt[0], pt[1]);
    mathVertexLineLineIntersection_D(pt, p1a.x, p1a.y, p2a.x, p2a.y, p3a.x, p3a.y, p4a.x, p4a.y);
    Point64 ipd = Point64(pt[0], pt[1]);
    mathVertexLineLineIntersection_G(pt, p1a.x, p1a.y, p2a.x, p2a.y, p3a.x, p3a.y, p4a.x, p4a.y);
    Point64 ipg = Point64(pt[0], pt[1]);

    //measure differences between calculated intersect points and 'ip'
    a += std::abs(ip.x - ipa.x) + std::abs(ip.y - ipa.y);
    b += std::abs(ip.x - ipb.x) + std::abs(ip.y - ipb.y);
    c += std::abs(ip.x - ipc.x) + std::abs(ip.y - ipc.y);
    d += std::abs(ip.x - ipd.x) + std::abs(ip.y - ipd.y);
    g += std::abs(ip.x - ipd.x) + std::abs(ip.y - ipd.y);
  }
  std::cout << "\nTypical intersection point: " << std::endl;
  std::cout << ip << std::endl << std::endl;

  std::cout << "\nAvg coordinate difference: " << std::endl;
  std::cout << "A: " << a / (count*2) << std::endl;
  std::cout << "B: " << b / (count * 2) << std::endl;
  std::cout << "C: " << c / (count * 2) << std::endl;
  std::cout << "D: " << d / (count * 2) << std::endl;
  std::cout << "G: " << d / (count * 2) << std::endl;

sample result:

Typical intersection point:
201,134,696,440,158,068, 1,116,342,604,634,274,019

Avg coordinate difference:
A: 647
B: 356
C: 320
D: 320
G: 320
Complex Polygons Benchmark (A):
Edge Count: 1,000 = 120 millisecs
Edge Count: 2,000 = 366 millisecs
Edge Count: 3,000 = 865 millisecs
Edge Count: 4,000 = 1.94 secs
Edge Count: 5,000 = 4.84 secs

Complex Polygons Benchmark (G):
Edge Count: 1,000 = 191 millisecs
Edge Count: 2,000 = 531 millisecs
Edge Count: 3,000 = 1.34 secs
Edge Count: 4,000 = 3.88 secs
Edge Count: 5,000 = 9.53 secs

Anyhow, the point of this is that although algorithm 'A' is the least accurate (ie accuracy of 3.2e-15 vs 1.6e-15), it's still very accurate, and it's substantially faster than the others.

sergey-239 commented 1 year ago

@alexisnaveros Same performance as F, but jump-free regardless of compiler:

static inline int mathVertexLineLineIntersection_F_jumfree( int64_t *hitpt, int64_t l0p0x, int64_t l0p0y, int64_t l0p1x, int64_t l0p1y, int64_t l1p0x, int64_t l1p0y, int64_t l1p1x, int64_t l1p1y )
{
  int64_t originx, originy;
  int64_t bb0cx, bb0cy, bb1cx, bb1cy;
  double ln0a, ln0b, ln1a, ln1b;
  double ln0c, ln1c, det, hitx, hity;
  ln0a = (double)( l0p1y - l0p0y );
  ln0b = (double)( l0p0x - l0p1x );
  ln1a = (double)( l1p1y - l1p0y );
  ln1b = (double)( l1p0x - l1p1x );
  det = ( ln1a * ln0b ) - ( ln0a * ln1b );
  if( det == 0.0 )
    return 0;
  bb0cx = ( l0p0x + l0p1x ) >> 1;
  bb0cy = ( l0p0y + l0p1y ) >> 1;
  bb1cx = ( l1p0x + l1p1x ) >> 1;
  bb1cy = ( l1p0y + l1p1y ) >> 1;
  originx = ( bb0cx + bb1cx ) >> 1;
  originy = ( bb0cy + bb1cy ) >> 1;

  ln0c = ( ln0a * (double)( l0p0x - originx ) ) + ( ln0b * (double)( l0p0y - originy ) );
  ln1c = ( ln1a * (double)( l1p0x - originx ) ) + ( ln1b * (double)( l1p0y - originy ) );
  hitx = ( ( ln0b * ln1c ) - ( ln1b * ln0c ) ) / det;
  hity = ( ( ln1a * ln0c ) - ( ln0a * ln1c ) ) / det;
#if DETECT_HIT_OVERFLOW
  if( fmax( fabs( (double)originx + hitx ), fabs( (double)originy + hity ) ) >= (double)(INT64_MAX-1) )
    return 0;
#endif
  hitpt[0] = originx + (int64_t)nearbyint( hitx );
  hitpt[1] = originy + (int64_t)nearbyint( hity );
  return 1;
}

Edit: and probably much more suited for SIMD vectorization

AngusJohnson commented 1 year ago

Same performance as F, but jump-free regardless of compiler:

Complex Polygons Benchmark:
Edge Count: 1,000 = 154 millisecs
Edge Count: 2,000 = 480 millisecs
Edge Count: 3,000 = 1.27 secs
Edge Count: 4,000 = 3.22 secs
Edge Count: 5,000 = 8.99 secs

(for comparison with the performances in my post 2 above)

alexisnaveros commented 1 year ago

@sergey-239 That fails some accuracy tests though. Put 3 points (of the 2 lines) in one spot of the "space", the remaining point far away, and the computed origin is too far away for the floating point math to give a correct result.

bahvalo commented 1 year ago

@AngusJohnson Now the code does not throw an exception but fails on the following test. The result is 3.21e32 for the m=0,,,5 and 4.28e32 for m>=6. The latter is correct.

#include <stdio.h>
#include "clipper2/clipper.h"
using namespace Clipper2Lib;

#define ADD(A,X,Y) A.push_back(Point64(int64_t(X), int64_t(Y)))

int main(int, char**) {
    Path64 a,b;
    ADD(a,-984684108819478528LL,19135785819045088LL);
    ADD(a,-1150000000000000000LL,-370576142787303296LL);
    ADD(a,-984684108819478528LL,-760288071393598976LL);
    ADD(a,-819368217638956928LL,-1150000000000000000LL);
    ADD(a,-488736435277913728LL,-1149999999999973632LL);
    ADD(a,-158104652916871424LL,-1150000000000000000LL);
    ADD(a,7211238263649718LL,-760288071393598976LL);
    ADD(a,172527129444176736LL,-370576142787303296LL);
    ADD(a,7211238263658415LL,19135785819045088LL);
    ADD(a,-158104652916865728LL,408847714425367104LL);
    ADD(a,-488736435277913920LL,408847714425393472LL);
    ADD(a,-819368217638957056LL,408847714425367104LL);
    ADD(b,6657179020798405LL,798563016150734848LL);
    ADD(b,-158657280480045088LL,408849962547857920LL);
    ADD(b,6660042376026709LL,19139158947841160LL);
    ADD(b,171977365232092768LL,-370571644652175552LL);
    ADD(b,502609147589005184LL,-370569394649315392LL);
    ADD(b,833240929945947008LL,-370567144646507904LL);
    ADD(b,998555389446808064LL,19145908956395368LL);
    ADD(b,1150000000000000000LL,377905349443409536LL);
    ADD(b,977747752670084352LL,752139346485547520LL);
    ADD(b,827171221707625728LL,1119050886889804928LL);
    ADD(b,514312222008247040LL,1130864215126028032LL);
    ADD(b,179777505941435104LL,1150000000000000000LL);

    for(int m=0; m<=20; m++) {
        Paths64 AA; AA.push_back(a);
        Paths64 BB; BB.push_back(b);
        Paths64 solution = Intersect(Paths64(AA), Paths64(BB), FillRule::NonZero);

        const double M = pow(2.,m);
        printf("S = %e\n", Area(solution)*M);

        for(size_t i=0; i<a.size(); i++) a[i].y /= 2;
        for(size_t i=0; i<b.size(); i++) b[i].y /= 2;
    }
}
bahvalo commented 1 year ago

@alexisnaveros Please consider the following test:

  l0p0x = 0;
  l0p0y = 0;
  l0p1x = INT64_MAX/4;
  l0p1y = INT64_MAX/4;
  l1p0x = some_small_value;
  l1p0y = 0;
  l1p1x = INT64_MAX/4-some_small_value;
  l1p1y = INT64_MAX/4;   

Here small_value may be 1000, 100, 10, 1.

AngusJohnson commented 1 year ago

@AngusJohnson Now the code does not throw an exception but fails on the following test. The result is 3.21e32 for the m=0,,,5 and 4.28e32 for m>=6. The latter is correct.

Yep, thanks again Bakhvalov. This time it's due to a bug in AddNewIntersectNode in clipper.engine.cpp.

And here's the fix (though I do hope to upload at least an interim fix for all these issues soonish):

    inline Point64 GetClosestPointOnSegment(const Point64& offPt, 
        const Point64& seg1, const Point64& seg2)
    {
        if (seg1.x == seg2.x && seg1.y == seg2.y) return seg1;
        double q = 
            (static_cast<double>(offPt.x - seg1.x) * (seg2.x - seg1.x) +
             static_cast<double>(offPt.y - seg1.y) * (seg2.y - seg1.y)) /
            (Sqr(static_cast<double>(seg2.x - seg1.x)) + 
             Sqr(static_cast<double>(seg2.y - seg1.y)));
        if (q < 0) q = 0; else if (q > 1) q = 1;
        return Point64(
            static_cast<int64_t>(nearbyint((1 - q) * seg1.x + q * seg2.x)),
            static_cast<int64_t>(nearbyint((1 - q) * seg1.y + q * seg2.y)));
    }

    void ClipperBase::AddNewIntersectNode(Active& e1, Active& e2, int64_t top_y)
    {
        Point64 pt; 
        if (!GetIntersectPoint(e1, e2, pt))
            pt = Point64(e1.curr_x, top_y); //parallel edges

        //rounding errors can occasionally place the calculated intersection
        //point either below or above the scanbeam, so check and correct ...
        if (pt.y > bot_y_)
        {
            //e.curr.y is still the bottom of scanbeam
            pt.y = bot_y_;
            //use the more vertical of the 2 edges to derive pt.x ...
            if (abs(e1.dx) < abs(e2.dx))
                pt.x = TopX(e1, bot_y_);
            else
                pt.x = TopX(e2, bot_y_);
        }
        else if (pt.y < top_y)
        {
            //top_y is at the top of the scanbeam
            if (e1.top.y == top_y)
                pt = GetClosestPointOnSegment(pt, e1.bot, e1.top);
            else if (e2.top.y == top_y)
                pt = GetClosestPointOnSegment(pt, e2.bot, e2.top);
            else if (abs(e1.dx) < abs(e2.dx))
                pt = GetClosestPointOnSegment(pt, e2.bot, e2.top);
            else
                pt = GetClosestPointOnSegment(pt, e1.bot, e1.top);
        }
        intersect_nodes_.push_back(IntersectNode(&e1, &e2, pt));
    }
bahvalo commented 1 year ago

Great! One group of tests successfully passed. Now one more case with a wrong result.

#include "clipper2/clipper.h"
using namespace Clipper2Lib;

#define ADD(A,X,Y) A.push_back(Point64(int64_t(X), int64_t(Y)))

int main(int, char**) {
    Path64 a,b;
    ADD(a,814722870202722176LL,-1149999999999998848LL);
    ADD(a,819545881835559040LL,-1059365200152486912LL);
    ADD(a,1150000000000000128LL,-227013697625513632LL);
    ADD(a,816835048201766784LL,280016125006099552LL);
    ADD(a,487416310657610752LL,1099756648736188544LL);
    ADD(a,-174329652178024992LL,930790319390064128LL);
    ADD(a,-814284595299387904LL,686735517908352256LL);
    ADD(a,-1135716310733731328LL,-339515571235652800LL);
    ADD(a,-1150000000000000000LL,-426667326553760960LL);
    ADD(b,814722870202723456LL,-1149999999999998848LL);
    ADD(b,819545881835560320LL,-1059365200152486912LL);
    ADD(b,812755077311476992LL,29570836596776912LL);
    ADD(b,310967664168980992LL,664892926339535360LL);
    ADD(b,-76177665946567712LL,1150000000000000896LL);
    ADD(b,-513853233051506688LL,621278544952343808LL);
    ADD(b,-1040633649215042560LL,480394334949876224LL);
    ADD(b,-1135716310733731328LL,-339515571235652800LL);
    ADD(b,-1150000000000000000LL,-426667326553760960LL);

    for(int m=0; m<=20; m++) {
        Paths64 AA; AA.push_back(a);
        Paths64 BB; BB.push_back(b);
        Paths64 solution = Intersect(Paths64(AA), Paths64(BB), FillRule::NonZero);

        const double M = pow(2.,m);
        printf("S = %e\n", Area(solution)*M);

        //for(size_t i=0; i<a.size(); i++) a[i].x /= 2;
        //for(size_t i=0; i<b.size(); i++) b[i].x /= 2;
        for(size_t i=0; i<a.size(); i++) a[i].y /= 2;
        for(size_t i=0; i<b.size(); i++) b[i].y /= 2;
    }
}

For different m, I get 3.22e+36, 2.65e+36, 2.80e+36, 2.38e+36. I guess that 2.652054e+36 is correct.