Bug in libdivide_128_div_64_to_64()

Bug report by Stefan Kanthak:

The bug needs a divisor 'v' with the most significant bit set, i.e. a value from
0x8000000000000000ULL to 0xFFFFFFFFFFFFFFFFULL.
It should be reproducible with a dividend pair 'u1' and 'u0' of 0ULL and
~0ULL: this combination lets 'un64' be ~0ULL instead of 0ULL

Here is a link to the full blog post: https://skanthak.homepage.t-online.de/division.html

I have written a standalone test program for libdivide's libdivide_128_div_64_to_64() function that can be compiled using GCC or Clang on a 64-bit CPU architecture. The test program calculates many combinations of x1/x2 where x1 is of type __uint128_t and x2 is of type uint64_t. The test program always calculates x1/x2 twice, first using the builtin integer division and second using libdivide's libdivide_128_div_64_to_64() function and then compares both results for equality.

The test program is able to reproduce the reported bug and I can also confirm that the proposed fix below fixes the bug:

// Error
    } else {
        // Avoid undefined behavior
        un64 = u1 | u0;
        un10 = u0;
    }

// Fix
    } else {
        // Avoid undefined behavior
        un64 = u1;
        un10 = u0;
    }

And here is my test program:

// This is a test program for the function libdivide_128_div_64_to_64
// which devides a 128-bit number by a 64-bit number where
// the result must fit into a 64-bit word.
// The test program tries to cover a wide variety of test cases
// e.g. small dividend / small divisor,
//      large dividend / small divisor,
//      small dividend / large divisor,
//      large dividend / large divisor,
//      ...
// 

#include <iostream>
#include <random>
#include <limits>
#include <ostream>
#include <string>

using namespace std;

// Code taken from Hacker's Delight:
// http://www.hackersdelight.org/HDcode/divlu.c.
// License permits inclusion here per:
// http://www.hackersdelight.org/permissions.htm

static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) {
    const uint64_t b = (1ULL << 32); // Number base (16 bits)
    uint64_t un1, un0; // Norm. dividend LSD's
    uint64_t vn1, vn0; // Norm. divisor digits
    uint64_t q1, q0; // Quotient digits
    uint64_t un64, un21, un10; // Dividend digit pairs
    uint64_t rhat; // A remainder
    int32_t s; // Shift amount for norm

    // If overflow, set rem. to an impossible value,
    // and return the largest possible quotient
    if (u1 >= v) {
        if (r != NULL)
            *r = (uint64_t) -1;
        return (uint64_t) -1;
    }

    // count leading zeros
    s = __builtin_clzll(v);
    if (s > 0) {
        // Normalize divisor
        v = v << s;
        un64 = (u1 << s) | ((u0 >> (64 - s)) & (-s >> 31));
        un10 = u0 << s; // Shift dividend left
    } else {
        // Avoid undefined behavior
        un64 = u1 | u0;
        un10 = u0;
    }

    // Break divisor up into two 32-bit digits
    vn1 = v >> 32;
    vn0 = v & 0xFFFFFFFF;

    // Break right half of dividend into two digits
    un1 = un10 >> 32;
    un0 = un10 & 0xFFFFFFFF;

    // Compute the first quotient digit, q1
    q1 = un64 / vn1;
    rhat = un64 - q1 * vn1;

    while (q1 >= b || q1 * vn0 > b * rhat + un1) {
        q1 = q1 - 1;
        rhat = rhat + vn1;
        if (rhat >= b)
            break;
    }

     // Multiply and subtract
    un21 = un64 * b + un1 - q1 * v;

    // Compute the second quotient digit
    q0 = un21 / vn1;
    rhat = un21 - q0 * vn1;

    while (q0 >= b || q0 * vn0 > b * rhat + un0) {
        q0 = q0 - 1;
        rhat = rhat + vn1;
        if (rhat >= b)
            break;
    }

    // If remainder is wanted, return it
    if (r != NULL)
        *r = (un21 * b + un0 - q0 * v) >> s;

    return q1 * b + q0;
}

inline std::ostream& operator<<(std::ostream& stream, __uint128_t n)
{
  std::string str;

  while (n > 0)
  {
    str += '0' + n % 10;
    n /= 10;
  }

  if (str.empty())
    str = "0";

  stream << std::string(str.rbegin(), str.rend());
  return stream;
}

int main(int argc, char** argv)
{
    random_device rd16;
    mt19937 gen16(rd16());
    uniform_int_distribution<uint16_t> dist16(1, std::numeric_limits<uint16_t>::max());

    int iters = 100000;

    if (argc > 1)
        iters = atoi(argv[1]);

    // Test: random dividends < 2^64 / small divisors    
    for (int i = 0; i < iters; i++)
    {
        uint64_t x1 = 1;

        for (int i = 0; i < 4; i++)
        {
            uint16_t a = dist16(gen16);
            uint64_t x2 = 1;
            x1 *= a;
            x1 += 1;

            for (int j = 1; j < 17; j++)
            {
                x2 = j;

                auto res1 = x1 / x2;
                auto res2 = libdivide_128_div_64_to_64(0, x1, x2, nullptr);

                if ((uint64_t) res1 != (uint64_t) res2)
                {
                    std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
                    std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
                    return 1;
                }
            }
        }
    }

    // Test: 2^64-1 / random divisors < 2^64    
    for (int i = 0; i < iters; i++)
    {
        uint64_t x1 = ~0ull;
        uint64_t x2 = 1;

        for (int i = 0; i < 4; i++)
        {
            uint16_t a = dist16(gen16);
            x2 *= a;
            x2 += 1;

            auto res1 = x1 / x2;
            auto res2 = libdivide_128_div_64_to_64(0, x1, x2, nullptr);

            if ((uint64_t) res1 != (uint64_t) res2)
            {
                std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
                std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
                return 1;
            }
        }
    }

    // Test: 2^90 / random divisors > 2^46 && < 2^64    
    for (int i = 0; i < iters; i++)
    {
        __uint128_t x1 = ((__uint128_t) 1) << 90;
        uint64_t x2 = 1;

        while (x2 <= (1ull << 46))
        {
            uint16_t a = dist16(gen16);
            x2 *= a;
            x2 += 1;
        }

        uint64_t res1 = x1 / x2;
        uint64_t res2 = libdivide_128_div_64_to_64((uint64_t)(x1 >> 64), (uint64_t)(x1 & ~0ull), x2, nullptr);

        if ((uint64_t) res1 != (uint64_t) res2)
        {
            std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
            std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
            return 1;
        }
    }

    // Test: random dividends > 2^100 / 2^64-1    
    for (int i = 0; i < iters; i++)
    {
        __uint128_t x1 = 1;
        uint64_t x2 = ~0ull;

        while (x1 <= (((__uint128_t) 1) << 100))
        {
            uint16_t a = dist16(gen16);
            x1 *= a;
            x1 += 1;
        }

        uint64_t res1 = x1 / x2;
        uint64_t res2 = libdivide_128_div_64_to_64((uint64_t)(x1 >> 64), (uint64_t)(x1 & ~0ull), x2, nullptr);

        if ((uint64_t) res1 != (uint64_t) res2)
        {
            std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
            std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
            return 1;
        }
    }

    // Test: 2^(0..127) / 2^64-1  
    for (int i = 0; i < 127; i++)
    {
        __uint128_t x1 = ((__uint128_t) 1) << i;
        uint64_t x2 = ~0ull;

        uint64_t res1 = x1 / x2;
        uint64_t res2 = libdivide_128_div_64_to_64((uint64_t)(x1 >> 64), (uint64_t)(x1 & ~0ull), x2, nullptr);

        if ((uint64_t) res1 != (uint64_t) res2)
        {
            std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
            std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
            return 1;
        }
    }

    // Test: 2^(0..127) / 2^63 
    for (int i = 0; i < 127; i++)
    {
        __uint128_t x1 = ((__uint128_t) 1) << i;
        uint64_t x2 = 1ull << 63;

        uint64_t res1 = x1 / x2;
        uint64_t res2 = libdivide_128_div_64_to_64((uint64_t)(x1 >> 64), (uint64_t)(x1 & ~0ull), x2, nullptr);

        if ((uint64_t) res1 != (uint64_t) res2)
        {
            std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
            std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
            return 1;
        }
    }

    {
        // Test: 2^64-1 / 2^64-1
        uint64_t x1 = ~0ull;
        uint64_t x2 = ~0ull;

        uint64_t res1 = x1 / x2;
        uint64_t res2 = libdivide_128_div_64_to_64(0, x1, x2, nullptr);

        if ((uint64_t) res1 != (uint64_t) res2)
        {
            std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
            std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
            return 1;
        }    
    }

    {
        // Test: 2^64-1 / 2^63
        uint64_t x1 = ~0ull;
        uint64_t x2 = 0x8000000000000000ULL;

        uint64_t res1 = x1 / x2;
        uint64_t res2 = libdivide_128_div_64_to_64(0, x1, x2, nullptr);

        if ((uint64_t) res1 != (uint64_t) res2)
        {
            std::cerr << x1 << " / " << x2 << " = " << res1 << " correct" << std::endl;
            std::cerr << x1 << " / " << x2 << " = " << res2 << " error" << std::endl;
            return 1;
        }
    }

    return 0;
}

The above test program can be compiled and run using:

g++ -O2 test.cpp -o test
./test
18446744073709551615 / 10012018566958790731 = 1 correct
18446744073709551615 / 10012018566958790731 = 15540644647674054952 error

ridiculousfish / libdivide

Bug in libdivide_128_div_64_to_64() #45