C conversion/adaptation of jeaiii_to_text.h compared to C conversion/adaptation of itoa_jeaiii.cpp not as much faster as hoped

MasterDuke17 commented 6 months ago

See https://github.com/MoarVM/MoarVM/pull/1800. Obviously there's lots of overhead, but the readme mentioned a 25% speedup for the new version, and I'm seeing nowhere near that (with gcc 13.2.1 in linux, compiling with -O3 -march=native, on a Zen 2). Do you see anything obviously problematic in my conversion?

MasterDuke17 commented 6 months ago

Huh, this is interesting. A standalone benchmark of my two C versions shows the newer one as slightly slower and almost double the instructions.

old (~0.137s and 2,282,158,658 instructions):

#include <stdio.h>
#include <stdint.h>

typedef struct pair { char t, o; } pair;
#define P(T) T, '0',  T, '1', T, '2', T, '3', T, '4', T, '5', T, '6', T, '7', T, '8', T, '9'
static const pair s_pairs[] = { P('0'), P('1'), P('2'), P('3'), P('4'), P('5'), P('6'), P('7'), P('8'), P('9') };

#define W(N, I) *(pair*)&b[N] = s_pairs[I]
#define A(N) t = ((uint64_t)(1) << (32 + N / 5 * N * 53 / 16)) / (uint32_t)(1e##N) + 1 + N/6 - N/8, t *= u, t >>= N / 5 * N * 53 / 16, t += N / 6 * 4, W(0, t >> 32)
#define S(N) b[N] = (char)((uint64_t)(10) * (uint32_t)(t) >> 32) + '0'
#define D(N) t = (uint64_t)(100) * (uint32_t)(t), W(N, t >> 32)

#define L0 b[0] = (char)(u) + '0'
#define L1 W(0, u)
#define L2 A(1), S(2)
#define L3 A(2), D(2)
#define L4 A(3), D(2), S(4)
#define L5 A(4), D(2), D(4)
#define L6 A(5), D(2), D(4), S(6)
#define L7 A(6), D(2), D(4), D(6)
#define L8 A(7), D(2), D(4), D(6), S(8)
#define L9 A(8), D(2), D(4), D(6), D(8)

#define LN(N) (L##N, b += N + 1)
#define LZ LN

#define LG(F) (u<100 ? u<10 ? F(0) : F(1) : u<1000000 ? u<10000 ? u<1000 ? F(2) : F(3) : u<100000 ? F(4) : F(5) : u<100000000 ? u<10000000 ? F(6) : F(7) : u<1000000000 ? F(8) : F(9))

static char * u64toa_jeaiii(uint64_t n, char* b) {
    uint32_t u;
    uint64_t t;

    if ((uint32_t)(n >> 32) == 0)
        return u = (uint32_t)(n), LG(LZ);

    uint64_t a = n / 100000000;

    if ((uint32_t)(a >> 32) == 0) {
        u = (uint32_t)(a);
        LG(LN);
    }
    else {
        u = (uint32_t)(a / 100000000);
        LG(LN);
        u = a % 100000000;
        LN(7);
    }

    u = n % 100000000;
    return LZ(7);
}

void main(void) {
    char work[20];
    long sum = 0;
    for (unsigned int i = 0; i < 100000000; i++) {
        u64toa_jeaiii(i, work);
        sum += work[0];
    }
    printf("sum = %ld\n", sum);
}

new (~0.188s and 4,094,107,679 instructions):

#include <stdio.h>
#include <stdint.h>

struct pair
{
    char dd[2];
};

static const struct pair dd[100] =
{
    {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
    {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
    {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
    {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
    {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
    {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
    {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
    {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
    {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
    {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};
static const struct pair fd[100] =
{
    {'0','\0'}, {'1','\0'}, {'2','\0'}, {'3','\0'}, {'4','\0'}, {'5','\0'}, {'6','\0'}, {'7','\0'}, {'8','\0'}, {'9','\0'},
    {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
    {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
    {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
    {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
    {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
    {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
    {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
    {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
    {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};

static const uint64_t mask24 = ((uint64_t)(1) << 24) - 1;
static const uint64_t mask32 = ((uint64_t)(1) << 32) - 1;
static const uint64_t mask57 = ((uint64_t)(1) << 57) - 1;

static char * u64toa_jeaiii(uint64_t n, char* b) {
    if (n < (uint32_t)(1e2))
    {
        *(struct pair *)(b) = fd[n];
        return n < 10 ? b + 1 : b + 2;
    }
    if (n < (uint32_t)(1e6))
    {
        if (n < (uint32_t)(1e4))
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * n;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= n < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            return b + 4;
        }
        uint64_t f0 = (uint64_t)(10 * (1ull << 32ull)/ 1e5 + 1) * n;
        *(struct pair *)(b) = fd[f0 >> 32];
        b -= n < (uint32_t)(1e5);
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        return b + 6;
    }
    if (n < (uint64_t)(1ull << 32ull))
    {
        if (n < (uint32_t)(1e8))
        {
            uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * n >> 16;
            *(struct pair *)(b) = fd[f0 >> 32];
            b -= n < (uint32_t)(1e7);
            uint64_t f2 = (f0 & mask32) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 32];
            uint64_t f4 = (f2 & mask32) * 100;
            *(struct pair *)(b + 4) = dd[f4 >> 32];
            uint64_t f6 = (f4 & mask32) * 100;
            *(struct pair *)(b + 6) = dd[f6 >> 32];
            return b + 8;
        }
        uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * n;
        *(struct pair *)(b) = fd[f0 >> 57];
        b -= n < (uint32_t)(1e9);
        uint64_t f2 = (f0 & mask57) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 57];
        uint64_t f4 = (f2 & mask57) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 57];
        uint64_t f6 = (f4 & mask57) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 57];
        uint64_t f8 = (f6 & mask57) * 100;
        *(struct pair *)(b + 8) = dd[f8 >> 57];
        return b + 10;
    }

    // if we get here U must be (uint64_t) but some compilers don't know that, so reassign n to a (uint64_t) to avoid warnings
    uint32_t z = n % (uint32_t)(1e8);
    uint64_t u = n / (uint32_t)(1e8);

    if (u < (uint32_t)(1e2))
    {
        // u can't be 1 digit (if u < 10 it would have been handled above as a 9 digit 32bit number)
        *(struct pair *)(b) = dd[u];
        b += 2;
    }
    else if (u < (uint32_t)(1e6))
    {
        if (u < (uint32_t)(1e4))
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= u < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            b += 4;
        }
        else
        {
            uint64_t f0 = (uint64_t)(10 * (1ull << 32ull) / 1e5 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 32];
            b -= u < (uint32_t)(1e5);
            uint64_t f2 = (f0 & mask32) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 32];
            uint64_t f4 = (f2 & mask32) * 100;
            *(struct pair *)(b + 4) = dd[f4 >> 32];
            b += 6;
        }
    }
    else if (u < (uint32_t)(1e8))
    {
        uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * u >> 16;
        *(struct pair *)(b) = fd[f0 >> 32];
        b -= u < (uint32_t)(1e7);
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        uint64_t f6 = (f4 & mask32) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 32];
        b += 8;
    }
    else if (u < (uint64_t)(1ull << 32ull))
    {
        uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * u;
        *(struct pair *)(b) = fd[f0 >> 57];
        b -= u < (uint32_t)(1e9);
        uint64_t f2 = (f0 & mask57) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 57];
        uint64_t f4 = (f2 & mask57) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 57];
        uint64_t f6 = (f4 & mask57) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 57];
        uint64_t f8 = (f6 & mask57) * 100;
        *(struct pair *)(b + 8) = dd[f8 >> 57];
        b += 10;
    }
    else
    {
        uint32_t y = u % (uint32_t)(1e8);
        u /= (uint32_t)(1e8);

        // u is 2, 3, or 4 digits (if u < 10 it would have been handled above)
        if (u < (uint32_t)(1e2))
        {
            *(struct pair *)(b) = dd[u];
            b += 2;
        }
        else
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= u < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            b += 4;
        }
        // do 8 digits
        uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * y >> 16) + 1;
        *(struct pair *)(b) = dd[f0 >> 32];
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        uint64_t f6 = (f4 & mask32) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 32];
        b += 8;
    }
    // do 8 digits
    uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * z >> 16) + 1;
    *(struct pair *)(b) = dd[f0 >> 32];
    uint64_t f2 = (f0 & mask32) * 100;
    *(struct pair *)(b + 2) = dd[f2 >> 32];
    uint64_t f4 = (f2 & mask32) * 100;
    *(struct pair *)(b + 4) = dd[f4 >> 32];
    uint64_t f6 = (f4 & mask32) * 100;
    *(struct pair *)(b + 6) = dd[f6 >> 32];
    return b + 8;
}

void main(void) {
    char work[20];
    long sum = 0;
    for (unsigned int i = 0; i < 100000000; i++) {
        u64toa_jeaiii(i, work);
        sum += work[0];
    }
    printf("sum = %ld\n", sum);
}

jeaiii commented 4 months ago

"benchmarking is hard"

My suspicion is the optimizer is interfering with your simple benchmark to different degrees as seen in this godbolt:

https://godbolt.org/z/EEsYPGzrs

Since the first code is simpler the optimizer can "see" into it more and produce the sum (which in theory the compiler could compute fully at compile time) with less work than compared to the second code. Both however aren't doing the full amount of work you would expect if the number being converted was completely unknown at compile time

Sorry for the super later reply

jeaiii commented 4 months ago

with a few changes to make it constexpr friendly you can see clang calculates the result fully (if you lower the limit, otherwise godbolt times out)

https://godbolt.org/z/o1rocnvva

jeaiii commented 4 months ago

the 25% speed up refers to my benchmark's uniform random digit length category where I take equal quantities of numbers with digits length from 1 to 10 and shuffle them together so that the CPU can't predict how many digits each conversion will have. This is the worst case for algorithms that try to do less work based on how "large" the number is. For example converting only 10 digit numbers is twice as fast as converting random digit length numbers.

https://jeaiii.github.io/itoa/

MasterDuke17 commented 4 months ago

In a new wrinkle, when using gcc with -O3, it gives a different final value in my example (I looked at the objdump -S output and both were actually running the code). Gcc 14.1.1 prints sum = 4777499475, but clang 17.0.6 prints sum = 5249999475. Both print sum = 5249999475 with -O2. My example (compiled with -g -O(2|3) -Wall -Wextra -Wpedantic -Wno-missing-braces):

#include <stdio.h>
#include <stdint.h>

struct pair
{
    char dd[2];
};

static const struct pair dd[100] =
{
    {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
    {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
    {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
    {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
    {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
    {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
    {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
    {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
    {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
    {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};
static const struct pair fd[100] =
{
    {'0','\0'}, {'1','\0'}, {'2','\0'}, {'3','\0'}, {'4','\0'}, {'5','\0'}, {'6','\0'}, {'7','\0'}, {'8','\0'}, {'9','\0'},
    {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
    {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
    {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
    {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
    {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
    {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
    {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
    {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
    {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};

static const uint64_t mask24 = ((uint64_t)(1) << 24) - 1;
static const uint64_t mask32 = ((uint64_t)(1) << 32) - 1;
static const uint64_t mask57 = ((uint64_t)(1) << 57) - 1;

static char * u64toa_jeaiii(uint64_t n, char* b) {
    if (n < (uint32_t)(1e2))
    {
        *(struct pair *)(b) = fd[n];
        return n < 10 ? b + 1 : b + 2;
    }
    if (n < (uint32_t)(1e6))
    {
        if (n < (uint32_t)(1e4))
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * n;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= n < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            return b + 4;
        }
        uint64_t f0 = (uint64_t)(10 * (1ull << 32ull)/ 1e5 + 1) * n;
        *(struct pair *)(b) = fd[f0 >> 32];
        b -= n < (uint32_t)(1e5);
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        return b + 6;
    }
    if (n < (uint64_t)(1ull << 32ull))
    {
        if (n < (uint32_t)(1e8))
        {
            uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * n >> 16;
            *(struct pair *)(b) = fd[f0 >> 32];
            b -= n < (uint32_t)(1e7);
            uint64_t f2 = (f0 & mask32) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 32];
            uint64_t f4 = (f2 & mask32) * 100;
            *(struct pair *)(b + 4) = dd[f4 >> 32];
            uint64_t f6 = (f4 & mask32) * 100;
            *(struct pair *)(b + 6) = dd[f6 >> 32];
            return b + 8;
        }
        uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * n;
        *(struct pair *)(b) = fd[f0 >> 57];
        b -= n < (uint32_t)(1e9);
        uint64_t f2 = (f0 & mask57) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 57];
        uint64_t f4 = (f2 & mask57) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 57];
        uint64_t f6 = (f4 & mask57) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 57];
        uint64_t f8 = (f6 & mask57) * 100;
        *(struct pair *)(b + 8) = dd[f8 >> 57];
        return b + 10;
    }

    // if we get here U must be (uint64_t) but some compilers don't know that, so reassign n to a (uint64_t) to avoid warnings
    uint32_t z = n % (uint32_t)(1e8);
    uint64_t u = n / (uint32_t)(1e8);

    if (u < (uint32_t)(1e2))
    {
        // u can't be 1 digit (if u < 10 it would have been handled above as a 9 digit 32bit number)
        *(struct pair *)(b) = dd[u];
        b += 2;
    }
    else if (u < (uint32_t)(1e6))
    {
        if (u < (uint32_t)(1e4))
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= u < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            b += 4;
        }
        else
        {
            uint64_t f0 = (uint64_t)(10 * (1ull << 32ull) / 1e5 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 32];
            b -= u < (uint32_t)(1e5);
            uint64_t f2 = (f0 & mask32) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 32];
            uint64_t f4 = (f2 & mask32) * 100;
            *(struct pair *)(b + 4) = dd[f4 >> 32];
            b += 6;
        }
    }
    else if (u < (uint32_t)(1e8))
    {
        uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * u >> 16;
        *(struct pair *)(b) = fd[f0 >> 32];
        b -= u < (uint32_t)(1e7);
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        uint64_t f6 = (f4 & mask32) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 32];
        b += 8;
    }
    else if (u < (uint64_t)(1ull << 32ull))
    {
        uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * u;
        *(struct pair *)(b) = fd[f0 >> 57];
        b -= u < (uint32_t)(1e9);
        uint64_t f2 = (f0 & mask57) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 57];
        uint64_t f4 = (f2 & mask57) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 57];
        uint64_t f6 = (f4 & mask57) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 57];
        uint64_t f8 = (f6 & mask57) * 100;
        *(struct pair *)(b + 8) = dd[f8 >> 57];
        b += 10;
    }
    else
    {
        uint32_t y = u % (uint32_t)(1e8);
        u /= (uint32_t)(1e8);

        // u is 2, 3, or 4 digits (if u < 10 it would have been handled above)
        if (u < (uint32_t)(1e2))
        {
            *(struct pair *)(b) = dd[u];
            b += 2;
        }
        else
        {
            uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
            *(struct pair *)(b) = fd[f0 >> 24];
            b -= u < (uint32_t)(1e3);
            uint32_t f2 = (f0 & mask24) * 100;
            *(struct pair *)(b + 2) = dd[f2 >> 24];
            b += 4;
        }
        // do 8 digits
        uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * y >> 16) + 1;
        *(struct pair *)(b) = dd[f0 >> 32];
        uint64_t f2 = (f0 & mask32) * 100;
        *(struct pair *)(b + 2) = dd[f2 >> 32];
        uint64_t f4 = (f2 & mask32) * 100;
        *(struct pair *)(b + 4) = dd[f4 >> 32];
        uint64_t f6 = (f4 & mask32) * 100;
        *(struct pair *)(b + 6) = dd[f6 >> 32];
        b += 8;
    }
    // do 8 digits
    uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * z >> 16) + 1;
    *(struct pair *)(b) = dd[f0 >> 32];
    uint64_t f2 = (f0 & mask32) * 100;
    *(struct pair *)(b + 2) = dd[f2 >> 32];
    uint64_t f4 = (f2 & mask32) * 100;
    *(struct pair *)(b + 4) = dd[f4 >> 32];
    uint64_t f6 = (f4 & mask32) * 100;
    *(struct pair *)(b + 6) = dd[f6 >> 32];
    return b + 8;
}

int main(void) {
    char work[20] = {0};
    long sum = 0;
    for (unsigned int i = 0; i < 100000000; i++) {
        u64toa_jeaiii(i, work);
        sum += work[1];
    }
    return printf("sum = %ld\n", sum);
}

jeaiii commented 4 months ago

well that's super sus. -O1 and no args also does 4777499475. Only gcc -O2 matches clang, and clang is the same at any optimization level...

MasterDuke17 commented 4 months ago

Yeah, and a simple C++ version gives 4777499475 for g++ with -O3, but 5249999475 for every other combination of -O and g++/clang++.

#include <cstdio>
#include <cstdint>
#include "jeaiii_to_text.h"

int main(void) {
    char work[20] = {0};
    long sum = 0;
    for (unsigned int i = 0; i < 100000000; i++) {
        u64toa_jeaiii(i, work);
        sum += work[1];
    }
    return printf("sum = %ld\n", sum);
}

MasterDuke17 commented 4 months ago

Huh. Adding -fsanitize=address to g++ with -O3 results in 5249999475. Same with -fsanitize=undefined, and neither reports any problems.

jeaiii commented 4 months ago

is there a bug bounty for gcc? :-)

https://godbolt.org/z/q3azMGdr8

MasterDuke17 commented 4 months ago

is there a bug bounty for gcc? :-)

godbolt.org/z/q3azMGdr8

It seems like 9.1 (going just by major versions) is the first where it doesn't print 0 and 0 at -O3. I've never reported a bug to gcc (and don't really understand the code and resulting assembly well enough to thoroughly explain what's going on), do you mind doing so?

jeaiii commented 4 months ago

I've only reported bugs for clang and msvc in the past, but will make an account so I can report this to gcc. I have found a few work arounds, but not sure yet which one is best. For C, I would suggest making 4 versions for int , unsigned int, long long, and unsigned long long

jeaiii / itoa

C conversion/adaptation of jeaiii_to_text.h compared to C conversion/adaptation of itoa_jeaiii.cpp not as much faster as hoped #17