Open MasterDuke17 opened 6 months ago
Huh, this is interesting. A standalone benchmark of my two C versions shows the newer one as slightly slower and almost double the instructions.
old (~0.137s and 2,282,158,658 instructions):
#include <stdio.h>
#include <stdint.h>
typedef struct pair { char t, o; } pair;
#define P(T) T, '0', T, '1', T, '2', T, '3', T, '4', T, '5', T, '6', T, '7', T, '8', T, '9'
static const pair s_pairs[] = { P('0'), P('1'), P('2'), P('3'), P('4'), P('5'), P('6'), P('7'), P('8'), P('9') };
#define W(N, I) *(pair*)&b[N] = s_pairs[I]
#define A(N) t = ((uint64_t)(1) << (32 + N / 5 * N * 53 / 16)) / (uint32_t)(1e##N) + 1 + N/6 - N/8, t *= u, t >>= N / 5 * N * 53 / 16, t += N / 6 * 4, W(0, t >> 32)
#define S(N) b[N] = (char)((uint64_t)(10) * (uint32_t)(t) >> 32) + '0'
#define D(N) t = (uint64_t)(100) * (uint32_t)(t), W(N, t >> 32)
#define L0 b[0] = (char)(u) + '0'
#define L1 W(0, u)
#define L2 A(1), S(2)
#define L3 A(2), D(2)
#define L4 A(3), D(2), S(4)
#define L5 A(4), D(2), D(4)
#define L6 A(5), D(2), D(4), S(6)
#define L7 A(6), D(2), D(4), D(6)
#define L8 A(7), D(2), D(4), D(6), S(8)
#define L9 A(8), D(2), D(4), D(6), D(8)
#define LN(N) (L##N, b += N + 1)
#define LZ LN
#define LG(F) (u<100 ? u<10 ? F(0) : F(1) : u<1000000 ? u<10000 ? u<1000 ? F(2) : F(3) : u<100000 ? F(4) : F(5) : u<100000000 ? u<10000000 ? F(6) : F(7) : u<1000000000 ? F(8) : F(9))
static char * u64toa_jeaiii(uint64_t n, char* b) {
uint32_t u;
uint64_t t;
if ((uint32_t)(n >> 32) == 0)
return u = (uint32_t)(n), LG(LZ);
uint64_t a = n / 100000000;
if ((uint32_t)(a >> 32) == 0) {
u = (uint32_t)(a);
LG(LN);
}
else {
u = (uint32_t)(a / 100000000);
LG(LN);
u = a % 100000000;
LN(7);
}
u = n % 100000000;
return LZ(7);
}
void main(void) {
char work[20];
long sum = 0;
for (unsigned int i = 0; i < 100000000; i++) {
u64toa_jeaiii(i, work);
sum += work[0];
}
printf("sum = %ld\n", sum);
}
new (~0.188s and 4,094,107,679 instructions):
#include <stdio.h>
#include <stdint.h>
struct pair
{
char dd[2];
};
static const struct pair dd[100] =
{
{'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
{'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
{'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
{'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
{'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
{'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
{'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
{'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
{'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
{'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};
static const struct pair fd[100] =
{
{'0','\0'}, {'1','\0'}, {'2','\0'}, {'3','\0'}, {'4','\0'}, {'5','\0'}, {'6','\0'}, {'7','\0'}, {'8','\0'}, {'9','\0'},
{'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
{'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
{'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
{'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
{'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
{'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
{'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
{'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
{'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};
static const uint64_t mask24 = ((uint64_t)(1) << 24) - 1;
static const uint64_t mask32 = ((uint64_t)(1) << 32) - 1;
static const uint64_t mask57 = ((uint64_t)(1) << 57) - 1;
static char * u64toa_jeaiii(uint64_t n, char* b) {
if (n < (uint32_t)(1e2))
{
*(struct pair *)(b) = fd[n];
return n < 10 ? b + 1 : b + 2;
}
if (n < (uint32_t)(1e6))
{
if (n < (uint32_t)(1e4))
{
uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * n;
*(struct pair *)(b) = fd[f0 >> 24];
b -= n < (uint32_t)(1e3);
uint32_t f2 = (f0 & mask24) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 24];
return b + 4;
}
uint64_t f0 = (uint64_t)(10 * (1ull << 32ull)/ 1e5 + 1) * n;
*(struct pair *)(b) = fd[f0 >> 32];
b -= n < (uint32_t)(1e5);
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
return b + 6;
}
if (n < (uint64_t)(1ull << 32ull))
{
if (n < (uint32_t)(1e8))
{
uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * n >> 16;
*(struct pair *)(b) = fd[f0 >> 32];
b -= n < (uint32_t)(1e7);
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
uint64_t f6 = (f4 & mask32) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 32];
return b + 8;
}
uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * n;
*(struct pair *)(b) = fd[f0 >> 57];
b -= n < (uint32_t)(1e9);
uint64_t f2 = (f0 & mask57) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 57];
uint64_t f4 = (f2 & mask57) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 57];
uint64_t f6 = (f4 & mask57) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 57];
uint64_t f8 = (f6 & mask57) * 100;
*(struct pair *)(b + 8) = dd[f8 >> 57];
return b + 10;
}
// if we get here U must be (uint64_t) but some compilers don't know that, so reassign n to a (uint64_t) to avoid warnings
uint32_t z = n % (uint32_t)(1e8);
uint64_t u = n / (uint32_t)(1e8);
if (u < (uint32_t)(1e2))
{
// u can't be 1 digit (if u < 10 it would have been handled above as a 9 digit 32bit number)
*(struct pair *)(b) = dd[u];
b += 2;
}
else if (u < (uint32_t)(1e6))
{
if (u < (uint32_t)(1e4))
{
uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
*(struct pair *)(b) = fd[f0 >> 24];
b -= u < (uint32_t)(1e3);
uint32_t f2 = (f0 & mask24) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 24];
b += 4;
}
else
{
uint64_t f0 = (uint64_t)(10 * (1ull << 32ull) / 1e5 + 1) * u;
*(struct pair *)(b) = fd[f0 >> 32];
b -= u < (uint32_t)(1e5);
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
b += 6;
}
}
else if (u < (uint32_t)(1e8))
{
uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * u >> 16;
*(struct pair *)(b) = fd[f0 >> 32];
b -= u < (uint32_t)(1e7);
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
uint64_t f6 = (f4 & mask32) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 32];
b += 8;
}
else if (u < (uint64_t)(1ull << 32ull))
{
uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * u;
*(struct pair *)(b) = fd[f0 >> 57];
b -= u < (uint32_t)(1e9);
uint64_t f2 = (f0 & mask57) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 57];
uint64_t f4 = (f2 & mask57) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 57];
uint64_t f6 = (f4 & mask57) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 57];
uint64_t f8 = (f6 & mask57) * 100;
*(struct pair *)(b + 8) = dd[f8 >> 57];
b += 10;
}
else
{
uint32_t y = u % (uint32_t)(1e8);
u /= (uint32_t)(1e8);
// u is 2, 3, or 4 digits (if u < 10 it would have been handled above)
if (u < (uint32_t)(1e2))
{
*(struct pair *)(b) = dd[u];
b += 2;
}
else
{
uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
*(struct pair *)(b) = fd[f0 >> 24];
b -= u < (uint32_t)(1e3);
uint32_t f2 = (f0 & mask24) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 24];
b += 4;
}
// do 8 digits
uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * y >> 16) + 1;
*(struct pair *)(b) = dd[f0 >> 32];
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
uint64_t f6 = (f4 & mask32) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 32];
b += 8;
}
// do 8 digits
uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * z >> 16) + 1;
*(struct pair *)(b) = dd[f0 >> 32];
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
uint64_t f6 = (f4 & mask32) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 32];
return b + 8;
}
void main(void) {
char work[20];
long sum = 0;
for (unsigned int i = 0; i < 100000000; i++) {
u64toa_jeaiii(i, work);
sum += work[0];
}
printf("sum = %ld\n", sum);
}
"benchmarking is hard"
My suspicion is the optimizer is interfering with your simple benchmark to different degrees as seen in this godbolt:
https://godbolt.org/z/EEsYPGzrs
Since the first code is simpler the optimizer can "see" into it more and produce the sum (which in theory the compiler could compute fully at compile time) with less work than compared to the second code. Both however aren't doing the full amount of work you would expect if the number being converted was completely unknown at compile time
Sorry for the super later reply
with a few changes to make it constexpr
friendly you can see clang calculates the result fully (if you lower the limit, otherwise godbolt times out)
the 25% speed up refers to my benchmark's uniform random digit length category where I take equal quantities of numbers with digits length from 1 to 10 and shuffle them together so that the CPU can't predict how many digits each conversion will have. This is the worst case for algorithms that try to do less work based on how "large" the number is. For example converting only 10 digit numbers is twice as fast as converting random digit length numbers.
In a new wrinkle, when using gcc with -O3
, it gives a different final value in my example (I looked at the objdump -S output and both were actually running the code). Gcc 14.1.1 prints sum = 4777499475
, but clang 17.0.6 prints sum = 5249999475
. Both print sum = 5249999475
with -O2
. My example (compiled with -g -O(2|3) -Wall -Wextra -Wpedantic -Wno-missing-braces
):
#include <stdio.h>
#include <stdint.h>
struct pair
{
char dd[2];
};
static const struct pair dd[100] =
{
{'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'}, {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
{'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
{'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
{'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
{'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
{'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
{'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
{'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
{'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
{'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};
static const struct pair fd[100] =
{
{'0','\0'}, {'1','\0'}, {'2','\0'}, {'3','\0'}, {'4','\0'}, {'5','\0'}, {'6','\0'}, {'7','\0'}, {'8','\0'}, {'9','\0'},
{'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'}, {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
{'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'}, {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
{'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'}, {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
{'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'}, {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
{'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'}, {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
{'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'}, {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
{'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'}, {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
{'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'}, {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
{'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'}, {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'},
};
static const uint64_t mask24 = ((uint64_t)(1) << 24) - 1;
static const uint64_t mask32 = ((uint64_t)(1) << 32) - 1;
static const uint64_t mask57 = ((uint64_t)(1) << 57) - 1;
static char * u64toa_jeaiii(uint64_t n, char* b) {
if (n < (uint32_t)(1e2))
{
*(struct pair *)(b) = fd[n];
return n < 10 ? b + 1 : b + 2;
}
if (n < (uint32_t)(1e6))
{
if (n < (uint32_t)(1e4))
{
uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * n;
*(struct pair *)(b) = fd[f0 >> 24];
b -= n < (uint32_t)(1e3);
uint32_t f2 = (f0 & mask24) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 24];
return b + 4;
}
uint64_t f0 = (uint64_t)(10 * (1ull << 32ull)/ 1e5 + 1) * n;
*(struct pair *)(b) = fd[f0 >> 32];
b -= n < (uint32_t)(1e5);
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
return b + 6;
}
if (n < (uint64_t)(1ull << 32ull))
{
if (n < (uint32_t)(1e8))
{
uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * n >> 16;
*(struct pair *)(b) = fd[f0 >> 32];
b -= n < (uint32_t)(1e7);
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
uint64_t f6 = (f4 & mask32) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 32];
return b + 8;
}
uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * n;
*(struct pair *)(b) = fd[f0 >> 57];
b -= n < (uint32_t)(1e9);
uint64_t f2 = (f0 & mask57) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 57];
uint64_t f4 = (f2 & mask57) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 57];
uint64_t f6 = (f4 & mask57) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 57];
uint64_t f8 = (f6 & mask57) * 100;
*(struct pair *)(b + 8) = dd[f8 >> 57];
return b + 10;
}
// if we get here U must be (uint64_t) but some compilers don't know that, so reassign n to a (uint64_t) to avoid warnings
uint32_t z = n % (uint32_t)(1e8);
uint64_t u = n / (uint32_t)(1e8);
if (u < (uint32_t)(1e2))
{
// u can't be 1 digit (if u < 10 it would have been handled above as a 9 digit 32bit number)
*(struct pair *)(b) = dd[u];
b += 2;
}
else if (u < (uint32_t)(1e6))
{
if (u < (uint32_t)(1e4))
{
uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
*(struct pair *)(b) = fd[f0 >> 24];
b -= u < (uint32_t)(1e3);
uint32_t f2 = (f0 & mask24) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 24];
b += 4;
}
else
{
uint64_t f0 = (uint64_t)(10 * (1ull << 32ull) / 1e5 + 1) * u;
*(struct pair *)(b) = fd[f0 >> 32];
b -= u < (uint32_t)(1e5);
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
b += 6;
}
}
else if (u < (uint32_t)(1e8))
{
uint64_t f0 = (uint64_t)(10 * (1ull << 48ull) / 1e7 + 1) * u >> 16;
*(struct pair *)(b) = fd[f0 >> 32];
b -= u < (uint32_t)(1e7);
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
uint64_t f6 = (f4 & mask32) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 32];
b += 8;
}
else if (u < (uint64_t)(1ull << 32ull))
{
uint64_t f0 = (uint64_t)(10 * (1ull << 57ull) / 1e9 + 1) * u;
*(struct pair *)(b) = fd[f0 >> 57];
b -= u < (uint32_t)(1e9);
uint64_t f2 = (f0 & mask57) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 57];
uint64_t f4 = (f2 & mask57) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 57];
uint64_t f6 = (f4 & mask57) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 57];
uint64_t f8 = (f6 & mask57) * 100;
*(struct pair *)(b + 8) = dd[f8 >> 57];
b += 10;
}
else
{
uint32_t y = u % (uint32_t)(1e8);
u /= (uint32_t)(1e8);
// u is 2, 3, or 4 digits (if u < 10 it would have been handled above)
if (u < (uint32_t)(1e2))
{
*(struct pair *)(b) = dd[u];
b += 2;
}
else
{
uint32_t f0 = (uint32_t)(10 * (1 << 24) / 1e3 + 1) * u;
*(struct pair *)(b) = fd[f0 >> 24];
b -= u < (uint32_t)(1e3);
uint32_t f2 = (f0 & mask24) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 24];
b += 4;
}
// do 8 digits
uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * y >> 16) + 1;
*(struct pair *)(b) = dd[f0 >> 32];
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
uint64_t f6 = (f4 & mask32) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 32];
b += 8;
}
// do 8 digits
uint64_t f0 = ((uint64_t)((1ull << 48ull) / 1e6 + 1) * z >> 16) + 1;
*(struct pair *)(b) = dd[f0 >> 32];
uint64_t f2 = (f0 & mask32) * 100;
*(struct pair *)(b + 2) = dd[f2 >> 32];
uint64_t f4 = (f2 & mask32) * 100;
*(struct pair *)(b + 4) = dd[f4 >> 32];
uint64_t f6 = (f4 & mask32) * 100;
*(struct pair *)(b + 6) = dd[f6 >> 32];
return b + 8;
}
int main(void) {
char work[20] = {0};
long sum = 0;
for (unsigned int i = 0; i < 100000000; i++) {
u64toa_jeaiii(i, work);
sum += work[1];
}
return printf("sum = %ld\n", sum);
}
well that's super sus. -O1 and no args also does 4777499475. Only gcc -O2 matches clang, and clang is the same at any optimization level...
Yeah, and a simple C++ version gives 4777499475 for g++ with -O3, but 5249999475 for every other combination of -O and g++/clang++.
#include <cstdio>
#include <cstdint>
#include "jeaiii_to_text.h"
int main(void) {
char work[20] = {0};
long sum = 0;
for (unsigned int i = 0; i < 100000000; i++) {
u64toa_jeaiii(i, work);
sum += work[1];
}
return printf("sum = %ld\n", sum);
}
Huh. Adding -fsanitize=address
to g++ with -O3 results in 5249999475. Same with -fsanitize=undefined
, and neither reports any problems.
is there a bug bounty for gcc? :-)
is there a bug bounty for gcc? :-)
It seems like 9.1 (going just by major versions) is the first where it doesn't print 0 and 0 at -O3. I've never reported a bug to gcc (and don't really understand the code and resulting assembly well enough to thoroughly explain what's going on), do you mind doing so?
I've only reported bugs for clang and msvc in the past, but will make an account so I can report this to gcc. I have found a few work arounds, but not sure yet which one is best. For C, I would suggest making 4 versions for int
, unsigned int
, long long
, and unsigned long long
See https://github.com/MoarVM/MoarVM/pull/1800. Obviously there's lots of overhead, but the readme mentioned a 25% speedup for the new version, and I'm seeing nowhere near that (with gcc 13.2.1 in linux, compiling with
-O3 -march=native
, on a Zen 2). Do you see anything obviously problematic in my conversion?