Open jedisct1 opened 4 months ago
Hi, jedisct1!
Thank you for your feedback! We plan to address the issue you mentioned in the next release.
Hi @yumi-sakemi
There's also issue with all the benchmarks.
When using a good optimizing compiler (I used zig cc -Ofast -march=native
with Zig 0.13), the loops are optimized out. So, the printed number of cycles per bytes is very small but obviously incorrect.
A workaround is to swap the arguments in the loop (so that the compiler doesn't consider out
an invariant) and print the state after the loop (so that the entire computation can't be optimized out).
diff --git a/benchmark/areion-benchmark.c b/benchmark/areion-benchmark.c
index c63cefd..169da3b 100644
--- a/benchmark/areion-benchmark.c
+++ b/benchmark/areion-benchmark.c
@@ -25,12 +25,17 @@ static void benchmark_primitives()
ticks t0 = getticks();
for (int i = 0; i < NUMBER_OF_LOOPS; i++) {
permute_areion_256u8(out, in);
+ permute_areion_256u8(in, out);
}
ticks t1 = getticks();
double total_cycle = elapsed(t1, t0);
- total_cycle /= NUMBER_OF_LOOPS;
+ total_cycle /= NUMBER_OF_LOOPS * 2;
total_cycle /= 32;
printf("permute_areion_256u8: %g\n", total_cycle);
+ for (int i = 0; i < 32; i++) {
+ printf("%02x", in[i]);
+ }
+ puts("\n");
}
{
uint8_t in[32];
@@ -40,12 +45,17 @@ static void benchmark_primitives()
ticks t0 = getticks();
for (int i = 0; i < NUMBER_OF_LOOPS; i++) {
inverse_areion_256u8(out, in);
+ inverse_areion_256u8(in, out);
}
ticks t1 = getticks();
double total_cycle = elapsed(t1, t0);
- total_cycle /= NUMBER_OF_LOOPS;
+ total_cycle /= NUMBER_OF_LOOPS * 2;
total_cycle /= 32;
printf("inverse_areion_256u8: %g\n", total_cycle);
+ for (int i = 0; i < 32; i++) {
+ printf("%02x", in[i]);
+ }
+ puts("\n");
}
{
uint8_t in[64];
@@ -54,13 +64,18 @@ static void benchmark_primitives()
ticks t0 = getticks();
for (int i = 0; i < NUMBER_OF_LOOPS; i++) {
- permute_areion_512u8(out, in);
+ permute_areion_512u8(out, in);
+ permute_areion_512u8(in, out);
}
ticks t1 = getticks();
double total_cycle = elapsed(t1, t0);
- total_cycle /= NUMBER_OF_LOOPS;
- total_cycle /= 32;
+ total_cycle /= NUMBER_OF_LOOPS * 2;
+ total_cycle /= 64;
printf("permute_areion_512u8: %g\n", total_cycle);
+ for (int i = 0; i < 64; i++) {
+ printf("%02x", in[i]);
+ }
+ puts("\n");
}
{
uint8_t in[64];
@@ -70,12 +85,17 @@ static void benchmark_primitives()
ticks t0 = getticks();
for (int i = 0; i < NUMBER_OF_LOOPS; i++) {
inverse_areion_512u8(out, in);
+ inverse_areion_512u8(in, out);
}
ticks t1 = getticks();
double total_cycle = elapsed(t1, t0);
- total_cycle /= NUMBER_OF_LOOPS;
- total_cycle /= 32;
+ total_cycle /= NUMBER_OF_LOOPS * 2;
+ total_cycle /= 64;
printf("inverse_areion_512u8: %g\n", total_cycle);
+ for (int i = 0; i < 64; i++) {
+ printf("%02x", in[i]);
+ }
+ puts("\n");
}
}
The areion512 permutation benchmark divides the total number of cycles by 32:
That looks like a copy&paste error from areion256, and should probably be 64 instead.