gmo-ierae / low-latency-crypto-areion

This software provides a reference code of the low-latency cryptographic permutation algorithm Areion.
MIT License
17 stars 3 forks source link

Wrong output of the areion512 benchmark #2

Open jedisct1 opened 4 months ago

jedisct1 commented 4 months ago

The areion512 permutation benchmark divides the total number of cycles by 32:

        total_cycle /= NUMBER_OF_LOOPS;
        total_cycle /= 32;

That looks like a copy&paste error from areion256, and should probably be 64 instead.

yumi-sakemi commented 4 months ago

Hi, jedisct1!

Thank you for your feedback! We plan to address the issue you mentioned in the next release.

jedisct1 commented 4 months ago

Hi @yumi-sakemi

There's also issue with all the benchmarks.

When using a good optimizing compiler (I used zig cc -Ofast -march=native with Zig 0.13), the loops are optimized out. So, the printed number of cycles per bytes is very small but obviously incorrect.

A workaround is to swap the arguments in the loop (so that the compiler doesn't consider out an invariant) and print the state after the loop (so that the entire computation can't be optimized out).

diff --git a/benchmark/areion-benchmark.c b/benchmark/areion-benchmark.c
index c63cefd..169da3b 100644
--- a/benchmark/areion-benchmark.c
+++ b/benchmark/areion-benchmark.c
@@ -25,12 +25,17 @@ static void benchmark_primitives()
         ticks t0 = getticks();
         for (int i = 0; i < NUMBER_OF_LOOPS; i++) {
             permute_areion_256u8(out, in);
+            permute_areion_256u8(in, out);
         }
         ticks t1 = getticks();
         double total_cycle = elapsed(t1, t0);
-        total_cycle /= NUMBER_OF_LOOPS;
+        total_cycle /= NUMBER_OF_LOOPS * 2;
         total_cycle /= 32;
         printf("permute_areion_256u8: %g\n", total_cycle);
+        for (int i = 0; i < 32; i++) {
+           printf("%02x", in[i]);
+        }
+        puts("\n");
     }
     {
         uint8_t in[32];
@@ -40,12 +45,17 @@ static void benchmark_primitives()
         ticks t0 = getticks();
         for (int i = 0; i < NUMBER_OF_LOOPS; i++) {
             inverse_areion_256u8(out, in);
+            inverse_areion_256u8(in, out);
         }
         ticks t1 = getticks();
         double total_cycle = elapsed(t1, t0);
-        total_cycle /= NUMBER_OF_LOOPS;
+        total_cycle /= NUMBER_OF_LOOPS * 2;
         total_cycle /= 32;
         printf("inverse_areion_256u8: %g\n", total_cycle);
+        for (int i = 0; i < 32; i++) {
+           printf("%02x", in[i]);
+        }
+        puts("\n");
     }
     {
         uint8_t in[64];
@@ -54,13 +64,18 @@ static void benchmark_primitives()

         ticks t0 = getticks();
         for (int i = 0; i < NUMBER_OF_LOOPS; i++) {
-            permute_areion_512u8(out, in);
+            permute_areion_512u8(out, in);
+            permute_areion_512u8(in, out);
         }
         ticks t1 = getticks();
         double total_cycle = elapsed(t1, t0);
-        total_cycle /= NUMBER_OF_LOOPS;
-        total_cycle /= 32;
+        total_cycle /= NUMBER_OF_LOOPS * 2;
+        total_cycle /= 64;
         printf("permute_areion_512u8: %g\n", total_cycle);
+        for (int i = 0; i < 64; i++) {
+           printf("%02x", in[i]);
+        }
+        puts("\n");
     }
     {
         uint8_t in[64];
@@ -70,12 +85,17 @@ static void benchmark_primitives()
         ticks t0 = getticks();
         for (int i = 0; i < NUMBER_OF_LOOPS; i++) {
             inverse_areion_512u8(out, in);
+            inverse_areion_512u8(in, out);
         }
         ticks t1 = getticks();
         double total_cycle = elapsed(t1, t0);
-        total_cycle /= NUMBER_OF_LOOPS;
-        total_cycle /= 32;
+        total_cycle /= NUMBER_OF_LOOPS * 2;
+        total_cycle /= 64;
         printf("inverse_areion_512u8: %g\n", total_cycle);
+        for (int i = 0; i < 64; i++) {
+           printf("%02x", in[i]);
+        }
+        puts("\n");
     }
 }