randombit / botan

Cryptography Toolkit
https://botan.randombit.net
BSD 2-Clause "Simplified" License
2.54k stars 562 forks source link

Add ARM SHA extension for SHA-224 and SHA-256 #841

Closed noloader closed 7 years ago

noloader commented 7 years ago

Attached and below is a patch for SHA-224 and SHA-256 using ARM SHA extensions. Its another partial patch, and others will have to complete it.

The code came from the ARM ARM, and the mbedtls experimental implementation by Johannes Schneiders, Barry O'Rourke and Skip Hovsmith.

The dev-boards used for testing were a Pine64 and LeMaker HiKey. Both have CRC and Crypto extensions. Botan was configured with ./configure.py --cc={gcc|clang} --cc-abi="-march=armv8-a+crc+crypto -mtune=cortex-a53".

Here are the relative numbers:

$ ./botan speed --msec=3000 SHA-1 SHA-224 SHA-256
SHA-160 [base] hash 84.630 MiB/sec (253.895 MiB in 3000.041 ms)
SHA-224 [base] hash 41.528 MiB/sec (124.586 MiB in 3000.026 ms)
SHA-256 [base] hash 41.549 MiB/sec (124.648 MiB in 3000.014 ms)
$ ./botan speed --msec=3000 SHA-1 SHA-224 SHA-256
SHA-160 [base] hash 87.312 MiB/sec (261.938 MiB in 3000.008 ms)
SHA-224 [base] hash 43.174 MiB/sec (129.523 MiB in 3000.025 ms)
SHA-256 [base] hash 43.151 MiB/sec (129.453 MiB in 3000.019 ms)
$ ./botan speed --msec=3000 SHA-1 SHA-224 SHA-256
SHA-160 [base] hash 281.095 MiB/sec (843.285 MiB in 3000.002 ms)
SHA-224 [base] hash 383.009 MiB/sec (1149.027 MiB in 3000.003 ms)
SHA-256 [base] hash 385.632 MiB/sec (1156.898 MiB in 3000.004 ms)
$ ./botan speed --msec=3000 SHA-1 SHA-224 SHA-256
SHA-160 [base] hash 421.620 MiB/sec (1264.859 MiB in 3000.001 ms)
SHA-224 [base] hash 398.778 MiB/sec (1196.336 MiB in 3000.002 ms)
SHA-256 [base] hash 398.780 MiB/sec (1196.344 MiB in 3000.006 ms)
$ ./botan speed --msec=3000 SHA-1 SHA-224 SHA-256
SHA-160 [base] hash 323.623 MiB/sec (970.871 MiB in 3000.007 ms)
SHA-224 [base] hash 521.434 MiB/sec (1564.305 MiB in 3000.003 ms)
SHA-256 [base] hash 521.383 MiB/sec (1564.148 MiB in 3000.001 ms)
$ ./botan speed --msec=3000 SHA-1 SHA-224 SHA-256
SHA-160 [base] hash 516.304 MiB/sec (1548.914 MiB in 3000.006 ms)
SHA-224 [base] hash 542.872 MiB/sec (1628.617 MiB in 3000.002 ms)
SHA-256 [base] hash 542.947 MiB/sec (1628.844 MiB in 3000.007 ms)

Be careful of using hwcaps to determine SHA extension availability because its only available on Linux. There is no equivalent on iOS and Windows Phone. The best way I found was a Unix signal handler to catch a SIGILL and a __try/__except block on Windows because they work everywhere. It also avoids the need for EL1 to check a MSR.


$ git diff > sha256.diff
$ cat sha256.diff
diff --git a/src/lib/hash/sha2_32/sha2_32.cpp b/src/lib/hash/sha2_32/sha2_32.cpp
index 2a748a6aa..792e3dae2 100644
--- a/src/lib/hash/sha2_32/sha2_32.cpp
+++ b/src/lib/hash/sha2_32/sha2_32.cpp
@@ -8,42 +8,48 @@

 #include <botan/sha2_32.h>

+// Ugly... ARM32/ARM64 Headers
+// As of Visual Studio 2015, Microsoft does not support ARM ACLE extensions
+#if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__)
+# if defined(__GNUC__)
+#  include <stdint.h>
+# endif
+# if defined(__ARM_NEON)
+#  include <arm_neon.h>
+# endif
+// GCC and LLVM Clang, but not Apple Clang
+# if defined(__GNUC__) && !defined(__apple_build_version__)
+#  if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
+#      include <arm_acle.h>
+#  endif
+# endif
+#endif  // ARM32 and ARM64 Headers
+
 namespace Botan {

 namespace {

 namespace SHA2_32 {

-/*
-* SHA-256 Rho Function
-*/
-inline uint32_t rho(uint32_t X, uint32_t rot1, uint32_t rot2, uint32_t rot3)
-   {
-   return (rotate_right(X, rot1) ^ rotate_right(X, rot2) ^
-           rotate_right(X, rot3));
-   }
-
-/*
-* SHA-256 Sigma Function
-*/
-inline uint32_t sigma(uint32_t X, uint32_t rot1, uint32_t rot2, uint32_t shift)
-   {
-   return (rotate_right(X, rot1) ^ rotate_right(X, rot2) ^ (X >> shift));
-   }
-
-/*
-* SHA-256 F1 Function
-*
-* Use a macro as many compilers won't inline a function this big,
-* even though it is much faster if inlined.
-*/
-#define SHA2_32_F(A, B, C, D, E, F, G, H, M1, M2, M3, M4, magic)   \
-   do {                                                            \
-      H += magic + rho(E, 6, 11, 25) + ((E & F) ^ (~E & G)) + M1;  \
-      D += H;                                                      \
-      H += rho(A, 2, 13, 22) + ((A & B) | ((A | B) & C));          \
-      M1 += sigma(M2, 17, 19, 10) + M3 + sigma(M4, 7, 18, 3);      \
-   } while(0);
+static const uint32_t K[] =
+{
+   0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
+   0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+   0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
+   0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+   0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
+   0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+   0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
+   0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+   0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
+   0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+   0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
+   0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+   0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
+   0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+   0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
+   0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
+};

 /*
 * SHA-224 / SHA-256 compression function
@@ -51,106 +57,164 @@ inline uint32_t sigma(uint32_t X, uint32_t rot1, uint32_t rot2, uint32_t shift)
 void compress(secure_vector<uint32_t>& digest,
               const uint8_t input[], size_t blocks)
    {
-   uint32_t A = digest[0], B = digest[1], C = digest[2],
-          D = digest[3], E = digest[4], F = digest[5],
-          G = digest[6], H = digest[7];
+   uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
+   uint32x4_t MSG0, MSG1, MSG2, MSG3;
+   uint32x4_t TMP0, TMP1, TMP2;
+
+   // Load initial values
+   STATE0 = vld1q_u32(&digest[0]);
+   STATE1 = vld1q_u32(&digest[4]);

-   for(size_t i = 0; i != blocks; ++i)
+   while (blocks)
       {
-      uint32_t W00 = load_be<uint32_t>(input,  0);
-      uint32_t W01 = load_be<uint32_t>(input,  1);
-      uint32_t W02 = load_be<uint32_t>(input,  2);
-      uint32_t W03 = load_be<uint32_t>(input,  3);
-      uint32_t W04 = load_be<uint32_t>(input,  4);
-      uint32_t W05 = load_be<uint32_t>(input,  5);
-      uint32_t W06 = load_be<uint32_t>(input,  6);
-      uint32_t W07 = load_be<uint32_t>(input,  7);
-      uint32_t W08 = load_be<uint32_t>(input,  8);
-      uint32_t W09 = load_be<uint32_t>(input,  9);
-      uint32_t W10 = load_be<uint32_t>(input, 10);
-      uint32_t W11 = load_be<uint32_t>(input, 11);
-      uint32_t W12 = load_be<uint32_t>(input, 12);
-      uint32_t W13 = load_be<uint32_t>(input, 13);
-      uint32_t W14 = load_be<uint32_t>(input, 14);
-      uint32_t W15 = load_be<uint32_t>(input, 15);
-
-      SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x428A2F98);
-      SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x71374491);
-      SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0xB5C0FBCF);
-      SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0xE9B5DBA5);
-      SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x3956C25B);
-      SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x59F111F1);
-      SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x923F82A4);
-      SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0xAB1C5ED5);
-      SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0xD807AA98);
-      SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0x12835B01);
-      SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0x243185BE);
-      SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0x550C7DC3);
-      SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0x72BE5D74);
-      SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0x80DEB1FE);
-      SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x9BDC06A7);
-      SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC19BF174);
-      SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0xE49B69C1);
-      SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0xEFBE4786);
-      SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x0FC19DC6);
-      SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x240CA1CC);
-      SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x2DE92C6F);
-      SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x4A7484AA);
-      SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x5CB0A9DC);
-      SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x76F988DA);
-      SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0x983E5152);
-      SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0xA831C66D);
-      SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0xB00327C8);
-      SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0xBF597FC7);
-      SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0xC6E00BF3);
-      SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD5A79147);
-      SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0x06CA6351);
-      SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x14292967);
-      SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x27B70A85);
-      SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x2E1B2138);
-      SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x4D2C6DFC);
-      SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x53380D13);
-      SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x650A7354);
-      SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x766A0ABB);
-      SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x81C2C92E);
-      SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x92722C85);
-      SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0xA2BFE8A1);
-      SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0xA81A664B);
-      SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0xC24B8B70);
-      SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0xC76C51A3);
-      SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0xD192E819);
-      SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xD6990624);
-      SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xF40E3585);
-      SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0x106AA070);
-      SHA2_32_F(A, B, C, D, E, F, G, H, W00, W14, W09, W01, 0x19A4C116);
-      SHA2_32_F(H, A, B, C, D, E, F, G, W01, W15, W10, W02, 0x1E376C08);
-      SHA2_32_F(G, H, A, B, C, D, E, F, W02, W00, W11, W03, 0x2748774C);
-      SHA2_32_F(F, G, H, A, B, C, D, E, W03, W01, W12, W04, 0x34B0BCB5);
-      SHA2_32_F(E, F, G, H, A, B, C, D, W04, W02, W13, W05, 0x391C0CB3);
-      SHA2_32_F(D, E, F, G, H, A, B, C, W05, W03, W14, W06, 0x4ED8AA4A);
-      SHA2_32_F(C, D, E, F, G, H, A, B, W06, W04, W15, W07, 0x5B9CCA4F);
-      SHA2_32_F(B, C, D, E, F, G, H, A, W07, W05, W00, W08, 0x682E6FF3);
-      SHA2_32_F(A, B, C, D, E, F, G, H, W08, W06, W01, W09, 0x748F82EE);
-      SHA2_32_F(H, A, B, C, D, E, F, G, W09, W07, W02, W10, 0x78A5636F);
-      SHA2_32_F(G, H, A, B, C, D, E, F, W10, W08, W03, W11, 0x84C87814);
-      SHA2_32_F(F, G, H, A, B, C, D, E, W11, W09, W04, W12, 0x8CC70208);
-      SHA2_32_F(E, F, G, H, A, B, C, D, W12, W10, W05, W13, 0x90BEFFFA);
-      SHA2_32_F(D, E, F, G, H, A, B, C, W13, W11, W06, W14, 0xA4506CEB);
-      SHA2_32_F(C, D, E, F, G, H, A, B, W14, W12, W07, W15, 0xBEF9A3F7);
-      SHA2_32_F(B, C, D, E, F, G, H, A, W15, W13, W08, W00, 0xC67178F2);
-
-      A = (digest[0] += A);
-      B = (digest[1] += B);
-      C = (digest[2] += C);
-      D = (digest[3] += D);
-      E = (digest[4] += E);
-      F = (digest[5] += F);
-      G = (digest[6] += G);
-      H = (digest[7] += H);
+      // Save current state
+      ABEF_SAVE = STATE0;
+      CDGH_SAVE = STATE1;
+
+         // Intermediate void* cast due to http://llvm.org/bugs/show_bug.cgi?id=20670
+      MSG0 = vld1q_u32((const uint32_t*)(void*)(input +  0));
+      MSG1 = vld1q_u32((const uint32_t*)(void*)(input + 16));
+      MSG2 = vld1q_u32((const uint32_t*)(void*)(input + 32));
+      MSG3 = vld1q_u32((const uint32_t*)(void*)(input + 48));
+
+      MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
+      MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
+      MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
+      MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
+
+      TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x00]));
+
+      // Rounds 0-3
+      MSG0 = vsha256su0q_u32(MSG0, MSG1);
+      TMP2 = STATE0;
+      TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x04]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
+      MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
+
+      // Rounds 4-7
+      MSG1 = vsha256su0q_u32(MSG1, MSG2);
+      TMP2 = STATE0;
+      TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x08]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
+      MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
+
+      // Rounds 8-11
+      MSG2 = vsha256su0q_u32(MSG2, MSG3);
+      TMP2 = STATE0;
+      TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x0c]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
+      MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
+
+      // Rounds 12-15
+      MSG3 = vsha256su0q_u32(MSG3, MSG0);
+      TMP2 = STATE0;
+      TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x10]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
+      MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
+
+      // Rounds 16-19
+      MSG0 = vsha256su0q_u32(MSG0, MSG1);
+      TMP2 = STATE0;
+      TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x14]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
+      MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
+
+      // Rounds 20-23
+      MSG1 = vsha256su0q_u32(MSG1, MSG2);
+      TMP2 = STATE0;
+      TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x18]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
+      MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
+
+      // Rounds 24-27
+      MSG2 = vsha256su0q_u32(MSG2, MSG3);
+      TMP2 = STATE0;
+      TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x1c]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
+      MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
+
+      // Rounds 28-31
+      MSG3 = vsha256su0q_u32(MSG3, MSG0);
+      TMP2 = STATE0;
+      TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x20]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
+      MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
+
+      // Rounds 32-35
+      MSG0 = vsha256su0q_u32(MSG0, MSG1);
+      TMP2 = STATE0;
+      TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x24]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
+      MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
+
+      // Rounds 36-39
+      MSG1 = vsha256su0q_u32(MSG1, MSG2);
+      TMP2 = STATE0;
+      TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x28]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
+      MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
+
+      // Rounds 40-43
+      MSG2 = vsha256su0q_u32(MSG2, MSG3);
+      TMP2 = STATE0;
+      TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x2c]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
+      MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
+
+      // Rounds 44-47
+      MSG3 = vsha256su0q_u32(MSG3, MSG0);
+      TMP2 = STATE0;
+      TMP0 = vaddq_u32(MSG0, vld1q_u32(&K[0x30]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
+      MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
+
+      // Rounds 48-51
+      TMP2 = STATE0;
+      TMP1 = vaddq_u32(MSG1, vld1q_u32(&K[0x34]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
+
+      // Rounds 52-55
+      TMP2 = STATE0;
+      TMP0 = vaddq_u32(MSG2, vld1q_u32(&K[0x38]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
+
+      // Rounds 56-59
+      TMP2 = STATE0;
+      TMP1 = vaddq_u32(MSG3, vld1q_u32(&K[0x3c]));
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
+
+      // Rounds 60-63
+      TMP2 = STATE0;
+      STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
+      STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
+
+      // Add back to state
+      STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
+      STATE1 = vaddq_u32(STATE1, CDGH_SAVE);

       input += 64;
+      blocks--;
       }
-   }
+
+    // Save state
+    vst1q_u32(&digest[0], STATE0);
+    vst1q_u32(&digest[4], STATE1);
+    }

 }

Here is the updated sha2_32.cpp and the diff packaged as a ZIP file.

sha2_32_updated.zip

noloader commented 7 years ago

Testing your integrated code on the compile farm's gcc117 shows some impressive numbers:

# Without -march=...
noloader@gcc117:~/botan> LD_LIBRARY_PATH=/opt/cfarm/gcc-latest/lib64 ./botan speed --msec=3000 SHA-1 SHA-256
SHA-160 [base] hash 207.170 MiB/sec (621.512 MiB in 3000.006 ms)
SHA-256 [base] hash 154.733 MiB/sec (464.199 MiB in 3000.000 ms)

# With -march=armv8-a+crc+crypto -mtune=cortex-a57
noloader@gcc117:~/botan> LD_LIBRARY_PATH=/opt/cfarm/gcc-latest/lib64 ./botan speed --msec=3000 SHA-1 SHA-256
SHA-160 [base] hash 812.616 MiB/sec (2437.848 MiB in 3000.000 ms)
SHA-256 [base] hash 738.528 MiB/sec (2215.586 MiB in 3000.005 ms)
randombit commented 7 years ago

Merged so closing. Thanks again!