Attached and below is a patch for SHA-1 using ARM SHA extensions. Its another partial patch, and others will have to complete it.
The code came from the ARM ARM, and the mbedtls experimental implementation by Johannes Schneiders, Barry O'Rourke and Skip Hovsmith.
The dev-boards used for testing were a Pine64 and LeMaker HiKey. Both have CRC and Crypto extensions. Botan was configured with ./configure.py --cc={gcc|clang} --cc-abi="-march=armv8-a+crc+crypto -mtune=cortex-a53".
Be careful of using hwcaps to determine SHA extension availability because its only available on Linux. There is no equivalent on iOS and Windows Phone. The best way I found was a Unix signal handler to catch a SIGILL and a __try/__except block on Windows because they work everywhere. It also avoids the need for EL1 to check a MSR.
$ git diff > sha1.diff
$ cat sha1.diff
diff --git a/src/lib/hash/sha1/sha160.cpp b/src/lib/hash/sha1/sha160.cpp
index 735789cab..5d0d6e8f0 100644
--- a/src/lib/hash/sha1/sha160.cpp
+++ b/src/lib/hash/sha1/sha160.cpp
@@ -8,140 +8,208 @@
#include <botan/sha160.h>
#include <botan/cpuid.h>
-namespace Botan {
-
-namespace SHA1_F {
-
-namespace {
-
-/*
-* SHA-160 F1 Function
-*/
-inline void F1(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg)
- {
- E += (D ^ (B & (C ^ D))) + msg + 0x5A827999 + rotate_left(A, 5);
- B = rotate_left(B, 30);
- }
+// Ugly... ARM32/ARM64 Headers
+// As of Visual Studio 2015, Microsoft does not support ARM ACLE extensions
+#if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__)
+# if defined(__GNUC__)
+# include <stdint.h>
+# endif
+# if defined(__ARM_NEON)
+# include <arm_neon.h>
+# endif
+// GCC and LLVM Clang, but not Apple Clang
+# if defined(__GNUC__) && !defined(__apple_build_version__)
+# if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
+# include <arm_acle.h>
+# endif
+# endif
+#endif // ARM32 and ARM64 Headers
+
-/*
-* SHA-160 F2 Function
-*/
-inline void F2(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg)
- {
- E += (B ^ C ^ D) + msg + 0x6ED9EBA1 + rotate_left(A, 5);
- B = rotate_left(B, 30);
- }
-
-/*
-* SHA-160 F3 Function
-*/
-inline void F3(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg)
- {
- E += ((B & C) | ((B | C) & D)) + msg + 0x8F1BBCDC + rotate_left(A, 5);
- B = rotate_left(B, 30);
- }
-
-/*
-* SHA-160 F4 Function
-*/
-inline void F4(uint32_t A, uint32_t& B, uint32_t C, uint32_t D, uint32_t& E, uint32_t msg)
- {
- E += (B ^ C ^ D) + msg + 0xCA62C1D6 + rotate_left(A, 5);
- B = rotate_left(B, 30);
- }
-
-}
-
-}
+namespace Botan {
/*
* SHA-160 Compression Function
*/
void SHA_160::compress_n(const uint8_t input[], size_t blocks)
{
- using namespace SHA1_F;
-
-#if defined(BOTAN_HAS_SHA1_SSE2)
- if(CPUID::has_sse2())
- {
- return sse2_compress_n(m_digest, input, blocks);
- }
-
-#endif
-
- uint32_t A = m_digest[0], B = m_digest[1], C = m_digest[2],
- D = m_digest[3], E = m_digest[4];
-
- m_W.resize(80);
-
- for(size_t i = 0; i != blocks; ++i)
- {
- load_be(m_W.data(), input, 16);
-
- for(size_t j = 16; j != 80; j += 8)
- {
- m_W[j ] = rotate_left((m_W[j-3] ^ m_W[j-8] ^ m_W[j-14] ^ m_W[j-16]), 1);
- m_W[j+1] = rotate_left((m_W[j-2] ^ m_W[j-7] ^ m_W[j-13] ^ m_W[j-15]), 1);
- m_W[j+2] = rotate_left((m_W[j-1] ^ m_W[j-6] ^ m_W[j-12] ^ m_W[j-14]), 1);
- m_W[j+3] = rotate_left((m_W[j ] ^ m_W[j-5] ^ m_W[j-11] ^ m_W[j-13]), 1);
- m_W[j+4] = rotate_left((m_W[j+1] ^ m_W[j-4] ^ m_W[j-10] ^ m_W[j-12]), 1);
- m_W[j+5] = rotate_left((m_W[j+2] ^ m_W[j-3] ^ m_W[j- 9] ^ m_W[j-11]), 1);
- m_W[j+6] = rotate_left((m_W[j+3] ^ m_W[j-2] ^ m_W[j- 8] ^ m_W[j-10]), 1);
- m_W[j+7] = rotate_left((m_W[j+4] ^ m_W[j-1] ^ m_W[j- 7] ^ m_W[j- 9]), 1);
- }
-
- F1(A, B, C, D, E, m_W[ 0]); F1(E, A, B, C, D, m_W[ 1]);
- F1(D, E, A, B, C, m_W[ 2]); F1(C, D, E, A, B, m_W[ 3]);
- F1(B, C, D, E, A, m_W[ 4]); F1(A, B, C, D, E, m_W[ 5]);
- F1(E, A, B, C, D, m_W[ 6]); F1(D, E, A, B, C, m_W[ 7]);
- F1(C, D, E, A, B, m_W[ 8]); F1(B, C, D, E, A, m_W[ 9]);
- F1(A, B, C, D, E, m_W[10]); F1(E, A, B, C, D, m_W[11]);
- F1(D, E, A, B, C, m_W[12]); F1(C, D, E, A, B, m_W[13]);
- F1(B, C, D, E, A, m_W[14]); F1(A, B, C, D, E, m_W[15]);
- F1(E, A, B, C, D, m_W[16]); F1(D, E, A, B, C, m_W[17]);
- F1(C, D, E, A, B, m_W[18]); F1(B, C, D, E, A, m_W[19]);
-
- F2(A, B, C, D, E, m_W[20]); F2(E, A, B, C, D, m_W[21]);
- F2(D, E, A, B, C, m_W[22]); F2(C, D, E, A, B, m_W[23]);
- F2(B, C, D, E, A, m_W[24]); F2(A, B, C, D, E, m_W[25]);
- F2(E, A, B, C, D, m_W[26]); F2(D, E, A, B, C, m_W[27]);
- F2(C, D, E, A, B, m_W[28]); F2(B, C, D, E, A, m_W[29]);
- F2(A, B, C, D, E, m_W[30]); F2(E, A, B, C, D, m_W[31]);
- F2(D, E, A, B, C, m_W[32]); F2(C, D, E, A, B, m_W[33]);
- F2(B, C, D, E, A, m_W[34]); F2(A, B, C, D, E, m_W[35]);
- F2(E, A, B, C, D, m_W[36]); F2(D, E, A, B, C, m_W[37]);
- F2(C, D, E, A, B, m_W[38]); F2(B, C, D, E, A, m_W[39]);
-
- F3(A, B, C, D, E, m_W[40]); F3(E, A, B, C, D, m_W[41]);
- F3(D, E, A, B, C, m_W[42]); F3(C, D, E, A, B, m_W[43]);
- F3(B, C, D, E, A, m_W[44]); F3(A, B, C, D, E, m_W[45]);
- F3(E, A, B, C, D, m_W[46]); F3(D, E, A, B, C, m_W[47]);
- F3(C, D, E, A, B, m_W[48]); F3(B, C, D, E, A, m_W[49]);
- F3(A, B, C, D, E, m_W[50]); F3(E, A, B, C, D, m_W[51]);
- F3(D, E, A, B, C, m_W[52]); F3(C, D, E, A, B, m_W[53]);
- F3(B, C, D, E, A, m_W[54]); F3(A, B, C, D, E, m_W[55]);
- F3(E, A, B, C, D, m_W[56]); F3(D, E, A, B, C, m_W[57]);
- F3(C, D, E, A, B, m_W[58]); F3(B, C, D, E, A, m_W[59]);
-
- F4(A, B, C, D, E, m_W[60]); F4(E, A, B, C, D, m_W[61]);
- F4(D, E, A, B, C, m_W[62]); F4(C, D, E, A, B, m_W[63]);
- F4(B, C, D, E, A, m_W[64]); F4(A, B, C, D, E, m_W[65]);
- F4(E, A, B, C, D, m_W[66]); F4(D, E, A, B, C, m_W[67]);
- F4(C, D, E, A, B, m_W[68]); F4(B, C, D, E, A, m_W[69]);
- F4(A, B, C, D, E, m_W[70]); F4(E, A, B, C, D, m_W[71]);
- F4(D, E, A, B, C, m_W[72]); F4(C, D, E, A, B, m_W[73]);
- F4(B, C, D, E, A, m_W[74]); F4(A, B, C, D, E, m_W[75]);
- F4(E, A, B, C, D, m_W[76]); F4(D, E, A, B, C, m_W[77]);
- F4(C, D, E, A, B, m_W[78]); F4(B, C, D, E, A, m_W[79]);
-
- A = (m_digest[0] += A);
- B = (m_digest[1] += B);
- C = (m_digest[2] += C);
- D = (m_digest[3] += D);
- E = (m_digest[4] += E);
-
- input += hash_block_size();
- }
+ uint32x4_t C0, C1, C2, C3;
+ uint32x4_t ABCD, ABCD_SAVED;
+ uint32_t E0, E0_SAVED, E1;
+
+ // Load initial values
+ C0 = vdupq_n_u32(0x5A827999);
+ C1 = vdupq_n_u32(0x6ED9EBA1);
+ C2 = vdupq_n_u32(0x8F1BBCDC);
+ C3 = vdupq_n_u32(0xCA62C1D6);
+
+ ABCD = vld1q_u32(&m_digest[0]);
+ E0 = m_digest[4];
+
+ while (blocks)
+ {
+ uint32x4_t MSG0, MSG1, MSG2, MSG3;
+ uint32x4_t TMP0, TMP1;
+
+ // Save current hash
+ ABCD_SAVED = ABCD;
+ E0_SAVED = E0;
+
+ // Intermediate void* cast due to http://llvm.org/bugs/show_bug.cgi?id=20670
+ MSG0 = vld1q_u32((const uint32_t*)(void*)(input + 0));
+ MSG1 = vld1q_u32((const uint32_t*)(void*)(input + 16));
+ MSG2 = vld1q_u32((const uint32_t*)(void*)(input + 32));
+ MSG3 = vld1q_u32((const uint32_t*)(void*)(input + 48));
+
+ MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
+ MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
+ MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
+ MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
+
+ TMP0 = vaddq_u32(MSG0, C0);
+ TMP1 = vaddq_u32(MSG1, C0);
+
+ // Rounds 0-3
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C0);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 4-7
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C0);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 8-11
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C0);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 12-15
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C1);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 16-19
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1cq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C1);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 20-23
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C1);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 24-27
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C1);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 28-31
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C1);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 32-35
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C2);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 36-39
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C2);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 40-43
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C2);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 44-47
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C2);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 48-51
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C2);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 52-55
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C3);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+ MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
+
+ // Rounds 56-59
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1mq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG0, C3);
+ MSG1 = vsha1su1q_u32(MSG1, MSG0);
+ MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
+
+ // Rounds 60-63
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG1, C3);
+ MSG2 = vsha1su1q_u32(MSG2, MSG1);
+ MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
+
+ // Rounds 64-67
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+ TMP0 = vaddq_u32(MSG2, C3);
+ MSG3 = vsha1su1q_u32(MSG3, MSG2);
+ MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
+
+ // Rounds 68-71
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+ TMP1 = vaddq_u32(MSG3, C3);
+ MSG0 = vsha1su1q_u32(MSG0, MSG3);
+
+ // Rounds 72-75
+ E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E0, TMP0);
+
+ // Rounds 76-79
+ E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
+ ABCD = vsha1pq_u32(ABCD, E1, TMP1);
+
+ // Add state back
+ E0 += E0_SAVED;
+ ABCD = vaddq_u32(ABCD_SAVED, ABCD);
+
+ input += 64;
+ blocks--;
+ }
+
+ // Save digest
+ vst1q_u32(&m_digest[0], ABCD);
+ m_digest[4] = E0;
}
/*
Here is the updated sha160.cpp and the diff packaged as a ZIP file.
Attached and below is a patch for SHA-1 using ARM SHA extensions. Its another partial patch, and others will have to complete it.
The code came from the ARM ARM, and the mbedtls experimental implementation by Johannes Schneiders, Barry O'Rourke and Skip Hovsmith.
The dev-boards used for testing were a Pine64 and LeMaker HiKey. Both have CRC and Crypto extensions. Botan was configured with
./configure.py --cc={gcc|clang} --cc-abi="-march=armv8-a+crc+crypto -mtune=cortex-a53"
.Here are the relative numbers:
Be careful of using hwcaps to determine SHA extension availability because its only available on Linux. There is no equivalent on iOS and Windows Phone. The best way I found was a Unix signal handler to catch a
SIGILL
and a__try/__except
block on Windows because they work everywhere. It also avoids the need for EL1 to check a MSR.Here is the updated
sha160.cpp
and the diff packaged as a ZIP file.sha1_updated.zip