Attached and below is a patch for AES using ARMv8 extensions. Its another partial patch, and it hijacks the C++ implementation. Others will have to complete it.
The code came from a few sources, including Crypto++ and mbedTLS experimental implementation by Johannes Schneiders, Barry O'Rourke and Skip Hovsmith. An early paper by Cynthia Crutchfield was also used as a reference. mbedTLS was helpful because it was a working implementation that allowed us to examine state under a debugger.
Here are the numbers from GCC117 (compile farm), which is a 2.0 GHz AMD Opteron. If my calculations are correct, Botan is pushing data at 1.4 cpb for AES-128 on the Opteron (2.048/1.414).
Botan was configured with ./configure.py --cc=gcc --cc-bin=/opt/cfarm/gcc-latest/bin/c++ --cc-abi="-march=armv8-a+crypto". The options defined __ARM_FEATURE_CRYPTO and made the intrinsics available.
$ git diff > aes.diff
$ cat aes.diff
diff --git a/src/lib/block/aes/aes.cpp b/src/lib/block/aes/aes.cpp
index 21228e0c1..30da4ce00 100644
--- a/src/lib/block/aes/aes.cpp
+++ b/src/lib/block/aes/aes.cpp
@@ -12,6 +12,11 @@
#include <botan/cpuid.h>
#include <botan/internal/bit_ops.h>
+#if defined(__ARM_FEATURE_CRYPTO)
+# include <arm_neon.h>
+# include <arm_acle.h>
+#endif
+
/*
* This implementation is based on table lookups which are known to be
* vulnerable to timing and cache based side channel attacks. Some
@@ -156,95 +161,51 @@ void aes_encrypt_n(const uint8_t in[], uint8_t out[],
{
BOTAN_ASSERT(EK.size() && ME.size() == 16, "Key was set");
- const size_t cache_line_size = CPUID::cache_line_size();
-
- const std::vector<uint32_t>& TE = AES_TE();
+ // For an N round cipher, there are N+1 subkeys.
+ // Subkeys 1 to N are in EK; the N+1 subkey is in ME (1 based numbering).
+ const uint8_t *skey = reinterpret_cast<const uint8_t*>(EK.data());
+ const uint8_t *mkey = reinterpret_cast<const uint8_t*>(ME.data());
+ const size_t rounds = EK.size()/4;
- // Hit every cache line of TE
- uint32_t Z = 0;
- for(size_t i = 0; i < TE.size(); i += cache_line_size / sizeof(uint32_t))
+ for (size_t k=0; k<blocks; ++k)
{
- Z |= TE[i];
- }
- Z &= TE[82]; // this is zero, which hopefully the compiler cannot deduce
-
- BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i)
- {
- uint32_t T0, T1, T2, T3;
- load_be(in + 16*i, T0, T1, T2, T3);
-
- T0 ^= EK[0];
- T1 ^= EK[1];
- T2 ^= EK[2];
- T3 ^= EK[3];
-
- T0 ^= Z;
-
- /* Use only the first 256 entries of the TE table and do the
- * rotations directly in the code. This reduces the number of
- * cache lines potentially used in the first round from 64 to 16
- * (assuming a typical 64 byte cache line), which makes timing
- * attacks a little harder; the first round is particularly
- * vulnerable.
- */
-
- uint32_t B0 = TE[get_byte(0, T0)] ^
- rotate_right(TE[get_byte(1, T1)], 8) ^
- rotate_right(TE[get_byte(2, T2)], 16) ^
- rotate_right(TE[get_byte(3, T3)], 24) ^ EK[4];
-
- uint32_t B1 = TE[get_byte(0, T1)] ^
- rotate_right(TE[get_byte(1, T2)], 8) ^
- rotate_right(TE[get_byte(2, T3)], 16) ^
- rotate_right(TE[get_byte(3, T0)], 24) ^ EK[5];
-
- uint32_t B2 = TE[get_byte(0, T2)] ^
- rotate_right(TE[get_byte(1, T3)], 8) ^
- rotate_right(TE[get_byte(2, T0)], 16) ^
- rotate_right(TE[get_byte(3, T1)], 24) ^ EK[6];
-
- uint32_t B3 = TE[get_byte(0, T3)] ^
- rotate_right(TE[get_byte(1, T0)], 8) ^
- rotate_right(TE[get_byte(2, T1)], 16) ^
- rotate_right(TE[get_byte(3, T2)], 24) ^ EK[7];
-
- for(size_t r = 2*4; r < EK.size(); r += 2*4)
+ uint8x16_t data = vld1q_u8(in+16*k);
+
+ // Unroll the loop, profit 0.3 to 0.5 cpb.
+ data = vaeseq_u8(data, vld1q_u8(skey+0));
+ data = vaesmcq_u8(data);
+ data = vaeseq_u8(data, vld1q_u8(skey+16));
+ data = vaesmcq_u8(data);
+ data = vaeseq_u8(data, vld1q_u8(skey+32));
+ data = vaesmcq_u8(data);
+ data = vaeseq_u8(data, vld1q_u8(skey+48));
+ data = vaesmcq_u8(data);
+ data = vaeseq_u8(data, vld1q_u8(skey+64));
+ data = vaesmcq_u8(data);
+ data = vaeseq_u8(data, vld1q_u8(skey+80));
+ data = vaesmcq_u8(data);
+ data = vaeseq_u8(data, vld1q_u8(skey+96));
+ data = vaesmcq_u8(data);
+ data = vaeseq_u8(data, vld1q_u8(skey+112));
+ data = vaesmcq_u8(data);
+ data = vaeseq_u8(data, vld1q_u8(skey+128));
+ data = vaesmcq_u8(data);
+
+ unsigned int i=9;
+ for ( ; i<rounds-1; ++i)
{
- T0 = EK[r ] ^ TE[get_byte(0, B0) ] ^ TE[get_byte(1, B1) + 256] ^
- TE[get_byte(2, B2) + 512] ^ TE[get_byte(3, B3) + 768];
- T1 = EK[r+1] ^ TE[get_byte(0, B1) ] ^ TE[get_byte(1, B2) + 256] ^
- TE[get_byte(2, B3) + 512] ^ TE[get_byte(3, B0) + 768];
- T2 = EK[r+2] ^ TE[get_byte(0, B2) ] ^ TE[get_byte(1, B3) + 256] ^
- TE[get_byte(2, B0) + 512] ^ TE[get_byte(3, B1) + 768];
- T3 = EK[r+3] ^ TE[get_byte(0, B3) ] ^ TE[get_byte(1, B0) + 256] ^
- TE[get_byte(2, B1) + 512] ^ TE[get_byte(3, B2) + 768];
-
- B0 = EK[r+4] ^ TE[get_byte(0, T0) ] ^ TE[get_byte(1, T1) + 256] ^
- TE[get_byte(2, T2) + 512] ^ TE[get_byte(3, T3) + 768];
- B1 = EK[r+5] ^ TE[get_byte(0, T1) ] ^ TE[get_byte(1, T2) + 256] ^
- TE[get_byte(2, T3) + 512] ^ TE[get_byte(3, T0) + 768];
- B2 = EK[r+6] ^ TE[get_byte(0, T2) ] ^ TE[get_byte(1, T3) + 256] ^
- TE[get_byte(2, T0) + 512] ^ TE[get_byte(3, T1) + 768];
- B3 = EK[r+7] ^ TE[get_byte(0, T3) ] ^ TE[get_byte(1, T0) + 256] ^
- TE[get_byte(2, T1) + 512] ^ TE[get_byte(3, T2) + 768];
+ // AES single round encryption
+ data = vaeseq_u8(data, vld1q_u8(skey+i*16));
+ // AES mix columns
+ data = vaesmcq_u8(data);
}
- out[16*i+ 0] = SE[get_byte(0, B0)] ^ ME[0];
- out[16*i+ 1] = SE[get_byte(1, B1)] ^ ME[1];
- out[16*i+ 2] = SE[get_byte(2, B2)] ^ ME[2];
- out[16*i+ 3] = SE[get_byte(3, B3)] ^ ME[3];
- out[16*i+ 4] = SE[get_byte(0, B1)] ^ ME[4];
- out[16*i+ 5] = SE[get_byte(1, B2)] ^ ME[5];
- out[16*i+ 6] = SE[get_byte(2, B3)] ^ ME[6];
- out[16*i+ 7] = SE[get_byte(3, B0)] ^ ME[7];
- out[16*i+ 8] = SE[get_byte(0, B2)] ^ ME[8];
- out[16*i+ 9] = SE[get_byte(1, B3)] ^ ME[9];
- out[16*i+10] = SE[get_byte(2, B0)] ^ ME[10];
- out[16*i+11] = SE[get_byte(3, B1)] ^ ME[11];
- out[16*i+12] = SE[get_byte(0, B3)] ^ ME[12];
- out[16*i+13] = SE[get_byte(1, B0)] ^ ME[13];
- out[16*i+14] = SE[get_byte(2, B1)] ^ ME[14];
- out[16*i+15] = SE[get_byte(3, B2)] ^ ME[15];
+ // One round of encryption: AES, no Mix
+ data = vaeseq_u8(data, vld1q_u8(skey+i*16));
+ // Final Add (bitwise Xor)
+ data = veorq_u8(data, vld1q_u8(mkey+0));
+
+ vst1q_u8(out+16*k, data);
}
}
@@ -257,85 +218,51 @@ void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks,
{
BOTAN_ASSERT(DK.size() && MD.size() == 16, "Key was set");
- const size_t cache_line_size = CPUID::cache_line_size();
- const std::vector<uint32_t>& TD = AES_TD();
+ // For an N round cipher, there are N+1 subkeys.
+ // Subkeys 1 to N are in DK; the N+1 subkey is in MD (1 based numbering).
+ const uint8_t *skey = reinterpret_cast<const uint8_t*>(DK.data());
+ const uint8_t *mkey = reinterpret_cast<const uint8_t*>(MD.data());
+ const size_t rounds = DK.size()/4;
- uint32_t Z = 0;
- for(size_t i = 0; i < TD.size(); i += cache_line_size / sizeof(uint32_t))
+ for (size_t k=0; k<blocks; ++k)
+ {
+ uint8x16_t data = vld1q_u8(in+16*k);
+
+ // Unroll the loop, profit 0.3 to 0.5 cpb.
+ data = vaesdq_u8(data, vld1q_u8(skey+0));
+ data = vaesimcq_u8(data);
+ data = vaesdq_u8(data, vld1q_u8(skey+16));
+ data = vaesimcq_u8(data);
+ data = vaesdq_u8(data, vld1q_u8(skey+32));
+ data = vaesimcq_u8(data);
+ data = vaesdq_u8(data, vld1q_u8(skey+48));
+ data = vaesimcq_u8(data);
+ data = vaesdq_u8(data, vld1q_u8(skey+64));
+ data = vaesimcq_u8(data);
+ data = vaesdq_u8(data, vld1q_u8(skey+80));
+ data = vaesimcq_u8(data);
+ data = vaesdq_u8(data, vld1q_u8(skey+96));
+ data = vaesimcq_u8(data);
+ data = vaesdq_u8(data, vld1q_u8(skey+112));
+ data = vaesimcq_u8(data);
+ data = vaesdq_u8(data, vld1q_u8(skey+128));
+ data = vaesimcq_u8(data);
+
+ unsigned int i=9;
+ for ( ; i<rounds-1; ++i)
{
- Z |= TD[i];
+ // AES single round decryption
+ data = vaesdq_u8(data, vld1q_u8(skey+i*16));
+ // AES inverse mix columns
+ data = vaesimcq_u8(data);
}
- Z &= TD[99]; // this is zero, which hopefully the compiler cannot deduce
- for(size_t i = 0; i != blocks; ++i)
- {
- uint32_t T0 = load_be<uint32_t>(in, 0) ^ DK[0];
- uint32_t T1 = load_be<uint32_t>(in, 1) ^ DK[1];
- uint32_t T2 = load_be<uint32_t>(in, 2) ^ DK[2];
- uint32_t T3 = load_be<uint32_t>(in, 3) ^ DK[3];
-
- T0 ^= Z;
-
- uint32_t B0 = TD[get_byte(0, T0)] ^
- rotate_right(TD[get_byte(1, T3)], 8) ^
- rotate_right(TD[get_byte(2, T2)], 16) ^
- rotate_right(TD[get_byte(3, T1)], 24) ^ DK[4];
-
- uint32_t B1 = TD[get_byte(0, T1)] ^
- rotate_right(TD[get_byte(1, T0)], 8) ^
- rotate_right(TD[get_byte(2, T3)], 16) ^
- rotate_right(TD[get_byte(3, T2)], 24) ^ DK[5];
-
- uint32_t B2 = TD[get_byte(0, T2)] ^
- rotate_right(TD[get_byte(1, T1)], 8) ^
- rotate_right(TD[get_byte(2, T0)], 16) ^
- rotate_right(TD[get_byte(3, T3)], 24) ^ DK[6];
-
- uint32_t B3 = TD[get_byte(0, T3)] ^
- rotate_right(TD[get_byte(1, T2)], 8) ^
- rotate_right(TD[get_byte(2, T1)], 16) ^
- rotate_right(TD[get_byte(3, T0)], 24) ^ DK[7];
-
- for(size_t r = 2*4; r < DK.size(); r += 2*4)
- {
- T0 = DK[r ] ^ TD[get_byte(0, B0) ] ^ TD[get_byte(1, B3) + 256] ^
- TD[get_byte(2, B2) + 512] ^ TD[get_byte(3, B1) + 768];
- T1 = DK[r+1] ^ TD[get_byte(0, B1) ] ^ TD[get_byte(1, B0) + 256] ^
- TD[get_byte(2, B3) + 512] ^ TD[get_byte(3, B2) + 768];
- T2 = DK[r+2] ^ TD[get_byte(0, B2) ] ^ TD[get_byte(1, B1) + 256] ^
- TD[get_byte(2, B0) + 512] ^ TD[get_byte(3, B3) + 768];
- T3 = DK[r+3] ^ TD[get_byte(0, B3) ] ^ TD[get_byte(1, B2) + 256] ^
- TD[get_byte(2, B1) + 512] ^ TD[get_byte(3, B0) + 768];
-
- B0 = DK[r+4] ^ TD[get_byte(0, T0) ] ^ TD[get_byte(1, T3) + 256] ^
- TD[get_byte(2, T2) + 512] ^ TD[get_byte(3, T1) + 768];
- B1 = DK[r+5] ^ TD[get_byte(0, T1) ] ^ TD[get_byte(1, T0) + 256] ^
- TD[get_byte(2, T3) + 512] ^ TD[get_byte(3, T2) + 768];
- B2 = DK[r+6] ^ TD[get_byte(0, T2) ] ^ TD[get_byte(1, T1) + 256] ^
- TD[get_byte(2, T0) + 512] ^ TD[get_byte(3, T3) + 768];
- B3 = DK[r+7] ^ TD[get_byte(0, T3) ] ^ TD[get_byte(1, T2) + 256] ^
- TD[get_byte(2, T1) + 512] ^ TD[get_byte(3, T0) + 768];
- }
+ // AES single round decryption
+ data = vaesdq_u8(data, vld1q_u8(skey+i*16));
+ // Final Add (bitwise Xor)
+ data = veorq_u8(data, vld1q_u8(mkey+0));
- out[ 0] = SD[get_byte(0, B0)] ^ MD[0];
- out[ 1] = SD[get_byte(1, B3)] ^ MD[1];
- out[ 2] = SD[get_byte(2, B2)] ^ MD[2];
- out[ 3] = SD[get_byte(3, B1)] ^ MD[3];
- out[ 4] = SD[get_byte(0, B1)] ^ MD[4];
- out[ 5] = SD[get_byte(1, B0)] ^ MD[5];
- out[ 6] = SD[get_byte(2, B3)] ^ MD[6];
- out[ 7] = SD[get_byte(3, B2)] ^ MD[7];
- out[ 8] = SD[get_byte(0, B2)] ^ MD[8];
- out[ 9] = SD[get_byte(1, B1)] ^ MD[9];
- out[10] = SD[get_byte(2, B0)] ^ MD[10];
- out[11] = SD[get_byte(3, B3)] ^ MD[11];
- out[12] = SD[get_byte(0, B3)] ^ MD[12];
- out[13] = SD[get_byte(1, B2)] ^ MD[13];
- out[14] = SD[get_byte(2, B1)] ^ MD[14];
- out[15] = SD[get_byte(3, B0)] ^ MD[15];
-
- in += 16;
- out += 16;
+ vst1q_u8(out+16*k, data);
}
}
@@ -412,6 +339,17 @@ void aes_key_schedule(const uint8_t key[], size_t length,
DK.resize(length + 24);
copy_mem(EK.data(), XEK.data(), EK.size());
copy_mem(DK.data(), XDK.data(), DK.size());
+
+ // ARM needs the subkeys in native endian. Little endian on litlle endian
+ for (unsigned int i=0; i<EK.size(); ++i)
+ {
+ EK[i] = reverse_bytes(EK[i]);
+ }
+ // ARM needs the subkeys in native endian. Little endian on litlle endian
+ for (unsigned int i=0; i<DK.size(); ++i)
+ {
+ DK[i] = reverse_bytes(DK[i]);
+ }
}
const char* aes_provider()
Attached and below is a patch for AES using ARMv8 extensions. Its another partial patch, and it hijacks the C++ implementation. Others will have to complete it.
The code came from a few sources, including Crypto++ and mbedTLS experimental implementation by Johannes Schneiders, Barry O'Rourke and Skip Hovsmith. An early paper by Cynthia Crutchfield was also used as a reference. mbedTLS was helpful because it was a working implementation that allowed us to examine state under a debugger.
Here are the numbers from GCC117 (compile farm), which is a 2.0 GHz AMD Opteron. If my calculations are correct, Botan is pushing data at 1.4 cpb for AES-128 on the Opteron (2.048/1.414).
Botan was configured with
./configure.py --cc=gcc --cc-bin=/opt/cfarm/gcc-latest/bin/c++ --cc-abi="-march=armv8-a+crypto"
. The options defined__ARM_FEATURE_CRYPTO
and made the intrinsics available.Here's a zip of the diff: aes.zip.