Closed noloader closed 7 years ago
Awesome, thank you. I am still working on the CPUID detection bits for ARM but after that should be able to merge all 3 of these patches quite easily.
@randombit,
I am still working on the CPUID detection bits for ARM but after that
Ack, let me know if/when you want comprehensive testing. I'm happy to lend a hand.
I was looking at Implementing GCM on ARMv8 again. According to Table 1 on page 4, PMULL
and PMULL2
are only available on Aarch64.
PMULL
and PMULL2
are not available on Aarch32. I don't have any Aarch32 machines to test on, so I don't know what will happen. The best case is they will compile, but fail the CPU probing tests. Worst case is a compile failure.
The include logic is OK for <arm_neon.h>
and <arm_acle.h>
, but it kind of implies PMULL
and PMULL2
are available. I think the include logic shown above is wrong since it uses I updated the initial report to reflect it.__aarch32__
as a signal for PMULL
and PMULL2
availability.
And for completeness, Apple's Clang claims Aarch64 environment:
$ clang++ --version
Apple LLVM version 6.0 (clang-600.0.57) (based on LLVM 3.5svn)
Target: x86_64-apple-darwin13.4.0
Thread model: posix
$ clang++ -arch arm64 -dM -E - < /dev/null | egrep -i '(arm|aarch)' | grep 64
#define __AARCH64EL__ 1
#define __AARCH64_SIMD__ 1
#define __ARM64_ARCH_8__ 1
#define __ARM_64BIT_STATE 1
#define __ARM_ARCH_ISA_A64 1
#define __ARM_PCS_AAPCS64 1
#define __aarch64__ 1
#define __arm64 1
#define __arm64__ 1
@randombit,
There's an experimental Carryless Multiply for ARM at mbedtls_armv8a_ce_gcm_mult. It has about 8 fewer instructions, and it runs 5 Mib or so faster based on GCC 4.9.2 (Pine64):
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 61.621 MiB/sec (184.863 MiB in 3000.016 ms)
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 63.061 MiB/sec (189.184 MiB in 3000.030 ms)
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 60.954 MiB/sec (182.863 MiB in 3000.008 ms)
mbedTLS calculates c = a * b
in GF(2^128). All you need to do is change the last line to the following for Botan's calculation of x = x * H
in GF(2^128):
+ /* reverse bits in each byte to convert from little-little endian to gcm format */
+ // vst1q_u8( c, vrbitq_u8( c_p ) );
+ vst1q_u8( x, vrbitq_u8( c_p ) );
The patch needs some polishing due to the vmull_low_p64
macro, but I think its another viable option for you.
pine64:botan$ git diff > mbed-pmull.diff
pine64:botan$ cat mbed-pmull.diff
diff --git a/src/lib/modes/aead/gcm/gcm.cpp b/src/lib/modes/aead/gcm/gcm.cpp
index 0d0cbff..a94f4a4 100644
--- a/src/lib/modes/aead/gcm/gcm.cpp
+++ b/src/lib/modes/aead/gcm/gcm.cpp
@@ -16,10 +16,96 @@
#include <botan/cpuid.h>
#endif
+// Ugly... ARM32/ARM64 Headers
+// As of Visual Studio 2015, Microsoft does not support ARM ACLE extensions
+// Also, PMULL and PMULL2 are only available on Aarch64 (not A-32, and not Aarch32)
+// And LLVM Clang only defines __aarch64__ (and not __arm__ or __arm64__)
+#if defined(__arm64__) || defined(__aarch64__)
+# if defined(__GNUC__)
+# include <stdint.h>
+# endif
+# if defined(__ARM_NEON)
+# include <arm_neon.h>
+# endif
+// GCC and LLVM Clang, but not Apple Clang
+# if defined(__GNUC__) && !defined(__apple_build_version__)
+# if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
+# include <arm_acle.h>
+# endif
+# endif
+#endif // ARM32 and ARM64 Headers
+
namespace Botan {
static const size_t GCM_BS = 16;
+void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]);
+
+/* because the vmull_p64 intrinsic uses the wrong argument types: */
+#define vmull_low_p64(A, B) ({ \
+ poly128_t res__; \
+ asm("pmull %0.1q, %1.1d, %2.1d \n\t" \
+ : "=w" (res__) : "w" (A), "w" (B) ); \
+ res__; \
+ })
+
+void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16])
+{
+ /*
+ * Implementing GCM on ARMv8, http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf
+ */
+
+ // GCC 4.x is missing vreinterpretq_p128_u64 and many of the other NEON casts
+ // Apple Clang is missing most of the poly types, vmull_p64 and vmull_high_p64
+ // poly128_t vmull_p64 (poly64_t, poly64_t);
+ // poly128_t vmull_high_p64 (poly64x2_t, poly64x2_t);
+
+ // $ ./botan speed --msec=3000 'GMAC(AES-128)'
+ // GMAC(AES-128) [base] mac 43.392 MiB/sec (130.176 MiB in 3000.026 ms)
+ // uint64x2_t a64 = vld1q_u64(reinterpret_cast<const uint64_t*>(x));
+ // uint64x2_t b64 = vld1q_u64(reinterpret_cast<const uint64_t*>(H));
+ // a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8((uint8x8_t)vgetq_lane_u64(a64, 1)), vrev64_u8((uint8x8_t)vgetq_lane_u64(a64, 0))));
+ // b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8((uint8x8_t)vgetq_lane_u64(b64, 1)), vrev64_u8((uint8x8_t)vgetq_lane_u64(b64, 0))));
+
+ /* vector variables */
+ uint8x16_t a_p, b_p; /* inputs */
+ uint8x16_t z, p; /* constants */
+ uint8x16_t r0, r1; /* full width multiply result (before reduction) */
+ uint8x16_t t0, t1; /* temps */
+ uint8x16_t c_p; /* output */
+
+ /* reverse bits in each byte to convert from gcm format to little-little endian */
+ a_p = vrbitq_u8( vld1q_u8( x ) );
+ b_p = vrbitq_u8( vld1q_u8( H ) );
+
+ /* polynomial multiply (128*128->256bit). See [GCM-WP] algorithms 3. */
+ z = vdupq_n_u8( 0 );
+ r0 = (uint8x16_t)vmull_low_p64( (poly64x2_t)a_p, (poly64x2_t)b_p );
+ r1 = (uint8x16_t)vmull_high_p64( (poly64x2_t)a_p, (poly64x2_t)b_p );
+ t0 = vextq_u8( b_p, b_p, 8 );
+ t1 = (uint8x16_t)vmull_low_p64( (poly64x2_t)a_p, (poly64x2_t)t0 );
+ t0 = (uint8x16_t)vmull_high_p64( (poly64x2_t)a_p, (poly64x2_t)t0 );
+ t0 = veorq_u8( t0, t1 );
+ t1 = vextq_u8( z, t0, 8 );
+ r0 = veorq_u8( r0, t1 );
+ t1 = vextq_u8( t0, z, 8 );
+ r1 = veorq_u8( r1, t1 );
+
+ /* polynomial reduction (256->128bit). See [GCM-WP] algorithms 5. */
+ p = (uint8x16_t)vdupq_n_u64( 0x0000000000000087 );
+ t0 = (uint8x16_t)vmull_high_p64( (poly64x2_t)r1, (poly64x2_t)p );
+ t1 = vextq_u8( t0, z, 8 );
+ r1 = veorq_u8( r1, t1 );
+ t1 = vextq_u8( z, t0, 8 );
+ r0 = veorq_u8( r0, t1 );
+ t0 = (uint8x16_t)vmull_low_p64( (poly64x2_t)r1, (poly64x2_t)p );
+ c_p = veorq_u8( r0, t0 );
+
+ /* reverse bits in each byte to convert from little-little endian to gcm format */
+ // vst1q_u8( c, vrbitq_u8( c_p ) );
+ vst1q_u8( x, vrbitq_u8( c_p ) );
+}
+
void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const
{
#if defined(BOTAN_HAS_GCM_CLMUL)
@@ -27,6 +113,9 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const
return gcm_multiply_clmul(x.data(), m_H.data());
#endif
+ return gcm_multiply_pmull(x.data(), m_H.data());
+
+#if 0
static const uint64_t R = 0xE100000000000000;
uint64_t H[2] = {
@@ -64,6 +153,7 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const
store_be<uint64_t>(x.data(), Z[0], Z[1]);
CT::unpoison(x.data(), x.size());
+#endif
}
void GHASH::ghash_update(secure_vector<uint8_t>& ghash,
The code generation looks very good:
0000000000001020 <Botan::gcm_multiply_pmull(unsigned char*, unsigned char const*)>:
1020: 3dc00021 ldr q1, [x1]
1024: 4f000406 movi v6.4s, #0x0
1028: 3dc00004 ldr q4, [x0]
102c: 9c000327 ldr q7, 1090 <Botan::gcm_multiply_pmull(unsigned char*, unsigned char const*)+0x70>
1030: 6e605821 rbit v1.16b, v1.16b
1034: 6e605884 rbit v4.16b, v4.16b
1038: 0ee1e090 pmull v16.1q, v4.1d, v1.1d
103c: 6e014025 .inst 0x6e014025 ; undefined
1040: 0ee5e080 pmull v0.1q, v4.1d, v5.1d
1044: 4ee5e082 pmull2 v2.1q, v4.2d, v5.2d
1048: 4ee1e084 pmull2 v4.1q, v4.2d, v1.2d
104c: 6e201c43 eor v3.16b, v2.16b, v0.16b
1050: 6e064062 .inst 0x6e064062 ; undefined
1054: 6e0340c3 .inst 0x6e0340c3 ; undefined
1058: 6e241c41 eor v1.16b, v2.16b, v4.16b
105c: 4ee7e022 pmull2 v2.1q, v1.2d, v7.2d
1060: 6e064040 .inst 0x6e064040 ; undefined
1064: 6e0240c6 .inst 0x6e0240c6 ; undefined
1068: 6e201c21 eor v1.16b, v1.16b, v0.16b
106c: 0ee7e020 pmull v0.1q, v1.1d, v7.1d
1070: 6e301c01 eor v1.16b, v0.16b, v16.16b
1074: 6e261c20 eor v0.16b, v1.16b, v6.16b
1078: 6e231c00 eor v0.16b, v0.16b, v3.16b
107c: 6e605800 rbit v0.16b, v0.16b
1080: 3d800000 str q0, [x0]
1084: d65f03c0 ret
1088: d503201f nop
108c: d503201f nop
1090: 00000087 .word 0x00000087
1094: 00000000 .word 0x00000000
1098: 00000087 .word 0x00000087
109c: 00000000 .word 0x00000000
@randombit,
I think I got the macro cleaned up at clmul-arm.c:
/********************************/
/* GCC and compatible compilers */
/********************************/
#if defined(__GNUC__)
#if defined(__GNUC_STDC_INLINE__) || defined(__INLINE__)
# define MAYBE_INLINE inline
#else
# define MAYBE_INLINE
#endif
/* Schneiders, Hovsmith and O'Rourke discovered this trick. */
/* It results in much better code generation in production code */
/* by avoiding D-register spills when using vgetq_lane_u64. The */
/* problem does not surface under minimal test cases. */
MAYBE_INLINE uint8x16_t PMULL_LOW(const uint8x16_t a, const uint8x16_t b)
{
uint8x16_t r;
__asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
:"=w" (r) : "w" (a), "w" (b) );
return r;
}
MAYBE_INLINE uint8x16_t PMULL_HIGH(const uint8x16_t a, const uint8x16_t b)
{
uint8x16_t r;
__asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t"
:"=w" (r) : "w" (a), "w" (b) );
return r;
}
#endif /* GCC and compatibles */
/**************************************/
/* Microsoft and compatible compilers */
/**************************************/
#if defined(_MSC_VER)
inline uint8x16_t PMULL_LOW(const uint8x16_t a, const uint8x16_t b)
{
return (uint8x16_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
}
inline uint8x16_t PMULL_HIGH(const uint8x16_t a, const uint8x16_t b)
{
return (uint8x16_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
}
#endif /* Microsoft and compatibles */
/*********************************************************/
/* Perform the multiplication and reduction in GF(2^128) */
/*********************************************************/
void clmul_arm(uint8_t r[16], const uint8_t a[16], const uint8_t b[16])
{
uint8x16_t a8, b8, c8;
uint8x16_t z, p;
uint8x16_t r0, r1;
uint8x16_t t0, t1;
a8 = vrbitq_u8(vld1q_u8(a));
b8 = vrbitq_u8(vld1q_u8(b));
/* polynomial multiply */
z = vdupq_n_u8(0);
r0 = PMULL_LOW(a8, b8);
r1 = PMULL_HIGH(a8, b8);
t0 = vextq_u8(b8, b8, 8);
t1 = PMULL_LOW(a8, t0);
t0 = PMULL_HIGH(a8, t0);
t0 = veorq_u8(t0, t1);
t1 = vextq_u8(z, t0, 8);
r0 = veorq_u8(r0, t1);
t1 = vextq_u8(t0, z, 8);
r1 = veorq_u8(r1, t1);
/* polynomial reduction */
p = vreinterpretq_u8_u64(vdupq_n_u64(0x0000000000000087));
t0 = PMULL_HIGH(r1, p);
t1 = vextq_u8(t0, z, 8);
r1 = veorq_u8(r1, t1);
t1 = vextq_u8(z, t0, 8);
r0 = veorq_u8(r0, t1);
t0 = PMULL_LOW(r1, p);
c8 = veorq_u8(r0, t0);
vst1q_u8(r, vrbitq_u8(c8));
}
Its not part of this patch, however.
Merged!
Attached and below is a patch for Carryless Multiplies using ARM Crypto extensions and
PMULL
. Its another partial patch, and others will have to complete it.The dev-boards used for testing were a Pine64 and LeMaker HiKey. Both have CRC and Crypto extensions. Botan was configured with
./configure.py --cc={gcc|clang} --cc-abi="-march=armv8-a+crc+crypto -mtune=cortex-a53"
.Here are the relative numbers:
Botan without ARM
PMULL
, GCC 4.9.2 (Pine64):Botan without ARM
PMULL
, GCC 4.9.2 (HiKey):Botan without ARM
PMULL
, Clang 3.5.0 (Pine64):Botan without ARM
PMULL
, Clang 3.5.0 (HiKey):Botan with ARM
PMULL
, GCC 4.9.2 (Pine64):PMULL
, GCC 4.9.2 (HiKey):PMULL
, Clang 3.5.0 (Pine64):PMULL
, Clang 3.5.0 (HiKey):The tricky thing here is, LLVM Clang provides the types and functions, like
poly64x2_t
,vmull_p64
andvmull_high_p64
, inarm_neon.h
orarm_acle.h
. However, Apple Clang as of 6.0 does not provide them even though it advertises__ARM_FEATURE_CRYPTO
preprocessor macro. It does not even provide the builtin equivalents, like__builtin_arm64_vmull
.I think its an omission on Apple's part at the moment since I have not found any reference to disgorging the carryless multiplies from the crypto. I expect Apple will provide them eventually. Apple may have already fixed it in later versions of Xcode and their Clang.
Here is the updated
gcm.cpp
and the diff packaged as a ZIP file.gcm_updated.zip