Add ARM Carryless Multiply

noloader commented 7 years ago

Attached and below is a patch for Carryless Multiplies using ARM Crypto extensions and PMULL. Its another partial patch, and others will have to complete it.

The dev-boards used for testing were a Pine64 and LeMaker HiKey. Both have CRC and Crypto extensions. Botan was configured with ./configure.py --cc={gcc|clang} --cc-abi="-march=armv8-a+crc+crypto -mtune=cortex-a53".

Here are the relative numbers:

Botan without ARM PMULL, GCC 4.9.2 (Pine64):

pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 4.647 MiB/sec (13.941 MiB in 3000.230 ms)
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 3.944 MiB/sec (11.832 MiB in 3000.084 ms)

Botan without ARM PMULL, GCC 4.9.2 (HiKey):

hikey:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 7.167 MiB/sec (21.504 MiB in 3000.520 ms)
hikey:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 7.209 MiB/sec (21.629 MiB in 3000.265 ms)

Botan without ARM PMULL, Clang 3.5.0 (Pine64):

pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 3.443 MiB/sec (10.328 MiB in 3000.142 ms)
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 3.818 MiB/sec (11.453 MiB in 3000.100 ms)

Botan without ARM PMULL, Clang 3.5.0 (HiKey):

hikey:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 6.360 MiB/sec (19.082 MiB in 3000.461 ms)
hikey:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 6.361 MiB/sec (19.086 MiB in 3000.274 ms)

Botan with ARM PMULL, GCC 4.9.2 (Pine64):

pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 57.638 MiB/sec (172.918 MiB in 3000.051 ms)
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 54.729 MiB/sec (164.188 MiB in 3000.017 ms)

Botan with ARM PMULL, GCC 4.9.2 (HiKey):

hikey:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 92.700 MiB/sec (278.102 MiB in 3000.005 ms)
hikey:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 92.756 MiB/sec (278.270 MiB in 3000.002 ms)

Botan with ARM PMULL, Clang 3.5.0 (Pine64):

pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 48.050 MiB/sec (144.152 MiB in 3000.028 ms)
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 45.544 MiB/sec (136.633 MiB in 3000.043 ms)

Botan with ARM PMULL, Clang 3.5.0 (HiKey):

hikey:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 77.691 MiB/sec (233.074 MiB in 3000.017 ms)
hikey:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 77.444 MiB/sec (232.332 MiB in 3000.016 ms)

The tricky thing here is, LLVM Clang provides the types and functions, like poly64x2_t, vmull_p64 and vmull_high_p64, in arm_neon.h or arm_acle.h. However, Apple Clang as of 6.0 does not provide them even though it advertises __ARM_FEATURE_CRYPTO preprocessor macro. It does not even provide the builtin equivalents, like __builtin_arm64_vmull.

I think its an omission on Apple's part at the moment since I have not found any reference to disgorging the carryless multiplies from the crypto. I expect Apple will provide them eventually. Apple may have already fixed it in later versions of Xcode and their Clang.

$ git diff > pmull.diff
$ cat pmull.diff
diff --git a/src/lib/modes/aead/gcm/gcm.cpp b/src/lib/modes/aead/gcm/gcm.cpp
index 0d0cbff3c..929c76984 100644
--- a/src/lib/modes/aead/gcm/gcm.cpp
+++ b/src/lib/modes/aead/gcm/gcm.cpp
@@ -16,10 +16,110 @@
   #include <botan/cpuid.h>
 #endif

+// Ugly... ARM32/ARM64 Headers
+// As of Visual Studio 2015, Microsoft does not support ARM ACLE extensions
+// Also, PMULL and PMULL2 are only available on Aarch64 (not A-32, and not Aarch32)
+//   And LLVM Clang only defines __aarch64__ (and not __arm__ or __arm64__)
+#if defined(__arm64__) || defined(__aarch64__)
+# if defined(__GNUC__)
+#  include <stdint.h>
+# endif
+# if defined(__ARM_NEON)
+#  include <arm_neon.h>
+# endif
+// GCC and LLVM Clang, but not Apple Clang
+# if defined(__GNUC__) && !defined(__apple_build_version__)
+#  if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
+#      include <arm_acle.h>
+#  endif
+# endif
+#endif  // ARM32 and ARM64 Headers
+
 namespace Botan {

 static const size_t GCM_BS = 16;

+void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]);
+
+void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16])
+{
+   /*
+    * Implementing GCM on ARMv8, http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf
+    */
+
+   // GCC 4.x is missing vreinterpretq_p128_u64 and many of the other NEON casts
+   // Apple Clang is missing most of the poly types, vmull_p64 and vmull_high_p64
+   //   poly128_t vmull_p64 (poly64_t, poly64_t);
+   //   poly128_t vmull_high_p64 (poly64x2_t, poly64x2_t);
+
+   // $ ./botan speed --msec=3000 'GMAC(AES-128)'
+   // GMAC(AES-128) [base] mac 43.392 MiB/sec (130.176 MiB in 3000.026 ms)
+   // uint64x2_t a64 = vld1q_u64(reinterpret_cast<const uint64_t*>(x));
+   // uint64x2_t b64 = vld1q_u64(reinterpret_cast<const uint64_t*>(H));
+   // a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8((uint8x8_t)vgetq_lane_u64(a64, 1)), vrev64_u8((uint8x8_t)vgetq_lane_u64(a64, 0))));
+   // b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8((uint8x8_t)vgetq_lane_u64(b64, 1)), vrev64_u8((uint8x8_t)vgetq_lane_u64(b64, 0))));
+
+   // $ ./botan speed --msec=3000 'GMAC(AES-128)'
+   // GMAC(AES-128) [base] mac 60.175 MiB/sec (180.527 MiB in 3000.040 ms)
+   const uint64x2_t a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(x+8)), vrev64_u8(vld1_u8(x))));
+   const uint64x2_t b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8(vld1_u8(H+8)), vrev64_u8(vld1_u8(H))));
+
+   uint64x2_t T0, T1, T2, T3, T4, T5;
+
+   T0 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 0));
+   T1 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 0));
+   T2 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 0), vgetq_lane_u64(b64, 1));
+   T3 = (uint64x2_t)vmull_p64(vgetq_lane_u64(a64, 1), vgetq_lane_u64(b64, 1));
+
+   T1 = veorq_u64(T1, T2);
+   T2 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T1), 8));
+   T1 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T1), vdupq_n_u8(0), 8));
+   T0 = veorq_u64(T0, T2);
+   T3 = veorq_u64(T3, T1);
+
+   T4 = vshrq_n_u64(T0, 31);
+   T0 = vshlq_n_u64(T0, 1);
+
+   T5 = vshrq_n_u64(T3, 31);
+   T3 = vshlq_n_u64(T3, 1);
+
+   T2 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 12));
+   T5 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T5), 12));
+   T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 12));
+   T0 = vorrq_u64(T0, T4);
+   T3 = vorrq_u64(T3, T5);
+   T3 = vorrq_u64(T3, T2);
+
+   T4 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 31));
+   T5 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 30));
+   T2 = vreinterpretq_u64_u32(vshlq_n_u32(vreinterpretq_u32_u64(T0), 25));
+
+   T4 = veorq_u64(T4, T5);
+   T4 = veorq_u64(T4, T2);
+   T5 = vreinterpretq_u64_u8(vextq_u8(vreinterpretq_u8_u64(T4), vdupq_n_u8(0), 4));
+   T3 = veorq_u64(T3, T5);
+   T4 = vreinterpretq_u64_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_u64(T4), 4));
+   T0 = veorq_u64(T0, T4);
+   T3 = veorq_u64(T3, T0);
+
+   T4 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 1));
+   T1 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 2));
+   T2 = vreinterpretq_u64_u32(vshrq_n_u32(vreinterpretq_u32_u64(T0), 7));
+   T3 = veorq_u64(T3, T1);
+   T3 = veorq_u64(T3, T2);
+   T3 = veorq_u64(T3, T4);
+
+   // $ ./botan speed --msec=3000 'GMAC(AES-128)'
+   // GMAC(AES-128) [base] mac 49.585 MiB/sec (148.758 MiB in 3000.028 ms)
+   // T3 = (uint64x2_t)vrev64q_u8((uint8x16_t)vcombine_u64(vget_high_u64(T3), vget_low_u64(T3)));
+   // vst1q_u64(reinterpret_cast<uint64_t*>(x), T3);
+
+   // $ ./botan speed --msec=3000 'GMAC(AES-128)'
+   // GMAC(AES-128) [base] mac 59.410 MiB/sec (178.230 MiB in 3000.031 ms)
+   vst1_u8(x+0, vrev64_u8(vreinterpret_u8_u64(vget_high_u64(T3))));
+   vst1_u8(x+8, vrev64_u8(vreinterpret_u8_u64(vget_low_u64(T3))));
+}
+
 void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const
    {
 #if defined(BOTAN_HAS_GCM_CLMUL)
@@ -27,6 +127,9 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const
       return gcm_multiply_clmul(x.data(), m_H.data());
 #endif

+   return gcm_multiply_pmull(x.data(), m_H.data());
+
+#if 0
    static const uint64_t R = 0xE100000000000000;

    uint64_t H[2] = {
@@ -64,6 +167,7 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const

    store_be<uint64_t>(x.data(), Z[0], Z[1]);
    CT::unpoison(x.data(), x.size());
+#endif
    }

 void GHASH::ghash_update(secure_vector<uint8_t>& ghash,

Here is the updated gcm.cpp and the diff packaged as a ZIP file.

gcm_updated.zip

randombit commented 7 years ago

Awesome, thank you. I am still working on the CPUID detection bits for ARM but after that should be able to merge all 3 of these patches quite easily.

noloader commented 7 years ago

@randombit,

I am still working on the CPUID detection bits for ARM but after that

Ack, let me know if/when you want comprehensive testing. I'm happy to lend a hand.

I was looking at Implementing GCM on ARMv8 again. According to Table 1 on page 4, PMULL and PMULL2 are only available on Aarch64.

PMULL and PMULL2 are not available on Aarch32. I don't have any Aarch32 machines to test on, so I don't know what will happen. The best case is they will compile, but fail the CPU probing tests. Worst case is a compile failure.

The include logic is OK for <arm_neon.h> and <arm_acle.h>, but it kind of implies PMULL and PMULL2 are available. ~~I think the include logic shown above is wrong since it uses __aarch32__ as a signal for PMULL and PMULL2 availability.~~ I updated the initial report to reflect it.

And for completeness, Apple's Clang claims Aarch64 environment:

$ clang++ --version
Apple LLVM version 6.0 (clang-600.0.57) (based on LLVM 3.5svn)
Target: x86_64-apple-darwin13.4.0
Thread model: posix

$ clang++ -arch arm64 -dM -E - < /dev/null | egrep -i '(arm|aarch)' | grep 64
#define __AARCH64EL__ 1
#define __AARCH64_SIMD__ 1
#define __ARM64_ARCH_8__ 1
#define __ARM_64BIT_STATE 1
#define __ARM_ARCH_ISA_A64 1
#define __ARM_PCS_AAPCS64 1
#define __aarch64__ 1
#define __arm64 1
#define __arm64__ 1

noloader commented 7 years ago

@randombit,

There's an experimental Carryless Multiply for ARM at mbedtls_armv8a_ce_gcm_mult. It has about 8 fewer instructions, and it runs 5 Mib or so faster based on GCC 4.9.2 (Pine64):

pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 61.621 MiB/sec (184.863 MiB in 3000.016 ms)
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 63.061 MiB/sec (189.184 MiB in 3000.030 ms)
pine64:botan$ ./botan speed --msec=3000 'GMAC(AES-128)'
GMAC(AES-128) [base] mac 60.954 MiB/sec (182.863 MiB in 3000.008 ms)

mbedTLS calculates c = a * b in GF(2^128). All you need to do is change the last line to the following for Botan's calculation of x = x * H in GF(2^128):

+    /* reverse bits in each byte to convert from little-little endian to gcm format */
+    // vst1q_u8( c, vrbitq_u8( c_p ) );
+    vst1q_u8( x, vrbitq_u8( c_p ) );

The patch needs some polishing due to the vmull_low_p64 macro, but I think its another viable option for you.

pine64:botan$ git diff > mbed-pmull.diff
pine64:botan$ cat mbed-pmull.diff
diff --git a/src/lib/modes/aead/gcm/gcm.cpp b/src/lib/modes/aead/gcm/gcm.cpp
index 0d0cbff..a94f4a4 100644
--- a/src/lib/modes/aead/gcm/gcm.cpp
+++ b/src/lib/modes/aead/gcm/gcm.cpp
@@ -16,10 +16,96 @@
   #include <botan/cpuid.h>
 #endif

+// Ugly... ARM32/ARM64 Headers
+// As of Visual Studio 2015, Microsoft does not support ARM ACLE extensions
+// Also, PMULL and PMULL2 are only available on Aarch64 (not A-32, and not Aarch32)
+//   And LLVM Clang only defines __aarch64__ (and not __arm__ or __arm64__)
+#if defined(__arm64__) || defined(__aarch64__)
+# if defined(__GNUC__)
+#  include <stdint.h>
+# endif
+# if defined(__ARM_NEON)
+#  include <arm_neon.h>
+# endif
+// GCC and LLVM Clang, but not Apple Clang
+# if defined(__GNUC__) && !defined(__apple_build_version__)
+#  if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
+#      include <arm_acle.h>
+#  endif
+# endif
+#endif  // ARM32 and ARM64 Headers
+
 namespace Botan {

 static const size_t GCM_BS = 16;

+void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16]);
+
+/* because the vmull_p64 intrinsic uses the wrong argument types: */
+#define vmull_low_p64(A, B) ({                               \
+    poly128_t res__;                                         \
+    asm("pmull    %0.1q, %1.1d, %2.1d                \n\t"   \
+       : "=w" (res__) : "w" (A), "w" (B) );                 \
+    res__;                                                   \
+       })
+
+void gcm_multiply_pmull(uint8_t x[16], const uint8_t H[16])
+{
+   /*
+    * Implementing GCM on ARMv8, http://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf
+    */
+
+   // GCC 4.x is missing vreinterpretq_p128_u64 and many of the other NEON casts
+   // Apple Clang is missing most of the poly types, vmull_p64 and vmull_high_p64
+   //   poly128_t vmull_p64 (poly64_t, poly64_t);
+   //   poly128_t vmull_high_p64 (poly64x2_t, poly64x2_t);
+
+   // $ ./botan speed --msec=3000 'GMAC(AES-128)'
+   // GMAC(AES-128) [base] mac 43.392 MiB/sec (130.176 MiB in 3000.026 ms)
+   // uint64x2_t a64 = vld1q_u64(reinterpret_cast<const uint64_t*>(x));
+   // uint64x2_t b64 = vld1q_u64(reinterpret_cast<const uint64_t*>(H));
+   // a64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8((uint8x8_t)vgetq_lane_u64(a64, 1)), vrev64_u8((uint8x8_t)vgetq_lane_u64(a64, 0))));
+   // b64 = vreinterpretq_u64_u8(vcombine_u8(vrev64_u8((uint8x8_t)vgetq_lane_u64(b64, 1)), vrev64_u8((uint8x8_t)vgetq_lane_u64(b64, 0))));
+
+    /* vector variables */
+    uint8x16_t a_p, b_p; /* inputs */
+    uint8x16_t z, p; /* constants */
+    uint8x16_t r0, r1; /* full width multiply result (before reduction) */
+    uint8x16_t t0, t1; /* temps */
+    uint8x16_t c_p; /* output */
+
+    /* reverse bits in each byte to convert from gcm format to little-little endian */
+    a_p = vrbitq_u8( vld1q_u8( x ) );
+    b_p = vrbitq_u8( vld1q_u8( H ) );
+
+    /* polynomial multiply (128*128->256bit). See [GCM-WP] algorithms 3. */
+    z = vdupq_n_u8( 0 );
+    r0 = (uint8x16_t)vmull_low_p64( (poly64x2_t)a_p, (poly64x2_t)b_p );
+    r1 = (uint8x16_t)vmull_high_p64( (poly64x2_t)a_p, (poly64x2_t)b_p );
+    t0 = vextq_u8( b_p, b_p, 8 );
+    t1 = (uint8x16_t)vmull_low_p64( (poly64x2_t)a_p, (poly64x2_t)t0 );
+    t0 = (uint8x16_t)vmull_high_p64( (poly64x2_t)a_p, (poly64x2_t)t0 );
+    t0 = veorq_u8( t0, t1 );
+    t1 = vextq_u8( z, t0, 8 );
+    r0 = veorq_u8( r0, t1 );
+    t1 = vextq_u8( t0, z, 8 );
+    r1 = veorq_u8( r1, t1 );
+
+    /* polynomial reduction (256->128bit). See [GCM-WP] algorithms 5. */
+    p = (uint8x16_t)vdupq_n_u64( 0x0000000000000087 );
+    t0 = (uint8x16_t)vmull_high_p64( (poly64x2_t)r1, (poly64x2_t)p );
+    t1 = vextq_u8( t0, z, 8 );
+    r1 = veorq_u8( r1, t1 );
+    t1 = vextq_u8( z, t0, 8 );
+    r0 = veorq_u8( r0, t1 );
+    t0 = (uint8x16_t)vmull_low_p64( (poly64x2_t)r1, (poly64x2_t)p );
+    c_p = veorq_u8( r0, t0 );
+
+    /* reverse bits in each byte to convert from little-little endian to gcm format */
+    // vst1q_u8( c, vrbitq_u8( c_p ) );
+    vst1q_u8( x, vrbitq_u8( c_p ) );
+}
+
 void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const
    {
 #if defined(BOTAN_HAS_GCM_CLMUL)
@@ -27,6 +113,9 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const
       return gcm_multiply_clmul(x.data(), m_H.data());
 #endif

+   return gcm_multiply_pmull(x.data(), m_H.data());
+
+#if 0
    static const uint64_t R = 0xE100000000000000;

    uint64_t H[2] = {
@@ -64,6 +153,7 @@ void GHASH::gcm_multiply(secure_vector<uint8_t>& x) const

    store_be<uint64_t>(x.data(), Z[0], Z[1]);
    CT::unpoison(x.data(), x.size());
+#endif
    }

 void GHASH::ghash_update(secure_vector<uint8_t>& ghash,

The code generation looks very good:

0000000000001020 <Botan::gcm_multiply_pmull(unsigned char*, unsigned char const*)>:
    1020:       3dc00021        ldr     q1, [x1]
    1024:       4f000406        movi    v6.4s, #0x0
    1028:       3dc00004        ldr     q4, [x0]
    102c:       9c000327        ldr     q7, 1090 <Botan::gcm_multiply_pmull(unsigned char*, unsigned char const*)+0x70>
    1030:       6e605821        rbit    v1.16b, v1.16b
    1034:       6e605884        rbit    v4.16b, v4.16b
    1038:       0ee1e090        pmull   v16.1q, v4.1d, v1.1d
    103c:       6e014025        .inst   0x6e014025 ; undefined
    1040:       0ee5e080        pmull   v0.1q, v4.1d, v5.1d
    1044:       4ee5e082        pmull2  v2.1q, v4.2d, v5.2d
    1048:       4ee1e084        pmull2  v4.1q, v4.2d, v1.2d
    104c:       6e201c43        eor     v3.16b, v2.16b, v0.16b
    1050:       6e064062        .inst   0x6e064062 ; undefined
    1054:       6e0340c3        .inst   0x6e0340c3 ; undefined
    1058:       6e241c41        eor     v1.16b, v2.16b, v4.16b
    105c:       4ee7e022        pmull2  v2.1q, v1.2d, v7.2d
    1060:       6e064040        .inst   0x6e064040 ; undefined
    1064:       6e0240c6        .inst   0x6e0240c6 ; undefined
    1068:       6e201c21        eor     v1.16b, v1.16b, v0.16b
    106c:       0ee7e020        pmull   v0.1q, v1.1d, v7.1d
    1070:       6e301c01        eor     v1.16b, v0.16b, v16.16b
    1074:       6e261c20        eor     v0.16b, v1.16b, v6.16b
    1078:       6e231c00        eor     v0.16b, v0.16b, v3.16b
    107c:       6e605800        rbit    v0.16b, v0.16b
    1080:       3d800000        str     q0, [x0]
    1084:       d65f03c0        ret
    1088:       d503201f        nop
    108c:       d503201f        nop
    1090:       00000087        .word   0x00000087
    1094:       00000000        .word   0x00000000
    1098:       00000087        .word   0x00000087
    109c:       00000000        .word   0x00000000

noloader commented 7 years ago

@randombit,

I think I got the macro cleaned up at clmul-arm.c:

/********************************/
/* GCC and compatible compilers */
/********************************/
#if defined(__GNUC__)
#if defined(__GNUC_STDC_INLINE__) || defined(__INLINE__)
# define MAYBE_INLINE inline
#else
# define MAYBE_INLINE
#endif

/* Schneiders, Hovsmith and O'Rourke discovered this trick.     */
/* It results in much better code generation in production code */
/* by avoiding D-register spills when using vgetq_lane_u64. The */
/* problem does not surface under minimal test cases.           */
MAYBE_INLINE uint8x16_t PMULL_LOW(const uint8x16_t a, const uint8x16_t b)
{
    uint8x16_t r;
    __asm __volatile("pmull    %0.1q, %1.1d, %2.1d \n\t"
        :"=w" (r) : "w" (a), "w" (b) );
    return r;
}

MAYBE_INLINE uint8x16_t PMULL_HIGH(const uint8x16_t a, const uint8x16_t b)
{
    uint8x16_t r;
    __asm __volatile("pmull2   %0.1q, %1.2d, %2.2d \n\t"
        :"=w" (r) : "w" (a), "w" (b) );
    return r;
}
#endif /* GCC and compatibles */

/**************************************/
/* Microsoft and compatible compilers */
/**************************************/
#if defined(_MSC_VER)
inline uint8x16_t PMULL_LOW(const uint8x16_t a, const uint8x16_t b)
{
    return (uint8x16_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
                                  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
}

inline uint8x16_t PMULL_HIGH(const uint8x16_t a, const uint8x16_t b)
{
    return (uint8x16_t)(vmull_p64(vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
                                  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
}
#endif /* Microsoft and compatibles */

/*********************************************************/
/* Perform the multiplication and reduction in GF(2^128) */
/*********************************************************/
void clmul_arm(uint8_t r[16], const uint8_t a[16], const uint8_t b[16])
{
    uint8x16_t a8, b8, c8;
    uint8x16_t z, p;
    uint8x16_t r0, r1;
    uint8x16_t t0, t1;

    a8 = vrbitq_u8(vld1q_u8(a));
    b8 = vrbitq_u8(vld1q_u8(b));

    /* polynomial multiply */
    z = vdupq_n_u8(0);
    r0 = PMULL_LOW(a8, b8);
    r1 = PMULL_HIGH(a8, b8);
    t0 = vextq_u8(b8, b8, 8);
    t1 = PMULL_LOW(a8, t0);
    t0 = PMULL_HIGH(a8, t0);
    t0 = veorq_u8(t0, t1);
    t1 = vextq_u8(z, t0, 8);
    r0 = veorq_u8(r0, t1);
    t1 = vextq_u8(t0, z, 8);
    r1 = veorq_u8(r1, t1);

    /* polynomial reduction */
    p = vreinterpretq_u8_u64(vdupq_n_u64(0x0000000000000087));
    t0 = PMULL_HIGH(r1, p);
    t1 = vextq_u8(t0, z, 8);
    r1 = veorq_u8(r1, t1);
    t1 = vextq_u8(z, t0, 8);
    r0 = veorq_u8(r0, t1);
    t0 = PMULL_LOW(r1, p);
    c8 = veorq_u8(r0, t0);

    vst1q_u8(r, vrbitq_u8(c8));
}

Its not part of this patch, however.

randombit commented 7 years ago

Merged!

randombit / botan

Add ARM Carryless Multiply #842