Reduce memory timing leak risk

gmaxwell commented 9 years ago

Yuval Yarom pointed us to this presentation https://cryptojedi.org/peter/data/chesrump-20130822.pdf which suggests that on at least some hardware uniform cacheline access is not enough, presumably because the loads have multi-cycle delays and earlier bytes are being made available earlier (?).

The obvious defense is to read all the bytes, which is what we expected cacheline colocation to do so doing it explicitly shouldn't require more memory bandwidth, and then extract the data we want.

A kludgy implementation given our current memory layout (and typedefs, pardon the typepunning) is:

diff --git a/src/ecmult_gen_impl.h b/src/ecmult_gen_impl.h
index 07859ab..157ab05 100644
--- a/src/ecmult_gen_impl.h
+++ b/src/ecmult_gen_impl.h
@@ -104,14 +104,32 @@ static void secp256k1_ecmult_gen(secp256k1_gej_t *r, const secp256k1_scalar_t *g
     const secp256k1_ecmult_gen_consts_t *c = secp256k1_ecmult_gen_consts;
     secp256k1_gej_set_infinity(r);
     secp256k1_ge_t add;
-    int bits;
+    uint32_t bits;
+    uint32_t bpos;
     for (int j=0; j<64; j++) {
         bits = secp256k1_scalar_get_bits(gn, j * 4, 4);
+        bpos = (1<<((bits&3)<<3))*255;
+        int b2 = (bits>>2)&1;
+        int b3 = (bits>>3)&1;
+        uint32_t maska = bpos*!(b2|b3);
+        uint32_t maskb = bpos*(b2&!b3);
+        uint32_t maskc = bpos*(b3&!b2);
+        uint32_t maskd = bpos*(b2&b3);
+        bits = (bits&3)<<3;
         for (size_t k=0; k<sizeof(secp256k1_ge_t); k++)
-            ((unsigned char*)(&add))[k] = c->prec[j][k][bits];
+        {
+            uint32_t *p;
+            p = (uint32_t *)c->prec[j][k];
+            uint32_t ra = maska & p[0];
+            uint32_t rb = maskb & p[1];
+            uint32_t rc = maskc & p[2];
+            uint32_t rd = maskd & p[3];
+            ((unsigned char*)(&add))[k] = (ra|rb|rc|rd)>>bits;
+        }
         secp256k1_gej_add_ge(r, r, &add);
     }
     bits = 0;
+    bpos = 0;
     secp256k1_ge_clear(&add);
 }

Is something like this something we'd like to do?

sipa commented 9 years ago

Or we could just slice per bit, and store everything as uint64_t, with a single shift to fetch the right bit.

sipa commented 9 years ago

Outch. Bitslicing makes signing time go from 194k cycles to 377k cycles...

sipa commented 9 years ago

Your code needs 361k cycles.

gmaxwell commented 9 years ago

ouch.

gmaxwell commented 9 years ago

Is this any faster?

diff --git a/src/ecmult_gen_impl.h b/src/ecmult_gen_impl.h
index 07859ab..4417e96 100644
--- a/src/ecmult_gen_impl.h
+++ b/src/ecmult_gen_impl.h
@@ -104,11 +104,26 @@ static void secp256k1_ecmult_gen(secp256k1_gej_t *r, const secp256k1_scalar_t *g
     const secp256k1_ecmult_gen_consts_t *c = secp256k1_ecmult_gen_consts;
     secp256k1_gej_set_infinity(r);
     secp256k1_ge_t add;
-    int bits;
+    uint32_t bits;
     for (int j=0; j<64; j++) {
         bits = secp256k1_scalar_get_bits(gn, j * 4, 4);
+        uint32_t b2 = -((bits>>2)&1);
+        uint32_t b3 = -((bits>>3)&1);
+        uint32_t maska = ~(b2|b3);
+        uint32_t maskb = b2 & ~b3;
+        uint32_t maskc = ~b2 & b3;
+        uint32_t maskd = b2&b3;
+        bits = (bits&3)<<3;
         for (size_t k=0; k<sizeof(secp256k1_ge_t); k++)
-            ((unsigned char*)(&add))[k] = c->prec[j][k][bits];
+        {
+            uint32_t *p;
+            p = (uint32_t *)c->prec[j][k];
+            uint32_t ra = maska & p[0];
+            uint32_t rb = maskb & p[1];
+            uint32_t rc = maskc & p[2];
+            uint32_t rd = maskd & p[3];
+            ((unsigned char*)(&add))[k] = ((ra|rb|rc|rd)>>bits)&0xff;
+        }
         secp256k1_gej_add_ge(r, r, &add);
     }
     bits = 0;

gmaxwell commented 9 years ago

Fixed by #132

bitcoin-core / secp256k1

Reduce memory timing leak risk #113