Unsaturated solinas fails on primes with + (e.g., p224, p256)

Synthesis fails on

./src/ExtractionOCaml/unsaturated_solinas 'p224' '4' '2^224 - 2^96 + 1' '64'

With a couple of issues:

bounds analysis cannot prove that the lower bound on carry is non-negative.
to_bytes immediately calls sbb in a way that gives a borrow bit which is maybe-a-borrow-but-maybe-a-carry Z.sub_with_get_borrow(2^56, 0, x1[0], 1)

the lower bounds on opp are negative:

In fiat_p224_opp:
Computed bounds (Some [Some [-0x119999999999997 ~> 0x2], Some [0xe6646666666667 ~> 0x1fffe0000000000], Some [0xe6666666666665 ~> 0x1fffffffffffffe], Some [0xe6666666666665 ~> 0x1fffffffffffffe]]) are not tight enough (expected bounds not looser than (Some [Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb]])).
The bounds [-0x119999999999997 ~> 0x2] are looser than the expected bounds [0x0 ~> 0x34ccccccccccccb]
When doing bounds analysis on the syntax tree:
/*
* Input Bounds:
*   arg1: [[0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999]]
* Output Bounds:
*   out1: None
*/
void f(int128 out1[4], const uint64_t arg1[4]) {
int64_t x1 = (0x2 - (int64_t)(arg1[0]));
uint64_t x2 = (UINT64_C(0x1fffe0000000000) - (arg1[1]));
uint64_t x3 = (UINT64_C(0x1fffffffffffffe) - (arg1[2]));
uint64_t x4 = (UINT64_C(0x1fffffffffffffe) - (arg1[3]));
out1[0] = x1;
out1[1] = x2;
out1[2] = x3;
out1[3] = x4;
}

Is this an issue with alternating TAPs? An issue with us not supporting karatsuba or goldilocks? An issue with computations of carry chains (we're using [0, 3, 1, 0, 2, 3, 1, 0])? Something else?

cc @davidben @andres-erbsen

check_args
/* Autogenerated */
/* curve description: p224 */
/* requested operations: (all) */
/* n = 4 (from "4") */
/* s-c = 2^224 - [(2^96, 1), (1, -1)] (from "2^224 - 2^96 + 1") */
/* machine_wordsize = 64 (from "64") */

/* Computed values: */
/* carry_chain = [0, 3, 1, 0, 2, 3, 1, 0] */

#include <stdint.h>
typedef unsigned char fiat_p224_uint1;
typedef signed char fiat_p224_int1;

In fiat_p224_carry_mul:
Computed bounds (Some [Some [0x0 ~> 0xffffffffffffff], Some [-0x1 ~> 0x100000000000000], Some [-0x1 ~> 0x100000015c7ae14], Some [0x0 ~> 0xffffffffffffff]]) are not tight enough (expected bounds not looser than (Some [Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999]])).
The bounds [-0x1 ~> 0x100000000000000] are looser than the expected bounds [0x0 ~> 0x119999999999999]
The bounds [-0x1 ~> 0x100000015c7ae14] are looser than the expected bounds [0x0 ~> 0x119999999999999]
When doing bounds analysis on the syntax tree:
/*
 * Input Bounds:
 *   arg1: [[0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb]]
 *   arg2: [[0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb]]
 * Output Bounds:
 *   out1: None
 */
void f(int128 out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
  uint128 x1 = ((uint128)(arg1[3]) * (arg2[3]));
  uint128 x2 = ((uint128)(arg1[3]) * (arg2[2]));
  uint128 x3 = ((uint128)(arg1[3]) * (arg2[1]));
  uint128 x4 = ((uint128)(arg1[2]) * (arg2[3]));
  uint128 x5 = ((uint128)(arg1[2]) * (arg2[2]));
  uint128 x6 = ((uint128)(arg1[1]) * (arg2[3]));
  uint256 x7 = ((uint256)((uint128)(arg1[3]) * (arg2[3])) << 40);
  uint256 x8 = ((uint256)((uint128)(arg1[3]) * (arg2[2])) << 40);
  uint256 x9 = ((uint256)((uint128)(arg1[3]) * (arg2[1])) << 40);
  uint256 x10 = ((uint256)((uint128)(arg1[2]) * (arg2[3])) << 40);
  uint256 x11 = ((uint256)((uint128)(arg1[2]) * (arg2[2])) << 40);
  uint256 x12 = ((uint256)((uint128)(arg1[1]) * (arg2[3])) << 40);
  uint128 x13 = ((uint128)(arg1[3]) * (arg2[0]));
  uint128 x14 = ((uint128)(arg1[2]) * (arg2[1]));
  uint128 x15 = ((uint128)(arg1[2]) * (arg2[0]));
  uint128 x16 = ((uint128)(arg1[1]) * (arg2[2]));
  uint128 x17 = ((uint128)(arg1[1]) * (arg2[1]));
  uint128 x18 = ((uint128)(arg1[1]) * (arg2[0]));
  uint128 x19 = ((uint128)(arg1[0]) * (arg2[3]));
  uint128 x20 = ((uint128)(arg1[0]) * (arg2[2]));
  uint128 x21 = ((uint128)(arg1[0]) * (arg2[1]));
  uint128 x22 = ((uint128)(arg1[0]) * (arg2[0]));
  int128 x23 = (int128)((int128)x22 - (x6 + (x5 + x3)));
  int64_t x24 = (int64_t)(x23 >> 56);
  uint64_t x25 = (uint64_t)(x23 & UINT64_C(0xffffffffffffff));
  uint256 x26 = (x19 + (x16 + (x14 + (x13 + x7))));
  int256 x27 = (x20 + (x17 + (x15 + (int256)(x10 + ((int256)x8 - x1)))));
  int256 x28 = (x21 + (x18 + (int256)(x12 + (int256)(x11 + ((int256)x9 - (x4 + x2))))));
  int256 x29 = (x24 + x28);
  uint128 x30 = (uint128)(x26 >> 56);
  uint64_t x31 = (uint64_t)(x26 & UINT64_C(0xffffffffffffff));
  uint256 x32 = ((uint256)x30 << 40);
  int256 x33 = (x29 + (int256)x32);
  int128 x34 = (int128)(x33 >> 56);
  uint64_t x35 = (uint64_t)(x33 & UINT64_C(0xffffffffffffff));
  int128 x36 = (x25 - (int128)x30);
  int256 x37 = (x34 + x27);
  int64_t x38 = (int64_t)(x36 >> 56);
  uint64_t x39 = (uint64_t)(x36 & UINT64_C(0xffffffffffffff));
  int64_t x40 = (x38 + (int64_t)x35);
  int128 x41 = (int128)(x37 >> 56);
  uint64_t x42 = (uint64_t)(x37 & UINT64_C(0xffffffffffffff));
  int128 x43 = (x41 + x31);
  int64_t x44 = (int64_t)(x43 >> 56);
  uint64_t x45 = (uint64_t)(x43 & UINT64_C(0xffffffffffffff));
  int128 x46 = (int128)((uint128)x44 << 40);
  int128 x47 = (x40 + x46);
  int32_t x48 = (int32_t)(x47 >> 56);
  uint64_t x49 = (uint64_t)(x47 & UINT64_C(0xffffffffffffff));
  int64_t x50 = (int64_t)(x39 - x44);
  int64_t x51 = (x48 + (int64_t)x42);
  int2 x52 = (int2)(x50 >> 56);
  uint64_t x53 = (x50 & UINT64_C(0xffffffffffffff));
  int64_t x54 = (x52 + (int64_t)x49);
  out1[0] = x53;
  out1[1] = x54;
  out1[2] = x51;
  out1[3] = x45;
}

with input bounds ((Some [Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb]]), Some [Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb]]).

In fiat_p224_carry_square:
Computed bounds (Some [Some [0x0 ~> 0xffffffffffffff], Some [-0x1 ~> 0x100000000000000], Some [-0x1 ~> 0x100000015c7ae14], Some [0x0 ~> 0xffffffffffffff]]) are not tight enough (expected bounds not looser than (Some [Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999]])).
The bounds [-0x1 ~> 0x100000000000000] are looser than the expected bounds [0x0 ~> 0x119999999999999]
The bounds [-0x1 ~> 0x100000015c7ae14] are looser than the expected bounds [0x0 ~> 0x119999999999999]
When doing bounds analysis on the syntax tree:
/*
 * Input Bounds:
 *   arg1: [[0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb]]
 * Output Bounds:
 *   out1: None
 */
void f(int128 out1[4], const uint64_t arg1[4]) {
  uint64_t x1 = (arg1[3]);
  uint64_t x2 = (arg1[3]);
  uint64_t x3 = (x1 * (uint64_t)0x2);
  uint64_t x4 = (x2 * (uint64_t)0x2);
  uint64_t x5 = ((arg1[3]) * (uint64_t)0x2);
  uint64_t x6 = (arg1[2]);
  uint64_t x7 = (arg1[2]);
  uint64_t x8 = ((arg1[2]) * (uint64_t)0x2);
  uint64_t x9 = ((arg1[1]) * (uint64_t)0x2);
  uint128 x10 = ((uint128)(arg1[3]) * x2);
  uint256 x11 = ((uint256)((uint128)(arg1[3]) * x1) << 40);
  int128 x12 = (0x0 - (int128)((uint128)(arg1[2]) * x4));
  uint256 x13 = ((uint256)((uint128)(arg1[2]) * x3) << 40);
  uint128 x14 = ((uint128)(arg1[2]) * x7);
  uint256 x15 = ((uint256)((uint128)(arg1[2]) * x6) << 40);
  int128 x16 = (0x0 - (int128)((uint128)(arg1[1]) * x4));
  uint256 x17 = ((uint256)((uint128)(arg1[1]) * x3) << 40);
  uint128 x18 = ((uint128)(arg1[1]) * x8);
  uint128 x19 = ((uint128)(arg1[1]) * (arg1[1]));
  uint128 x20 = ((uint128)(arg1[0]) * x5);
  uint128 x21 = ((uint128)(arg1[0]) * x8);
  uint128 x22 = ((uint128)(arg1[0]) * x9);
  uint128 x23 = ((uint128)(arg1[0]) * (arg1[0]));
  int128 x24 = (int128)(x23 + (x16 - (int128)x14));
  int64_t x25 = (int64_t)(x24 >> 56);
  uint64_t x26 = (uint64_t)(x24 & UINT64_C(0xffffffffffffff));
  uint256 x27 = (x20 + (x18 + x11));
  int256 x28 = (x21 + (x19 + ((int256)x13 - x10)));
  int256 x29 = (x22 + (int256)(x17 + (int256)(x15 + (int256)x12)));
  int256 x30 = (x25 + x29);
  uint128 x31 = (uint128)(x27 >> 56);
  uint64_t x32 = (uint64_t)(x27 & UINT64_C(0xffffffffffffff));
  uint256 x33 = ((uint256)x31 << 40);
  int256 x34 = (x30 + (int256)x33);
  int128 x35 = (int128)(x34 >> 56);
  uint64_t x36 = (uint64_t)(x34 & UINT64_C(0xffffffffffffff));
  int128 x37 = (x26 - (int128)x31);
  int256 x38 = (x35 + x28);
  int64_t x39 = (int64_t)(x37 >> 56);
  uint64_t x40 = (uint64_t)(x37 & UINT64_C(0xffffffffffffff));
  int64_t x41 = (x39 + (int64_t)x36);
  int128 x42 = (int128)(x38 >> 56);
  uint64_t x43 = (uint64_t)(x38 & UINT64_C(0xffffffffffffff));
  int128 x44 = (x42 + x32);
  int64_t x45 = (int64_t)(x44 >> 56);
  uint64_t x46 = (uint64_t)(x44 & UINT64_C(0xffffffffffffff));
  int128 x47 = (int128)((uint128)x45 << 40);
  int128 x48 = (x41 + x47);
  int32_t x49 = (int32_t)(x48 >> 56);
  uint64_t x50 = (uint64_t)(x48 & UINT64_C(0xffffffffffffff));
  int64_t x51 = (int64_t)(x40 - x45);
  int64_t x52 = (x49 + (int64_t)x43);
  int2 x53 = (int2)(x51 >> 56);
  uint64_t x54 = (x51 & UINT64_C(0xffffffffffffff));
  int64_t x55 = (x53 + (int64_t)x50);
  out1[0] = x54;
  out1[1] = x55;
  out1[2] = x52;
  out1[3] = x46;
}

with input bounds (Some [Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb]]).

In fiat_p224_carry:
Computed bounds (Some [Some [0x0 ~> 0xffffffffffffff], Some [-0x1 ~> 0xffffffffffffff], Some [-0x1 ~> 0x100000000000000], Some [0x0 ~> 0xffffffffffffff]]) are not tight enough (expected bounds not looser than (Some [Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999]])).
The bounds [-0x1 ~> 0xffffffffffffff] are looser than the expected bounds [0x0 ~> 0x119999999999999]
The bounds [-0x1 ~> 0x100000000000000] are looser than the expected bounds [0x0 ~> 0x119999999999999]
When doing bounds analysis on the syntax tree:
/*
 * Input Bounds:
 *   arg1: [[0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb], [0x0 ~> 0x34ccccccccccccb]]
 * Output Bounds:
 *   out1: None
 */
void f(int128 out1[4], const uint64_t arg1[4]) {
  uint64_t x1 = (arg1[0]);
  uint64_t x2 = (arg1[3]);
  uint64_t x3 = (x2 >> 56);
  uint64_t x4 = (((x1 >> 56) + (arg1[1])) + (x3 << 40));
  int64_t x5 = (int64_t)((int64_t)(x1 & UINT64_C(0xffffffffffffff)) - x3);
  uint64_t x6 = ((x4 >> 56) + (arg1[2]));
  uint64_t x7 = ((x6 >> 56) + (x2 & UINT64_C(0xffffffffffffff)));
  uint64_t x8 = (x7 >> 56);
  int64_t x9 = (((int1)(x5 >> 56) + (int64_t)(x4 & UINT64_C(0xffffffffffffff))) + (int64_t)(x8 << 40));
  int64_t x10 = (int64_t)((int64_t)(x5 & UINT64_C(0xffffffffffffff)) - x8);
  uint64_t x11 = (x10 & UINT64_C(0xffffffffffffff));
  int64_t x12 = ((int1)(x10 >> 56) + (int64_t)(x9 & UINT64_C(0xffffffffffffff)));
  int64_t x13 = ((int2)(x9 >> 56) + (int64_t)(x6 & UINT64_C(0xffffffffffffff)));
  uint64_t x14 = (x7 & UINT64_C(0xffffffffffffff));
  out1[0] = x11;
  out1[1] = x12;
  out1[2] = x13;
  out1[3] = x14;
}

with input bounds (Some [Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb]]).

In fiat_p224_sub:
Computed bounds (Some [Some [-0x119999999999997 ~> 0x11999999999999b], Some [0xe6646666666667 ~> 0x319979999999999], Some [0xe6666666666665 ~> 0x319999999999997], Some [0xe6666666666665 ~> 0x319999999999997]]) are not tight enough (expected bounds not looser than (Some [Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb]])).
The bounds [-0x119999999999997 ~> 0x11999999999999b] are looser than the expected bounds [0x0 ~> 0x34ccccccccccccb]
When doing bounds analysis on the syntax tree:
/*
 * Input Bounds:
 *   arg1: [[0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999]]
 *   arg2: [[0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999]]
 * Output Bounds:
 *   out1: None
 */
void f(int128 out1[4], const uint64_t arg1[4], const uint64_t arg2[4]) {
  int64_t x1 = (int64_t)((int64_t)(0x2 + (arg1[0])) - (arg2[0]));
  uint64_t x2 = ((UINT64_C(0x1fffe0000000000) + (arg1[1])) - (arg2[1]));
  uint64_t x3 = ((UINT64_C(0x1fffffffffffffe) + (arg1[2])) - (arg2[2]));
  uint64_t x4 = ((UINT64_C(0x1fffffffffffffe) + (arg1[3])) - (arg2[3]));
  out1[0] = x1;
  out1[1] = x2;
  out1[2] = x3;
  out1[3] = x4;
}

with input bounds ((Some [Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999]]), Some [Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999]]).

In fiat_p224_opp:
Computed bounds (Some [Some [-0x119999999999997 ~> 0x2], Some [0xe6646666666667 ~> 0x1fffe0000000000], Some [0xe6666666666665 ~> 0x1fffffffffffffe], Some [0xe6666666666665 ~> 0x1fffffffffffffe]]) are not tight enough (expected bounds not looser than (Some [Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb], Some [0x0 ~> 0x34ccccccccccccb]])).
The bounds [-0x119999999999997 ~> 0x2] are looser than the expected bounds [0x0 ~> 0x34ccccccccccccb]
When doing bounds analysis on the syntax tree:
/*
 * Input Bounds:
 *   arg1: [[0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999], [0x0 ~> 0x119999999999999]]
 * Output Bounds:
 *   out1: None
 */
void f(int128 out1[4], const uint64_t arg1[4]) {
  int64_t x1 = (0x2 - (int64_t)(arg1[0]));
  uint64_t x2 = (UINT64_C(0x1fffe0000000000) - (arg1[1]));
  uint64_t x3 = (UINT64_C(0x1fffffffffffffe) - (arg1[2]));
  uint64_t x4 = (UINT64_C(0x1fffffffffffffe) - (arg1[3]));
  out1[0] = x1;
  out1[1] = x2;
  out1[2] = x3;
  out1[3] = x4;
}

with input bounds (Some [Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999], Some [0x0 ~> 0x119999999999999]]).

In fiat_p224_to_bytes:
Stringification failed on the syntax tree:
(λ x1,
  expr_let x2 := Z.sub_with_get_borrow(2^56, 0, x1[0], 1) (* : uint64_t, [-0x1 ~> 0x1] *) in
  expr_let x3 := Z.sub_with_get_borrow(2^56, x2₂, x1[1], 0xffff0000000000) (* : uint64_t, uint1_t *) in
  expr_let x4 := Z.sub_with_get_borrow(2^56, x3₂, x1[2], 2^56-1) (* : uint64_t, uint1_t *) in
  expr_let x5 := Z.sub_with_get_borrow(2^56, x4₂, x1[3], 2^56-1) (* : uint64_t, uint1_t *) in
  expr_let x6 := Z.zselect(x5₂, 0, 2^64-1) (* : uint64_t *) in
  expr_let x7 := Z.add_get_carry(2^56, x2₁, (x6 & 1)) (* : uint64_t, uint1_t *) in
  expr_let x8 := Z.add_with_get_carry(2^56, x7₂, x3₁, (x6 & 0xffff0000000000)) (* : uint64_t, uint1_t *) in
  expr_let x9 := Z.add_with_get_carry(2^56, x8₂, x4₁, (x6 & 2^56-1)) (* : uint64_t, uint1_t *) in
  expr_let x10 := Z.add_with_get_carry(2^56, x9₂, x5₁, (x6 & 2^56-1)) (* : uint64_t, uint1_t *) in
  expr_let x11 := x7₁ >> 8 (* : uint64_t *) in
  expr_let x12 := x7₁ & 255 (* : uint8_t *) in
  expr_let x13 := x11 >> 8 (* : uint64_t *) in
  expr_let x14 := x11 & 255 (* : uint8_t *) in
  expr_let x15 := x13 >> 8 (* : uint64_t *) in
  expr_let x16 := x13 & 255 (* : uint8_t *) in
  expr_let x17 := x15 >> 8 (* : uint64_t *) in
  expr_let x18 := x15 & 255 (* : uint8_t *) in
  expr_let x19 := x17 >> 8 (* : uint64_t *) in
  expr_let x20 := x17 & 255 (* : uint8_t *) in
  expr_let x21 := x19 >> 8 (* : uint8_t *) in
  expr_let x22 := x19 & 255 (* : uint8_t *) in
  expr_let x23 := x21 & 255 (* : uint8_t *) in
  expr_let x24 := x8₁ >> 8 (* : uint64_t *) in
  expr_let x25 := x8₁ & 255 (* : uint8_t *) in
  expr_let x26 := x24 >> 8 (* : uint64_t *) in
  expr_let x27 := x24 & 255 (* : uint8_t *) in
  expr_let x28 := x26 >> 8 (* : uint64_t *) in
  expr_let x29 := x26 & 255 (* : uint8_t *) in
  expr_let x30 := x28 >> 8 (* : uint64_t *) in
  expr_let x31 := x28 & 255 (* : uint8_t *) in
  expr_let x32 := x30 >> 8 (* : uint64_t *) in
  expr_let x33 := x30 & 255 (* : uint8_t *) in
  expr_let x34 := x32 >> 8 (* : uint8_t *) in
  expr_let x35 := x32 & 255 (* : uint8_t *) in
  expr_let x36 := x34 & 255 (* : uint8_t *) in
  expr_let x37 := x9₁ >> 8 (* : uint64_t *) in
  expr_let x38 := x9₁ & 255 (* : uint8_t *) in
  expr_let x39 := x37 >> 8 (* : uint64_t *) in
  expr_let x40 := x37 & 255 (* : uint8_t *) in
  expr_let x41 := x39 >> 8 (* : uint64_t *) in
  expr_let x42 := x39 & 255 (* : uint8_t *) in
  expr_let x43 := x41 >> 8 (* : uint64_t *) in
  expr_let x44 := x41 & 255 (* : uint8_t *) in
  expr_let x45 := x43 >> 8 (* : uint64_t *) in
  expr_let x46 := x43 & 255 (* : uint8_t *) in
  expr_let x47 := x45 >> 8 (* : uint8_t *) in
  expr_let x48 := x45 & 255 (* : uint8_t *) in
  expr_let x49 := x47 & 255 (* : uint8_t *) in
  expr_let x50 := x10₁ >> 8 (* : uint64_t *) in
  expr_let x51 := x10₁ & 255 (* : uint8_t *) in
  expr_let x52 := x50 >> 8 (* : uint64_t *) in
  expr_let x53 := x50 & 255 (* : uint8_t *) in
  expr_let x54 := x52 >> 8 (* : uint64_t *) in
  expr_let x55 := x52 & 255 (* : uint8_t *) in
  expr_let x56 := x54 >> 8 (* : uint64_t *) in
  expr_let x57 := x54 & 255 (* : uint8_t *) in
  expr_let x58 := x56 >> 8 (* : uint64_t *) in
  expr_let x59 := x56 & 255 (* : uint8_t *) in
  expr_let x60 := x58 >> 8 (* : uint8_t *) in
  expr_let x61 := x58 & 255 (* : uint8_t *) in
  x12 :: x14 :: x16 :: x18 :: x20 :: x22 :: x23 :: x25 :: x27 :: x29 :: x31 :: x33 :: x35 :: x36 :: x38 :: x40 :: x42 :: x44 :: x46 :: x48 :: x49 :: x51 :: x53 :: x55 :: x57 :: x59 :: x61 :: x60 :: []
)
Error in converting fiat_p224_to_bytes to C:
Final bounds check failed on second (carry) return value of Z.sub_with_get_borrow; expected an unsigned 1-bit number (uint1), but found a int2.

Fatal error: exception Failure("Synthesis failed")

Maybe also the way that we're encoding balance for sub is wrong here? (we use scmul 2 (encode (s - c)), which somehow gives 0x2; 0x1fffe0000000000; 0x1fffffffffffffe; 0x1fffffffffffffe on p224) cc @jadephilipoom

Okay, opp doesn't boundscheck because we encode s - 1 (and then multiply each limb by 1.1) for the tight upper bounds, but the balance uses s - c. For p224, the first (lowest, I think) limb of the encoding of s - 1 (with no reduction) is 2^56-1, but the first limb of s - c is 1. I have no idea what we should be doing, though.

@andres-erbsen said on signal:

I'm not actually sure this should succeed, it might need the trick of adding copies of p224 based on word ranges during carrying emilia käsper wrote an implementation of p224 unsaturated solinas and agl cited it when he did p256 with this kind of balancing

Recording an update based on conversation with Andres and Jason.

This issue seems to stem from the fact that these primes, when encoded, have low limbs that are much smaller than bounds. So when we try to add balance, we get subtractions that underflow because the balance is smaller than the bounds (e.g. for p224 opp, we subtract from 2p, but 2p has a limb that's only 0x2. Then we're subtracting something that can be on the order of 2^56 from 0x2 and it underflows.)

Relevant code using a carefully weighted 4*p for p224: https://github.com/openssl/openssl/blob/master/crypto/ec/ecp_nistp224.c#L395-L414

An improved balance computation fixes the negative bounds on opp (turns out 2*p is sufficient as long as you're careful about the distribution), but the issues with carry underflows and to_bytes remain.

The cause of the carry underflow is that if you're doing Solinas reduction on something with positive coefficients (specifically, positive coefficients without a larger negative coefficient in the same limb), that means you're subtracting from limbs that might be 0, and you need to add balance to make sure it's okay. Looks like we might need some extra parameter for the balance added in reduce, which would be 0 in most cases but some carefully distributed multiple of p (see https://github.com/openssl/openssl/blob/master/crypto/ec/ecp_nistp224.c#L539) for the primes that have potential for underflow in reduce. Working on figuring out what that looks like. (EDIT: the multiple of p in the openssl implementation I linked is actually p << 15, which is interestingly large)

As for to_bytes, my current guess is this: freeze subtracts p from the input to start, and then conditionally adds it back. When each limb of p is about the same size as the tight bounds, this is always either negative or below the limb size, and we know we have a borrow (i.e. the quotient is either negative or 0). However, the whole problem with this shape of prime is that some limbs of p are actually very small. So now we could have either a positive or negative quotient -- it could be positive because the tight upper bounds are greater than the limb size and the prime-limb we're subtracting is tiny, and it could be negative because if the input is 0 we will still underflow. Possibly just distributing p better using a similar algorithm to the one I just applied to subtraction balance will solve that.

In order to figure out the balance for reduce, I've been looking for more examples of unsaturated solinas for primes that need balance for reduce (primes in the shape 2^k-c for which c has negative limbs) and kind of coming up empty. Any additional examples appreciated.

It looks like a balance of p*1 would actually suffice for p224 reduce (if distributed properly) but p256 would need a balance of p*2^25 (at least, I think -- I haven't found an unsaturated solinas implementation of p256 in the wild). It's not really clear to me why the difference in coefficient is so huge, and it's difficult to generalize without some more examples. It's pretty easy to plug in a coefficient and it's obvious if the coefficient is insufficient, but I don't really want to search the space between 1 and 2^25 for every prime. I think if I don't find more examples, I'll parameterize over a reduce_balance_coef that defaults to 0, and you'll have to pass in a value of 1 to get p224 working. And I'll leave a TODO saying we should look for a way to at least narrow the search space for that coefficient. Sound good?

I guess a second option would be to default to reduce_balance_coef = 1 and then use 0 anyway if reduce balance is unneeded (which is easy to tell at compile time). But that might be confusing.

I guess a second option would be to default to reduce_balance_coef = 1 and then use 0 anyway if reduce balance is unneeded (which is easy to tell at compile time). But that might be confusing.

How about making it an option, and if it's None, use 0 if that works, otherwise use 1? But, wait, don't you mean 1 and 2, not 0 and 1? It seems like sub won't work at all if you don't have at least one copy of the prime you're adding as balance.

But I'm a bit confused. It seems like it should be possible to compute the balance needed in just one or two iterations. Something like the following: As I understand it, the current algorithm is to encode p, and then, starting from the lowest limb, rebalance as follows: If there current limb is less that 2^log2_up(tight bound for this limb), borrow 2^log2_up(tight bound - current value)) to ensure that we are larger than the tight bound. Repeat this process until we get to the highest limb.

Now, for the highest limb, we need to borrow some amount from a non-existent limb. So instead we ask "how many copies of p do we need to add to allow this much borrowing?", so we compute the balance as something like 1+2^(log2_up(tight_bound - current_value) - log2_up(p / (weight(n-1))))

(Now, it's possible that if this number is very large, it might bump some limbs of c over to the next highest limb, which might have a chance of changing the value for the highest limb. Though I think you managed to convince me last time that this shouldn't happen, and that in fact the current value is never (?) going to have the same high bit as the tight bound, and so we can just ignore the current value when computing balance.)

Does this algorithm not work for p256? P256 is 2^256 - 2^224 + 2^192 + 2^96 - 1. If you're doing x64, 5 limbs of 51.2 bits each (so 51,51,51,51,52), tight bounds are 2^53 in the highest limb, so it seems like this algorithm indicates that you should be able to add a balance of 2 (or just 1 extra copy of the prime). In fact, it seems like this should always be true regardless of prime. Why does p256 need more than 2*p? Can you walk me through the rebalancing/borrowing and where it fails?

This is for reduce balance, not subtraction balance. Second paragraph in https://github.com/mit-plv/fiat-crypto/issues/554#issuecomment-662988844

Ah, I see. Hmmm, would you be willing to write out the steps of where a balance of one copy of p fails? The other thought that occurs to me is wondering if you can represent c without any +, and use that for the solinas implementation?

Representing c without any + works for the bitcoin prime (for 64-bit) only because the c without + fits entirely in one limb. Otherwise, we won't know how to split c across limbs if we just represent it without +. So this will be hard for p256, with its positive 2^192 coefficient.

I've been working on an algorithm for this and making some progress. I can at least have lower bounds now and understand why p256 needs such a big number. It'll show up in the example.

One copy of p fails for p256 by the following logic:

reduce is called after multiply, so we need to be able to reduce the product of two numbers bounded by tight bounds
assuming those numbers are at the maximum allowed by tight_bounds and n=6, our values for the high limbs times c are [2^88+2^86, 2^88-2^86, -(2^98+2^96-2^87), -(2^98+2^97-2^87), -(2^109+2^107+2^98-2^85), 2^266-2^234+2^225-2^193+2^183-2^151+2^140+2^138-2^108-2^107+2^98-2^96]
Now we need to add that term to the low bits, which are still lower bounded by 0. The large negative numbers can cause underflow, so we need to be adding balance that is at least as great as those negative numbers in those limbs. In addition, our balance cannot have negative limbs.
That means the minimum value of our balance (not restricting by "needs to be a multiple of p") is [0, 0, 2^98+2^96-2^87, 2^98+2^97-2^87, 2^109+2^107+2^98-2^85, 0]
Evaluating that number gives 2^280+2^278+2^269-2^256+2^226+2^225-2^215+2^184+2^182-2^173, which is much greater than p256; we need to multiply p256 by at least 2^24 and some change to get a number that can possibly be distributed to meet these lower bounds.

It turns out, though, that even this isn't quite enough, and 1 doesn't quite work for p224 once I did a more rigorous check, because (I think) you actually need to do a different carry chain to make this work...still hammering that out. So revise my statement above to say "p224 needs at least 1 and p256 needs at least <ceiling-quotient of the 280-bit number above and p256>".

Otherwise, we won't know how to split c across limbs if we just represent it without +. So this will be hard for p256, with its positive 2^192 coefficient.

What about

Require Import Coq.ZArith.ZArith.
Open Scope Z_scope.

Definition Z_pow' := Z.pow.
Definition Z_sub' := Z.sub.
Delimit Scope Z'_scope with Z'.
Infix "^" := Z_pow' : Z'_scope.
Infix "-" := Z_sub' : Z'_scope.
Arguments Z_pow' (_ _)%Z'.
Arguments Z_sub' (_ _)%Z'.

Section taps.
  Context (p : Z).

  Fixpoint exps (fuel : nat) (remaining : Z) : option (list Z)
    := if (remaining <=? 0)
       then Some nil
       else match fuel with
            | O => None
            | S fuel
              => let e := Z.log2 remaining in
                 option_map (cons e) (exps fuel (remaining - 2^e))
            end.

  Let fuel := Z.to_nat (5 + Z.log2_up p).

  Let s_exp := Z.log2_up p.

  Let rest_exps := exps fuel (2^s_exp - p).

  Definition p'
    := match rest_exps as v return match v with Some _ => Z | None => unit end with
       | Some exps
         => List.fold_left
              (fun z e => Z_sub' z (Z_pow' 2 e))
              exps
              (Z_pow' 2 s_exp)
       | None => tt
       end.
End taps.

Definition p256 := 2^256 - 2^224 + 2^192 + 2^96 - 1.
Set Printing Depth 100000.
Definition p256' := Eval lazy -[Z_pow' Z_sub'] in p' p256.
Compute p256 - p256'. (* 0 *)
Eval lazy [p256' Z_pow' Z_sub'] in p256'.
(*      = 2 ^ 256 - 2 ^ 223 - 2 ^ 222 - 2 ^ 221 - 2 ^ 220 - 2 ^ 219 - 2 ^ 218 -
       2 ^ 217 - 2 ^ 216 - 2 ^ 215 - 2 ^ 214 - 2 ^ 213 -
       2 ^ 212 - 2 ^ 211 - 2 ^ 210 - 2 ^ 209 - 2 ^ 208 -
       2 ^ 207 - 2 ^ 206 - 2 ^ 205 - 2 ^ 204 - 2 ^ 203 -
       2 ^ 202 - 2 ^ 201 - 2 ^ 200 - 2 ^ 199 - 2 ^ 198 -
       2 ^ 197 - 2 ^ 196 - 2 ^ 195 - 2 ^ 194 - 2 ^ 193 -
       2 ^ 191 - 2 ^ 190 - 2 ^ 189 - 2 ^ 188 - 2 ^ 187 -
       2 ^ 186 - 2 ^ 185 - 2 ^ 184 - 2 ^ 183 - 2 ^ 182 -
       2 ^ 181 - 2 ^ 180 - 2 ^ 179 - 2 ^ 178 - 2 ^ 177 -
       2 ^ 176 - 2 ^ 175 - 2 ^ 174 - 2 ^ 173 - 2 ^ 172 -
       2 ^ 171 - 2 ^ 170 - 2 ^ 169 - 2 ^ 168 - 2 ^ 167 -
       2 ^ 166 - 2 ^ 165 - 2 ^ 164 - 2 ^ 163 - 2 ^ 162 -
       2 ^ 161 - 2 ^ 160 - 2 ^ 159 - 2 ^ 158 - 2 ^ 157 -
       2 ^ 156 - 2 ^ 155 - 2 ^ 154 - 2 ^ 153 - 2 ^ 152 -
       2 ^ 151 - 2 ^ 150 - 2 ^ 149 - 2 ^ 148 - 2 ^ 147 -
       2 ^ 146 - 2 ^ 145 - 2 ^ 144 - 2 ^ 143 - 2 ^ 142 -
       2 ^ 141 - 2 ^ 140 - 2 ^ 139 - 2 ^ 138 - 2 ^ 137 -
       2 ^ 136 - 2 ^ 135 - 2 ^ 134 - 2 ^ 133 - 2 ^ 132 -
       2 ^ 131 - 2 ^ 130 - 2 ^ 129 - 2 ^ 128 - 2 ^ 127 -
       2 ^ 126 - 2 ^ 125 - 2 ^ 124 - 2 ^ 123 - 2 ^ 122 -
       2 ^ 121 - 2 ^ 120 - 2 ^ 119 - 2 ^ 118 - 2 ^ 117 -
       2 ^ 116 - 2 ^ 115 - 2 ^ 114 - 2 ^ 113 - 2 ^ 112 -
       2 ^ 111 - 2 ^ 110 - 2 ^ 109 - 2 ^ 108 - 2 ^ 107 -
       2 ^ 106 - 2 ^ 105 - 2 ^ 104 - 2 ^ 103 - 2 ^ 102 -
       2 ^ 101 - 2 ^ 100 - 2 ^ 99 - 2 ^ 98 - 2 ^ 97 -
       2 ^ 96 - 2 ^ 0
*)

Definition p224 := 2^224 - 2^96 + 1.
Set Printing Depth 100000.
Definition p224' := Eval lazy -[Z_pow' Z_sub'] in p' p224.
Compute p224 - p224'. (* 0 *)
Eval lazy [p224' Z_pow' Z_sub'] in p224'.
(*      = 2 ^ 224 - 2 ^ 95 - 2 ^ 94 - 2 ^ 93 - 2 ^ 92 - 2 ^ 91 - 2 ^ 90 - 2 ^ 89 -
       2 ^ 88 - 2 ^ 87 - 2 ^ 86 - 2 ^ 85 - 2 ^ 84 - 2 ^ 83 -
       2 ^ 82 - 2 ^ 81 - 2 ^ 80 - 2 ^ 79 - 2 ^ 78 - 2 ^ 77 -
       2 ^ 76 - 2 ^ 75 - 2 ^ 74 - 2 ^ 73 - 2 ^ 72 - 2 ^ 71 -
       2 ^ 70 - 2 ^ 69 - 2 ^ 68 - 2 ^ 67 - 2 ^ 66 - 2 ^ 65 -
       2 ^ 64 - 2 ^ 63 - 2 ^ 62 - 2 ^ 61 - 2 ^ 60 - 2 ^ 59 -
       2 ^ 58 - 2 ^ 57 - 2 ^ 56 - 2 ^ 55 - 2 ^ 54 - 2 ^ 53 -
       2 ^ 52 - 2 ^ 51 - 2 ^ 50 - 2 ^ 49 - 2 ^ 48 - 2 ^ 47 -
       2 ^ 46 - 2 ^ 45 - 2 ^ 44 - 2 ^ 43 - 2 ^ 42 - 2 ^ 41 -
       2 ^ 40 - 2 ^ 39 - 2 ^ 38 - 2 ^ 37 - 2 ^ 36 - 2 ^ 35 -
       2 ^ 34 - 2 ^ 33 - 2 ^ 32 - 2 ^ 31 - 2 ^ 30 - 2 ^ 29 -
       2 ^ 28 - 2 ^ 27 - 2 ^ 26 - 2 ^ 25 - 2 ^ 24 - 2 ^ 23 -
       2 ^ 22 - 2 ^ 21 - 2 ^ 20 - 2 ^ 19 - 2 ^ 18 - 2 ^ 17 -
       2 ^ 16 - 2 ^ 15 - 2 ^ 14 - 2 ^ 13 - 2 ^ 12 - 2 ^ 11 -
       2 ^ 10 - 2 ^ 9 - 2 ^ 8 - 2 ^ 7 - 2 ^ 6 - 2 ^ 5 -
       2 ^ 4 - 2 ^ 3 - 2 ^ 2 - 2 ^ 1 - 2 ^ 0
     : Z
*)

But in any case, what you're working on seems like great progress, and thanks for figuring it out and doing it!

Have looked into this some more and ultimately my conclusion is that p224 and p256 were not meant for unsaturated solinas and implementing the extra tricks needed to accommodate them is probably more time and effort than it's worth. You need, at least:

balance for anything that uses reduce, which is at least carry, carry_mul, and carry_square
possibly different amounts of balance for those different operations -- carry won't need nearly as much as carry_mul
the ability to structure your carry chain around your balance or interleave balance + carrying (if you carry from a limb you just added balance to, you defeat the point of adding balance)
the ability to split large terms across limbs in reduce (as in https://github.com/openssl/openssl/blob/master/crypto/ec/ecp_nistp224.c#L547 - p224 c in positional form is [-1; 2^40], so reduce involves adding partial products left-shifted by 40, which is large enough to overflow even 128-bit limbs)

This is all possible but probably weeks of work, and I'm not really up to putting weeks into it.

Mmm, that sounds unfortunate, and thanks for investigating.

I think it's probably still worth merging the change in computing the balance for sub/opportunity, even if we don't also get carry/mul working. Btw, does the following formula work for computing the adusted balance? carry (encode p - tight_bounds) + tight_bounds? It seems like it should compute the right thing, and also should be pretty easy to prove that it evaluated to a multiple of p.

I think it's probably still worth merging the change in computing the balance for sub/opportunity, even if we don't also get carry/mul working.

Sounds good. I'll get the new sub/opp balance in a state to merge and make a PR for it.

Btw, does the following formula work for computing the adusted balance? carry (encode p - tight_bounds) + tight_bounds? It seems like it should compute the right thing, and also should be pretty easy to prove that it evaluated to a multiple of p.

No, that formula doesn't work -- but maybe I'm misunderstanding what you mean, because I don't see why it should. (I'm assuming by "balance" you mean sub/opp balance.) For p224, where the lowest limb of encode p is 1, subtracting tight_bounds gives you -(2^56-1). Then when you carry from that limb, you take that mod 2^56 and get again a 1 in that position, while carrying a -1. But the limb still stays low, while what you need is to distribute it so that it's higher than the tight bounds.

To be specific:

(* the balance we want to compute *)
Compute show false (@sub_balance 1 n s c).
     (* = "[2^56+2, 2^57-2^41-1, 2^57-2, 2^57-2]" *)

(* some relevant values *)
Compute show false (@encode weight n s c (s - Associational.eval c)).
     (* = "[1, 2^56-2^40, 2^56-1, 2^56-1]" *)
Compute show false (@tight_upperbounds 1 n s c).
     (* = "[2^56, 2^56, 2^56, 2^56]" *)
Compute show false (carry_chains n s c).
     (* = "[0, 3, 1, 0, 2, 3, 1, 0]" *)

(* encode p - tight_upperbounds *)
Compute show false (sub weight n (zeros n) (encode weight n s c (s - Associational.eval c)) (@tight_upperbounds 1 n s c)).
     (* = "[-(2^56-1), -2^40, -1, -1]" *)
(* carrying from limb 0 *)
Compute show false (chained_carries weight n s c (sub weight n (zeros n) (encode weight n s c (s - Associational.eval c)) (@tight_upperbounds 1 n s c)) [0]%nat).
     (* = "[1, -(2^40+1), -1, -1]" *)

(* doing full carry chain *)
Compute show false (chained_carries weight n s c (sub weight n (zeros n) (encode weight n s c (s - Associational.eval c)) (@tight_upperbounds 1 n s c)) (carry_chains n s c)).
     (* = "[2, 2^56-2^41-1, 2^56-2, 2^56-2]" *)

But the limb still stays low, while what you need is to distribute it so that it's higher than the tight bounds.

To be specific:

You're missing the final step, which is adding back tight_bounds without carrying. This is the step that ensures each limb is high enough. (And it does indeed seem to give the same value as sub_balance)

Ah, I indeed missed that part. In that case, I'm happy for you to change it if you prefer that version. But I'll make a PR with the original formula to start with, since I already prepared a branch for it and it passed tests. The proofs weren't difficult, but your version might be more readable.

Do we have a lemma that says that all limbs are non-negative after carrying? (Would it be reasonable to have one, or is it overkill to specify this property?)

mit-plv / fiat-crypto

Unsaturated solinas fails on primes with + (e.g., p224, p256) #554