aboutsummaryrefslogtreecommitdiffstats
path: root/test/monniaux/BearSSL/src/ec
diff options
context:
space:
mode:
Diffstat (limited to 'test/monniaux/BearSSL/src/ec')
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_all_m15.c121
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_all_m31.c171
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_c25519_i15.c398
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_c25519_i31.c390
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_c25519_m15.c1478
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_c25519_m31.c800
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_c25519_m62.c605
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_c25519_m64.c835
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_curve25519.c46
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_default.c36
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_keygen.c86
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_p256_m15.c2130
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_p256_m31.c1475
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_p256_m62.c1765
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_p256_m64.c1730
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_prime_i15.c820
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_prime_i31.c819
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_pubkey.c85
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_secp256r1.c51
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_secp384r1.c57
-rw-r--r--test/monniaux/BearSSL/src/ec/ec_secp521r1.c64
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_atr.c134
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c36
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c36
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c36
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c36
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c47
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c45
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c174
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c48
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c166
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c47
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c45
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c173
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c48
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c165
-rw-r--r--test/monniaux/BearSSL/src/ec/ecdsa_rta.c121
37 files changed, 15319 insertions, 0 deletions
diff --git a/test/monniaux/BearSSL/src/ec/ec_all_m15.c b/test/monniaux/BearSSL/src/ec/ec_all_m15.c
new file mode 100644
index 00000000..bb550e18
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_all_m15.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+ return br_ec_p256_m15.generator(curve, len);
+ case BR_EC_curve25519:
+ return br_ec_c25519_m15.generator(curve, len);
+ default:
+ return br_ec_prime_i15.generator(curve, len);
+ }
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+ return br_ec_p256_m15.order(curve, len);
+ case BR_EC_curve25519:
+ return br_ec_c25519_m15.order(curve, len);
+ default:
+ return br_ec_prime_i15.order(curve, len);
+ }
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+ return br_ec_p256_m15.xoff(curve, len);
+ case BR_EC_curve25519:
+ return br_ec_c25519_m15.xoff(curve, len);
+ default:
+ return br_ec_prime_i15.xoff(curve, len);
+ }
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *kb, size_t kblen, int curve)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+ return br_ec_p256_m15.mul(G, Glen, kb, kblen, curve);
+ case BR_EC_curve25519:
+ return br_ec_c25519_m15.mul(G, Glen, kb, kblen, curve);
+ default:
+ return br_ec_prime_i15.mul(G, Glen, kb, kblen, curve);
+ }
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+ return br_ec_p256_m15.mulgen(R, x, xlen, curve);
+ case BR_EC_curve25519:
+ return br_ec_c25519_m15.mulgen(R, x, xlen, curve);
+ default:
+ return br_ec_prime_i15.mulgen(R, x, xlen, curve);
+ }
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+ return br_ec_p256_m15.muladd(A, B, len,
+ x, xlen, y, ylen, curve);
+ case BR_EC_curve25519:
+ return br_ec_c25519_m15.muladd(A, B, len,
+ x, xlen, y, ylen, curve);
+ default:
+ return br_ec_prime_i15.muladd(A, B, len,
+ x, xlen, y, ylen, curve);
+ }
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_all_m15 = {
+ (uint32_t)0x23800000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_all_m31.c b/test/monniaux/BearSSL/src/ec/ec_all_m31.c
new file mode 100644
index 00000000..8fd8c3c0
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_all_m31.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_p256_m64.generator(curve, len);
+#else
+ return br_ec_p256_m31.generator(curve, len);
+#endif
+ case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_c25519_m64.generator(curve, len);
+#else
+ return br_ec_c25519_m31.generator(curve, len);
+#endif
+ default:
+ return br_ec_prime_i31.generator(curve, len);
+ }
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_p256_m64.order(curve, len);
+#else
+ return br_ec_p256_m31.order(curve, len);
+#endif
+ case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_c25519_m64.order(curve, len);
+#else
+ return br_ec_c25519_m31.order(curve, len);
+#endif
+ default:
+ return br_ec_prime_i31.order(curve, len);
+ }
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_p256_m64.xoff(curve, len);
+#else
+ return br_ec_p256_m31.xoff(curve, len);
+#endif
+ case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_c25519_m64.xoff(curve, len);
+#else
+ return br_ec_c25519_m31.xoff(curve, len);
+#endif
+ default:
+ return br_ec_prime_i31.xoff(curve, len);
+ }
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *kb, size_t kblen, int curve)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_p256_m64.mul(G, Glen, kb, kblen, curve);
+#else
+ return br_ec_p256_m31.mul(G, Glen, kb, kblen, curve);
+#endif
+ case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_c25519_m64.mul(G, Glen, kb, kblen, curve);
+#else
+ return br_ec_c25519_m31.mul(G, Glen, kb, kblen, curve);
+#endif
+ default:
+ return br_ec_prime_i31.mul(G, Glen, kb, kblen, curve);
+ }
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_p256_m64.mulgen(R, x, xlen, curve);
+#else
+ return br_ec_p256_m31.mulgen(R, x, xlen, curve);
+#endif
+ case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_c25519_m64.mulgen(R, x, xlen, curve);
+#else
+ return br_ec_c25519_m31.mulgen(R, x, xlen, curve);
+#endif
+ default:
+ return br_ec_prime_i31.mulgen(R, x, xlen, curve);
+ }
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_p256_m64.muladd(A, B, len,
+ x, xlen, y, ylen, curve);
+#else
+ return br_ec_p256_m31.muladd(A, B, len,
+ x, xlen, y, ylen, curve);
+#endif
+ case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+ return br_ec_c25519_m64.muladd(A, B, len,
+ x, xlen, y, ylen, curve);
+#else
+ return br_ec_c25519_m31.muladd(A, B, len,
+ x, xlen, y, ylen, curve);
+#endif
+ default:
+ return br_ec_prime_i31.muladd(A, B, len,
+ x, xlen, y, ylen, curve);
+ }
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_all_m31 = {
+ (uint32_t)0x23800000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c b/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c
new file mode 100644
index 00000000..8fadcf48
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Parameters for the field:
+ * - field modulus p = 2^255-19
+ * - R^2 mod p (R = 2^(15k) for the smallest k such that R >= p)
+ */
+
+static const uint16_t C255_P[] = {
+ 0x0110,
+ 0x7FED, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+ 0x7FFF
+};
+
+#define P0I 0x4A1B
+
+static const uint16_t C255_R2[] = {
+ 0x0110,
+ 0x0169, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000
+};
+
+/* obsolete
+#include <stdio.h>
+#include <stdlib.h>
+static void
+print_int_mont(const char *name, const uint16_t *x)
+{
+ uint16_t y[18];
+ unsigned char tmp[32];
+ size_t u;
+
+ printf("%s = ", name);
+ memcpy(y, x, sizeof y);
+ br_i15_from_monty(y, C255_P, P0I);
+ br_i15_encode(tmp, sizeof tmp, y);
+ for (u = 0; u < sizeof tmp; u ++) {
+ printf("%02X", tmp[u]);
+ }
+ printf("\n");
+}
+*/
+
+static const uint16_t C255_A24[] = {
+ 0x0110,
+ 0x45D3, 0x0046, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000
+};
+
+static const unsigned char GEN[] = {
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+ 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 0;
+}
+
+static void
+cswap(uint16_t *a, uint16_t *b, uint32_t ctl)
+{
+ int i;
+
+ ctl = -ctl;
+ for (i = 0; i < 18; i ++) {
+ uint32_t aw, bw, tw;
+
+ aw = a[i];
+ bw = b[i];
+ tw = ctl & (aw ^ bw);
+ a[i] = aw ^ tw;
+ b[i] = bw ^ tw;
+ }
+}
+
+static void
+c255_add(uint16_t *d, const uint16_t *a, const uint16_t *b)
+{
+ uint32_t ctl;
+ uint16_t t[18];
+
+ memcpy(t, a, sizeof t);
+ ctl = br_i15_add(t, b, 1);
+ ctl |= NOT(br_i15_sub(t, C255_P, 0));
+ br_i15_sub(t, C255_P, ctl);
+ memcpy(d, t, sizeof t);
+}
+
+static void
+c255_sub(uint16_t *d, const uint16_t *a, const uint16_t *b)
+{
+ uint16_t t[18];
+
+ memcpy(t, a, sizeof t);
+ br_i15_add(t, C255_P, br_i15_sub(t, b, 1));
+ memcpy(d, t, sizeof t);
+}
+
+static void
+c255_mul(uint16_t *d, const uint16_t *a, const uint16_t *b)
+{
+ uint16_t t[18];
+
+ br_i15_montymul(t, a, b, C255_P, P0I);
+ memcpy(d, t, sizeof t);
+}
+
+static void
+byteswap(unsigned char *G)
+{
+ int i;
+
+ for (i = 0; i < 16; i ++) {
+ unsigned char t;
+
+ t = G[i];
+ G[i] = G[31 - i];
+ G[31 - i] = t;
+ }
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *kb, size_t kblen, int curve)
+{
+#define ILEN (18 * sizeof(uint16_t))
+
+ /*
+ * The a[] and b[] arrays have an extra word to allow for
+ * decoding without using br_i15_decode_reduce().
+ */
+ uint16_t x1[18], x2[18], x3[18], z2[18], z3[18];
+ uint16_t a[19], aa[18], b[19], bb[18];
+ uint16_t c[18], d[18], e[18], da[18], cb[18];
+ unsigned char k[32];
+ uint32_t swap;
+ int i;
+
+ (void)curve;
+
+ /*
+ * Points are encoded over exactly 32 bytes. Multipliers must fit
+ * in 32 bytes as well.
+ * RFC 7748 mandates that the high bit of the last point byte must
+ * be ignored/cleared.
+ */
+ if (Glen != 32 || kblen > 32) {
+ return 0;
+ }
+ G[31] &= 0x7F;
+
+ /*
+ * Byteswap the point encoding, because it uses little-endian, and
+ * the generic decoding routine uses big-endian.
+ */
+ byteswap(G);
+
+ /*
+ * Decode the point ('u' coordinate). This should be reduced
+ * modulo p, but we prefer to avoid the dependency on
+ * br_i15_decode_reduce(). Instead, we use br_i15_decode_mod()
+ * with a synthetic modulus of value 2^255 (this must work
+ * since G was truncated to 255 bits), then use a conditional
+ * subtraction. We use br_i15_decode_mod() and not
+ * br_i15_decode(), because the ec_prime_i15 implementation uses
+ * the former but not the latter.
+ * br_i15_decode_reduce(a, G, 32, C255_P);
+ */
+ br_i15_zero(b, 0x111);
+ b[18] = 1;
+ br_i15_decode_mod(a, G, 32, b);
+ a[0] = 0x110;
+ br_i15_sub(a, C255_P, NOT(br_i15_sub(a, C255_P, 0)));
+
+ /*
+ * Initialise variables x1, x2, z2, x3 and z3. We set all of them
+ * into Montgomery representation.
+ */
+ br_i15_montymul(x1, a, C255_R2, C255_P, P0I);
+ memcpy(x3, x1, ILEN);
+ br_i15_zero(z2, C255_P[0]);
+ memcpy(x2, z2, ILEN);
+ x2[1] = 19;
+ memcpy(z3, x2, ILEN);
+
+ memset(k, 0, (sizeof k) - kblen);
+ memcpy(k + (sizeof k) - kblen, kb, kblen);
+ k[31] &= 0xF8;
+ k[0] &= 0x7F;
+ k[0] |= 0x40;
+
+ /* obsolete
+ print_int_mont("x1", x1);
+ */
+
+ swap = 0;
+ for (i = 254; i >= 0; i --) {
+ uint32_t kt;
+
+ kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+ swap ^= kt;
+ cswap(x2, x3, swap);
+ cswap(z2, z3, swap);
+ swap = kt;
+
+ /* obsolete
+ print_int_mont("x2", x2);
+ print_int_mont("z2", z2);
+ print_int_mont("x3", x3);
+ print_int_mont("z3", z3);
+ */
+
+ c255_add(a, x2, z2);
+ c255_mul(aa, a, a);
+ c255_sub(b, x2, z2);
+ c255_mul(bb, b, b);
+ c255_sub(e, aa, bb);
+ c255_add(c, x3, z3);
+ c255_sub(d, x3, z3);
+ c255_mul(da, d, a);
+ c255_mul(cb, c, b);
+
+ /* obsolete
+ print_int_mont("a ", a);
+ print_int_mont("aa", aa);
+ print_int_mont("b ", b);
+ print_int_mont("bb", bb);
+ print_int_mont("e ", e);
+ print_int_mont("c ", c);
+ print_int_mont("d ", d);
+ print_int_mont("da", da);
+ print_int_mont("cb", cb);
+ */
+
+ c255_add(x3, da, cb);
+ c255_mul(x3, x3, x3);
+ c255_sub(z3, da, cb);
+ c255_mul(z3, z3, z3);
+ c255_mul(z3, z3, x1);
+ c255_mul(x2, aa, bb);
+ c255_mul(z2, C255_A24, e);
+ c255_add(z2, z2, aa);
+ c255_mul(z2, e, z2);
+
+ /* obsolete
+ print_int_mont("x2", x2);
+ print_int_mont("z2", z2);
+ print_int_mont("x3", x3);
+ print_int_mont("z3", z3);
+ */
+ }
+ cswap(x2, x3, swap);
+ cswap(z2, z3, swap);
+
+ /*
+ * Inverse z2 with a modular exponentiation. This is a simple
+ * square-and-multiply algorithm; we mutualise most non-squarings
+ * since the exponent contains almost only ones.
+ */
+ memcpy(a, z2, ILEN);
+ for (i = 0; i < 15; i ++) {
+ c255_mul(a, a, a);
+ c255_mul(a, a, z2);
+ }
+ memcpy(b, a, ILEN);
+ for (i = 0; i < 14; i ++) {
+ int j;
+
+ for (j = 0; j < 16; j ++) {
+ c255_mul(b, b, b);
+ }
+ c255_mul(b, b, a);
+ }
+ for (i = 14; i >= 0; i --) {
+ c255_mul(b, b, b);
+ if ((0xFFEB >> i) & 1) {
+ c255_mul(b, z2, b);
+ }
+ }
+ c255_mul(b, x2, b);
+
+ /*
+ * To avoid a dependency on br_i15_from_monty(), we use a
+ * Montgomery multiplication with 1.
+ * memcpy(x2, b, ILEN);
+ * br_i15_from_monty(x2, C255_P, P0I);
+ */
+ br_i15_zero(a, C255_P[0]);
+ a[1] = 1;
+ br_i15_montymul(x2, a, b, C255_P, P0I);
+
+ br_i15_encode(G, 32, x2);
+ byteswap(G);
+ return 1;
+
+#undef ILEN
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ /*
+ * We don't implement this method, since it is used for ECDSA
+ * only, and there is no ECDSA over Curve25519 (which instead
+ * uses EdDSA).
+ */
+ (void)A;
+ (void)B;
+ (void)len;
+ (void)x;
+ (void)xlen;
+ (void)y;
+ (void)ylen;
+ (void)curve;
+ return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_i15 = {
+ (uint32_t)0x20000000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c b/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c
new file mode 100644
index 00000000..f8ffc2c2
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Parameters for the field:
+ * - field modulus p = 2^255-19
+ * - R^2 mod p (R = 2^(31k) for the smallest k such that R >= p)
+ */
+
+static const uint32_t C255_P[] = {
+ 0x00000107,
+ 0x7FFFFFED, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0000007F
+};
+
+#define P0I 0x286BCA1B
+
+static const uint32_t C255_R2[] = {
+ 0x00000107,
+ 0x00000000, 0x02D20000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000
+};
+
+static const uint32_t C255_A24[] = {
+ 0x00000107,
+ 0x53000000, 0x0000468B, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000
+};
+
+/* obsolete
+#include <stdio.h>
+#include <stdlib.h>
+static void
+print_int_mont(const char *name, const uint32_t *x)
+{
+ uint32_t y[10];
+ unsigned char tmp[32];
+ size_t u;
+
+ printf("%s = ", name);
+ memcpy(y, x, sizeof y);
+ br_i31_from_monty(y, C255_P, P0I);
+ br_i31_encode(tmp, sizeof tmp, y);
+ for (u = 0; u < sizeof tmp; u ++) {
+ printf("%02X", tmp[u]);
+ }
+ printf("\n");
+}
+*/
+
+static const unsigned char GEN[] = {
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+ 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 0;
+}
+
+static void
+cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
+{
+ int i;
+
+ ctl = -ctl;
+ for (i = 0; i < 10; i ++) {
+ uint32_t aw, bw, tw;
+
+ aw = a[i];
+ bw = b[i];
+ tw = ctl & (aw ^ bw);
+ a[i] = aw ^ tw;
+ b[i] = bw ^ tw;
+ }
+}
+
+static void
+c255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t ctl;
+ uint32_t t[10];
+
+ memcpy(t, a, sizeof t);
+ ctl = br_i31_add(t, b, 1);
+ ctl |= NOT(br_i31_sub(t, C255_P, 0));
+ br_i31_sub(t, C255_P, ctl);
+ memcpy(d, t, sizeof t);
+}
+
+static void
+c255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t t[10];
+
+ memcpy(t, a, sizeof t);
+ br_i31_add(t, C255_P, br_i31_sub(t, b, 1));
+ memcpy(d, t, sizeof t);
+}
+
+static void
+c255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t t[10];
+
+ br_i31_montymul(t, a, b, C255_P, P0I);
+ memcpy(d, t, sizeof t);
+}
+
+static void
+byteswap(unsigned char *G)
+{
+ int i;
+
+ for (i = 0; i < 16; i ++) {
+ unsigned char t;
+
+ t = G[i];
+ G[i] = G[31 - i];
+ G[31 - i] = t;
+ }
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *kb, size_t kblen, int curve)
+{
+ uint32_t x1[10], x2[10], x3[10], z2[10], z3[10];
+ uint32_t a[10], aa[10], b[10], bb[10];
+ uint32_t c[10], d[10], e[10], da[10], cb[10];
+ unsigned char k[32];
+ uint32_t swap;
+ int i;
+
+ (void)curve;
+
+ /*
+ * Points are encoded over exactly 32 bytes. Multipliers must fit
+ * in 32 bytes as well.
+ * RFC 7748 mandates that the high bit of the last point byte must
+ * be ignored/cleared.
+ */
+ if (Glen != 32 || kblen > 32) {
+ return 0;
+ }
+ G[31] &= 0x7F;
+
+ /*
+ * Byteswap the point encoding, because it uses little-endian, and
+ * the generic decoding routine uses big-endian.
+ */
+ byteswap(G);
+
+ /*
+ * Decode the point ('u' coordinate). This should be reduced
+ * modulo p, but we prefer to avoid the dependency on
+ * br_i31_decode_reduce(). Instead, we use br_i31_decode_mod()
+ * with a synthetic modulus of value 2^255 (this must work
+ * since G was truncated to 255 bits), then use a conditional
+ * subtraction. We use br_i31_decode_mod() and not
+ * br_i31_decode(), because the ec_prime_i31 implementation uses
+ * the former but not the latter.
+ * br_i31_decode_reduce(a, G, 32, C255_P);
+ */
+ br_i31_zero(b, 0x108);
+ b[9] = 0x0080;
+ br_i31_decode_mod(a, G, 32, b);
+ a[0] = 0x107;
+ br_i31_sub(a, C255_P, NOT(br_i31_sub(a, C255_P, 0)));
+
+ /*
+ * Initialise variables x1, x2, z2, x3 and z3. We set all of them
+ * into Montgomery representation.
+ */
+ br_i31_montymul(x1, a, C255_R2, C255_P, P0I);
+ memcpy(x3, x1, sizeof x1);
+ br_i31_zero(z2, C255_P[0]);
+ memcpy(x2, z2, sizeof z2);
+ x2[1] = 0x13000000;
+ memcpy(z3, x2, sizeof x2);
+
+ /*
+ * kb[] is in big-endian notation, but possibly shorter than k[].
+ */
+ memset(k, 0, (sizeof k) - kblen);
+ memcpy(k + (sizeof k) - kblen, kb, kblen);
+ k[31] &= 0xF8;
+ k[0] &= 0x7F;
+ k[0] |= 0x40;
+
+ /* obsolete
+ print_int_mont("x1", x1);
+ */
+
+ swap = 0;
+ for (i = 254; i >= 0; i --) {
+ uint32_t kt;
+
+ kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+ swap ^= kt;
+ cswap(x2, x3, swap);
+ cswap(z2, z3, swap);
+ swap = kt;
+
+ /* obsolete
+ print_int_mont("x2", x2);
+ print_int_mont("z2", z2);
+ print_int_mont("x3", x3);
+ print_int_mont("z3", z3);
+ */
+
+ c255_add(a, x2, z2);
+ c255_mul(aa, a, a);
+ c255_sub(b, x2, z2);
+ c255_mul(bb, b, b);
+ c255_sub(e, aa, bb);
+ c255_add(c, x3, z3);
+ c255_sub(d, x3, z3);
+ c255_mul(da, d, a);
+ c255_mul(cb, c, b);
+
+ /* obsolete
+ print_int_mont("a ", a);
+ print_int_mont("aa", aa);
+ print_int_mont("b ", b);
+ print_int_mont("bb", bb);
+ print_int_mont("e ", e);
+ print_int_mont("c ", c);
+ print_int_mont("d ", d);
+ print_int_mont("da", da);
+ print_int_mont("cb", cb);
+ */
+
+ c255_add(x3, da, cb);
+ c255_mul(x3, x3, x3);
+ c255_sub(z3, da, cb);
+ c255_mul(z3, z3, z3);
+ c255_mul(z3, z3, x1);
+ c255_mul(x2, aa, bb);
+ c255_mul(z2, C255_A24, e);
+ c255_add(z2, z2, aa);
+ c255_mul(z2, e, z2);
+
+ /* obsolete
+ print_int_mont("x2", x2);
+ print_int_mont("z2", z2);
+ print_int_mont("x3", x3);
+ print_int_mont("z3", z3);
+ */
+ }
+ cswap(x2, x3, swap);
+ cswap(z2, z3, swap);
+
+ /*
+ * Inverse z2 with a modular exponentiation. This is a simple
+ * square-and-multiply algorithm; we mutualise most non-squarings
+ * since the exponent contains almost only ones.
+ */
+ memcpy(a, z2, sizeof z2);
+ for (i = 0; i < 15; i ++) {
+ c255_mul(a, a, a);
+ c255_mul(a, a, z2);
+ }
+ memcpy(b, a, sizeof a);
+ for (i = 0; i < 14; i ++) {
+ int j;
+
+ for (j = 0; j < 16; j ++) {
+ c255_mul(b, b, b);
+ }
+ c255_mul(b, b, a);
+ }
+ for (i = 14; i >= 0; i --) {
+ c255_mul(b, b, b);
+ if ((0xFFEB >> i) & 1) {
+ c255_mul(b, z2, b);
+ }
+ }
+ c255_mul(b, x2, b);
+
+ /*
+ * To avoid a dependency on br_i31_from_monty(), we use
+ * a Montgomery multiplication with 1.
+ * memcpy(x2, b, sizeof b);
+ * br_i31_from_monty(x2, C255_P, P0I);
+ */
+ br_i31_zero(a, C255_P[0]);
+ a[1] = 1;
+ br_i31_montymul(x2, a, b, C255_P, P0I);
+
+ br_i31_encode(G, 32, x2);
+ byteswap(G);
+ return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ /*
+ * We don't implement this method, since it is used for ECDSA
+ * only, and there is no ECDSA over Curve25519 (which instead
+ * uses EdDSA).
+ */
+ (void)A;
+ (void)B;
+ (void)len;
+ (void)x;
+ (void)xlen;
+ (void)y;
+ (void)ylen;
+ (void)curve;
+ return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_i31 = {
+ (uint32_t)0x20000000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c
new file mode 100644
index 00000000..deff55b3
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c
@@ -0,0 +1,1478 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* obsolete
+#include <stdio.h>
+#include <stdlib.h>
+static void
+print_int(const char *name, const uint32_t *x)
+{
+ size_t u;
+ unsigned char tmp[36];
+
+ printf("%s = ", name);
+ for (u = 0; u < 20; u ++) {
+ if (x[u] > 0x1FFF) {
+ printf("INVALID:");
+ for (u = 0; u < 20; u ++) {
+ printf(" %04X", x[u]);
+ }
+ printf("\n");
+ return;
+ }
+ }
+ memset(tmp, 0, sizeof tmp);
+ for (u = 0; u < 20; u ++) {
+ uint32_t w;
+ int j, k;
+
+ w = x[u];
+ j = 13 * (int)u;
+ k = j & 7;
+ if (k != 0) {
+ w <<= k;
+ j -= k;
+ }
+ k = j >> 3;
+ tmp[35 - k] |= (unsigned char)w;
+ tmp[34 - k] |= (unsigned char)(w >> 8);
+ tmp[33 - k] |= (unsigned char)(w >> 16);
+ tmp[32 - k] |= (unsigned char)(w >> 24);
+ }
+ for (u = 4; u < 36; u ++) {
+ printf("%02X", tmp[u]);
+ }
+ printf("\n");
+}
+*/
+
+/*
+ * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
+ * that right-shifting a signed negative integer copies the sign bit
+ * (arithmetic right-shift). This is "implementation-defined behaviour",
+ * i.e. it is not undefined, but it may differ between compilers. Each
+ * compiler is supposed to document its behaviour in that respect. GCC
+ * explicitly defines that an arithmetic right shift is used. We expect
+ * all other compilers to do the same, because underlying CPU offer an
+ * arithmetic right shift opcode that could not be used otherwise.
+ */
+#if BR_NO_ARITH_SHIFT
+#define ARSH(x, n) (((uint32_t)(x) >> (n)) \
+ | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
+#else
+#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
+#endif
+
+/*
+ * Convert an integer from unsigned little-endian encoding to a sequence of
+ * 13-bit words in little-endian order. The final "partial" word is
+ * returned.
+ */
+static uint32_t
+le8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
+{
+ uint32_t acc;
+ int acc_len;
+
+ acc = 0;
+ acc_len = 0;
+ while (len -- > 0) {
+ acc |= (uint32_t)(*src ++) << acc_len;
+ acc_len += 8;
+ if (acc_len >= 13) {
+ *dst ++ = acc & 0x1FFF;
+ acc >>= 13;
+ acc_len -= 13;
+ }
+ }
+ return acc;
+}
+
+/*
+ * Convert an integer (13-bit words, little-endian) to unsigned
+ * little-endian encoding. The total encoding length is provided; all
+ * the destination bytes will be filled.
+ */
+static void
+le13_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
+{
+ uint32_t acc;
+ int acc_len;
+
+ acc = 0;
+ acc_len = 0;
+ while (len -- > 0) {
+ if (acc_len < 8) {
+ acc |= (*src ++) << acc_len;
+ acc_len += 13;
+ }
+ *dst ++ = (unsigned char)acc;
+ acc >>= 8;
+ acc_len -= 8;
+ }
+}
+
+/*
+ * Normalise an array of words to a strict 13 bits per word. Returned
+ * value is the resulting carry. The source (w) and destination (d)
+ * arrays may be identical, but shall not overlap partially.
+ */
+static inline uint32_t
+norm13(uint32_t *d, const uint32_t *w, size_t len)
+{
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ int32_t z;
+
+ z = w[u] + cc;
+ d[u] = z & 0x1FFF;
+ cc = ARSH(z, 13);
+ }
+ return cc;
+}
+
+/*
+ * mul20() multiplies two 260-bit integers together. Each word must fit
+ * on 13 bits; source operands use 20 words, destination operand
+ * receives 40 words. All overlaps allowed.
+ *
+ * square20() computes the square of a 260-bit integer. Each word must
+ * fit on 13 bits; source operand uses 20 words, destination operand
+ * receives 40 words. All overlaps allowed.
+ */
+
+#if BR_SLOW_MUL15
+
+static void
+mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ /*
+ * Two-level Karatsuba: turns a 20x20 multiplication into
+ * nine 5x5 multiplications. We use 13-bit words but do not
+ * propagate carries immediately, so words may expand:
+ *
+ * - First Karatsuba decomposition turns the 20x20 mul on
+ * 13-bit words into three 10x10 muls, two on 13-bit words
+ * and one on 14-bit words.
+ *
+ * - Second Karatsuba decomposition further splits these into:
+ *
+ * * four 5x5 muls on 13-bit words
+ * * four 5x5 muls on 14-bit words
+ * * one 5x5 mul on 15-bit words
+ *
+ * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
+ * or 15-bit words, respectively.
+ */
+ uint32_t u[45], v[45], w[90];
+ uint32_t cc;
+ int i;
+
+#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
+ (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
+ + (s2w)[5 * (s2_off) + 0]; \
+ (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
+ + (s2w)[5 * (s2_off) + 1]; \
+ (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
+ + (s2w)[5 * (s2_off) + 2]; \
+ (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
+ + (s2w)[5 * (s2_off) + 3]; \
+ (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
+ + (s2w)[5 * (s2_off) + 4]; \
+ } while (0)
+
+#define ZADDT(dw, d_off, sw, s_off) do { \
+ (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
+ (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
+ (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
+ (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
+ (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
+ } while (0)
+
+#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
+ (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
+ + (s2w)[5 * (s2_off) + 0]; \
+ (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
+ + (s2w)[5 * (s2_off) + 1]; \
+ (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
+ + (s2w)[5 * (s2_off) + 2]; \
+ (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
+ + (s2w)[5 * (s2_off) + 3]; \
+ (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
+ + (s2w)[5 * (s2_off) + 4]; \
+ } while (0)
+
+#define CPR1(w, cprcc) do { \
+ uint32_t cprz = (w) + cprcc; \
+ (w) = cprz & 0x1FFF; \
+ cprcc = cprz >> 13; \
+ } while (0)
+
+#define CPR(dw, d_off) do { \
+ uint32_t cprcc; \
+ cprcc = 0; \
+ CPR1((dw)[(d_off) + 0], cprcc); \
+ CPR1((dw)[(d_off) + 1], cprcc); \
+ CPR1((dw)[(d_off) + 2], cprcc); \
+ CPR1((dw)[(d_off) + 3], cprcc); \
+ CPR1((dw)[(d_off) + 4], cprcc); \
+ CPR1((dw)[(d_off) + 5], cprcc); \
+ CPR1((dw)[(d_off) + 6], cprcc); \
+ CPR1((dw)[(d_off) + 7], cprcc); \
+ CPR1((dw)[(d_off) + 8], cprcc); \
+ (dw)[(d_off) + 9] = cprcc; \
+ } while (0)
+
+ memcpy(u, a, 20 * sizeof *a);
+ ZADD(u, 4, a, 0, a, 1);
+ ZADD(u, 5, a, 2, a, 3);
+ ZADD(u, 6, a, 0, a, 2);
+ ZADD(u, 7, a, 1, a, 3);
+ ZADD(u, 8, u, 6, u, 7);
+
+ memcpy(v, b, 20 * sizeof *b);
+ ZADD(v, 4, b, 0, b, 1);
+ ZADD(v, 5, b, 2, b, 3);
+ ZADD(v, 6, b, 0, b, 2);
+ ZADD(v, 7, b, 1, b, 3);
+ ZADD(v, 8, v, 6, v, 7);
+
+ /*
+ * Do the eight first 8x8 muls. Source words are at most 16382
+ * each, so we can add product results together "as is" in 32-bit
+ * words.
+ */
+ for (i = 0; i < 40; i += 5) {
+ w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
+ w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
+ + MUL15(u[i + 1], v[i + 0]);
+ w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
+ + MUL15(u[i + 1], v[i + 1])
+ + MUL15(u[i + 2], v[i + 0]);
+ w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
+ + MUL15(u[i + 1], v[i + 2])
+ + MUL15(u[i + 2], v[i + 1])
+ + MUL15(u[i + 3], v[i + 0]);
+ w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
+ + MUL15(u[i + 1], v[i + 3])
+ + MUL15(u[i + 2], v[i + 2])
+ + MUL15(u[i + 3], v[i + 1])
+ + MUL15(u[i + 4], v[i + 0]);
+ w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
+ + MUL15(u[i + 2], v[i + 3])
+ + MUL15(u[i + 3], v[i + 2])
+ + MUL15(u[i + 4], v[i + 1]);
+ w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
+ + MUL15(u[i + 3], v[i + 3])
+ + MUL15(u[i + 4], v[i + 2]);
+ w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
+ + MUL15(u[i + 4], v[i + 3]);
+ w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
+ w[(i << 1) + 9] = 0;
+ }
+
+ /*
+ * For the 9th multiplication, source words are up to 32764,
+ * so we must do some carry propagation. If we add up to
+ * 4 products and the carry is no more than 524224, then the
+ * result fits in 32 bits, and the next carry will be no more
+ * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
+ *
+ * We thus just skip one of the products in the middle word,
+ * then do a carry propagation (this reduces words to 13 bits
+ * each, except possibly the last, which may use up to 17 bits
+ * or so), then add the missing product.
+ */
+ w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
+ w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
+ + MUL15(u[40 + 1], v[40 + 0]);
+ w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
+ + MUL15(u[40 + 1], v[40 + 1])
+ + MUL15(u[40 + 2], v[40 + 0]);
+ w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
+ + MUL15(u[40 + 1], v[40 + 2])
+ + MUL15(u[40 + 2], v[40 + 1])
+ + MUL15(u[40 + 3], v[40 + 0]);
+ w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
+ + MUL15(u[40 + 1], v[40 + 3])
+ + MUL15(u[40 + 2], v[40 + 2])
+ + MUL15(u[40 + 3], v[40 + 1]);
+ /* + MUL15(u[40 + 4], v[40 + 0]) */
+ w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
+ + MUL15(u[40 + 2], v[40 + 3])
+ + MUL15(u[40 + 3], v[40 + 2])
+ + MUL15(u[40 + 4], v[40 + 1]);
+ w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
+ + MUL15(u[40 + 3], v[40 + 3])
+ + MUL15(u[40 + 4], v[40 + 2]);
+ w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
+ + MUL15(u[40 + 4], v[40 + 3]);
+ w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
+
+ CPR(w, 80);
+
+ w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
+
+ /*
+ * The products on 14-bit words in slots 6 and 7 yield values
+ * up to 5*(16382^2) each, and we need to subtract two such
+ * values from the higher word. We need the subtraction to fit
+ * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
+ * However, 10*(16382^2) does not fit. So we must perform a
+ * bit of reduction here.
+ */
+ CPR(w, 60);
+ CPR(w, 70);
+
+ /*
+ * Recompose results.
+ */
+
+ /* 0..1*0..1 into 0..3 */
+ ZSUB2F(w, 8, w, 0, w, 2);
+ ZSUB2F(w, 9, w, 1, w, 3);
+ ZADDT(w, 1, w, 8);
+ ZADDT(w, 2, w, 9);
+
+ /* 2..3*2..3 into 4..7 */
+ ZSUB2F(w, 10, w, 4, w, 6);
+ ZSUB2F(w, 11, w, 5, w, 7);
+ ZADDT(w, 5, w, 10);
+ ZADDT(w, 6, w, 11);
+
+ /* (0..1+2..3)*(0..1+2..3) into 12..15 */
+ ZSUB2F(w, 16, w, 12, w, 14);
+ ZSUB2F(w, 17, w, 13, w, 15);
+ ZADDT(w, 13, w, 16);
+ ZADDT(w, 14, w, 17);
+
+ /* first-level recomposition */
+ ZSUB2F(w, 12, w, 0, w, 4);
+ ZSUB2F(w, 13, w, 1, w, 5);
+ ZSUB2F(w, 14, w, 2, w, 6);
+ ZSUB2F(w, 15, w, 3, w, 7);
+ ZADDT(w, 2, w, 12);
+ ZADDT(w, 3, w, 13);
+ ZADDT(w, 4, w, 14);
+ ZADDT(w, 5, w, 15);
+
+ /*
+ * Perform carry propagation to bring all words down to 13 bits.
+ */
+ cc = norm13(d, w, 40);
+ d[39] += (cc << 13);
+
+#undef ZADD
+#undef ZADDT
+#undef ZSUB2F
+#undef CPR1
+#undef CPR
+}
+
+static inline void
+square20(uint32_t *d, const uint32_t *a)
+{
+ mul20(d, a, a);
+}
+
+#else
+
+static void
+mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t t[39];
+
+ t[ 0] = MUL15(a[ 0], b[ 0]);
+ t[ 1] = MUL15(a[ 0], b[ 1])
+ + MUL15(a[ 1], b[ 0]);
+ t[ 2] = MUL15(a[ 0], b[ 2])
+ + MUL15(a[ 1], b[ 1])
+ + MUL15(a[ 2], b[ 0]);
+ t[ 3] = MUL15(a[ 0], b[ 3])
+ + MUL15(a[ 1], b[ 2])
+ + MUL15(a[ 2], b[ 1])
+ + MUL15(a[ 3], b[ 0]);
+ t[ 4] = MUL15(a[ 0], b[ 4])
+ + MUL15(a[ 1], b[ 3])
+ + MUL15(a[ 2], b[ 2])
+ + MUL15(a[ 3], b[ 1])
+ + MUL15(a[ 4], b[ 0]);
+ t[ 5] = MUL15(a[ 0], b[ 5])
+ + MUL15(a[ 1], b[ 4])
+ + MUL15(a[ 2], b[ 3])
+ + MUL15(a[ 3], b[ 2])
+ + MUL15(a[ 4], b[ 1])
+ + MUL15(a[ 5], b[ 0]);
+ t[ 6] = MUL15(a[ 0], b[ 6])
+ + MUL15(a[ 1], b[ 5])
+ + MUL15(a[ 2], b[ 4])
+ + MUL15(a[ 3], b[ 3])
+ + MUL15(a[ 4], b[ 2])
+ + MUL15(a[ 5], b[ 1])
+ + MUL15(a[ 6], b[ 0]);
+ t[ 7] = MUL15(a[ 0], b[ 7])
+ + MUL15(a[ 1], b[ 6])
+ + MUL15(a[ 2], b[ 5])
+ + MUL15(a[ 3], b[ 4])
+ + MUL15(a[ 4], b[ 3])
+ + MUL15(a[ 5], b[ 2])
+ + MUL15(a[ 6], b[ 1])
+ + MUL15(a[ 7], b[ 0]);
+ t[ 8] = MUL15(a[ 0], b[ 8])
+ + MUL15(a[ 1], b[ 7])
+ + MUL15(a[ 2], b[ 6])
+ + MUL15(a[ 3], b[ 5])
+ + MUL15(a[ 4], b[ 4])
+ + MUL15(a[ 5], b[ 3])
+ + MUL15(a[ 6], b[ 2])
+ + MUL15(a[ 7], b[ 1])
+ + MUL15(a[ 8], b[ 0]);
+ t[ 9] = MUL15(a[ 0], b[ 9])
+ + MUL15(a[ 1], b[ 8])
+ + MUL15(a[ 2], b[ 7])
+ + MUL15(a[ 3], b[ 6])
+ + MUL15(a[ 4], b[ 5])
+ + MUL15(a[ 5], b[ 4])
+ + MUL15(a[ 6], b[ 3])
+ + MUL15(a[ 7], b[ 2])
+ + MUL15(a[ 8], b[ 1])
+ + MUL15(a[ 9], b[ 0]);
+ t[10] = MUL15(a[ 0], b[10])
+ + MUL15(a[ 1], b[ 9])
+ + MUL15(a[ 2], b[ 8])
+ + MUL15(a[ 3], b[ 7])
+ + MUL15(a[ 4], b[ 6])
+ + MUL15(a[ 5], b[ 5])
+ + MUL15(a[ 6], b[ 4])
+ + MUL15(a[ 7], b[ 3])
+ + MUL15(a[ 8], b[ 2])
+ + MUL15(a[ 9], b[ 1])
+ + MUL15(a[10], b[ 0]);
+ t[11] = MUL15(a[ 0], b[11])
+ + MUL15(a[ 1], b[10])
+ + MUL15(a[ 2], b[ 9])
+ + MUL15(a[ 3], b[ 8])
+ + MUL15(a[ 4], b[ 7])
+ + MUL15(a[ 5], b[ 6])
+ + MUL15(a[ 6], b[ 5])
+ + MUL15(a[ 7], b[ 4])
+ + MUL15(a[ 8], b[ 3])
+ + MUL15(a[ 9], b[ 2])
+ + MUL15(a[10], b[ 1])
+ + MUL15(a[11], b[ 0]);
+ t[12] = MUL15(a[ 0], b[12])
+ + MUL15(a[ 1], b[11])
+ + MUL15(a[ 2], b[10])
+ + MUL15(a[ 3], b[ 9])
+ + MUL15(a[ 4], b[ 8])
+ + MUL15(a[ 5], b[ 7])
+ + MUL15(a[ 6], b[ 6])
+ + MUL15(a[ 7], b[ 5])
+ + MUL15(a[ 8], b[ 4])
+ + MUL15(a[ 9], b[ 3])
+ + MUL15(a[10], b[ 2])
+ + MUL15(a[11], b[ 1])
+ + MUL15(a[12], b[ 0]);
+ t[13] = MUL15(a[ 0], b[13])
+ + MUL15(a[ 1], b[12])
+ + MUL15(a[ 2], b[11])
+ + MUL15(a[ 3], b[10])
+ + MUL15(a[ 4], b[ 9])
+ + MUL15(a[ 5], b[ 8])
+ + MUL15(a[ 6], b[ 7])
+ + MUL15(a[ 7], b[ 6])
+ + MUL15(a[ 8], b[ 5])
+ + MUL15(a[ 9], b[ 4])
+ + MUL15(a[10], b[ 3])
+ + MUL15(a[11], b[ 2])
+ + MUL15(a[12], b[ 1])
+ + MUL15(a[13], b[ 0]);
+ t[14] = MUL15(a[ 0], b[14])
+ + MUL15(a[ 1], b[13])
+ + MUL15(a[ 2], b[12])
+ + MUL15(a[ 3], b[11])
+ + MUL15(a[ 4], b[10])
+ + MUL15(a[ 5], b[ 9])
+ + MUL15(a[ 6], b[ 8])
+ + MUL15(a[ 7], b[ 7])
+ + MUL15(a[ 8], b[ 6])
+ + MUL15(a[ 9], b[ 5])
+ + MUL15(a[10], b[ 4])
+ + MUL15(a[11], b[ 3])
+ + MUL15(a[12], b[ 2])
+ + MUL15(a[13], b[ 1])
+ + MUL15(a[14], b[ 0]);
+ t[15] = MUL15(a[ 0], b[15])
+ + MUL15(a[ 1], b[14])
+ + MUL15(a[ 2], b[13])
+ + MUL15(a[ 3], b[12])
+ + MUL15(a[ 4], b[11])
+ + MUL15(a[ 5], b[10])
+ + MUL15(a[ 6], b[ 9])
+ + MUL15(a[ 7], b[ 8])
+ + MUL15(a[ 8], b[ 7])
+ + MUL15(a[ 9], b[ 6])
+ + MUL15(a[10], b[ 5])
+ + MUL15(a[11], b[ 4])
+ + MUL15(a[12], b[ 3])
+ + MUL15(a[13], b[ 2])
+ + MUL15(a[14], b[ 1])
+ + MUL15(a[15], b[ 0]);
+ t[16] = MUL15(a[ 0], b[16])
+ + MUL15(a[ 1], b[15])
+ + MUL15(a[ 2], b[14])
+ + MUL15(a[ 3], b[13])
+ + MUL15(a[ 4], b[12])
+ + MUL15(a[ 5], b[11])
+ + MUL15(a[ 6], b[10])
+ + MUL15(a[ 7], b[ 9])
+ + MUL15(a[ 8], b[ 8])
+ + MUL15(a[ 9], b[ 7])
+ + MUL15(a[10], b[ 6])
+ + MUL15(a[11], b[ 5])
+ + MUL15(a[12], b[ 4])
+ + MUL15(a[13], b[ 3])
+ + MUL15(a[14], b[ 2])
+ + MUL15(a[15], b[ 1])
+ + MUL15(a[16], b[ 0]);
+ t[17] = MUL15(a[ 0], b[17])
+ + MUL15(a[ 1], b[16])
+ + MUL15(a[ 2], b[15])
+ + MUL15(a[ 3], b[14])
+ + MUL15(a[ 4], b[13])
+ + MUL15(a[ 5], b[12])
+ + MUL15(a[ 6], b[11])
+ + MUL15(a[ 7], b[10])
+ + MUL15(a[ 8], b[ 9])
+ + MUL15(a[ 9], b[ 8])
+ + MUL15(a[10], b[ 7])
+ + MUL15(a[11], b[ 6])
+ + MUL15(a[12], b[ 5])
+ + MUL15(a[13], b[ 4])
+ + MUL15(a[14], b[ 3])
+ + MUL15(a[15], b[ 2])
+ + MUL15(a[16], b[ 1])
+ + MUL15(a[17], b[ 0]);
+ t[18] = MUL15(a[ 0], b[18])
+ + MUL15(a[ 1], b[17])
+ + MUL15(a[ 2], b[16])
+ + MUL15(a[ 3], b[15])
+ + MUL15(a[ 4], b[14])
+ + MUL15(a[ 5], b[13])
+ + MUL15(a[ 6], b[12])
+ + MUL15(a[ 7], b[11])
+ + MUL15(a[ 8], b[10])
+ + MUL15(a[ 9], b[ 9])
+ + MUL15(a[10], b[ 8])
+ + MUL15(a[11], b[ 7])
+ + MUL15(a[12], b[ 6])
+ + MUL15(a[13], b[ 5])
+ + MUL15(a[14], b[ 4])
+ + MUL15(a[15], b[ 3])
+ + MUL15(a[16], b[ 2])
+ + MUL15(a[17], b[ 1])
+ + MUL15(a[18], b[ 0]);
+ t[19] = MUL15(a[ 0], b[19])
+ + MUL15(a[ 1], b[18])
+ + MUL15(a[ 2], b[17])
+ + MUL15(a[ 3], b[16])
+ + MUL15(a[ 4], b[15])
+ + MUL15(a[ 5], b[14])
+ + MUL15(a[ 6], b[13])
+ + MUL15(a[ 7], b[12])
+ + MUL15(a[ 8], b[11])
+ + MUL15(a[ 9], b[10])
+ + MUL15(a[10], b[ 9])
+ + MUL15(a[11], b[ 8])
+ + MUL15(a[12], b[ 7])
+ + MUL15(a[13], b[ 6])
+ + MUL15(a[14], b[ 5])
+ + MUL15(a[15], b[ 4])
+ + MUL15(a[16], b[ 3])
+ + MUL15(a[17], b[ 2])
+ + MUL15(a[18], b[ 1])
+ + MUL15(a[19], b[ 0]);
+ t[20] = MUL15(a[ 1], b[19])
+ + MUL15(a[ 2], b[18])
+ + MUL15(a[ 3], b[17])
+ + MUL15(a[ 4], b[16])
+ + MUL15(a[ 5], b[15])
+ + MUL15(a[ 6], b[14])
+ + MUL15(a[ 7], b[13])
+ + MUL15(a[ 8], b[12])
+ + MUL15(a[ 9], b[11])
+ + MUL15(a[10], b[10])
+ + MUL15(a[11], b[ 9])
+ + MUL15(a[12], b[ 8])
+ + MUL15(a[13], b[ 7])
+ + MUL15(a[14], b[ 6])
+ + MUL15(a[15], b[ 5])
+ + MUL15(a[16], b[ 4])
+ + MUL15(a[17], b[ 3])
+ + MUL15(a[18], b[ 2])
+ + MUL15(a[19], b[ 1]);
+ t[21] = MUL15(a[ 2], b[19])
+ + MUL15(a[ 3], b[18])
+ + MUL15(a[ 4], b[17])
+ + MUL15(a[ 5], b[16])
+ + MUL15(a[ 6], b[15])
+ + MUL15(a[ 7], b[14])
+ + MUL15(a[ 8], b[13])
+ + MUL15(a[ 9], b[12])
+ + MUL15(a[10], b[11])
+ + MUL15(a[11], b[10])
+ + MUL15(a[12], b[ 9])
+ + MUL15(a[13], b[ 8])
+ + MUL15(a[14], b[ 7])
+ + MUL15(a[15], b[ 6])
+ + MUL15(a[16], b[ 5])
+ + MUL15(a[17], b[ 4])
+ + MUL15(a[18], b[ 3])
+ + MUL15(a[19], b[ 2]);
+ t[22] = MUL15(a[ 3], b[19])
+ + MUL15(a[ 4], b[18])
+ + MUL15(a[ 5], b[17])
+ + MUL15(a[ 6], b[16])
+ + MUL15(a[ 7], b[15])
+ + MUL15(a[ 8], b[14])
+ + MUL15(a[ 9], b[13])
+ + MUL15(a[10], b[12])
+ + MUL15(a[11], b[11])
+ + MUL15(a[12], b[10])
+ + MUL15(a[13], b[ 9])
+ + MUL15(a[14], b[ 8])
+ + MUL15(a[15], b[ 7])
+ + MUL15(a[16], b[ 6])
+ + MUL15(a[17], b[ 5])
+ + MUL15(a[18], b[ 4])
+ + MUL15(a[19], b[ 3]);
+ t[23] = MUL15(a[ 4], b[19])
+ + MUL15(a[ 5], b[18])
+ + MUL15(a[ 6], b[17])
+ + MUL15(a[ 7], b[16])
+ + MUL15(a[ 8], b[15])
+ + MUL15(a[ 9], b[14])
+ + MUL15(a[10], b[13])
+ + MUL15(a[11], b[12])
+ + MUL15(a[12], b[11])
+ + MUL15(a[13], b[10])
+ + MUL15(a[14], b[ 9])
+ + MUL15(a[15], b[ 8])
+ + MUL15(a[16], b[ 7])
+ + MUL15(a[17], b[ 6])
+ + MUL15(a[18], b[ 5])
+ + MUL15(a[19], b[ 4]);
+ t[24] = MUL15(a[ 5], b[19])
+ + MUL15(a[ 6], b[18])
+ + MUL15(a[ 7], b[17])
+ + MUL15(a[ 8], b[16])
+ + MUL15(a[ 9], b[15])
+ + MUL15(a[10], b[14])
+ + MUL15(a[11], b[13])
+ + MUL15(a[12], b[12])
+ + MUL15(a[13], b[11])
+ + MUL15(a[14], b[10])
+ + MUL15(a[15], b[ 9])
+ + MUL15(a[16], b[ 8])
+ + MUL15(a[17], b[ 7])
+ + MUL15(a[18], b[ 6])
+ + MUL15(a[19], b[ 5]);
+ t[25] = MUL15(a[ 6], b[19])
+ + MUL15(a[ 7], b[18])
+ + MUL15(a[ 8], b[17])
+ + MUL15(a[ 9], b[16])
+ + MUL15(a[10], b[15])
+ + MUL15(a[11], b[14])
+ + MUL15(a[12], b[13])
+ + MUL15(a[13], b[12])
+ + MUL15(a[14], b[11])
+ + MUL15(a[15], b[10])
+ + MUL15(a[16], b[ 9])
+ + MUL15(a[17], b[ 8])
+ + MUL15(a[18], b[ 7])
+ + MUL15(a[19], b[ 6]);
+ t[26] = MUL15(a[ 7], b[19])
+ + MUL15(a[ 8], b[18])
+ + MUL15(a[ 9], b[17])
+ + MUL15(a[10], b[16])
+ + MUL15(a[11], b[15])
+ + MUL15(a[12], b[14])
+ + MUL15(a[13], b[13])
+ + MUL15(a[14], b[12])
+ + MUL15(a[15], b[11])
+ + MUL15(a[16], b[10])
+ + MUL15(a[17], b[ 9])
+ + MUL15(a[18], b[ 8])
+ + MUL15(a[19], b[ 7]);
+ t[27] = MUL15(a[ 8], b[19])
+ + MUL15(a[ 9], b[18])
+ + MUL15(a[10], b[17])
+ + MUL15(a[11], b[16])
+ + MUL15(a[12], b[15])
+ + MUL15(a[13], b[14])
+ + MUL15(a[14], b[13])
+ + MUL15(a[15], b[12])
+ + MUL15(a[16], b[11])
+ + MUL15(a[17], b[10])
+ + MUL15(a[18], b[ 9])
+ + MUL15(a[19], b[ 8]);
+ t[28] = MUL15(a[ 9], b[19])
+ + MUL15(a[10], b[18])
+ + MUL15(a[11], b[17])
+ + MUL15(a[12], b[16])
+ + MUL15(a[13], b[15])
+ + MUL15(a[14], b[14])
+ + MUL15(a[15], b[13])
+ + MUL15(a[16], b[12])
+ + MUL15(a[17], b[11])
+ + MUL15(a[18], b[10])
+ + MUL15(a[19], b[ 9]);
+ t[29] = MUL15(a[10], b[19])
+ + MUL15(a[11], b[18])
+ + MUL15(a[12], b[17])
+ + MUL15(a[13], b[16])
+ + MUL15(a[14], b[15])
+ + MUL15(a[15], b[14])
+ + MUL15(a[16], b[13])
+ + MUL15(a[17], b[12])
+ + MUL15(a[18], b[11])
+ + MUL15(a[19], b[10]);
+ t[30] = MUL15(a[11], b[19])
+ + MUL15(a[12], b[18])
+ + MUL15(a[13], b[17])
+ + MUL15(a[14], b[16])
+ + MUL15(a[15], b[15])
+ + MUL15(a[16], b[14])
+ + MUL15(a[17], b[13])
+ + MUL15(a[18], b[12])
+ + MUL15(a[19], b[11]);
+ t[31] = MUL15(a[12], b[19])
+ + MUL15(a[13], b[18])
+ + MUL15(a[14], b[17])
+ + MUL15(a[15], b[16])
+ + MUL15(a[16], b[15])
+ + MUL15(a[17], b[14])
+ + MUL15(a[18], b[13])
+ + MUL15(a[19], b[12]);
+ t[32] = MUL15(a[13], b[19])
+ + MUL15(a[14], b[18])
+ + MUL15(a[15], b[17])
+ + MUL15(a[16], b[16])
+ + MUL15(a[17], b[15])
+ + MUL15(a[18], b[14])
+ + MUL15(a[19], b[13]);
+ t[33] = MUL15(a[14], b[19])
+ + MUL15(a[15], b[18])
+ + MUL15(a[16], b[17])
+ + MUL15(a[17], b[16])
+ + MUL15(a[18], b[15])
+ + MUL15(a[19], b[14]);
+ t[34] = MUL15(a[15], b[19])
+ + MUL15(a[16], b[18])
+ + MUL15(a[17], b[17])
+ + MUL15(a[18], b[16])
+ + MUL15(a[19], b[15]);
+ t[35] = MUL15(a[16], b[19])
+ + MUL15(a[17], b[18])
+ + MUL15(a[18], b[17])
+ + MUL15(a[19], b[16]);
+ t[36] = MUL15(a[17], b[19])
+ + MUL15(a[18], b[18])
+ + MUL15(a[19], b[17]);
+ t[37] = MUL15(a[18], b[19])
+ + MUL15(a[19], b[18]);
+ t[38] = MUL15(a[19], b[19]);
+
+ d[39] = norm13(d, t, 39);
+}
+
+static void
+square20(uint32_t *d, const uint32_t *a)
+{
+ uint32_t t[39];
+
+ t[ 0] = MUL15(a[ 0], a[ 0]);
+ t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
+ t[ 2] = MUL15(a[ 1], a[ 1])
+ + ((MUL15(a[ 0], a[ 2])) << 1);
+ t[ 3] = ((MUL15(a[ 0], a[ 3])
+ + MUL15(a[ 1], a[ 2])) << 1);
+ t[ 4] = MUL15(a[ 2], a[ 2])
+ + ((MUL15(a[ 0], a[ 4])
+ + MUL15(a[ 1], a[ 3])) << 1);
+ t[ 5] = ((MUL15(a[ 0], a[ 5])
+ + MUL15(a[ 1], a[ 4])
+ + MUL15(a[ 2], a[ 3])) << 1);
+ t[ 6] = MUL15(a[ 3], a[ 3])
+ + ((MUL15(a[ 0], a[ 6])
+ + MUL15(a[ 1], a[ 5])
+ + MUL15(a[ 2], a[ 4])) << 1);
+ t[ 7] = ((MUL15(a[ 0], a[ 7])
+ + MUL15(a[ 1], a[ 6])
+ + MUL15(a[ 2], a[ 5])
+ + MUL15(a[ 3], a[ 4])) << 1);
+ t[ 8] = MUL15(a[ 4], a[ 4])
+ + ((MUL15(a[ 0], a[ 8])
+ + MUL15(a[ 1], a[ 7])
+ + MUL15(a[ 2], a[ 6])
+ + MUL15(a[ 3], a[ 5])) << 1);
+ t[ 9] = ((MUL15(a[ 0], a[ 9])
+ + MUL15(a[ 1], a[ 8])
+ + MUL15(a[ 2], a[ 7])
+ + MUL15(a[ 3], a[ 6])
+ + MUL15(a[ 4], a[ 5])) << 1);
+ t[10] = MUL15(a[ 5], a[ 5])
+ + ((MUL15(a[ 0], a[10])
+ + MUL15(a[ 1], a[ 9])
+ + MUL15(a[ 2], a[ 8])
+ + MUL15(a[ 3], a[ 7])
+ + MUL15(a[ 4], a[ 6])) << 1);
+ t[11] = ((MUL15(a[ 0], a[11])
+ + MUL15(a[ 1], a[10])
+ + MUL15(a[ 2], a[ 9])
+ + MUL15(a[ 3], a[ 8])
+ + MUL15(a[ 4], a[ 7])
+ + MUL15(a[ 5], a[ 6])) << 1);
+ t[12] = MUL15(a[ 6], a[ 6])
+ + ((MUL15(a[ 0], a[12])
+ + MUL15(a[ 1], a[11])
+ + MUL15(a[ 2], a[10])
+ + MUL15(a[ 3], a[ 9])
+ + MUL15(a[ 4], a[ 8])
+ + MUL15(a[ 5], a[ 7])) << 1);
+ t[13] = ((MUL15(a[ 0], a[13])
+ + MUL15(a[ 1], a[12])
+ + MUL15(a[ 2], a[11])
+ + MUL15(a[ 3], a[10])
+ + MUL15(a[ 4], a[ 9])
+ + MUL15(a[ 5], a[ 8])
+ + MUL15(a[ 6], a[ 7])) << 1);
+ t[14] = MUL15(a[ 7], a[ 7])
+ + ((MUL15(a[ 0], a[14])
+ + MUL15(a[ 1], a[13])
+ + MUL15(a[ 2], a[12])
+ + MUL15(a[ 3], a[11])
+ + MUL15(a[ 4], a[10])
+ + MUL15(a[ 5], a[ 9])
+ + MUL15(a[ 6], a[ 8])) << 1);
+ t[15] = ((MUL15(a[ 0], a[15])
+ + MUL15(a[ 1], a[14])
+ + MUL15(a[ 2], a[13])
+ + MUL15(a[ 3], a[12])
+ + MUL15(a[ 4], a[11])
+ + MUL15(a[ 5], a[10])
+ + MUL15(a[ 6], a[ 9])
+ + MUL15(a[ 7], a[ 8])) << 1);
+ t[16] = MUL15(a[ 8], a[ 8])
+ + ((MUL15(a[ 0], a[16])
+ + MUL15(a[ 1], a[15])
+ + MUL15(a[ 2], a[14])
+ + MUL15(a[ 3], a[13])
+ + MUL15(a[ 4], a[12])
+ + MUL15(a[ 5], a[11])
+ + MUL15(a[ 6], a[10])
+ + MUL15(a[ 7], a[ 9])) << 1);
+ t[17] = ((MUL15(a[ 0], a[17])
+ + MUL15(a[ 1], a[16])
+ + MUL15(a[ 2], a[15])
+ + MUL15(a[ 3], a[14])
+ + MUL15(a[ 4], a[13])
+ + MUL15(a[ 5], a[12])
+ + MUL15(a[ 6], a[11])
+ + MUL15(a[ 7], a[10])
+ + MUL15(a[ 8], a[ 9])) << 1);
+ t[18] = MUL15(a[ 9], a[ 9])
+ + ((MUL15(a[ 0], a[18])
+ + MUL15(a[ 1], a[17])
+ + MUL15(a[ 2], a[16])
+ + MUL15(a[ 3], a[15])
+ + MUL15(a[ 4], a[14])
+ + MUL15(a[ 5], a[13])
+ + MUL15(a[ 6], a[12])
+ + MUL15(a[ 7], a[11])
+ + MUL15(a[ 8], a[10])) << 1);
+ t[19] = ((MUL15(a[ 0], a[19])
+ + MUL15(a[ 1], a[18])
+ + MUL15(a[ 2], a[17])
+ + MUL15(a[ 3], a[16])
+ + MUL15(a[ 4], a[15])
+ + MUL15(a[ 5], a[14])
+ + MUL15(a[ 6], a[13])
+ + MUL15(a[ 7], a[12])
+ + MUL15(a[ 8], a[11])
+ + MUL15(a[ 9], a[10])) << 1);
+ t[20] = MUL15(a[10], a[10])
+ + ((MUL15(a[ 1], a[19])
+ + MUL15(a[ 2], a[18])
+ + MUL15(a[ 3], a[17])
+ + MUL15(a[ 4], a[16])
+ + MUL15(a[ 5], a[15])
+ + MUL15(a[ 6], a[14])
+ + MUL15(a[ 7], a[13])
+ + MUL15(a[ 8], a[12])
+ + MUL15(a[ 9], a[11])) << 1);
+ t[21] = ((MUL15(a[ 2], a[19])
+ + MUL15(a[ 3], a[18])
+ + MUL15(a[ 4], a[17])
+ + MUL15(a[ 5], a[16])
+ + MUL15(a[ 6], a[15])
+ + MUL15(a[ 7], a[14])
+ + MUL15(a[ 8], a[13])
+ + MUL15(a[ 9], a[12])
+ + MUL15(a[10], a[11])) << 1);
+ t[22] = MUL15(a[11], a[11])
+ + ((MUL15(a[ 3], a[19])
+ + MUL15(a[ 4], a[18])
+ + MUL15(a[ 5], a[17])
+ + MUL15(a[ 6], a[16])
+ + MUL15(a[ 7], a[15])
+ + MUL15(a[ 8], a[14])
+ + MUL15(a[ 9], a[13])
+ + MUL15(a[10], a[12])) << 1);
+ t[23] = ((MUL15(a[ 4], a[19])
+ + MUL15(a[ 5], a[18])
+ + MUL15(a[ 6], a[17])
+ + MUL15(a[ 7], a[16])
+ + MUL15(a[ 8], a[15])
+ + MUL15(a[ 9], a[14])
+ + MUL15(a[10], a[13])
+ + MUL15(a[11], a[12])) << 1);
+ t[24] = MUL15(a[12], a[12])
+ + ((MUL15(a[ 5], a[19])
+ + MUL15(a[ 6], a[18])
+ + MUL15(a[ 7], a[17])
+ + MUL15(a[ 8], a[16])
+ + MUL15(a[ 9], a[15])
+ + MUL15(a[10], a[14])
+ + MUL15(a[11], a[13])) << 1);
+ t[25] = ((MUL15(a[ 6], a[19])
+ + MUL15(a[ 7], a[18])
+ + MUL15(a[ 8], a[17])
+ + MUL15(a[ 9], a[16])
+ + MUL15(a[10], a[15])
+ + MUL15(a[11], a[14])
+ + MUL15(a[12], a[13])) << 1);
+ t[26] = MUL15(a[13], a[13])
+ + ((MUL15(a[ 7], a[19])
+ + MUL15(a[ 8], a[18])
+ + MUL15(a[ 9], a[17])
+ + MUL15(a[10], a[16])
+ + MUL15(a[11], a[15])
+ + MUL15(a[12], a[14])) << 1);
+ t[27] = ((MUL15(a[ 8], a[19])
+ + MUL15(a[ 9], a[18])
+ + MUL15(a[10], a[17])
+ + MUL15(a[11], a[16])
+ + MUL15(a[12], a[15])
+ + MUL15(a[13], a[14])) << 1);
+ t[28] = MUL15(a[14], a[14])
+ + ((MUL15(a[ 9], a[19])
+ + MUL15(a[10], a[18])
+ + MUL15(a[11], a[17])
+ + MUL15(a[12], a[16])
+ + MUL15(a[13], a[15])) << 1);
+ t[29] = ((MUL15(a[10], a[19])
+ + MUL15(a[11], a[18])
+ + MUL15(a[12], a[17])
+ + MUL15(a[13], a[16])
+ + MUL15(a[14], a[15])) << 1);
+ t[30] = MUL15(a[15], a[15])
+ + ((MUL15(a[11], a[19])
+ + MUL15(a[12], a[18])
+ + MUL15(a[13], a[17])
+ + MUL15(a[14], a[16])) << 1);
+ t[31] = ((MUL15(a[12], a[19])
+ + MUL15(a[13], a[18])
+ + MUL15(a[14], a[17])
+ + MUL15(a[15], a[16])) << 1);
+ t[32] = MUL15(a[16], a[16])
+ + ((MUL15(a[13], a[19])
+ + MUL15(a[14], a[18])
+ + MUL15(a[15], a[17])) << 1);
+ t[33] = ((MUL15(a[14], a[19])
+ + MUL15(a[15], a[18])
+ + MUL15(a[16], a[17])) << 1);
+ t[34] = MUL15(a[17], a[17])
+ + ((MUL15(a[15], a[19])
+ + MUL15(a[16], a[18])) << 1);
+ t[35] = ((MUL15(a[16], a[19])
+ + MUL15(a[17], a[18])) << 1);
+ t[36] = MUL15(a[18], a[18])
+ + ((MUL15(a[17], a[19])) << 1);
+ t[37] = ((MUL15(a[18], a[19])) << 1);
+ t[38] = MUL15(a[19], a[19]);
+
+ d[39] = norm13(d, t, 39);
+}
+
+#endif
+
+/*
+ * Perform a "final reduction" in field F255 (field for Curve25519)
+ * The source value must be less than twice the modulus. If the value
+ * is not lower than the modulus, then the modulus is subtracted and
+ * this function returns 1; otherwise, it leaves it untouched and it
+ * returns 0.
+ */
+static uint32_t
+reduce_final_f255(uint32_t *d)
+{
+ uint32_t t[20];
+ uint32_t cc;
+ int i;
+
+ memcpy(t, d, sizeof t);
+ cc = 19;
+ for (i = 0; i < 20; i ++) {
+ uint32_t w;
+
+ w = t[i] + cc;
+ cc = w >> 13;
+ t[i] = w & 0x1FFF;
+ }
+ cc = t[19] >> 8;
+ t[19] &= 0xFF;
+ CCOPY(cc, d, t, sizeof t);
+ return cc;
+}
+
+static void
+f255_mulgen(uint32_t *d, const uint32_t *a, const uint32_t *b, int square)
+{
+ uint32_t t[40], cc, w;
+
+ /*
+ * Compute raw multiplication. All result words fit in 13 bits
+ * each; upper word (t[39]) must fit on 5 bits, since the product
+ * of two 256-bit integers must fit on 512 bits.
+ */
+ if (square) {
+ square20(t, a);
+ } else {
+ mul20(t, a, b);
+ }
+
+ /*
+ * Modular reduction: each high word is added where necessary.
+ * Since the modulus is 2^255-19 and word 20 corresponds to
+ * offset 20*13 = 260, word 20+k must be added to word k with
+ * a factor of 19*2^5 = 608. The extra bits in word 19 are also
+ * added that way.
+ */
+ cc = MUL15(t[19] >> 8, 19);
+ t[19] &= 0xFF;
+
+#define MM1(x) do { \
+ w = t[x] + cc + MUL15(t[(x) + 20], 608); \
+ t[x] = w & 0x1FFF; \
+ cc = w >> 13; \
+ } while (0)
+
+ MM1( 0);
+ MM1( 1);
+ MM1( 2);
+ MM1( 3);
+ MM1( 4);
+ MM1( 5);
+ MM1( 6);
+ MM1( 7);
+ MM1( 8);
+ MM1( 9);
+ MM1(10);
+ MM1(11);
+ MM1(12);
+ MM1(13);
+ MM1(14);
+ MM1(15);
+ MM1(16);
+ MM1(17);
+ MM1(18);
+ MM1(19);
+
+#undef MM1
+
+ cc = MUL15(w >> 8, 19);
+ t[19] &= 0xFF;
+
+#define MM2(x) do { \
+ w = t[x] + cc; \
+ d[x] = w & 0x1FFF; \
+ cc = w >> 13; \
+ } while (0)
+
+ MM2( 0);
+ MM2( 1);
+ MM2( 2);
+ MM2( 3);
+ MM2( 4);
+ MM2( 5);
+ MM2( 6);
+ MM2( 7);
+ MM2( 8);
+ MM2( 9);
+ MM2(10);
+ MM2(11);
+ MM2(12);
+ MM2(13);
+ MM2(14);
+ MM2(15);
+ MM2(16);
+ MM2(17);
+ MM2(18);
+ MM2(19);
+
+#undef MM2
+}
+
+/*
+ * Perform a multiplication of two integers modulo 2^255-19.
+ * Operands are arrays of 20 words, each containing 13 bits of data, in
+ * little-endian order. Input value may be up to 2^256-1; on output, value
+ * fits on 256 bits and is lower than twice the modulus.
+ *
+ * f255_mul() is the general multiplication, f255_square() is specialised
+ * for squarings.
+ */
+#define f255_mul(d, a, b) f255_mulgen(d, a, b, 0)
+#define f255_square(d, a) f255_mulgen(d, a, a, 1)
+
+/*
+ * Add two values in F255. Partial reduction is performed (down to less
+ * than twice the modulus).
+ */
+static void
+f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ int i;
+ uint32_t cc, w;
+
+ cc = 0;
+ for (i = 0; i < 20; i ++) {
+ w = a[i] + b[i] + cc;
+ d[i] = w & 0x1FFF;
+ cc = w >> 13;
+ }
+ cc = MUL15(w >> 8, 19);
+ d[19] &= 0xFF;
+ for (i = 0; i < 20; i ++) {
+ w = d[i] + cc;
+ d[i] = w & 0x1FFF;
+ cc = w >> 13;
+ }
+}
+
+/*
+ * Subtract one value from another in F255. Partial reduction is
+ * performed (down to less than twice the modulus).
+ */
+static void
+f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ /*
+ * We actually compute a - b + 2*p, so that the final value is
+ * necessarily positive.
+ */
+ int i;
+ uint32_t cc, w;
+
+ cc = (uint32_t)-38;
+ for (i = 0; i < 20; i ++) {
+ w = a[i] - b[i] + cc;
+ d[i] = w & 0x1FFF;
+ cc = ARSH(w, 13);
+ }
+ cc = MUL15((w + 0x200) >> 8, 19);
+ d[19] &= 0xFF;
+ for (i = 0; i < 20; i ++) {
+ w = d[i] + cc;
+ d[i] = w & 0x1FFF;
+ cc = w >> 13;
+ }
+}
+
+/*
+ * Multiply an integer by the 'A24' constant (121665). Partial reduction
+ * is performed (down to less than twice the modulus).
+ */
+static void
+f255_mul_a24(uint32_t *d, const uint32_t *a)
+{
+ int i;
+ uint32_t cc, w;
+
+ cc = 0;
+ for (i = 0; i < 20; i ++) {
+ w = MUL15(a[i], 121665) + cc;
+ d[i] = w & 0x1FFF;
+ cc = w >> 13;
+ }
+ cc = MUL15(w >> 8, 19);
+ d[19] &= 0xFF;
+ for (i = 0; i < 20; i ++) {
+ w = d[i] + cc;
+ d[i] = w & 0x1FFF;
+ cc = w >> 13;
+ }
+}
+
+static const unsigned char GEN[] = {
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+ 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 0;
+}
+
+static void
+cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
+{
+ int i;
+
+ ctl = -ctl;
+ for (i = 0; i < 20; i ++) {
+ uint32_t aw, bw, tw;
+
+ aw = a[i];
+ bw = b[i];
+ tw = ctl & (aw ^ bw);
+ a[i] = aw ^ tw;
+ b[i] = bw ^ tw;
+ }
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *kb, size_t kblen, int curve)
+{
+ uint32_t x1[20], x2[20], x3[20], z2[20], z3[20];
+ uint32_t a[20], aa[20], b[20], bb[20];
+ uint32_t c[20], d[20], e[20], da[20], cb[20];
+ unsigned char k[32];
+ uint32_t swap;
+ int i;
+
+ (void)curve;
+
+ /*
+ * Points are encoded over exactly 32 bytes. Multipliers must fit
+ * in 32 bytes as well.
+ * RFC 7748 mandates that the high bit of the last point byte must
+ * be ignored/cleared.
+ */
+ if (Glen != 32 || kblen > 32) {
+ return 0;
+ }
+ G[31] &= 0x7F;
+
+ /*
+ * Initialise variables x1, x2, z2, x3 and z3. We set all of them
+ * into Montgomery representation.
+ */
+ x1[19] = le8_to_le13(x1, G, 32);
+ memcpy(x3, x1, sizeof x1);
+ memset(z2, 0, sizeof z2);
+ memset(x2, 0, sizeof x2);
+ x2[0] = 1;
+ memset(z3, 0, sizeof z3);
+ z3[0] = 1;
+
+ memset(k, 0, (sizeof k) - kblen);
+ memcpy(k + (sizeof k) - kblen, kb, kblen);
+ k[31] &= 0xF8;
+ k[0] &= 0x7F;
+ k[0] |= 0x40;
+
+ /* obsolete
+ print_int("x1", x1);
+ */
+
+ swap = 0;
+ for (i = 254; i >= 0; i --) {
+ uint32_t kt;
+
+ kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+ swap ^= kt;
+ cswap(x2, x3, swap);
+ cswap(z2, z3, swap);
+ swap = kt;
+
+ /* obsolete
+ print_int("x2", x2);
+ print_int("z2", z2);
+ print_int("x3", x3);
+ print_int("z3", z3);
+ */
+
+ f255_add(a, x2, z2);
+ f255_square(aa, a);
+ f255_sub(b, x2, z2);
+ f255_square(bb, b);
+ f255_sub(e, aa, bb);
+ f255_add(c, x3, z3);
+ f255_sub(d, x3, z3);
+ f255_mul(da, d, a);
+ f255_mul(cb, c, b);
+
+ /* obsolete
+ print_int("a ", a);
+ print_int("aa", aa);
+ print_int("b ", b);
+ print_int("bb", bb);
+ print_int("e ", e);
+ print_int("c ", c);
+ print_int("d ", d);
+ print_int("da", da);
+ print_int("cb", cb);
+ */
+
+ f255_add(x3, da, cb);
+ f255_square(x3, x3);
+ f255_sub(z3, da, cb);
+ f255_square(z3, z3);
+ f255_mul(z3, z3, x1);
+ f255_mul(x2, aa, bb);
+ f255_mul_a24(z2, e);
+ f255_add(z2, z2, aa);
+ f255_mul(z2, e, z2);
+
+ /* obsolete
+ print_int("x2", x2);
+ print_int("z2", z2);
+ print_int("x3", x3);
+ print_int("z3", z3);
+ */
+ }
+ cswap(x2, x3, swap);
+ cswap(z2, z3, swap);
+
+ /*
+ * Inverse z2 with a modular exponentiation. This is a simple
+ * square-and-multiply algorithm; we mutualise most non-squarings
+ * since the exponent contains almost only ones.
+ */
+ memcpy(a, z2, sizeof z2);
+ for (i = 0; i < 15; i ++) {
+ f255_square(a, a);
+ f255_mul(a, a, z2);
+ }
+ memcpy(b, a, sizeof a);
+ for (i = 0; i < 14; i ++) {
+ int j;
+
+ for (j = 0; j < 16; j ++) {
+ f255_square(b, b);
+ }
+ f255_mul(b, b, a);
+ }
+ for (i = 14; i >= 0; i --) {
+ f255_square(b, b);
+ if ((0xFFEB >> i) & 1) {
+ f255_mul(b, z2, b);
+ }
+ }
+ f255_mul(x2, x2, b);
+ reduce_final_f255(x2);
+ le13_to_le8(G, 32, x2);
+ return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ /*
+ * We don't implement this method, since it is used for ECDSA
+ * only, and there is no ECDSA over Curve25519 (which instead
+ * uses EdDSA).
+ */
+ (void)A;
+ (void)B;
+ (void)len;
+ (void)x;
+ (void)xlen;
+ (void)y;
+ (void)ylen;
+ (void)curve;
+ return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m15 = {
+ (uint32_t)0x20000000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c
new file mode 100644
index 00000000..1dd6d514
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* obsolete
+#include <stdio.h>
+#include <stdlib.h>
+static void
+print_int(const char *name, const uint32_t *x)
+{
+ size_t u;
+ unsigned char tmp[40];
+
+ printf("%s = ", name);
+ for (u = 0; u < 9; u ++) {
+ if (x[u] > 0x3FFFFFFF) {
+ printf("INVALID:");
+ for (u = 0; u < 9; u ++) {
+ printf(" %08X", x[u]);
+ }
+ printf("\n");
+ return;
+ }
+ }
+ memset(tmp, 0, sizeof tmp);
+ for (u = 0; u < 9; u ++) {
+ uint64_t w;
+ int j, k;
+
+ w = x[u];
+ j = 30 * (int)u;
+ k = j & 7;
+ if (k != 0) {
+ w <<= k;
+ j -= k;
+ }
+ k = j >> 3;
+ for (j = 0; j < 8; j ++) {
+ tmp[39 - k - j] |= (unsigned char)w;
+ w >>= 8;
+ }
+ }
+ for (u = 8; u < 40; u ++) {
+ printf("%02X", tmp[u]);
+ }
+ printf("\n");
+}
+*/
+
+/*
+ * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
+ * that right-shifting a signed negative integer copies the sign bit
+ * (arithmetic right-shift). This is "implementation-defined behaviour",
+ * i.e. it is not undefined, but it may differ between compilers. Each
+ * compiler is supposed to document its behaviour in that respect. GCC
+ * explicitly defines that an arithmetic right shift is used. We expect
+ * all other compilers to do the same, because underlying CPU offer an
+ * arithmetic right shift opcode that could not be used otherwise.
+ */
+#if BR_NO_ARITH_SHIFT
+#define ARSH(x, n) (((uint32_t)(x) >> (n)) \
+ | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
+#else
+#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
+#endif
+
+/*
+ * Convert an integer from unsigned little-endian encoding to a sequence of
+ * 30-bit words in little-endian order. The final "partial" word is
+ * returned.
+ */
+static uint32_t
+le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
+{
+ uint32_t acc;
+ int acc_len;
+
+ acc = 0;
+ acc_len = 0;
+ while (len -- > 0) {
+ uint32_t b;
+
+ b = *src ++;
+ if (acc_len < 22) {
+ acc |= b << acc_len;
+ acc_len += 8;
+ } else {
+ *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
+ acc = b >> (30 - acc_len);
+ acc_len -= 22;
+ }
+ }
+ return acc;
+}
+
+/*
+ * Convert an integer (30-bit words, little-endian) to unsigned
+ * little-endian encoding. The total encoding length is provided; all
+ * the destination bytes will be filled.
+ */
+static void
+le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
+{
+ uint32_t acc;
+ int acc_len;
+
+ acc = 0;
+ acc_len = 0;
+ while (len -- > 0) {
+ if (acc_len < 8) {
+ uint32_t w;
+
+ w = *src ++;
+ *dst ++ = (unsigned char)(acc | (w << acc_len));
+ acc = w >> (8 - acc_len);
+ acc_len += 22;
+ } else {
+ *dst ++ = (unsigned char)acc;
+ acc >>= 8;
+ acc_len -= 8;
+ }
+ }
+}
+
+/*
+ * Multiply two integers. Source integers are represented as arrays of
+ * nine 30-bit words, for values up to 2^270-1. Result is encoded over
+ * 18 words of 30 bits each.
+ */
+static void
+mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ /*
+ * Maximum intermediate result is no more than
+ * 10376293531797946367, which fits in 64 bits. Reason:
+ *
+ * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
+ * 10376293531797946367 < 9663676407 * 2^30
+ *
+ * Thus, adding together 9 products of 30-bit integers, with
+ * a carry of at most 9663676406, yields an integer that fits
+ * on 64 bits and generates a carry of at most 9663676406.
+ */
+ uint64_t t[17];
+ uint64_t cc;
+ int i;
+
+ t[ 0] = MUL31(a[0], b[0]);
+ t[ 1] = MUL31(a[0], b[1])
+ + MUL31(a[1], b[0]);
+ t[ 2] = MUL31(a[0], b[2])
+ + MUL31(a[1], b[1])
+ + MUL31(a[2], b[0]);
+ t[ 3] = MUL31(a[0], b[3])
+ + MUL31(a[1], b[2])
+ + MUL31(a[2], b[1])
+ + MUL31(a[3], b[0]);
+ t[ 4] = MUL31(a[0], b[4])
+ + MUL31(a[1], b[3])
+ + MUL31(a[2], b[2])
+ + MUL31(a[3], b[1])
+ + MUL31(a[4], b[0]);
+ t[ 5] = MUL31(a[0], b[5])
+ + MUL31(a[1], b[4])
+ + MUL31(a[2], b[3])
+ + MUL31(a[3], b[2])
+ + MUL31(a[4], b[1])
+ + MUL31(a[5], b[0]);
+ t[ 6] = MUL31(a[0], b[6])
+ + MUL31(a[1], b[5])
+ + MUL31(a[2], b[4])
+ + MUL31(a[3], b[3])
+ + MUL31(a[4], b[2])
+ + MUL31(a[5], b[1])
+ + MUL31(a[6], b[0]);
+ t[ 7] = MUL31(a[0], b[7])
+ + MUL31(a[1], b[6])
+ + MUL31(a[2], b[5])
+ + MUL31(a[3], b[4])
+ + MUL31(a[4], b[3])
+ + MUL31(a[5], b[2])
+ + MUL31(a[6], b[1])
+ + MUL31(a[7], b[0]);
+ t[ 8] = MUL31(a[0], b[8])
+ + MUL31(a[1], b[7])
+ + MUL31(a[2], b[6])
+ + MUL31(a[3], b[5])
+ + MUL31(a[4], b[4])
+ + MUL31(a[5], b[3])
+ + MUL31(a[6], b[2])
+ + MUL31(a[7], b[1])
+ + MUL31(a[8], b[0]);
+ t[ 9] = MUL31(a[1], b[8])
+ + MUL31(a[2], b[7])
+ + MUL31(a[3], b[6])
+ + MUL31(a[4], b[5])
+ + MUL31(a[5], b[4])
+ + MUL31(a[6], b[3])
+ + MUL31(a[7], b[2])
+ + MUL31(a[8], b[1]);
+ t[10] = MUL31(a[2], b[8])
+ + MUL31(a[3], b[7])
+ + MUL31(a[4], b[6])
+ + MUL31(a[5], b[5])
+ + MUL31(a[6], b[4])
+ + MUL31(a[7], b[3])
+ + MUL31(a[8], b[2]);
+ t[11] = MUL31(a[3], b[8])
+ + MUL31(a[4], b[7])
+ + MUL31(a[5], b[6])
+ + MUL31(a[6], b[5])
+ + MUL31(a[7], b[4])
+ + MUL31(a[8], b[3]);
+ t[12] = MUL31(a[4], b[8])
+ + MUL31(a[5], b[7])
+ + MUL31(a[6], b[6])
+ + MUL31(a[7], b[5])
+ + MUL31(a[8], b[4]);
+ t[13] = MUL31(a[5], b[8])
+ + MUL31(a[6], b[7])
+ + MUL31(a[7], b[6])
+ + MUL31(a[8], b[5]);
+ t[14] = MUL31(a[6], b[8])
+ + MUL31(a[7], b[7])
+ + MUL31(a[8], b[6]);
+ t[15] = MUL31(a[7], b[8])
+ + MUL31(a[8], b[7]);
+ t[16] = MUL31(a[8], b[8]);
+
+ /*
+ * Propagate carries.
+ */
+ cc = 0;
+ for (i = 0; i < 17; i ++) {
+ uint64_t w;
+
+ w = t[i] + cc;
+ d[i] = (uint32_t)w & 0x3FFFFFFF;
+ cc = w >> 30;
+ }
+ d[17] = (uint32_t)cc;
+}
+
+/*
+ * Square a 270-bit integer, represented as an array of nine 30-bit words.
+ * Result uses 18 words of 30 bits each.
+ */
+static void
+square9(uint32_t *d, const uint32_t *a)
+{
+ uint64_t t[17];
+ uint64_t cc;
+ int i;
+
+ t[ 0] = MUL31(a[0], a[0]);
+ t[ 1] = ((MUL31(a[0], a[1])) << 1);
+ t[ 2] = MUL31(a[1], a[1])
+ + ((MUL31(a[0], a[2])) << 1);
+ t[ 3] = ((MUL31(a[0], a[3])
+ + MUL31(a[1], a[2])) << 1);
+ t[ 4] = MUL31(a[2], a[2])
+ + ((MUL31(a[0], a[4])
+ + MUL31(a[1], a[3])) << 1);
+ t[ 5] = ((MUL31(a[0], a[5])
+ + MUL31(a[1], a[4])
+ + MUL31(a[2], a[3])) << 1);
+ t[ 6] = MUL31(a[3], a[3])
+ + ((MUL31(a[0], a[6])
+ + MUL31(a[1], a[5])
+ + MUL31(a[2], a[4])) << 1);
+ t[ 7] = ((MUL31(a[0], a[7])
+ + MUL31(a[1], a[6])
+ + MUL31(a[2], a[5])
+ + MUL31(a[3], a[4])) << 1);
+ t[ 8] = MUL31(a[4], a[4])
+ + ((MUL31(a[0], a[8])
+ + MUL31(a[1], a[7])
+ + MUL31(a[2], a[6])
+ + MUL31(a[3], a[5])) << 1);
+ t[ 9] = ((MUL31(a[1], a[8])
+ + MUL31(a[2], a[7])
+ + MUL31(a[3], a[6])
+ + MUL31(a[4], a[5])) << 1);
+ t[10] = MUL31(a[5], a[5])
+ + ((MUL31(a[2], a[8])
+ + MUL31(a[3], a[7])
+ + MUL31(a[4], a[6])) << 1);
+ t[11] = ((MUL31(a[3], a[8])
+ + MUL31(a[4], a[7])
+ + MUL31(a[5], a[6])) << 1);
+ t[12] = MUL31(a[6], a[6])
+ + ((MUL31(a[4], a[8])
+ + MUL31(a[5], a[7])) << 1);
+ t[13] = ((MUL31(a[5], a[8])
+ + MUL31(a[6], a[7])) << 1);
+ t[14] = MUL31(a[7], a[7])
+ + ((MUL31(a[6], a[8])) << 1);
+ t[15] = ((MUL31(a[7], a[8])) << 1);
+ t[16] = MUL31(a[8], a[8]);
+
+ /*
+ * Propagate carries.
+ */
+ cc = 0;
+ for (i = 0; i < 17; i ++) {
+ uint64_t w;
+
+ w = t[i] + cc;
+ d[i] = (uint32_t)w & 0x3FFFFFFF;
+ cc = w >> 30;
+ }
+ d[17] = (uint32_t)cc;
+}
+
+/*
+ * Perform a "final reduction" in field F255 (field for Curve25519)
+ * The source value must be less than twice the modulus. If the value
+ * is not lower than the modulus, then the modulus is subtracted and
+ * this function returns 1; otherwise, it leaves it untouched and it
+ * returns 0.
+ */
+static uint32_t
+reduce_final_f255(uint32_t *d)
+{
+ uint32_t t[9];
+ uint32_t cc;
+ int i;
+
+ memcpy(t, d, sizeof t);
+ cc = 19;
+ for (i = 0; i < 9; i ++) {
+ uint32_t w;
+
+ w = t[i] + cc;
+ cc = w >> 30;
+ t[i] = w & 0x3FFFFFFF;
+ }
+ cc = t[8] >> 15;
+ t[8] &= 0x7FFF;
+ CCOPY(cc, d, t, sizeof t);
+ return cc;
+}
+
+/*
+ * Perform a multiplication of two integers modulo 2^255-19.
+ * Operands are arrays of 9 words, each containing 30 bits of data, in
+ * little-endian order. Input value may be up to 2^256-1; on output, value
+ * fits on 256 bits and is lower than twice the modulus.
+ */
+static void
+f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t t[18], cc;
+ int i;
+
+ /*
+ * Compute raw multiplication. All result words fit in 30 bits
+ * each; upper word (t[17]) must fit on 2 bits, since the product
+ * of two 256-bit integers must fit on 512 bits.
+ */
+ mul9(t, a, b);
+
+ /*
+ * Modular reduction: each high word is added where necessary.
+ * Since the modulus is 2^255-19 and word 9 corresponds to
+ * offset 9*30 = 270, word 9+k must be added to word k with
+ * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
+ * added that way.
+ *
+ * Keeping the carry on 32 bits helps with 32-bit architectures,
+ * and does not noticeably impact performance on 64-bit systems.
+ */
+ cc = MUL15(t[8] >> 15, 19); /* at most 19*(2^15-1) = 622573 */
+ t[8] &= 0x7FFF;
+ for (i = 0; i < 9; i ++) {
+ uint64_t w;
+
+ w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
+ t[i] = (uint32_t)w & 0x3FFFFFFF;
+ cc = (uint32_t)(w >> 30); /* at most 622592 */
+ }
+
+ /*
+ * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
+ * This was split into two parts (upper of 257 bits, lower of 255
+ * bits), and the upper was added to the lower with a factor 19,
+ * which means that the intermediate value is less than 77*2^255
+ * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
+ * less than 77, and the initial carry cc is at most 76*19 = 1444.
+ */
+ cc = MUL15(t[8] >> 15, 19);
+ t[8] &= 0x7FFF;
+ for (i = 0; i < 9; i ++) {
+ uint32_t z;
+
+ z = t[i] + cc;
+ d[i] = z & 0x3FFFFFFF;
+ cc = z >> 30;
+ }
+
+ /*
+ * Final result is at most 2^255 + 1443. In particular, the last
+ * carry is necessarily 0, since t[8] was truncated to 15 bits.
+ */
+}
+
+/*
+ * Perform a squaring of an integer modulo 2^255-19.
+ * Operands are arrays of 9 words, each containing 30 bits of data, in
+ * little-endian order. Input value may be up to 2^256-1; on output, value
+ * fits on 256 bits and is lower than twice the modulus.
+ */
+static void
+f255_square(uint32_t *d, const uint32_t *a)
+{
+ uint32_t t[18], cc;
+ int i;
+
+ /*
+ * Compute raw squaring. All result words fit in 30 bits
+ * each; upper word (t[17]) must fit on 2 bits, since the square
+ * of a 256-bit integers must fit on 512 bits.
+ */
+ square9(t, a);
+
+ /*
+ * Modular reduction: each high word is added where necessary.
+ * See f255_mul() for details on the reduction and carry limits.
+ */
+ cc = MUL15(t[8] >> 15, 19);
+ t[8] &= 0x7FFF;
+ for (i = 0; i < 9; i ++) {
+ uint64_t w;
+
+ w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
+ t[i] = (uint32_t)w & 0x3FFFFFFF;
+ cc = (uint32_t)(w >> 30);
+ }
+ cc = MUL15(t[8] >> 15, 19);
+ t[8] &= 0x7FFF;
+ for (i = 0; i < 9; i ++) {
+ uint32_t z;
+
+ z = t[i] + cc;
+ d[i] = z & 0x3FFFFFFF;
+ cc = z >> 30;
+ }
+}
+
+/*
+ * Add two values in F255. Partial reduction is performed (down to less
+ * than twice the modulus).
+ */
+static void
+f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ /*
+ * Since operand words fit on 30 bits, we can use 32-bit
+ * variables throughout.
+ */
+ int i;
+ uint32_t cc, w;
+
+ cc = 0;
+ for (i = 0; i < 9; i ++) {
+ w = a[i] + b[i] + cc;
+ d[i] = w & 0x3FFFFFFF;
+ cc = w >> 30;
+ }
+ cc = MUL15(w >> 15, 19);
+ d[8] &= 0x7FFF;
+ for (i = 0; i < 9; i ++) {
+ w = d[i] + cc;
+ d[i] = w & 0x3FFFFFFF;
+ cc = w >> 30;
+ }
+}
+
+/*
+ * Subtract one value from another in F255. Partial reduction is
+ * performed (down to less than twice the modulus).
+ */
+static void
+f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ /*
+ * We actually compute a - b + 2*p, so that the final value is
+ * necessarily positive.
+ */
+ int i;
+ uint32_t cc, w;
+
+ cc = (uint32_t)-38;
+ for (i = 0; i < 9; i ++) {
+ w = a[i] - b[i] + cc;
+ d[i] = w & 0x3FFFFFFF;
+ cc = ARSH(w, 30);
+ }
+ cc = MUL15((w + 0x10000) >> 15, 19);
+ d[8] &= 0x7FFF;
+ for (i = 0; i < 9; i ++) {
+ w = d[i] + cc;
+ d[i] = w & 0x3FFFFFFF;
+ cc = w >> 30;
+ }
+}
+
+/*
+ * Multiply an integer by the 'A24' constant (121665). Partial reduction
+ * is performed (down to less than twice the modulus).
+ */
+static void
+f255_mul_a24(uint32_t *d, const uint32_t *a)
+{
+ int i;
+ uint64_t w;
+ uint32_t cc;
+
+ /*
+ * a[] is over 256 bits, thus a[8] has length at most 16 bits.
+ * We single out the processing of the last word: intermediate
+ * value w is up to 121665*2^16, yielding a carry for the next
+ * loop of at most 19*(121665*2^16/2^15) = 4623289.
+ */
+ cc = 0;
+ for (i = 0; i < 8; i ++) {
+ w = MUL31(a[i], 121665) + (uint64_t)cc;
+ d[i] = (uint32_t)w & 0x3FFFFFFF;
+ cc = (uint32_t)(w >> 30);
+ }
+ w = MUL31(a[8], 121665) + (uint64_t)cc;
+ d[8] = (uint32_t)w & 0x7FFF;
+ cc = MUL15((uint32_t)(w >> 15), 19);
+
+ for (i = 0; i < 9; i ++) {
+ uint32_t z;
+
+ z = d[i] + cc;
+ d[i] = z & 0x3FFFFFFF;
+ cc = z >> 30;
+ }
+}
+
+static const unsigned char GEN[] = {
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+ 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 0;
+}
+
+static void
+cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
+{
+ int i;
+
+ ctl = -ctl;
+ for (i = 0; i < 9; i ++) {
+ uint32_t aw, bw, tw;
+
+ aw = a[i];
+ bw = b[i];
+ tw = ctl & (aw ^ bw);
+ a[i] = aw ^ tw;
+ b[i] = bw ^ tw;
+ }
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *kb, size_t kblen, int curve)
+{
+ uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
+ uint32_t a[9], aa[9], b[9], bb[9];
+ uint32_t c[9], d[9], e[9], da[9], cb[9];
+ unsigned char k[32];
+ uint32_t swap;
+ int i;
+
+ (void)curve;
+
+ /*
+ * Points are encoded over exactly 32 bytes. Multipliers must fit
+ * in 32 bytes as well.
+ * RFC 7748 mandates that the high bit of the last point byte must
+ * be ignored/cleared.
+ */
+ if (Glen != 32 || kblen > 32) {
+ return 0;
+ }
+ G[31] &= 0x7F;
+
+ /*
+ * Initialise variables x1, x2, z2, x3 and z3. We set all of them
+ * into Montgomery representation.
+ */
+ x1[8] = le8_to_le30(x1, G, 32);
+ memcpy(x3, x1, sizeof x1);
+ memset(z2, 0, sizeof z2);
+ memset(x2, 0, sizeof x2);
+ x2[0] = 1;
+ memset(z3, 0, sizeof z3);
+ z3[0] = 1;
+
+ memset(k, 0, (sizeof k) - kblen);
+ memcpy(k + (sizeof k) - kblen, kb, kblen);
+ k[31] &= 0xF8;
+ k[0] &= 0x7F;
+ k[0] |= 0x40;
+
+ /* obsolete
+ print_int("x1", x1);
+ */
+
+ swap = 0;
+ for (i = 254; i >= 0; i --) {
+ uint32_t kt;
+
+ kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+ swap ^= kt;
+ cswap(x2, x3, swap);
+ cswap(z2, z3, swap);
+ swap = kt;
+
+ /* obsolete
+ print_int("x2", x2);
+ print_int("z2", z2);
+ print_int("x3", x3);
+ print_int("z3", z3);
+ */
+
+ f255_add(a, x2, z2);
+ f255_square(aa, a);
+ f255_sub(b, x2, z2);
+ f255_square(bb, b);
+ f255_sub(e, aa, bb);
+ f255_add(c, x3, z3);
+ f255_sub(d, x3, z3);
+ f255_mul(da, d, a);
+ f255_mul(cb, c, b);
+
+ /* obsolete
+ print_int("a ", a);
+ print_int("aa", aa);
+ print_int("b ", b);
+ print_int("bb", bb);
+ print_int("e ", e);
+ print_int("c ", c);
+ print_int("d ", d);
+ print_int("da", da);
+ print_int("cb", cb);
+ */
+
+ f255_add(x3, da, cb);
+ f255_square(x3, x3);
+ f255_sub(z3, da, cb);
+ f255_square(z3, z3);
+ f255_mul(z3, z3, x1);
+ f255_mul(x2, aa, bb);
+ f255_mul_a24(z2, e);
+ f255_add(z2, z2, aa);
+ f255_mul(z2, e, z2);
+
+ /* obsolete
+ print_int("x2", x2);
+ print_int("z2", z2);
+ print_int("x3", x3);
+ print_int("z3", z3);
+ */
+ }
+ cswap(x2, x3, swap);
+ cswap(z2, z3, swap);
+
+ /*
+ * Inverse z2 with a modular exponentiation. This is a simple
+ * square-and-multiply algorithm; we mutualise most non-squarings
+ * since the exponent contains almost only ones.
+ */
+ memcpy(a, z2, sizeof z2);
+ for (i = 0; i < 15; i ++) {
+ f255_square(a, a);
+ f255_mul(a, a, z2);
+ }
+ memcpy(b, a, sizeof a);
+ for (i = 0; i < 14; i ++) {
+ int j;
+
+ for (j = 0; j < 16; j ++) {
+ f255_square(b, b);
+ }
+ f255_mul(b, b, a);
+ }
+ for (i = 14; i >= 0; i --) {
+ f255_square(b, b);
+ if ((0xFFEB >> i) & 1) {
+ f255_mul(b, z2, b);
+ }
+ }
+ f255_mul(x2, x2, b);
+ reduce_final_f255(x2);
+ le30_to_le8(G, 32, x2);
+ return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ /*
+ * We don't implement this method, since it is used for ECDSA
+ * only, and there is no ECDSA over Curve25519 (which instead
+ * uses EdDSA).
+ */
+ (void)A;
+ (void)B;
+ (void)len;
+ (void)x;
+ (void)xlen;
+ (void)y;
+ (void)ylen;
+ (void)curve;
+ return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m31 = {
+ (uint32_t)0x20000000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c
new file mode 100644
index 00000000..6b058eb1
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char GEN[] = {
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+ 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 0;
+}
+
+/*
+ * A field element is encoded as five 64-bit integers, in basis 2^51.
+ * Limbs may be occasionally larger than 2^51, to save on carry
+ * propagation costs.
+ */
+
+#define MASK51 (((uint64_t)1 << 51) - (uint64_t)1)
+
+/*
+ * Swap two field elements, conditionally on a flag.
+ */
+static inline void
+f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
+{
+ uint64_t m, w;
+
+ m = -(uint64_t)ctl;
+ w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
+ w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
+ w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
+ w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
+ w = m & (a[4] ^ b[4]); a[4] ^= w; b[4] ^= w;
+}
+
+/*
+ * Addition with no carry propagation. Limbs double in size.
+ */
+static inline void
+f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+ d[0] = a[0] + b[0];
+ d[1] = a[1] + b[1];
+ d[2] = a[2] + b[2];
+ d[3] = a[3] + b[3];
+ d[4] = a[4] + b[4];
+}
+
+/*
+ * Subtraction.
+ * On input, limbs must fit on 60 bits each. On output, result is
+ * partially reduced, with max value 2^255+19456; moreover, all
+ * limbs will fit on 51 bits, except the low limb, which may have
+ * value up to 2^51+19455.
+ */
+static inline void
+f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+ uint64_t cc, w;
+
+ /*
+ * We compute d = (2^255-19)*1024 + a - b. Since the limbs
+ * fit on 60 bits, the maximum value of operands are slightly
+ * more than 2^264, but much less than 2^265-19456. This
+ * ensures that the result is positive.
+ */
+
+ /*
+ * Initial carry is 19456, since we add 2^265-19456. Each
+ * individual subtraction may yield a carry up to 513.
+ */
+ w = a[0] - b[0] - 19456;
+ d[0] = w & MASK51;
+ cc = -(w >> 51) & 0x3FF;
+ w = a[1] - b[1] - cc;
+ d[1] = w & MASK51;
+ cc = -(w >> 51) & 0x3FF;
+ w = a[2] - b[2] - cc;
+ d[2] = w & MASK51;
+ cc = -(w >> 51) & 0x3FF;
+ w = a[3] - b[3] - cc;
+ d[3] = w & MASK51;
+ cc = -(w >> 51) & 0x3FF;
+ d[4] = ((uint64_t)1 << 61) + a[4] - b[4] - cc;
+
+ /*
+ * Partial reduction. The intermediate result may be up to
+ * slightly above 2^265, but less than 2^265+2^255. When we
+ * truncate to 255 bits, the upper bits will be at most 1024.
+ */
+ d[0] += 19 * (d[4] >> 51);
+ d[4] &= MASK51;
+}
+
+/*
+ * UMUL51(hi, lo, x, y) computes:
+ *
+ * hi = floor((x * y) / (2^51))
+ * lo = x * y mod 2^51
+ *
+ * Note that lo < 2^51, but "hi" may be larger, if the input operands are
+ * larger.
+ */
+#if BR_INT128
+
+#define UMUL51(hi, lo, x, y) do { \
+ unsigned __int128 umul_tmp; \
+ umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
+ (hi) = (uint64_t)(umul_tmp >> 51); \
+ (lo) = (uint64_t)umul_tmp & MASK51; \
+ } while (0)
+
+#elif BR_UMUL128
+
+#define UMUL51(hi, lo, x, y) do { \
+ uint64_t umul_hi, umul_lo; \
+ umul_lo = _umul128((x), (y), &umul_hi); \
+ (hi) = (umul_hi << 13) | (umul_lo >> 51); \
+ (lo) = umul_lo & MASK51; \
+ } while (0)
+
+#endif
+
+/*
+ * Multiplication.
+ * On input, limbs must fit on 54 bits each.
+ * On output, limb 0 is at most 2^51 + 155647, and other limbs fit
+ * on 51 bits each.
+ */
+static inline void
+f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
+{
+ uint64_t t[10], hi, lo, w, cc;
+
+ /*
+ * Perform cross products, accumulating values without carry
+ * propagation.
+ *
+ * Since input limbs fit on 54 bits each, each individual
+ * UMUL51 will produce a "hi" of less than 2^57. The maximum
+ * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]),
+ * i.e. less than 324*2^51.
+ */
+
+ UMUL51(t[1], t[0], a[0], b[0]);
+
+ UMUL51(t[2], lo, a[1], b[0]); t[1] += lo;
+ UMUL51(hi, lo, a[0], b[1]); t[1] += lo; t[2] += hi;
+
+ UMUL51(t[3], lo, a[2], b[0]); t[2] += lo;
+ UMUL51(hi, lo, a[1], b[1]); t[2] += lo; t[3] += hi;
+ UMUL51(hi, lo, a[0], b[2]); t[2] += lo; t[3] += hi;
+
+ UMUL51(t[4], lo, a[3], b[0]); t[3] += lo;
+ UMUL51(hi, lo, a[2], b[1]); t[3] += lo; t[4] += hi;
+ UMUL51(hi, lo, a[1], b[2]); t[3] += lo; t[4] += hi;
+ UMUL51(hi, lo, a[0], b[3]); t[3] += lo; t[4] += hi;
+
+ UMUL51(t[5], lo, a[4], b[0]); t[4] += lo;
+ UMUL51(hi, lo, a[3], b[1]); t[4] += lo; t[5] += hi;
+ UMUL51(hi, lo, a[2], b[2]); t[4] += lo; t[5] += hi;
+ UMUL51(hi, lo, a[1], b[3]); t[4] += lo; t[5] += hi;
+ UMUL51(hi, lo, a[0], b[4]); t[4] += lo; t[5] += hi;
+
+ UMUL51(t[6], lo, a[4], b[1]); t[5] += lo;
+ UMUL51(hi, lo, a[3], b[2]); t[5] += lo; t[6] += hi;
+ UMUL51(hi, lo, a[2], b[3]); t[5] += lo; t[6] += hi;
+ UMUL51(hi, lo, a[1], b[4]); t[5] += lo; t[6] += hi;
+
+ UMUL51(t[7], lo, a[4], b[2]); t[6] += lo;
+ UMUL51(hi, lo, a[3], b[3]); t[6] += lo; t[7] += hi;
+ UMUL51(hi, lo, a[2], b[4]); t[6] += lo; t[7] += hi;
+
+ UMUL51(t[8], lo, a[4], b[3]); t[7] += lo;
+ UMUL51(hi, lo, a[3], b[4]); t[7] += lo; t[8] += hi;
+
+ UMUL51(t[9], lo, a[4], b[4]); t[8] += lo;
+
+ /*
+ * The upper words t[5]..t[9] are folded back into the lower
+ * words, using the rule that 2^255 = 19 in the field.
+ *
+ * Since each t[i] is less than 324*2^51, the additions below
+ * will yield less than 6480*2^51 in each limb; this fits in
+ * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is
+ * no overflow.
+ */
+ t[0] += 19 * t[5];
+ t[1] += 19 * t[6];
+ t[2] += 19 * t[7];
+ t[3] += 19 * t[8];
+ t[4] += 19 * t[9];
+
+ /*
+ * Propagate carries.
+ */
+ w = t[0];
+ d[0] = w & MASK51;
+ cc = w >> 51;
+ w = t[1] + cc;
+ d[1] = w & MASK51;
+ cc = w >> 51;
+ w = t[2] + cc;
+ d[2] = w & MASK51;
+ cc = w >> 51;
+ w = t[3] + cc;
+ d[3] = w & MASK51;
+ cc = w >> 51;
+ w = t[4] + cc;
+ d[4] = w & MASK51;
+ cc = w >> 51;
+
+ /*
+ * Since the limbs were 64-bit values, the top carry is at
+ * most 8192 (in practice, that cannot be reached). We simply
+ * performed a partial reduction.
+ */
+ d[0] += 19 * cc;
+}
+
+/*
+ * Multiplication by A24 = 121665.
+ * Input must have limbs of 60 bits at most.
+ */
+static inline void
+f255_mul_a24(uint64_t *d, const uint64_t *a)
+{
+ uint64_t t[5], cc, w;
+
+ /*
+ * 121665 = 15 * 8111. We first multiply by 15, with carry
+ * propagation and partial reduction.
+ */
+ w = a[0] * 15;
+ t[0] = w & MASK51;
+ cc = w >> 51;
+ w = a[1] * 15 + cc;
+ t[1] = w & MASK51;
+ cc = w >> 51;
+ w = a[2] * 15 + cc;
+ t[2] = w & MASK51;
+ cc = w >> 51;
+ w = a[3] * 15 + cc;
+ t[3] = w & MASK51;
+ cc = w >> 51;
+ w = a[4] * 15 + cc;
+ t[4] = w & MASK51;
+ t[0] += 19 * (w >> 51);
+
+ /*
+ * Then multiplication by 8111. At that point, we known that
+ * t[0] is less than 2^51 + 19*8192, and other limbs are less
+ * than 2^51; thus, there will be no overflow.
+ */
+ w = t[0] * 8111;
+ d[0] = w & MASK51;
+ cc = w >> 51;
+ w = t[1] * 8111 + cc;
+ d[1] = w & MASK51;
+ cc = w >> 51;
+ w = t[2] * 8111 + cc;
+ d[2] = w & MASK51;
+ cc = w >> 51;
+ w = t[3] * 8111 + cc;
+ d[3] = w & MASK51;
+ cc = w >> 51;
+ w = t[4] * 8111 + cc;
+ d[4] = w & MASK51;
+ d[0] += 19 * (w >> 51);
+}
+
+/*
+ * Finalize reduction.
+ * On input, limbs must fit on 51 bits, except possibly the low limb,
+ * which may be slightly above 2^51.
+ */
+static inline void
+f255_final_reduce(uint64_t *a)
+{
+ uint64_t t[5], cc, w;
+
+ /*
+ * We add 19. If the result (in t[]) is below 2^255, then a[]
+ * is already less than 2^255-19, thus already reduced.
+ * Otherwise, we subtract 2^255 from t[], in which case we
+ * have t = a - (2^255-19), and that's our result.
+ */
+ w = a[0] + 19;
+ t[0] = w & MASK51;
+ cc = w >> 51;
+ w = a[1] + cc;
+ t[1] = w & MASK51;
+ cc = w >> 51;
+ w = a[2] + cc;
+ t[2] = w & MASK51;
+ cc = w >> 51;
+ w = a[3] + cc;
+ t[3] = w & MASK51;
+ cc = w >> 51;
+ w = a[4] + cc;
+ t[4] = w & MASK51;
+ cc = w >> 51;
+
+ /*
+ * The bit 255 of t is in cc. If that bit is 0, when a[] must
+ * be unchanged; otherwise, it must be replaced with t[].
+ */
+ cc = -cc;
+ a[0] ^= cc & (a[0] ^ t[0]);
+ a[1] ^= cc & (a[1] ^ t[1]);
+ a[2] ^= cc & (a[2] ^ t[2]);
+ a[3] ^= cc & (a[3] ^ t[3]);
+ a[4] ^= cc & (a[4] ^ t[4]);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *kb, size_t kblen, int curve)
+{
+ unsigned char k[32];
+ uint64_t x1[5], x2[5], z2[5], x3[5], z3[5];
+ uint32_t swap;
+ int i;
+
+ (void)curve;
+
+ /*
+ * Points are encoded over exactly 32 bytes. Multipliers must fit
+ * in 32 bytes as well.
+ */
+ if (Glen != 32 || kblen > 32) {
+ return 0;
+ }
+
+ /*
+ * RFC 7748 mandates that the high bit of the last point byte must
+ * be ignored/cleared; the "& MASK51" in the initialization for
+ * x1[4] clears that bit.
+ */
+ x1[0] = br_dec64le(&G[0]) & MASK51;
+ x1[1] = (br_dec64le(&G[6]) >> 3) & MASK51;
+ x1[2] = (br_dec64le(&G[12]) >> 6) & MASK51;
+ x1[3] = (br_dec64le(&G[19]) >> 1) & MASK51;
+ x1[4] = (br_dec64le(&G[24]) >> 12) & MASK51;
+
+ /*
+ * We can use memset() to clear values, because exact-width types
+ * like uint64_t are guaranteed to have no padding bits or
+ * trap representations.
+ */
+ memset(x2, 0, sizeof x2);
+ x2[0] = 1;
+ memset(z2, 0, sizeof z2);
+ memcpy(x3, x1, sizeof x1);
+ memcpy(z3, x2, sizeof x2);
+
+ /*
+ * The multiplier is provided in big-endian notation, and
+ * possibly shorter than 32 bytes.
+ */
+ memset(k, 0, (sizeof k) - kblen);
+ memcpy(k + (sizeof k) - kblen, kb, kblen);
+ k[31] &= 0xF8;
+ k[0] &= 0x7F;
+ k[0] |= 0x40;
+
+ swap = 0;
+
+ for (i = 254; i >= 0; i --) {
+ uint64_t a[5], aa[5], b[5], bb[5], e[5];
+ uint64_t c[5], d[5], da[5], cb[5];
+ uint32_t kt;
+
+ kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+ swap ^= kt;
+ f255_cswap(x2, x3, swap);
+ f255_cswap(z2, z3, swap);
+ swap = kt;
+
+ /*
+ * At that point, limbs of x_2 and z_2 are assumed to fit
+ * on at most 52 bits each.
+ *
+ * Each f255_add() adds one bit to the maximum range of
+ * the values, but f255_sub() and f255_mul() bring back
+ * the limbs into 52 bits. All f255_add() outputs are
+ * used only as inputs for f255_mul(), which ensures
+ * that limbs remain in the proper range.
+ */
+
+ /* A = x_2 + z_2 -- limbs fit on 53 bits each */
+ f255_add(a, x2, z2);
+
+ /* AA = A^2 */
+ f255_mul(aa, a, a);
+
+ /* B = x_2 - z_2 */
+ f255_sub(b, x2, z2);
+
+ /* BB = B^2 */
+ f255_mul(bb, b, b);
+
+ /* E = AA - BB */
+ f255_sub(e, aa, bb);
+
+ /* C = x_3 + z_3 -- limbs fit on 53 bits each */
+ f255_add(c, x3, z3);
+
+ /* D = x_3 - z_3 */
+ f255_sub(d, x3, z3);
+
+ /* DA = D * A */
+ f255_mul(da, d, a);
+
+ /* CB = C * B */
+ f255_mul(cb, c, b);
+
+ /* x_3 = (DA + CB)^2 */
+ f255_add(x3, da, cb);
+ f255_mul(x3, x3, x3);
+
+ /* z_3 = x_1 * (DA - CB)^2 */
+ f255_sub(z3, da, cb);
+ f255_mul(z3, z3, z3);
+ f255_mul(z3, x1, z3);
+
+ /* x_2 = AA * BB */
+ f255_mul(x2, aa, bb);
+
+ /* z_2 = E * (AA + a24 * E) */
+ f255_mul_a24(z2, e);
+ f255_add(z2, aa, z2);
+ f255_mul(z2, e, z2);
+ }
+
+ f255_cswap(x2, x3, swap);
+ f255_cswap(z2, z3, swap);
+
+ /*
+ * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
+ * most non-squarings. We use x1 and x3, now useless, as temporaries.
+ */
+ memcpy(x1, z2, sizeof z2);
+ for (i = 0; i < 15; i ++) {
+ f255_mul(x1, x1, x1);
+ f255_mul(x1, x1, z2);
+ }
+ memcpy(x3, x1, sizeof x1);
+ for (i = 0; i < 14; i ++) {
+ int j;
+
+ for (j = 0; j < 16; j ++) {
+ f255_mul(x3, x3, x3);
+ }
+ f255_mul(x3, x3, x1);
+ }
+ for (i = 14; i >= 0; i --) {
+ f255_mul(x3, x3, x3);
+ if ((0xFFEB >> i) & 1) {
+ f255_mul(x3, z2, x3);
+ }
+ }
+
+ /*
+ * Compute x2/z2. We have 1/z2 in x3.
+ */
+ f255_mul(x2, x2, x3);
+ f255_final_reduce(x2);
+
+ /*
+ * Encode the final x2 value in little-endian. We first assemble
+ * the limbs into 64-bit values.
+ */
+ x2[0] |= x2[1] << 51;
+ x2[1] = (x2[1] >> 13) | (x2[2] << 38);
+ x2[2] = (x2[2] >> 26) | (x2[3] << 25);
+ x2[3] = (x2[3] >> 39) | (x2[4] << 12);
+ br_enc64le(G, x2[0]);
+ br_enc64le(G + 8, x2[1]);
+ br_enc64le(G + 16, x2[2]);
+ br_enc64le(G + 24, x2[3]);
+ return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ /*
+ * We don't implement this method, since it is used for ECDSA
+ * only, and there is no ECDSA over Curve25519 (which instead
+ * uses EdDSA).
+ */
+ (void)A;
+ (void)B;
+ (void)len;
+ (void)x;
+ (void)xlen;
+ (void)y;
+ (void)ylen;
+ (void)curve;
+ return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m62 = {
+ (uint32_t)0x20000000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m62_get(void)
+{
+ return &br_ec_c25519_m62;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m62_get(void)
+{
+ return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c
new file mode 100644
index 00000000..7e7f12f7
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char GEN[] = {
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+ 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 0;
+}
+
+/*
+ * A field element is encoded as four 64-bit integers, in basis 2^63.
+ * Operations return partially reduced values, which may range up to
+ * 2^255+37.
+ */
+
+#define MASK63 (((uint64_t)1 << 63) - (uint64_t)1)
+
+/*
+ * Swap two field elements, conditionally on a flag.
+ */
+static inline void
+f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
+{
+ uint64_t m, w;
+
+ m = -(uint64_t)ctl;
+ w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
+ w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
+ w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
+ w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
+}
+
+/*
+ * Addition in the field.
+ */
+static inline void
+f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+ uint64_t t0, t1, t2, t3, cc;
+ unsigned __int128 z;
+
+ z = (unsigned __int128)a[0] + (unsigned __int128)b[0];
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64);
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64);
+ t3 = (uint64_t)z & MASK63;
+ cc = (uint64_t)(z >> 63);
+
+ /*
+ * Since operands are at most 2^255+37, the sum is at most
+ * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
+ *
+ * We use: 2^255 = 19 mod p.
+ * Since we add 0, 19 or 38 to a value that fits on 255 bits,
+ * the result is at most 2^255+37.
+ */
+ z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc);
+ d[0] = (uint64_t)z;
+ z = (unsigned __int128)t1 + (z >> 64);
+ d[1] = (uint64_t)z;
+ z = (unsigned __int128)t2 + (z >> 64);
+ d[2] = (uint64_t)z;
+ d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+ uint64_t t0, t1, t2, t3, cc;
+ unsigned char k;
+
+ k = _addcarry_u64(0, a[0], b[0], &t0);
+ k = _addcarry_u64(k, a[1], b[1], &t1);
+ k = _addcarry_u64(k, a[2], b[2], &t2);
+ k = _addcarry_u64(k, a[3], b[3], &t3);
+ cc = (k << 1) + (t3 >> 63);
+ t3 &= MASK63;
+
+ /*
+ * Since operands are at most 2^255+37, the sum is at most
+ * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
+ *
+ * We use: 2^255 = 19 mod p.
+ * Since we add 0, 19 or 38 to a value that fits on 255 bits,
+ * the result is at most 2^255+37.
+ */
+ k = _addcarry_u64(0, t0, 19 * cc, &d[0]);
+ k = _addcarry_u64(k, t1, 0, &d[1]);
+ k = _addcarry_u64(k, t2, 0, &d[2]);
+ (void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Subtraction.
+ * On input, limbs must fit on 60 bits each. On output, result is
+ * partially reduced, with max value 2^255+19456; moreover, all
+ * limbs will fit on 51 bits, except the low limb, which may have
+ * value up to 2^51+19455.
+ */
+static inline void
+f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+ /*
+ * We compute t = 2^256 - 38 + a - b, which is necessarily
+ * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
+ * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
+ * on the two upper bits of t (bits 255 and 256).
+ */
+
+ uint64_t t0, t1, t2, t3, t4, cc;
+ unsigned __int128 z;
+
+ z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38;
+ t0 = (uint64_t)z;
+ cc = -(uint64_t)(z >> 64);
+ z = (unsigned __int128)a[1] - (unsigned __int128)b[1]
+ - (unsigned __int128)cc;
+ t1 = (uint64_t)z;
+ cc = -(uint64_t)(z >> 64);
+ z = (unsigned __int128)a[2] - (unsigned __int128)b[2]
+ - (unsigned __int128)cc;
+ t2 = (uint64_t)z;
+ cc = -(uint64_t)(z >> 64);
+ z = (unsigned __int128)a[3] - (unsigned __int128)b[3]
+ - (unsigned __int128)cc;
+ t3 = (uint64_t)z;
+ t4 = 1 + (uint64_t)(z >> 64);
+
+ /*
+ * We have a 257-bit result. The two top bits can be 00, 01 or 10,
+ * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
+ * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
+ * This guarantees that the result is at most 2^255+37.
+ */
+ cc = (38 & -t4) + (19 & -(t3 >> 63));
+ t3 &= MASK63;
+ z = (unsigned __int128)t0 + (unsigned __int128)cc;
+ d[0] = (uint64_t)z;
+ z = (unsigned __int128)t1 + (z >> 64);
+ d[1] = (uint64_t)z;
+ z = (unsigned __int128)t2 + (z >> 64);
+ d[2] = (uint64_t)z;
+ d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+ /*
+ * We compute t = 2^256 - 38 + a - b, which is necessarily
+ * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
+ * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
+ * on the two upper bits of t (bits 255 and 256).
+ */
+
+ uint64_t t0, t1, t2, t3, t4;
+ unsigned char k;
+
+ k = _subborrow_u64(0, a[0], b[0], &t0);
+ k = _subborrow_u64(k, a[1], b[1], &t1);
+ k = _subborrow_u64(k, a[2], b[2], &t2);
+ k = _subborrow_u64(k, a[3], b[3], &t3);
+ (void)_subborrow_u64(k, 1, 0, &t4);
+
+ k = _subborrow_u64(0, t0, 38, &t0);
+ k = _subborrow_u64(k, t1, 0, &t1);
+ k = _subborrow_u64(k, t2, 0, &t2);
+ k = _subborrow_u64(k, t3, 0, &t3);
+ (void)_subborrow_u64(k, t4, 0, &t4);
+
+ /*
+ * We have a 257-bit result. The two top bits can be 00, 01 or 10,
+ * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
+ * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
+ * This guarantees that the result is at most 2^255+37.
+ */
+ t4 = (38 & -t4) + (19 & -(t3 >> 63));
+ t3 &= MASK63;
+ k = _addcarry_u64(0, t0, t4, &d[0]);
+ k = _addcarry_u64(k, t1, 0, &d[1]);
+ k = _addcarry_u64(k, t2, 0, &d[2]);
+ (void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Multiplication.
+ */
+static inline void
+f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
+{
+#if BR_INT128
+
+ unsigned __int128 z;
+ uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
+
+ /*
+ * Compute the product a*b over plain integers.
+ */
+ z = (unsigned __int128)a[0] * (unsigned __int128)b[0];
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64);
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64);
+ t3 = (uint64_t)z;
+ t4 = (uint64_t)(z >> 64);
+
+ z = (unsigned __int128)a[1] * (unsigned __int128)b[0]
+ + (unsigned __int128)t1;
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)a[1] * (unsigned __int128)b[1]
+ + (unsigned __int128)t2 + (z >> 64);
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)a[1] * (unsigned __int128)b[2]
+ + (unsigned __int128)t3 + (z >> 64);
+ t3 = (uint64_t)z;
+ z = (unsigned __int128)a[1] * (unsigned __int128)b[3]
+ + (unsigned __int128)t4 + (z >> 64);
+ t4 = (uint64_t)z;
+ t5 = (uint64_t)(z >> 64);
+
+ z = (unsigned __int128)a[2] * (unsigned __int128)b[0]
+ + (unsigned __int128)t2;
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)a[2] * (unsigned __int128)b[1]
+ + (unsigned __int128)t3 + (z >> 64);
+ t3 = (uint64_t)z;
+ z = (unsigned __int128)a[2] * (unsigned __int128)b[2]
+ + (unsigned __int128)t4 + (z >> 64);
+ t4 = (uint64_t)z;
+ z = (unsigned __int128)a[2] * (unsigned __int128)b[3]
+ + (unsigned __int128)t5 + (z >> 64);
+ t5 = (uint64_t)z;
+ t6 = (uint64_t)(z >> 64);
+
+ z = (unsigned __int128)a[3] * (unsigned __int128)b[0]
+ + (unsigned __int128)t3;
+ t3 = (uint64_t)z;
+ z = (unsigned __int128)a[3] * (unsigned __int128)b[1]
+ + (unsigned __int128)t4 + (z >> 64);
+ t4 = (uint64_t)z;
+ z = (unsigned __int128)a[3] * (unsigned __int128)b[2]
+ + (unsigned __int128)t5 + (z >> 64);
+ t5 = (uint64_t)z;
+ z = (unsigned __int128)a[3] * (unsigned __int128)b[3]
+ + (unsigned __int128)t6 + (z >> 64);
+ t6 = (uint64_t)z;
+ t7 = (uint64_t)(z >> 64);
+
+ /*
+ * Modulo p, we have:
+ *
+ * 2^255 = 19
+ * 2^510 = 19*19 = 361
+ *
+ * We split the intermediate t into three parts, in basis
+ * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
+ * The upper one can only be a single bit (th), since the
+ * multiplication operands are at most 2^255+37 each.
+ */
+ th = t7 >> 62;
+ t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
+ t6 = (t6 << 1) | (t5 >> 63);
+ t5 = (t5 << 1) | (t4 >> 63);
+ t4 = (t4 << 1) | (t3 >> 63);
+ t3 &= MASK63;
+
+ /*
+ * Multiply the middle part (t4..t7) by 19. We truncate it to
+ * 255 bits; the extra bits will go along with th.
+ */
+ z = (unsigned __int128)t4 * 19;
+ t4 = (uint64_t)z;
+ z = (unsigned __int128)t5 * 19 + (z >> 64);
+ t5 = (uint64_t)z;
+ z = (unsigned __int128)t6 * 19 + (z >> 64);
+ t6 = (uint64_t)z;
+ z = (unsigned __int128)t7 * 19 + (z >> 64);
+ t7 = (uint64_t)z & MASK63;
+
+ th = (361 & -th) + (19 * (uint64_t)(z >> 63));
+
+ /*
+ * Add elements together.
+ * At this point:
+ * t0..t3 fits on 255 bits.
+ * t4..t7 fits on 255 bits.
+ * th <= 361 + 342 = 703.
+ */
+ z = (unsigned __int128)t0 + (unsigned __int128)t4
+ + (unsigned __int128)th;
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64);
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64);
+ t3 = (uint64_t)z & MASK63;
+ th = (uint64_t)(z >> 63);
+
+ /*
+ * Since the sum is at most 2^256 + 703, the two upper bits, in th,
+ * can only have value 0, 1 or 2. We just add th*19, which
+ * guarantees a result of at most 2^255+37.
+ */
+ z = (unsigned __int128)t0 + (19 * th);
+ d[0] = (uint64_t)z;
+ z = (unsigned __int128)t1 + (z >> 64);
+ d[1] = (uint64_t)z;
+ z = (unsigned __int128)t2 + (z >> 64);
+ d[2] = (uint64_t)z;
+ d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+ uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
+ uint64_t h0, h1, h2, h3;
+ unsigned char k;
+
+ /*
+ * Compute the product a*b over plain integers.
+ */
+ t0 = _umul128(a[0], b[0], &h0);
+ t1 = _umul128(a[0], b[1], &h1);
+ k = _addcarry_u64(0, t1, h0, &t1);
+ t2 = _umul128(a[0], b[2], &h2);
+ k = _addcarry_u64(k, t2, h1, &t2);
+ t3 = _umul128(a[0], b[3], &h3);
+ k = _addcarry_u64(k, t3, h2, &t3);
+ (void)_addcarry_u64(k, h3, 0, &t4);
+
+ k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1);
+ k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2);
+ k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3);
+ k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4);
+ t5 = k;
+ k = _addcarry_u64(0, t2, h0, &t2);
+ k = _addcarry_u64(k, t3, h1, &t3);
+ k = _addcarry_u64(k, t4, h2, &t4);
+ (void)_addcarry_u64(k, t5, h3, &t5);
+
+ k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2);
+ k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3);
+ k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4);
+ k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5);
+ t6 = k;
+ k = _addcarry_u64(0, t3, h0, &t3);
+ k = _addcarry_u64(k, t4, h1, &t4);
+ k = _addcarry_u64(k, t5, h2, &t5);
+ (void)_addcarry_u64(k, t6, h3, &t6);
+
+ k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3);
+ k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4);
+ k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5);
+ k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6);
+ t7 = k;
+ k = _addcarry_u64(0, t4, h0, &t4);
+ k = _addcarry_u64(k, t5, h1, &t5);
+ k = _addcarry_u64(k, t6, h2, &t6);
+ (void)_addcarry_u64(k, t7, h3, &t7);
+
+ /*
+ * Modulo p, we have:
+ *
+ * 2^255 = 19
+ * 2^510 = 19*19 = 361
+ *
+ * We split the intermediate t into three parts, in basis
+ * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
+ * The upper one can only be a single bit (th), since the
+ * multiplication operands are at most 2^255+37 each.
+ */
+ th = t7 >> 62;
+ t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
+ t6 = (t6 << 1) | (t5 >> 63);
+ t5 = (t5 << 1) | (t4 >> 63);
+ t4 = (t4 << 1) | (t3 >> 63);
+ t3 &= MASK63;
+
+ /*
+ * Multiply the middle part (t4..t7) by 19. We truncate it to
+ * 255 bits; the extra bits will go along with th.
+ */
+ t4 = _umul128(t4, 19, &h0);
+ t5 = _umul128(t5, 19, &h1);
+ t6 = _umul128(t6, 19, &h2);
+ t7 = _umul128(t7, 19, &h3);
+ k = _addcarry_u64(0, t5, h0, &t5);
+ k = _addcarry_u64(k, t6, h1, &t6);
+ k = _addcarry_u64(k, t7, h2, &t7);
+ (void)_addcarry_u64(k, h3, 0, &h3);
+ th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63)));
+ t7 &= MASK63;
+
+ /*
+ * Add elements together.
+ * At this point:
+ * t0..t3 fits on 255 bits.
+ * t4..t7 fits on 255 bits.
+ * th <= 361 + 342 = 703.
+ */
+ k = _addcarry_u64(0, t0, t4, &t0);
+ k = _addcarry_u64(k, t1, t5, &t1);
+ k = _addcarry_u64(k, t2, t6, &t2);
+ k = _addcarry_u64(k, t3, t7, &t3);
+ t4 = k;
+ k = _addcarry_u64(0, t0, th, &t0);
+ k = _addcarry_u64(k, t1, 0, &t1);
+ k = _addcarry_u64(k, t2, 0, &t2);
+ k = _addcarry_u64(k, t3, 0, &t3);
+ (void)_addcarry_u64(k, t4, 0, &t4);
+
+ th = (t4 << 1) + (t3 >> 63);
+ t3 &= MASK63;
+
+ /*
+ * Since the sum is at most 2^256 + 703, the two upper bits, in th,
+ * can only have value 0, 1 or 2. We just add th*19, which
+ * guarantees a result of at most 2^255+37.
+ */
+ k = _addcarry_u64(0, t0, 19 * th, &d[0]);
+ k = _addcarry_u64(k, t1, 0, &d[1]);
+ k = _addcarry_u64(k, t2, 0, &d[2]);
+ (void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Multiplication by A24 = 121665.
+ */
+static inline void
+f255_mul_a24(uint64_t *d, const uint64_t *a)
+{
+#if BR_INT128
+
+ uint64_t t0, t1, t2, t3;
+ unsigned __int128 z;
+
+ z = (unsigned __int128)a[0] * 121665;
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)a[1] * 121665 + (z >> 64);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)a[2] * 121665 + (z >> 64);
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)a[3] * 121665 + (z >> 64);
+ t3 = (uint64_t)z & MASK63;
+
+ z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63));
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)t1 + (z >> 64);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)t2 + (z >> 64);
+ t2 = (uint64_t)z;
+ t3 = t3 + (uint64_t)(z >> 64);
+
+ z = (unsigned __int128)t0 + (19 & -(t3 >> 63));
+ d[0] = (uint64_t)z;
+ z = (unsigned __int128)t1 + (z >> 64);
+ d[1] = (uint64_t)z;
+ z = (unsigned __int128)t2 + (z >> 64);
+ d[2] = (uint64_t)z;
+ d[3] = (t3 & MASK63) + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+ uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3;
+ unsigned char k;
+
+ t0 = _umul128(a[0], 121665, &h0);
+ t1 = _umul128(a[1], 121665, &h1);
+ k = _addcarry_u64(0, t1, h0, &t1);
+ t2 = _umul128(a[2], 121665, &h2);
+ k = _addcarry_u64(k, t2, h1, &t2);
+ t3 = _umul128(a[3], 121665, &h3);
+ k = _addcarry_u64(k, t3, h2, &t3);
+ (void)_addcarry_u64(k, h3, 0, &t4);
+
+ t4 = (t4 << 1) + (t3 >> 63);
+ t3 &= MASK63;
+ k = _addcarry_u64(0, t0, 19 * t4, &t0);
+ k = _addcarry_u64(k, t1, 0, &t1);
+ k = _addcarry_u64(k, t2, 0, &t2);
+ (void)_addcarry_u64(k, t3, 0, &t3);
+
+ t4 = 19 & -(t3 >> 63);
+ t3 &= MASK63;
+ k = _addcarry_u64(0, t0, t4, &d[0]);
+ k = _addcarry_u64(k, t1, 0, &d[1]);
+ k = _addcarry_u64(k, t2, 0, &d[2]);
+ (void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Finalize reduction.
+ */
+static inline void
+f255_final_reduce(uint64_t *a)
+{
+#if BR_INT128
+
+ uint64_t t0, t1, t2, t3, m;
+ unsigned __int128 z;
+
+ /*
+ * We add 19. If the result (in t) is below 2^255, then a[]
+ * is already less than 2^255-19, thus already reduced.
+ * Otherwise, we subtract 2^255 from t[], in which case we
+ * have t = a - (2^255-19), and that's our result.
+ */
+ z = (unsigned __int128)a[0] + 19;
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)a[1] + (z >> 64);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)a[2] + (z >> 64);
+ t2 = (uint64_t)z;
+ t3 = a[3] + (uint64_t)(z >> 64);
+
+ m = -(t3 >> 63);
+ t3 &= MASK63;
+ a[0] ^= m & (a[0] ^ t0);
+ a[1] ^= m & (a[1] ^ t1);
+ a[2] ^= m & (a[2] ^ t2);
+ a[3] ^= m & (a[3] ^ t3);
+
+#elif BR_UMUL128
+
+ uint64_t t0, t1, t2, t3, m;
+ unsigned char k;
+
+ /*
+ * We add 19. If the result (in t) is below 2^255, then a[]
+ * is already less than 2^255-19, thus already reduced.
+ * Otherwise, we subtract 2^255 from t[], in which case we
+ * have t = a - (2^255-19), and that's our result.
+ */
+ k = _addcarry_u64(0, a[0], 19, &t0);
+ k = _addcarry_u64(k, a[1], 0, &t1);
+ k = _addcarry_u64(k, a[2], 0, &t2);
+ (void)_addcarry_u64(k, a[3], 0, &t3);
+
+ m = -(t3 >> 63);
+ t3 &= MASK63;
+ a[0] ^= m & (a[0] ^ t0);
+ a[1] ^= m & (a[1] ^ t1);
+ a[2] ^= m & (a[2] ^ t2);
+ a[3] ^= m & (a[3] ^ t3);
+
+#endif
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *kb, size_t kblen, int curve)
+{
+ unsigned char k[32];
+ uint64_t x1[4], x2[4], z2[4], x3[4], z3[4];
+ uint32_t swap;
+ int i;
+
+ (void)curve;
+
+ /*
+ * Points are encoded over exactly 32 bytes. Multipliers must fit
+ * in 32 bytes as well.
+ */
+ if (Glen != 32 || kblen > 32) {
+ return 0;
+ }
+
+ /*
+ * RFC 7748 mandates that the high bit of the last point byte must
+ * be ignored/cleared.
+ */
+ x1[0] = br_dec64le(&G[ 0]);
+ x1[1] = br_dec64le(&G[ 8]);
+ x1[2] = br_dec64le(&G[16]);
+ x1[3] = br_dec64le(&G[24]) & MASK63;
+
+ /*
+ * We can use memset() to clear values, because exact-width types
+ * like uint64_t are guaranteed to have no padding bits or
+ * trap representations.
+ */
+ memset(x2, 0, sizeof x2);
+ x2[0] = 1;
+ memset(z2, 0, sizeof z2);
+ memcpy(x3, x1, sizeof x1);
+ memcpy(z3, x2, sizeof x2);
+
+ /*
+ * The multiplier is provided in big-endian notation, and
+ * possibly shorter than 32 bytes.
+ */
+ memset(k, 0, (sizeof k) - kblen);
+ memcpy(k + (sizeof k) - kblen, kb, kblen);
+ k[31] &= 0xF8;
+ k[0] &= 0x7F;
+ k[0] |= 0x40;
+
+ swap = 0;
+
+ for (i = 254; i >= 0; i --) {
+ uint64_t a[4], aa[4], b[4], bb[4], e[4];
+ uint64_t c[4], d[4], da[4], cb[4];
+ uint32_t kt;
+
+ kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+ swap ^= kt;
+ f255_cswap(x2, x3, swap);
+ f255_cswap(z2, z3, swap);
+ swap = kt;
+
+ /* A = x_2 + z_2 */
+ f255_add(a, x2, z2);
+
+ /* AA = A^2 */
+ f255_mul(aa, a, a);
+
+ /* B = x_2 - z_2 */
+ f255_sub(b, x2, z2);
+
+ /* BB = B^2 */
+ f255_mul(bb, b, b);
+
+ /* E = AA - BB */
+ f255_sub(e, aa, bb);
+
+ /* C = x_3 + z_3 */
+ f255_add(c, x3, z3);
+
+ /* D = x_3 - z_3 */
+ f255_sub(d, x3, z3);
+
+ /* DA = D * A */
+ f255_mul(da, d, a);
+
+ /* CB = C * B */
+ f255_mul(cb, c, b);
+
+ /* x_3 = (DA + CB)^2 */
+ f255_add(x3, da, cb);
+ f255_mul(x3, x3, x3);
+
+ /* z_3 = x_1 * (DA - CB)^2 */
+ f255_sub(z3, da, cb);
+ f255_mul(z3, z3, z3);
+ f255_mul(z3, x1, z3);
+
+ /* x_2 = AA * BB */
+ f255_mul(x2, aa, bb);
+
+ /* z_2 = E * (AA + a24 * E) */
+ f255_mul_a24(z2, e);
+ f255_add(z2, aa, z2);
+ f255_mul(z2, e, z2);
+ }
+
+ f255_cswap(x2, x3, swap);
+ f255_cswap(z2, z3, swap);
+
+ /*
+ * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
+ * most non-squarings. We use x1 and x3, now useless, as temporaries.
+ */
+ memcpy(x1, z2, sizeof z2);
+ for (i = 0; i < 15; i ++) {
+ f255_mul(x1, x1, x1);
+ f255_mul(x1, x1, z2);
+ }
+ memcpy(x3, x1, sizeof x1);
+ for (i = 0; i < 14; i ++) {
+ int j;
+
+ for (j = 0; j < 16; j ++) {
+ f255_mul(x3, x3, x3);
+ }
+ f255_mul(x3, x3, x1);
+ }
+ for (i = 14; i >= 0; i --) {
+ f255_mul(x3, x3, x3);
+ if ((0xFFEB >> i) & 1) {
+ f255_mul(x3, z2, x3);
+ }
+ }
+
+ /*
+ * Compute x2/z2. We have 1/z2 in x3.
+ */
+ f255_mul(x2, x2, x3);
+ f255_final_reduce(x2);
+
+ /*
+ * Encode the final x2 value in little-endian.
+ */
+ br_enc64le(G, x2[0]);
+ br_enc64le(G + 8, x2[1]);
+ br_enc64le(G + 16, x2[2]);
+ br_enc64le(G + 24, x2[3]);
+ return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ /*
+ * We don't implement this method, since it is used for ECDSA
+ * only, and there is no ECDSA over Curve25519 (which instead
+ * uses EdDSA).
+ */
+ (void)A;
+ (void)B;
+ (void)len;
+ (void)x;
+ (void)xlen;
+ (void)y;
+ (void)ylen;
+ (void)curve;
+ return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m64 = {
+ (uint32_t)0x20000000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m64_get(void)
+{
+ return &br_ec_c25519_m64;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m64_get(void)
+{
+ return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/ec/ec_curve25519.c b/test/monniaux/BearSSL/src/ec/ec_curve25519.c
new file mode 100644
index 00000000..a47d215e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_curve25519.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char GEN[] = {
+ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+ 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+/* see inner.h */
+const br_ec_curve_def br_curve25519 = {
+ BR_EC_curve25519,
+ ORDER, sizeof ORDER,
+ GEN, sizeof GEN
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_default.c b/test/monniaux/BearSSL/src/ec/ec_default.c
new file mode 100644
index 00000000..7bb6e0c7
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_default.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_get_default(void)
+{
+#if BR_LOMUL
+ return &br_ec_all_m15;
+#else
+ return &br_ec_all_m31;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ec_keygen.c b/test/monniaux/BearSSL/src/ec/ec_keygen.c
new file mode 100644
index 00000000..02a30962
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_keygen.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+size_t
+br_ec_keygen(const br_prng_class **rng_ctx,
+ const br_ec_impl *impl, br_ec_private_key *sk,
+ void *kbuf, int curve)
+{
+ const unsigned char *order;
+ unsigned char *buf;
+ size_t len;
+ unsigned mask;
+
+ if (curve < 0 || curve >= 32
+ || ((impl->supported_curves >> curve) & 1) == 0)
+ {
+ return 0;
+ }
+ order = impl->order(curve, &len);
+ while (len > 0 && *order == 0) {
+ order ++;
+ len --;
+ }
+ if (kbuf == NULL || len == 0) {
+ return len;
+ }
+ mask = order[0];
+ mask |= (mask >> 1);
+ mask |= (mask >> 2);
+ mask |= (mask >> 4);
+
+ /*
+ * We generate sequences of random bits of the right size, until
+ * the value is strictly lower than the curve order (we also
+ * check for all-zero values, which are invalid).
+ */
+ buf = kbuf;
+ for (;;) {
+ size_t u;
+ unsigned cc, zz;
+
+ (*rng_ctx)->generate(rng_ctx, buf, len);
+ buf[0] &= mask;
+ cc = 0;
+ u = len;
+ zz = 0;
+ while (u -- > 0) {
+ cc = ((unsigned)(buf[u] - order[u] - cc) >> 8) & 1;
+ zz |= buf[u];
+ }
+ if (cc != 0 && zz != 0) {
+ break;
+ }
+ }
+
+ if (sk != NULL) {
+ sk->curve = curve;
+ sk->x = buf;
+ sk->xlen = len;
+ }
+ return len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m15.c b/test/monniaux/BearSSL/src/ec/ec_p256_m15.c
new file mode 100644
index 00000000..8d68d1d2
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_p256_m15.c
@@ -0,0 +1,2130 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
+ * that right-shifting a signed negative integer copies the sign bit
+ * (arithmetic right-shift). This is "implementation-defined behaviour",
+ * i.e. it is not undefined, but it may differ between compilers. Each
+ * compiler is supposed to document its behaviour in that respect. GCC
+ * explicitly defines that an arithmetic right shift is used. We expect
+ * all other compilers to do the same, because underlying CPU offer an
+ * arithmetic right shift opcode that could not be used otherwise.
+ */
+#if BR_NO_ARITH_SHIFT
+#define ARSH(x, n) (((uint32_t)(x) >> (n)) \
+ | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
+#else
+#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
+#endif
+
+/*
+ * Convert an integer from unsigned big-endian encoding to a sequence of
+ * 13-bit words in little-endian order. The final "partial" word is
+ * returned.
+ */
+static uint32_t
+be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
+{
+ uint32_t acc;
+ int acc_len;
+
+ acc = 0;
+ acc_len = 0;
+ while (len -- > 0) {
+ acc |= (uint32_t)src[len] << acc_len;
+ acc_len += 8;
+ if (acc_len >= 13) {
+ *dst ++ = acc & 0x1FFF;
+ acc >>= 13;
+ acc_len -= 13;
+ }
+ }
+ return acc;
+}
+
+/*
+ * Convert an integer (13-bit words, little-endian) to unsigned
+ * big-endian encoding. The total encoding length is provided; all
+ * the destination bytes will be filled.
+ */
+static void
+le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
+{
+ uint32_t acc;
+ int acc_len;
+
+ acc = 0;
+ acc_len = 0;
+ while (len -- > 0) {
+ if (acc_len < 8) {
+ acc |= (*src ++) << acc_len;
+ acc_len += 13;
+ }
+ dst[len] = (unsigned char)acc;
+ acc >>= 8;
+ acc_len -= 8;
+ }
+}
+
+/*
+ * Normalise an array of words to a strict 13 bits per word. Returned
+ * value is the resulting carry. The source (w) and destination (d)
+ * arrays may be identical, but shall not overlap partially.
+ */
+static inline uint32_t
+norm13(uint32_t *d, const uint32_t *w, size_t len)
+{
+ size_t u;
+ uint32_t cc;
+
+ cc = 0;
+ for (u = 0; u < len; u ++) {
+ int32_t z;
+
+ z = w[u] + cc;
+ d[u] = z & 0x1FFF;
+ cc = ARSH(z, 13);
+ }
+ return cc;
+}
+
+/*
+ * mul20() multiplies two 260-bit integers together. Each word must fit
+ * on 13 bits; source operands use 20 words, destination operand
+ * receives 40 words. All overlaps allowed.
+ *
+ * square20() computes the square of a 260-bit integer. Each word must
+ * fit on 13 bits; source operand uses 20 words, destination operand
+ * receives 40 words. All overlaps allowed.
+ */
+
+#if BR_SLOW_MUL15
+
+static void
+mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ /*
+ * Two-level Karatsuba: turns a 20x20 multiplication into
+ * nine 5x5 multiplications. We use 13-bit words but do not
+ * propagate carries immediately, so words may expand:
+ *
+ * - First Karatsuba decomposition turns the 20x20 mul on
+ * 13-bit words into three 10x10 muls, two on 13-bit words
+ * and one on 14-bit words.
+ *
+ * - Second Karatsuba decomposition further splits these into:
+ *
+ * * four 5x5 muls on 13-bit words
+ * * four 5x5 muls on 14-bit words
+ * * one 5x5 mul on 15-bit words
+ *
+ * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
+ * or 15-bit words, respectively.
+ */
+ uint32_t u[45], v[45], w[90];
+ uint32_t cc;
+ int i;
+
+#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
+ (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
+ + (s2w)[5 * (s2_off) + 0]; \
+ (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
+ + (s2w)[5 * (s2_off) + 1]; \
+ (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
+ + (s2w)[5 * (s2_off) + 2]; \
+ (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
+ + (s2w)[5 * (s2_off) + 3]; \
+ (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
+ + (s2w)[5 * (s2_off) + 4]; \
+ } while (0)
+
+#define ZADDT(dw, d_off, sw, s_off) do { \
+ (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
+ (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
+ (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
+ (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
+ (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
+ } while (0)
+
+#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
+ (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
+ + (s2w)[5 * (s2_off) + 0]; \
+ (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
+ + (s2w)[5 * (s2_off) + 1]; \
+ (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
+ + (s2w)[5 * (s2_off) + 2]; \
+ (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
+ + (s2w)[5 * (s2_off) + 3]; \
+ (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
+ + (s2w)[5 * (s2_off) + 4]; \
+ } while (0)
+
+#define CPR1(w, cprcc) do { \
+ uint32_t cprz = (w) + cprcc; \
+ (w) = cprz & 0x1FFF; \
+ cprcc = cprz >> 13; \
+ } while (0)
+
+#define CPR(dw, d_off) do { \
+ uint32_t cprcc; \
+ cprcc = 0; \
+ CPR1((dw)[(d_off) + 0], cprcc); \
+ CPR1((dw)[(d_off) + 1], cprcc); \
+ CPR1((dw)[(d_off) + 2], cprcc); \
+ CPR1((dw)[(d_off) + 3], cprcc); \
+ CPR1((dw)[(d_off) + 4], cprcc); \
+ CPR1((dw)[(d_off) + 5], cprcc); \
+ CPR1((dw)[(d_off) + 6], cprcc); \
+ CPR1((dw)[(d_off) + 7], cprcc); \
+ CPR1((dw)[(d_off) + 8], cprcc); \
+ (dw)[(d_off) + 9] = cprcc; \
+ } while (0)
+
+ memcpy(u, a, 20 * sizeof *a);
+ ZADD(u, 4, a, 0, a, 1);
+ ZADD(u, 5, a, 2, a, 3);
+ ZADD(u, 6, a, 0, a, 2);
+ ZADD(u, 7, a, 1, a, 3);
+ ZADD(u, 8, u, 6, u, 7);
+
+ memcpy(v, b, 20 * sizeof *b);
+ ZADD(v, 4, b, 0, b, 1);
+ ZADD(v, 5, b, 2, b, 3);
+ ZADD(v, 6, b, 0, b, 2);
+ ZADD(v, 7, b, 1, b, 3);
+ ZADD(v, 8, v, 6, v, 7);
+
+ /*
+ * Do the eight first 8x8 muls. Source words are at most 16382
+ * each, so we can add product results together "as is" in 32-bit
+ * words.
+ */
+ for (i = 0; i < 40; i += 5) {
+ w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
+ w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
+ + MUL15(u[i + 1], v[i + 0]);
+ w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
+ + MUL15(u[i + 1], v[i + 1])
+ + MUL15(u[i + 2], v[i + 0]);
+ w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
+ + MUL15(u[i + 1], v[i + 2])
+ + MUL15(u[i + 2], v[i + 1])
+ + MUL15(u[i + 3], v[i + 0]);
+ w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
+ + MUL15(u[i + 1], v[i + 3])
+ + MUL15(u[i + 2], v[i + 2])
+ + MUL15(u[i + 3], v[i + 1])
+ + MUL15(u[i + 4], v[i + 0]);
+ w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
+ + MUL15(u[i + 2], v[i + 3])
+ + MUL15(u[i + 3], v[i + 2])
+ + MUL15(u[i + 4], v[i + 1]);
+ w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
+ + MUL15(u[i + 3], v[i + 3])
+ + MUL15(u[i + 4], v[i + 2]);
+ w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
+ + MUL15(u[i + 4], v[i + 3]);
+ w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
+ w[(i << 1) + 9] = 0;
+ }
+
+ /*
+ * For the 9th multiplication, source words are up to 32764,
+ * so we must do some carry propagation. If we add up to
+ * 4 products and the carry is no more than 524224, then the
+ * result fits in 32 bits, and the next carry will be no more
+ * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
+ *
+ * We thus just skip one of the products in the middle word,
+ * then do a carry propagation (this reduces words to 13 bits
+ * each, except possibly the last, which may use up to 17 bits
+ * or so), then add the missing product.
+ */
+ w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
+ w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
+ + MUL15(u[40 + 1], v[40 + 0]);
+ w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
+ + MUL15(u[40 + 1], v[40 + 1])
+ + MUL15(u[40 + 2], v[40 + 0]);
+ w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
+ + MUL15(u[40 + 1], v[40 + 2])
+ + MUL15(u[40 + 2], v[40 + 1])
+ + MUL15(u[40 + 3], v[40 + 0]);
+ w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
+ + MUL15(u[40 + 1], v[40 + 3])
+ + MUL15(u[40 + 2], v[40 + 2])
+ + MUL15(u[40 + 3], v[40 + 1]);
+ /* + MUL15(u[40 + 4], v[40 + 0]) */
+ w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
+ + MUL15(u[40 + 2], v[40 + 3])
+ + MUL15(u[40 + 3], v[40 + 2])
+ + MUL15(u[40 + 4], v[40 + 1]);
+ w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
+ + MUL15(u[40 + 3], v[40 + 3])
+ + MUL15(u[40 + 4], v[40 + 2]);
+ w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
+ + MUL15(u[40 + 4], v[40 + 3]);
+ w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
+
+ CPR(w, 80);
+
+ w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
+
+ /*
+ * The products on 14-bit words in slots 6 and 7 yield values
+ * up to 5*(16382^2) each, and we need to subtract two such
+ * values from the higher word. We need the subtraction to fit
+ * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
+ * However, 10*(16382^2) does not fit. So we must perform a
+ * bit of reduction here.
+ */
+ CPR(w, 60);
+ CPR(w, 70);
+
+ /*
+ * Recompose results.
+ */
+
+ /* 0..1*0..1 into 0..3 */
+ ZSUB2F(w, 8, w, 0, w, 2);
+ ZSUB2F(w, 9, w, 1, w, 3);
+ ZADDT(w, 1, w, 8);
+ ZADDT(w, 2, w, 9);
+
+ /* 2..3*2..3 into 4..7 */
+ ZSUB2F(w, 10, w, 4, w, 6);
+ ZSUB2F(w, 11, w, 5, w, 7);
+ ZADDT(w, 5, w, 10);
+ ZADDT(w, 6, w, 11);
+
+ /* (0..1+2..3)*(0..1+2..3) into 12..15 */
+ ZSUB2F(w, 16, w, 12, w, 14);
+ ZSUB2F(w, 17, w, 13, w, 15);
+ ZADDT(w, 13, w, 16);
+ ZADDT(w, 14, w, 17);
+
+ /* first-level recomposition */
+ ZSUB2F(w, 12, w, 0, w, 4);
+ ZSUB2F(w, 13, w, 1, w, 5);
+ ZSUB2F(w, 14, w, 2, w, 6);
+ ZSUB2F(w, 15, w, 3, w, 7);
+ ZADDT(w, 2, w, 12);
+ ZADDT(w, 3, w, 13);
+ ZADDT(w, 4, w, 14);
+ ZADDT(w, 5, w, 15);
+
+ /*
+ * Perform carry propagation to bring all words down to 13 bits.
+ */
+ cc = norm13(d, w, 40);
+ d[39] += (cc << 13);
+
+#undef ZADD
+#undef ZADDT
+#undef ZSUB2F
+#undef CPR1
+#undef CPR
+}
+
+static inline void
+square20(uint32_t *d, const uint32_t *a)
+{
+ mul20(d, a, a);
+}
+
+#else
+
+static void
+mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t t[39];
+
+ t[ 0] = MUL15(a[ 0], b[ 0]);
+ t[ 1] = MUL15(a[ 0], b[ 1])
+ + MUL15(a[ 1], b[ 0]);
+ t[ 2] = MUL15(a[ 0], b[ 2])
+ + MUL15(a[ 1], b[ 1])
+ + MUL15(a[ 2], b[ 0]);
+ t[ 3] = MUL15(a[ 0], b[ 3])
+ + MUL15(a[ 1], b[ 2])
+ + MUL15(a[ 2], b[ 1])
+ + MUL15(a[ 3], b[ 0]);
+ t[ 4] = MUL15(a[ 0], b[ 4])
+ + MUL15(a[ 1], b[ 3])
+ + MUL15(a[ 2], b[ 2])
+ + MUL15(a[ 3], b[ 1])
+ + MUL15(a[ 4], b[ 0]);
+ t[ 5] = MUL15(a[ 0], b[ 5])
+ + MUL15(a[ 1], b[ 4])
+ + MUL15(a[ 2], b[ 3])
+ + MUL15(a[ 3], b[ 2])
+ + MUL15(a[ 4], b[ 1])
+ + MUL15(a[ 5], b[ 0]);
+ t[ 6] = MUL15(a[ 0], b[ 6])
+ + MUL15(a[ 1], b[ 5])
+ + MUL15(a[ 2], b[ 4])
+ + MUL15(a[ 3], b[ 3])
+ + MUL15(a[ 4], b[ 2])
+ + MUL15(a[ 5], b[ 1])
+ + MUL15(a[ 6], b[ 0]);
+ t[ 7] = MUL15(a[ 0], b[ 7])
+ + MUL15(a[ 1], b[ 6])
+ + MUL15(a[ 2], b[ 5])
+ + MUL15(a[ 3], b[ 4])
+ + MUL15(a[ 4], b[ 3])
+ + MUL15(a[ 5], b[ 2])
+ + MUL15(a[ 6], b[ 1])
+ + MUL15(a[ 7], b[ 0]);
+ t[ 8] = MUL15(a[ 0], b[ 8])
+ + MUL15(a[ 1], b[ 7])
+ + MUL15(a[ 2], b[ 6])
+ + MUL15(a[ 3], b[ 5])
+ + MUL15(a[ 4], b[ 4])
+ + MUL15(a[ 5], b[ 3])
+ + MUL15(a[ 6], b[ 2])
+ + MUL15(a[ 7], b[ 1])
+ + MUL15(a[ 8], b[ 0]);
+ t[ 9] = MUL15(a[ 0], b[ 9])
+ + MUL15(a[ 1], b[ 8])
+ + MUL15(a[ 2], b[ 7])
+ + MUL15(a[ 3], b[ 6])
+ + MUL15(a[ 4], b[ 5])
+ + MUL15(a[ 5], b[ 4])
+ + MUL15(a[ 6], b[ 3])
+ + MUL15(a[ 7], b[ 2])
+ + MUL15(a[ 8], b[ 1])
+ + MUL15(a[ 9], b[ 0]);
+ t[10] = MUL15(a[ 0], b[10])
+ + MUL15(a[ 1], b[ 9])
+ + MUL15(a[ 2], b[ 8])
+ + MUL15(a[ 3], b[ 7])
+ + MUL15(a[ 4], b[ 6])
+ + MUL15(a[ 5], b[ 5])
+ + MUL15(a[ 6], b[ 4])
+ + MUL15(a[ 7], b[ 3])
+ + MUL15(a[ 8], b[ 2])
+ + MUL15(a[ 9], b[ 1])
+ + MUL15(a[10], b[ 0]);
+ t[11] = MUL15(a[ 0], b[11])
+ + MUL15(a[ 1], b[10])
+ + MUL15(a[ 2], b[ 9])
+ + MUL15(a[ 3], b[ 8])
+ + MUL15(a[ 4], b[ 7])
+ + MUL15(a[ 5], b[ 6])
+ + MUL15(a[ 6], b[ 5])
+ + MUL15(a[ 7], b[ 4])
+ + MUL15(a[ 8], b[ 3])
+ + MUL15(a[ 9], b[ 2])
+ + MUL15(a[10], b[ 1])
+ + MUL15(a[11], b[ 0]);
+ t[12] = MUL15(a[ 0], b[12])
+ + MUL15(a[ 1], b[11])
+ + MUL15(a[ 2], b[10])
+ + MUL15(a[ 3], b[ 9])
+ + MUL15(a[ 4], b[ 8])
+ + MUL15(a[ 5], b[ 7])
+ + MUL15(a[ 6], b[ 6])
+ + MUL15(a[ 7], b[ 5])
+ + MUL15(a[ 8], b[ 4])
+ + MUL15(a[ 9], b[ 3])
+ + MUL15(a[10], b[ 2])
+ + MUL15(a[11], b[ 1])
+ + MUL15(a[12], b[ 0]);
+ t[13] = MUL15(a[ 0], b[13])
+ + MUL15(a[ 1], b[12])
+ + MUL15(a[ 2], b[11])
+ + MUL15(a[ 3], b[10])
+ + MUL15(a[ 4], b[ 9])
+ + MUL15(a[ 5], b[ 8])
+ + MUL15(a[ 6], b[ 7])
+ + MUL15(a[ 7], b[ 6])
+ + MUL15(a[ 8], b[ 5])
+ + MUL15(a[ 9], b[ 4])
+ + MUL15(a[10], b[ 3])
+ + MUL15(a[11], b[ 2])
+ + MUL15(a[12], b[ 1])
+ + MUL15(a[13], b[ 0]);
+ t[14] = MUL15(a[ 0], b[14])
+ + MUL15(a[ 1], b[13])
+ + MUL15(a[ 2], b[12])
+ + MUL15(a[ 3], b[11])
+ + MUL15(a[ 4], b[10])
+ + MUL15(a[ 5], b[ 9])
+ + MUL15(a[ 6], b[ 8])
+ + MUL15(a[ 7], b[ 7])
+ + MUL15(a[ 8], b[ 6])
+ + MUL15(a[ 9], b[ 5])
+ + MUL15(a[10], b[ 4])
+ + MUL15(a[11], b[ 3])
+ + MUL15(a[12], b[ 2])
+ + MUL15(a[13], b[ 1])
+ + MUL15(a[14], b[ 0]);
+ t[15] = MUL15(a[ 0], b[15])
+ + MUL15(a[ 1], b[14])
+ + MUL15(a[ 2], b[13])
+ + MUL15(a[ 3], b[12])
+ + MUL15(a[ 4], b[11])
+ + MUL15(a[ 5], b[10])
+ + MUL15(a[ 6], b[ 9])
+ + MUL15(a[ 7], b[ 8])
+ + MUL15(a[ 8], b[ 7])
+ + MUL15(a[ 9], b[ 6])
+ + MUL15(a[10], b[ 5])
+ + MUL15(a[11], b[ 4])
+ + MUL15(a[12], b[ 3])
+ + MUL15(a[13], b[ 2])
+ + MUL15(a[14], b[ 1])
+ + MUL15(a[15], b[ 0]);
+ t[16] = MUL15(a[ 0], b[16])
+ + MUL15(a[ 1], b[15])
+ + MUL15(a[ 2], b[14])
+ + MUL15(a[ 3], b[13])
+ + MUL15(a[ 4], b[12])
+ + MUL15(a[ 5], b[11])
+ + MUL15(a[ 6], b[10])
+ + MUL15(a[ 7], b[ 9])
+ + MUL15(a[ 8], b[ 8])
+ + MUL15(a[ 9], b[ 7])
+ + MUL15(a[10], b[ 6])
+ + MUL15(a[11], b[ 5])
+ + MUL15(a[12], b[ 4])
+ + MUL15(a[13], b[ 3])
+ + MUL15(a[14], b[ 2])
+ + MUL15(a[15], b[ 1])
+ + MUL15(a[16], b[ 0]);
+ t[17] = MUL15(a[ 0], b[17])
+ + MUL15(a[ 1], b[16])
+ + MUL15(a[ 2], b[15])
+ + MUL15(a[ 3], b[14])
+ + MUL15(a[ 4], b[13])
+ + MUL15(a[ 5], b[12])
+ + MUL15(a[ 6], b[11])
+ + MUL15(a[ 7], b[10])
+ + MUL15(a[ 8], b[ 9])
+ + MUL15(a[ 9], b[ 8])
+ + MUL15(a[10], b[ 7])
+ + MUL15(a[11], b[ 6])
+ + MUL15(a[12], b[ 5])
+ + MUL15(a[13], b[ 4])
+ + MUL15(a[14], b[ 3])
+ + MUL15(a[15], b[ 2])
+ + MUL15(a[16], b[ 1])
+ + MUL15(a[17], b[ 0]);
+ t[18] = MUL15(a[ 0], b[18])
+ + MUL15(a[ 1], b[17])
+ + MUL15(a[ 2], b[16])
+ + MUL15(a[ 3], b[15])
+ + MUL15(a[ 4], b[14])
+ + MUL15(a[ 5], b[13])
+ + MUL15(a[ 6], b[12])
+ + MUL15(a[ 7], b[11])
+ + MUL15(a[ 8], b[10])
+ + MUL15(a[ 9], b[ 9])
+ + MUL15(a[10], b[ 8])
+ + MUL15(a[11], b[ 7])
+ + MUL15(a[12], b[ 6])
+ + MUL15(a[13], b[ 5])
+ + MUL15(a[14], b[ 4])
+ + MUL15(a[15], b[ 3])
+ + MUL15(a[16], b[ 2])
+ + MUL15(a[17], b[ 1])
+ + MUL15(a[18], b[ 0]);
+ t[19] = MUL15(a[ 0], b[19])
+ + MUL15(a[ 1], b[18])
+ + MUL15(a[ 2], b[17])
+ + MUL15(a[ 3], b[16])
+ + MUL15(a[ 4], b[15])
+ + MUL15(a[ 5], b[14])
+ + MUL15(a[ 6], b[13])
+ + MUL15(a[ 7], b[12])
+ + MUL15(a[ 8], b[11])
+ + MUL15(a[ 9], b[10])
+ + MUL15(a[10], b[ 9])
+ + MUL15(a[11], b[ 8])
+ + MUL15(a[12], b[ 7])
+ + MUL15(a[13], b[ 6])
+ + MUL15(a[14], b[ 5])
+ + MUL15(a[15], b[ 4])
+ + MUL15(a[16], b[ 3])
+ + MUL15(a[17], b[ 2])
+ + MUL15(a[18], b[ 1])
+ + MUL15(a[19], b[ 0]);
+ t[20] = MUL15(a[ 1], b[19])
+ + MUL15(a[ 2], b[18])
+ + MUL15(a[ 3], b[17])
+ + MUL15(a[ 4], b[16])
+ + MUL15(a[ 5], b[15])
+ + MUL15(a[ 6], b[14])
+ + MUL15(a[ 7], b[13])
+ + MUL15(a[ 8], b[12])
+ + MUL15(a[ 9], b[11])
+ + MUL15(a[10], b[10])
+ + MUL15(a[11], b[ 9])
+ + MUL15(a[12], b[ 8])
+ + MUL15(a[13], b[ 7])
+ + MUL15(a[14], b[ 6])
+ + MUL15(a[15], b[ 5])
+ + MUL15(a[16], b[ 4])
+ + MUL15(a[17], b[ 3])
+ + MUL15(a[18], b[ 2])
+ + MUL15(a[19], b[ 1]);
+ t[21] = MUL15(a[ 2], b[19])
+ + MUL15(a[ 3], b[18])
+ + MUL15(a[ 4], b[17])
+ + MUL15(a[ 5], b[16])
+ + MUL15(a[ 6], b[15])
+ + MUL15(a[ 7], b[14])
+ + MUL15(a[ 8], b[13])
+ + MUL15(a[ 9], b[12])
+ + MUL15(a[10], b[11])
+ + MUL15(a[11], b[10])
+ + MUL15(a[12], b[ 9])
+ + MUL15(a[13], b[ 8])
+ + MUL15(a[14], b[ 7])
+ + MUL15(a[15], b[ 6])
+ + MUL15(a[16], b[ 5])
+ + MUL15(a[17], b[ 4])
+ + MUL15(a[18], b[ 3])
+ + MUL15(a[19], b[ 2]);
+ t[22] = MUL15(a[ 3], b[19])
+ + MUL15(a[ 4], b[18])
+ + MUL15(a[ 5], b[17])
+ + MUL15(a[ 6], b[16])
+ + MUL15(a[ 7], b[15])
+ + MUL15(a[ 8], b[14])
+ + MUL15(a[ 9], b[13])
+ + MUL15(a[10], b[12])
+ + MUL15(a[11], b[11])
+ + MUL15(a[12], b[10])
+ + MUL15(a[13], b[ 9])
+ + MUL15(a[14], b[ 8])
+ + MUL15(a[15], b[ 7])
+ + MUL15(a[16], b[ 6])
+ + MUL15(a[17], b[ 5])
+ + MUL15(a[18], b[ 4])
+ + MUL15(a[19], b[ 3]);
+ t[23] = MUL15(a[ 4], b[19])
+ + MUL15(a[ 5], b[18])
+ + MUL15(a[ 6], b[17])
+ + MUL15(a[ 7], b[16])
+ + MUL15(a[ 8], b[15])
+ + MUL15(a[ 9], b[14])
+ + MUL15(a[10], b[13])
+ + MUL15(a[11], b[12])
+ + MUL15(a[12], b[11])
+ + MUL15(a[13], b[10])
+ + MUL15(a[14], b[ 9])
+ + MUL15(a[15], b[ 8])
+ + MUL15(a[16], b[ 7])
+ + MUL15(a[17], b[ 6])
+ + MUL15(a[18], b[ 5])
+ + MUL15(a[19], b[ 4]);
+ t[24] = MUL15(a[ 5], b[19])
+ + MUL15(a[ 6], b[18])
+ + MUL15(a[ 7], b[17])
+ + MUL15(a[ 8], b[16])
+ + MUL15(a[ 9], b[15])
+ + MUL15(a[10], b[14])
+ + MUL15(a[11], b[13])
+ + MUL15(a[12], b[12])
+ + MUL15(a[13], b[11])
+ + MUL15(a[14], b[10])
+ + MUL15(a[15], b[ 9])
+ + MUL15(a[16], b[ 8])
+ + MUL15(a[17], b[ 7])
+ + MUL15(a[18], b[ 6])
+ + MUL15(a[19], b[ 5]);
+ t[25] = MUL15(a[ 6], b[19])
+ + MUL15(a[ 7], b[18])
+ + MUL15(a[ 8], b[17])
+ + MUL15(a[ 9], b[16])
+ + MUL15(a[10], b[15])
+ + MUL15(a[11], b[14])
+ + MUL15(a[12], b[13])
+ + MUL15(a[13], b[12])
+ + MUL15(a[14], b[11])
+ + MUL15(a[15], b[10])
+ + MUL15(a[16], b[ 9])
+ + MUL15(a[17], b[ 8])
+ + MUL15(a[18], b[ 7])
+ + MUL15(a[19], b[ 6]);
+ t[26] = MUL15(a[ 7], b[19])
+ + MUL15(a[ 8], b[18])
+ + MUL15(a[ 9], b[17])
+ + MUL15(a[10], b[16])
+ + MUL15(a[11], b[15])
+ + MUL15(a[12], b[14])
+ + MUL15(a[13], b[13])
+ + MUL15(a[14], b[12])
+ + MUL15(a[15], b[11])
+ + MUL15(a[16], b[10])
+ + MUL15(a[17], b[ 9])
+ + MUL15(a[18], b[ 8])
+ + MUL15(a[19], b[ 7]);
+ t[27] = MUL15(a[ 8], b[19])
+ + MUL15(a[ 9], b[18])
+ + MUL15(a[10], b[17])
+ + MUL15(a[11], b[16])
+ + MUL15(a[12], b[15])
+ + MUL15(a[13], b[14])
+ + MUL15(a[14], b[13])
+ + MUL15(a[15], b[12])
+ + MUL15(a[16], b[11])
+ + MUL15(a[17], b[10])
+ + MUL15(a[18], b[ 9])
+ + MUL15(a[19], b[ 8]);
+ t[28] = MUL15(a[ 9], b[19])
+ + MUL15(a[10], b[18])
+ + MUL15(a[11], b[17])
+ + MUL15(a[12], b[16])
+ + MUL15(a[13], b[15])
+ + MUL15(a[14], b[14])
+ + MUL15(a[15], b[13])
+ + MUL15(a[16], b[12])
+ + MUL15(a[17], b[11])
+ + MUL15(a[18], b[10])
+ + MUL15(a[19], b[ 9]);
+ t[29] = MUL15(a[10], b[19])
+ + MUL15(a[11], b[18])
+ + MUL15(a[12], b[17])
+ + MUL15(a[13], b[16])
+ + MUL15(a[14], b[15])
+ + MUL15(a[15], b[14])
+ + MUL15(a[16], b[13])
+ + MUL15(a[17], b[12])
+ + MUL15(a[18], b[11])
+ + MUL15(a[19], b[10]);
+ t[30] = MUL15(a[11], b[19])
+ + MUL15(a[12], b[18])
+ + MUL15(a[13], b[17])
+ + MUL15(a[14], b[16])
+ + MUL15(a[15], b[15])
+ + MUL15(a[16], b[14])
+ + MUL15(a[17], b[13])
+ + MUL15(a[18], b[12])
+ + MUL15(a[19], b[11]);
+ t[31] = MUL15(a[12], b[19])
+ + MUL15(a[13], b[18])
+ + MUL15(a[14], b[17])
+ + MUL15(a[15], b[16])
+ + MUL15(a[16], b[15])
+ + MUL15(a[17], b[14])
+ + MUL15(a[18], b[13])
+ + MUL15(a[19], b[12]);
+ t[32] = MUL15(a[13], b[19])
+ + MUL15(a[14], b[18])
+ + MUL15(a[15], b[17])
+ + MUL15(a[16], b[16])
+ + MUL15(a[17], b[15])
+ + MUL15(a[18], b[14])
+ + MUL15(a[19], b[13]);
+ t[33] = MUL15(a[14], b[19])
+ + MUL15(a[15], b[18])
+ + MUL15(a[16], b[17])
+ + MUL15(a[17], b[16])
+ + MUL15(a[18], b[15])
+ + MUL15(a[19], b[14]);
+ t[34] = MUL15(a[15], b[19])
+ + MUL15(a[16], b[18])
+ + MUL15(a[17], b[17])
+ + MUL15(a[18], b[16])
+ + MUL15(a[19], b[15]);
+ t[35] = MUL15(a[16], b[19])
+ + MUL15(a[17], b[18])
+ + MUL15(a[18], b[17])
+ + MUL15(a[19], b[16]);
+ t[36] = MUL15(a[17], b[19])
+ + MUL15(a[18], b[18])
+ + MUL15(a[19], b[17]);
+ t[37] = MUL15(a[18], b[19])
+ + MUL15(a[19], b[18]);
+ t[38] = MUL15(a[19], b[19]);
+ d[39] = norm13(d, t, 39);
+}
+
+static void
+square20(uint32_t *d, const uint32_t *a)
+{
+ uint32_t t[39];
+
+ t[ 0] = MUL15(a[ 0], a[ 0]);
+ t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
+ t[ 2] = MUL15(a[ 1], a[ 1])
+ + ((MUL15(a[ 0], a[ 2])) << 1);
+ t[ 3] = ((MUL15(a[ 0], a[ 3])
+ + MUL15(a[ 1], a[ 2])) << 1);
+ t[ 4] = MUL15(a[ 2], a[ 2])
+ + ((MUL15(a[ 0], a[ 4])
+ + MUL15(a[ 1], a[ 3])) << 1);
+ t[ 5] = ((MUL15(a[ 0], a[ 5])
+ + MUL15(a[ 1], a[ 4])
+ + MUL15(a[ 2], a[ 3])) << 1);
+ t[ 6] = MUL15(a[ 3], a[ 3])
+ + ((MUL15(a[ 0], a[ 6])
+ + MUL15(a[ 1], a[ 5])
+ + MUL15(a[ 2], a[ 4])) << 1);
+ t[ 7] = ((MUL15(a[ 0], a[ 7])
+ + MUL15(a[ 1], a[ 6])
+ + MUL15(a[ 2], a[ 5])
+ + MUL15(a[ 3], a[ 4])) << 1);
+ t[ 8] = MUL15(a[ 4], a[ 4])
+ + ((MUL15(a[ 0], a[ 8])
+ + MUL15(a[ 1], a[ 7])
+ + MUL15(a[ 2], a[ 6])
+ + MUL15(a[ 3], a[ 5])) << 1);
+ t[ 9] = ((MUL15(a[ 0], a[ 9])
+ + MUL15(a[ 1], a[ 8])
+ + MUL15(a[ 2], a[ 7])
+ + MUL15(a[ 3], a[ 6])
+ + MUL15(a[ 4], a[ 5])) << 1);
+ t[10] = MUL15(a[ 5], a[ 5])
+ + ((MUL15(a[ 0], a[10])
+ + MUL15(a[ 1], a[ 9])
+ + MUL15(a[ 2], a[ 8])
+ + MUL15(a[ 3], a[ 7])
+ + MUL15(a[ 4], a[ 6])) << 1);
+ t[11] = ((MUL15(a[ 0], a[11])
+ + MUL15(a[ 1], a[10])
+ + MUL15(a[ 2], a[ 9])
+ + MUL15(a[ 3], a[ 8])
+ + MUL15(a[ 4], a[ 7])
+ + MUL15(a[ 5], a[ 6])) << 1);
+ t[12] = MUL15(a[ 6], a[ 6])
+ + ((MUL15(a[ 0], a[12])
+ + MUL15(a[ 1], a[11])
+ + MUL15(a[ 2], a[10])
+ + MUL15(a[ 3], a[ 9])
+ + MUL15(a[ 4], a[ 8])
+ + MUL15(a[ 5], a[ 7])) << 1);
+ t[13] = ((MUL15(a[ 0], a[13])
+ + MUL15(a[ 1], a[12])
+ + MUL15(a[ 2], a[11])
+ + MUL15(a[ 3], a[10])
+ + MUL15(a[ 4], a[ 9])
+ + MUL15(a[ 5], a[ 8])
+ + MUL15(a[ 6], a[ 7])) << 1);
+ t[14] = MUL15(a[ 7], a[ 7])
+ + ((MUL15(a[ 0], a[14])
+ + MUL15(a[ 1], a[13])
+ + MUL15(a[ 2], a[12])
+ + MUL15(a[ 3], a[11])
+ + MUL15(a[ 4], a[10])
+ + MUL15(a[ 5], a[ 9])
+ + MUL15(a[ 6], a[ 8])) << 1);
+ t[15] = ((MUL15(a[ 0], a[15])
+ + MUL15(a[ 1], a[14])
+ + MUL15(a[ 2], a[13])
+ + MUL15(a[ 3], a[12])
+ + MUL15(a[ 4], a[11])
+ + MUL15(a[ 5], a[10])
+ + MUL15(a[ 6], a[ 9])
+ + MUL15(a[ 7], a[ 8])) << 1);
+ t[16] = MUL15(a[ 8], a[ 8])
+ + ((MUL15(a[ 0], a[16])
+ + MUL15(a[ 1], a[15])
+ + MUL15(a[ 2], a[14])
+ + MUL15(a[ 3], a[13])
+ + MUL15(a[ 4], a[12])
+ + MUL15(a[ 5], a[11])
+ + MUL15(a[ 6], a[10])
+ + MUL15(a[ 7], a[ 9])) << 1);
+ t[17] = ((MUL15(a[ 0], a[17])
+ + MUL15(a[ 1], a[16])
+ + MUL15(a[ 2], a[15])
+ + MUL15(a[ 3], a[14])
+ + MUL15(a[ 4], a[13])
+ + MUL15(a[ 5], a[12])
+ + MUL15(a[ 6], a[11])
+ + MUL15(a[ 7], a[10])
+ + MUL15(a[ 8], a[ 9])) << 1);
+ t[18] = MUL15(a[ 9], a[ 9])
+ + ((MUL15(a[ 0], a[18])
+ + MUL15(a[ 1], a[17])
+ + MUL15(a[ 2], a[16])
+ + MUL15(a[ 3], a[15])
+ + MUL15(a[ 4], a[14])
+ + MUL15(a[ 5], a[13])
+ + MUL15(a[ 6], a[12])
+ + MUL15(a[ 7], a[11])
+ + MUL15(a[ 8], a[10])) << 1);
+ t[19] = ((MUL15(a[ 0], a[19])
+ + MUL15(a[ 1], a[18])
+ + MUL15(a[ 2], a[17])
+ + MUL15(a[ 3], a[16])
+ + MUL15(a[ 4], a[15])
+ + MUL15(a[ 5], a[14])
+ + MUL15(a[ 6], a[13])
+ + MUL15(a[ 7], a[12])
+ + MUL15(a[ 8], a[11])
+ + MUL15(a[ 9], a[10])) << 1);
+ t[20] = MUL15(a[10], a[10])
+ + ((MUL15(a[ 1], a[19])
+ + MUL15(a[ 2], a[18])
+ + MUL15(a[ 3], a[17])
+ + MUL15(a[ 4], a[16])
+ + MUL15(a[ 5], a[15])
+ + MUL15(a[ 6], a[14])
+ + MUL15(a[ 7], a[13])
+ + MUL15(a[ 8], a[12])
+ + MUL15(a[ 9], a[11])) << 1);
+ t[21] = ((MUL15(a[ 2], a[19])
+ + MUL15(a[ 3], a[18])
+ + MUL15(a[ 4], a[17])
+ + MUL15(a[ 5], a[16])
+ + MUL15(a[ 6], a[15])
+ + MUL15(a[ 7], a[14])
+ + MUL15(a[ 8], a[13])
+ + MUL15(a[ 9], a[12])
+ + MUL15(a[10], a[11])) << 1);
+ t[22] = MUL15(a[11], a[11])
+ + ((MUL15(a[ 3], a[19])
+ + MUL15(a[ 4], a[18])
+ + MUL15(a[ 5], a[17])
+ + MUL15(a[ 6], a[16])
+ + MUL15(a[ 7], a[15])
+ + MUL15(a[ 8], a[14])
+ + MUL15(a[ 9], a[13])
+ + MUL15(a[10], a[12])) << 1);
+ t[23] = ((MUL15(a[ 4], a[19])
+ + MUL15(a[ 5], a[18])
+ + MUL15(a[ 6], a[17])
+ + MUL15(a[ 7], a[16])
+ + MUL15(a[ 8], a[15])
+ + MUL15(a[ 9], a[14])
+ + MUL15(a[10], a[13])
+ + MUL15(a[11], a[12])) << 1);
+ t[24] = MUL15(a[12], a[12])
+ + ((MUL15(a[ 5], a[19])
+ + MUL15(a[ 6], a[18])
+ + MUL15(a[ 7], a[17])
+ + MUL15(a[ 8], a[16])
+ + MUL15(a[ 9], a[15])
+ + MUL15(a[10], a[14])
+ + MUL15(a[11], a[13])) << 1);
+ t[25] = ((MUL15(a[ 6], a[19])
+ + MUL15(a[ 7], a[18])
+ + MUL15(a[ 8], a[17])
+ + MUL15(a[ 9], a[16])
+ + MUL15(a[10], a[15])
+ + MUL15(a[11], a[14])
+ + MUL15(a[12], a[13])) << 1);
+ t[26] = MUL15(a[13], a[13])
+ + ((MUL15(a[ 7], a[19])
+ + MUL15(a[ 8], a[18])
+ + MUL15(a[ 9], a[17])
+ + MUL15(a[10], a[16])
+ + MUL15(a[11], a[15])
+ + MUL15(a[12], a[14])) << 1);
+ t[27] = ((MUL15(a[ 8], a[19])
+ + MUL15(a[ 9], a[18])
+ + MUL15(a[10], a[17])
+ + MUL15(a[11], a[16])
+ + MUL15(a[12], a[15])
+ + MUL15(a[13], a[14])) << 1);
+ t[28] = MUL15(a[14], a[14])
+ + ((MUL15(a[ 9], a[19])
+ + MUL15(a[10], a[18])
+ + MUL15(a[11], a[17])
+ + MUL15(a[12], a[16])
+ + MUL15(a[13], a[15])) << 1);
+ t[29] = ((MUL15(a[10], a[19])
+ + MUL15(a[11], a[18])
+ + MUL15(a[12], a[17])
+ + MUL15(a[13], a[16])
+ + MUL15(a[14], a[15])) << 1);
+ t[30] = MUL15(a[15], a[15])
+ + ((MUL15(a[11], a[19])
+ + MUL15(a[12], a[18])
+ + MUL15(a[13], a[17])
+ + MUL15(a[14], a[16])) << 1);
+ t[31] = ((MUL15(a[12], a[19])
+ + MUL15(a[13], a[18])
+ + MUL15(a[14], a[17])
+ + MUL15(a[15], a[16])) << 1);
+ t[32] = MUL15(a[16], a[16])
+ + ((MUL15(a[13], a[19])
+ + MUL15(a[14], a[18])
+ + MUL15(a[15], a[17])) << 1);
+ t[33] = ((MUL15(a[14], a[19])
+ + MUL15(a[15], a[18])
+ + MUL15(a[16], a[17])) << 1);
+ t[34] = MUL15(a[17], a[17])
+ + ((MUL15(a[15], a[19])
+ + MUL15(a[16], a[18])) << 1);
+ t[35] = ((MUL15(a[16], a[19])
+ + MUL15(a[17], a[18])) << 1);
+ t[36] = MUL15(a[18], a[18])
+ + ((MUL15(a[17], a[19])) << 1);
+ t[37] = ((MUL15(a[18], a[19])) << 1);
+ t[38] = MUL15(a[19], a[19]);
+ d[39] = norm13(d, t, 39);
+}
+
+#endif
+
+/*
+ * Modulus for field F256 (field for point coordinates in curve P-256).
+ */
+static const uint32_t F256[] = {
+ 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
+ 0x0000, 0x1FF8, 0x1FFF, 0x01FF
+};
+
+/*
+ * The 'b' curve equation coefficient for P-256.
+ */
+static const uint32_t P256_B[] = {
+ 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
+ 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
+ 0x0A3A, 0x0EC5, 0x118D, 0x00B5
+};
+
+/*
+ * Perform a "short reduction" in field F256 (field for curve P-256).
+ * The source value should be less than 262 bits; on output, it will
+ * be at most 257 bits, and less than twice the modulus.
+ */
+static void
+reduce_f256(uint32_t *d)
+{
+ uint32_t x;
+
+ x = d[19] >> 9;
+ d[19] &= 0x01FF;
+ d[17] += x << 3;
+ d[14] -= x << 10;
+ d[7] -= x << 5;
+ d[0] += x;
+ norm13(d, d, 20);
+}
+
+/*
+ * Perform a "final reduction" in field F256 (field for curve P-256).
+ * The source value must be less than twice the modulus. If the value
+ * is not lower than the modulus, then the modulus is subtracted and
+ * this function returns 1; otherwise, it leaves it untouched and it
+ * returns 0.
+ */
+static uint32_t
+reduce_final_f256(uint32_t *d)
+{
+ uint32_t t[20];
+ uint32_t cc;
+ int i;
+
+ memcpy(t, d, sizeof t);
+ cc = 0;
+ for (i = 0; i < 20; i ++) {
+ uint32_t w;
+
+ w = t[i] - F256[i] - cc;
+ cc = w >> 31;
+ t[i] = w & 0x1FFF;
+ }
+ cc ^= 1;
+ CCOPY(cc, d, t, sizeof t);
+ return cc;
+}
+
+/*
+ * Perform a multiplication of two integers modulo
+ * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
+ * of 20 words, each containing 13 bits of data, in little-endian order.
+ * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
+ * on output, value fits on 257 bits and is lower than twice the modulus.
+ */
+static void
+mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t t[40], cc;
+ int i;
+
+ /*
+ * Compute raw multiplication. All result words fit in 13 bits
+ * each.
+ */
+ mul20(t, a, b);
+
+ /*
+ * Modular reduction: each high word in added/subtracted where
+ * necessary.
+ *
+ * The modulus is:
+ * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+ * Therefore:
+ * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+ *
+ * For a word x at bit offset n (n >= 256), we have:
+ * x*2^n = x*2^(n-32) - x*2^(n-64)
+ * - x*2^(n - 160) + x*2^(n-256) mod p
+ *
+ * Thus, we can nullify the high word if we reinject it at some
+ * proper emplacements.
+ */
+ for (i = 39; i >= 20; i --) {
+ uint32_t x;
+
+ x = t[i];
+ t[i - 2] += ARSH(x, 6);
+ t[i - 3] += (x << 7) & 0x1FFF;
+ t[i - 4] -= ARSH(x, 12);
+ t[i - 5] -= (x << 1) & 0x1FFF;
+ t[i - 12] -= ARSH(x, 4);
+ t[i - 13] -= (x << 9) & 0x1FFF;
+ t[i - 19] += ARSH(x, 9);
+ t[i - 20] += (x << 4) & 0x1FFF;
+ }
+
+ /*
+ * Propagate carries. This is a signed propagation, and the
+ * result may be negative. The loop above may enlarge values,
+ * but not two much: worst case is the chain involving t[i - 3],
+ * in which a value may be added to itself up to 7 times. Since
+ * starting values are 13-bit each, all words fit on 20 bits
+ * (21 to account for the sign bit).
+ */
+ cc = norm13(t, t, 20);
+
+ /*
+ * Perform modular reduction again for the bits beyond 256 (the carry
+ * and the bits 256..259). Since the largest shift below is by 10
+ * bits, and the values fit on 21 bits, values fit in 32-bit words,
+ * thereby allowing injecting full word values.
+ */
+ cc = (cc << 4) | (t[19] >> 9);
+ t[19] &= 0x01FF;
+ t[17] += cc << 3;
+ t[14] -= cc << 10;
+ t[7] -= cc << 5;
+ t[0] += cc;
+
+ /*
+ * If the carry is negative, then after carry propagation, we may
+ * end up with a value which is negative, and we don't want that.
+ * Thus, in that case, we add the modulus. Note that the subtraction
+ * result, when the carry is negative, is always smaller than the
+ * modulus, so the extra addition will not make the value exceed
+ * twice the modulus.
+ */
+ cc >>= 31;
+ t[0] -= cc;
+ t[7] += cc << 5;
+ t[14] += cc << 10;
+ t[17] -= cc << 3;
+ t[19] += cc << 9;
+
+ norm13(d, t, 20);
+}
+
+/*
+ * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
+ * P-256). Operand is an array of 20 words, each containing 13 bits of
+ * data, in little-endian order. On input, upper word may be up to 13
+ * bits (hence value up to 2^260-1); on output, value fits on 257 bits
+ * and is lower than twice the modulus.
+ */
+static void
+square_f256(uint32_t *d, const uint32_t *a)
+{
+ uint32_t t[40], cc;
+ int i;
+
+ /*
+ * Compute raw square. All result words fit in 13 bits each.
+ */
+ square20(t, a);
+
+ /*
+ * Modular reduction: each high word in added/subtracted where
+ * necessary.
+ *
+ * The modulus is:
+ * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+ * Therefore:
+ * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+ *
+ * For a word x at bit offset n (n >= 256), we have:
+ * x*2^n = x*2^(n-32) - x*2^(n-64)
+ * - x*2^(n - 160) + x*2^(n-256) mod p
+ *
+ * Thus, we can nullify the high word if we reinject it at some
+ * proper emplacements.
+ */
+ for (i = 39; i >= 20; i --) {
+ uint32_t x;
+
+ x = t[i];
+ t[i - 2] += ARSH(x, 6);
+ t[i - 3] += (x << 7) & 0x1FFF;
+ t[i - 4] -= ARSH(x, 12);
+ t[i - 5] -= (x << 1) & 0x1FFF;
+ t[i - 12] -= ARSH(x, 4);
+ t[i - 13] -= (x << 9) & 0x1FFF;
+ t[i - 19] += ARSH(x, 9);
+ t[i - 20] += (x << 4) & 0x1FFF;
+ }
+
+ /*
+ * Propagate carries. This is a signed propagation, and the
+ * result may be negative. The loop above may enlarge values,
+ * but not two much: worst case is the chain involving t[i - 3],
+ * in which a value may be added to itself up to 7 times. Since
+ * starting values are 13-bit each, all words fit on 20 bits
+ * (21 to account for the sign bit).
+ */
+ cc = norm13(t, t, 20);
+
+ /*
+ * Perform modular reduction again for the bits beyond 256 (the carry
+ * and the bits 256..259). Since the largest shift below is by 10
+ * bits, and the values fit on 21 bits, values fit in 32-bit words,
+ * thereby allowing injecting full word values.
+ */
+ cc = (cc << 4) | (t[19] >> 9);
+ t[19] &= 0x01FF;
+ t[17] += cc << 3;
+ t[14] -= cc << 10;
+ t[7] -= cc << 5;
+ t[0] += cc;
+
+ /*
+ * If the carry is negative, then after carry propagation, we may
+ * end up with a value which is negative, and we don't want that.
+ * Thus, in that case, we add the modulus. Note that the subtraction
+ * result, when the carry is negative, is always smaller than the
+ * modulus, so the extra addition will not make the value exceed
+ * twice the modulus.
+ */
+ cc >>= 31;
+ t[0] -= cc;
+ t[7] += cc << 5;
+ t[14] += cc << 10;
+ t[17] -= cc << 3;
+ t[19] += cc << 9;
+
+ norm13(d, t, 20);
+}
+
+/*
+ * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
+ * are such that:
+ * X = x / z^2
+ * Y = y / z^3
+ * For the point at infinity, z = 0.
+ * Each point thus admits many possible representations.
+ *
+ * Coordinates are represented in arrays of 32-bit integers, each holding
+ * 13 bits of data. Values may also be slightly greater than the modulus,
+ * but they will always be lower than twice the modulus.
+ */
+typedef struct {
+ uint32_t x[20];
+ uint32_t y[20];
+ uint32_t z[20];
+} p256_jacobian;
+
+/*
+ * Convert a point to affine coordinates:
+ * - If the point is the point at infinity, then all three coordinates
+ * are set to 0.
+ * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
+ * coordinates are the 'X' and 'Y' affine coordinates.
+ * The coordinates are guaranteed to be lower than the modulus.
+ */
+static void
+p256_to_affine(p256_jacobian *P)
+{
+ uint32_t t1[20], t2[20];
+ int i;
+
+ /*
+ * Invert z with a modular exponentiation: the modulus is
+ * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
+ * p-2. Exponent bit pattern (from high to low) is:
+ * - 32 bits of value 1
+ * - 31 bits of value 0
+ * - 1 bit of value 1
+ * - 96 bits of value 0
+ * - 94 bits of value 1
+ * - 1 bit of value 0
+ * - 1 bit of value 1
+ * Thus, we precompute z^(2^31-1) to speed things up.
+ *
+ * If z = 0 (point at infinity) then the modular exponentiation
+ * will yield 0, which leads to the expected result (all three
+ * coordinates set to 0).
+ */
+
+ /*
+ * A simple square-and-multiply for z^(2^31-1). We could save about
+ * two dozen multiplications here with an addition chain, but
+ * this would require a bit more code, and extra stack buffers.
+ */
+ memcpy(t1, P->z, sizeof P->z);
+ for (i = 0; i < 30; i ++) {
+ square_f256(t1, t1);
+ mul_f256(t1, t1, P->z);
+ }
+
+ /*
+ * Square-and-multiply. Apart from the squarings, we have a few
+ * multiplications to set bits to 1; we multiply by the original z
+ * for setting 1 bit, and by t1 for setting 31 bits.
+ */
+ memcpy(t2, P->z, sizeof P->z);
+ for (i = 1; i < 256; i ++) {
+ square_f256(t2, t2);
+ switch (i) {
+ case 31:
+ case 190:
+ case 221:
+ case 252:
+ mul_f256(t2, t2, t1);
+ break;
+ case 63:
+ case 253:
+ case 255:
+ mul_f256(t2, t2, P->z);
+ break;
+ }
+ }
+
+ /*
+ * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
+ */
+ mul_f256(t1, t2, t2);
+ mul_f256(P->x, t1, P->x);
+ mul_f256(t1, t1, t2);
+ mul_f256(P->y, t1, P->y);
+ reduce_final_f256(P->x);
+ reduce_final_f256(P->y);
+
+ /*
+ * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
+ * this will set z to 1.
+ */
+ mul_f256(P->z, P->z, t2);
+ reduce_final_f256(P->z);
+}
+
+/*
+ * Double a point in P-256. This function works for all valid points,
+ * including the point at infinity.
+ */
+static void
+p256_double(p256_jacobian *Q)
+{
+ /*
+ * Doubling formulas are:
+ *
+ * s = 4*x*y^2
+ * m = 3*(x + z^2)*(x - z^2)
+ * x' = m^2 - 2*s
+ * y' = m*(s - x') - 8*y^4
+ * z' = 2*y*z
+ *
+ * These formulas work for all points, including points of order 2
+ * and points at infinity:
+ * - If y = 0 then z' = 0. But there is no such point in P-256
+ * anyway.
+ * - If z = 0 then z' = 0.
+ */
+ uint32_t t1[20], t2[20], t3[20], t4[20];
+ int i;
+
+ /*
+ * Compute z^2 in t1.
+ */
+ square_f256(t1, Q->z);
+
+ /*
+ * Compute x-z^2 in t2 and x+z^2 in t1.
+ */
+ for (i = 0; i < 20; i ++) {
+ t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
+ t1[i] += Q->x[i];
+ }
+ norm13(t1, t1, 20);
+ norm13(t2, t2, 20);
+
+ /*
+ * Compute 3*(x+z^2)*(x-z^2) in t1.
+ */
+ mul_f256(t3, t1, t2);
+ for (i = 0; i < 20; i ++) {
+ t1[i] = MUL15(3, t3[i]);
+ }
+ norm13(t1, t1, 20);
+
+ /*
+ * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+ */
+ square_f256(t3, Q->y);
+ for (i = 0; i < 20; i ++) {
+ t3[i] <<= 1;
+ }
+ norm13(t3, t3, 20);
+ mul_f256(t2, Q->x, t3);
+ for (i = 0; i < 20; i ++) {
+ t2[i] <<= 1;
+ }
+ norm13(t2, t2, 20);
+ reduce_f256(t2);
+
+ /*
+ * Compute x' = m^2 - 2*s.
+ */
+ square_f256(Q->x, t1);
+ for (i = 0; i < 20; i ++) {
+ Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
+ }
+ norm13(Q->x, Q->x, 20);
+ reduce_f256(Q->x);
+
+ /*
+ * Compute z' = 2*y*z.
+ */
+ mul_f256(t4, Q->y, Q->z);
+ for (i = 0; i < 20; i ++) {
+ Q->z[i] = t4[i] << 1;
+ }
+ norm13(Q->z, Q->z, 20);
+ reduce_f256(Q->z);
+
+ /*
+ * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+ * 2*y^2 in t3.
+ */
+ for (i = 0; i < 20; i ++) {
+ t2[i] += (F256[i] << 1) - Q->x[i];
+ }
+ norm13(t2, t2, 20);
+ mul_f256(Q->y, t1, t2);
+ square_f256(t4, t3);
+ for (i = 0; i < 20; i ++) {
+ Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
+ }
+ norm13(Q->y, Q->y, 20);
+ reduce_f256(Q->y);
+}
+
+/*
+ * Add point P2 to point P1.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ * - If P1 == 0 but P2 != 0
+ * - If P1 != 0 but P2 == 0
+ * - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ * - P1 and P2 have the same Y coordinate
+ * - P1 == 0 and P2 == 0
+ * - The Y coordinate of one of the points is 0 and the other point is
+ * the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ * - If the result is not the point at infinity, then it is correct.
+ * - Otherwise, if the returned value is 1, then this is a case of
+ * P1+P2 == 0, so the result is indeed the point at infinity.
+ * - Otherwise, P1 == P2, so a "double" operation should have been
+ * performed.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+ /*
+ * Addtions formulas are:
+ *
+ * u1 = x1 * z2^2
+ * u2 = x2 * z1^2
+ * s1 = y1 * z2^3
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1 * z2
+ */
+ uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
+ uint32_t ret;
+ int i;
+
+ /*
+ * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+ */
+ square_f256(t3, P2->z);
+ mul_f256(t1, P1->x, t3);
+ mul_f256(t4, P2->z, t3);
+ mul_f256(t3, P1->y, t4);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ square_f256(t4, P1->z);
+ mul_f256(t2, P2->x, t4);
+ mul_f256(t5, P1->z, t4);
+ mul_f256(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * We need to test whether r is zero, so we will do some extra
+ * reduce.
+ */
+ for (i = 0; i < 20; i ++) {
+ t2[i] += (F256[i] << 1) - t1[i];
+ t4[i] += (F256[i] << 1) - t3[i];
+ }
+ norm13(t2, t2, 20);
+ norm13(t4, t4, 20);
+ reduce_f256(t4);
+ reduce_final_f256(t4);
+ ret = 0;
+ for (i = 0; i < 20; i ++) {
+ ret |= t4[i];
+ }
+ ret = (ret | -ret) >> 31;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ square_f256(t7, t2);
+ mul_f256(t6, t1, t7);
+ mul_f256(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ square_f256(P1->x, t4);
+ for (i = 0; i < 20; i ++) {
+ P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
+ }
+ norm13(P1->x, P1->x, 20);
+ reduce_f256(P1->x);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ for (i = 0; i < 20; i ++) {
+ t6[i] += (F256[i] << 1) - P1->x[i];
+ }
+ norm13(t6, t6, 20);
+ mul_f256(P1->y, t4, t6);
+ mul_f256(t1, t5, t3);
+ for (i = 0; i < 20; i ++) {
+ P1->y[i] += (F256[i] << 1) - t1[i];
+ }
+ norm13(P1->y, P1->y, 20);
+ reduce_f256(P1->y);
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ mul_f256(t1, P1->z, P2->z);
+ mul_f256(P1->z, t1, t2);
+
+ return ret;
+}
+
+/*
+ * Add point P2 to point P1. This is a specialised function for the
+ * case when P2 is a non-zero point in affine coordinate.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ * - If P1 == 0
+ * - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ * - P1 and P2 have the same Y coordinate
+ * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ * - If the result is not the point at infinity, then it is correct.
+ * - Otherwise, if the returned value is 1, then this is a case of
+ * P1+P2 == 0, so the result is indeed the point at infinity.
+ * - Otherwise, P1 == P2, so a "double" operation should have been
+ * performed.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
+{
+ /*
+ * Addtions formulas are:
+ *
+ * u1 = x1
+ * u2 = x2 * z1^2
+ * s1 = y1
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1
+ */
+ uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
+ uint32_t ret;
+ int i;
+
+ /*
+ * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+ */
+ memcpy(t1, P1->x, sizeof t1);
+ memcpy(t3, P1->y, sizeof t3);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ square_f256(t4, P1->z);
+ mul_f256(t2, P2->x, t4);
+ mul_f256(t5, P1->z, t4);
+ mul_f256(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * We need to test whether r is zero, so we will do some extra
+ * reduce.
+ */
+ for (i = 0; i < 20; i ++) {
+ t2[i] += (F256[i] << 1) - t1[i];
+ t4[i] += (F256[i] << 1) - t3[i];
+ }
+ norm13(t2, t2, 20);
+ norm13(t4, t4, 20);
+ reduce_f256(t4);
+ reduce_final_f256(t4);
+ ret = 0;
+ for (i = 0; i < 20; i ++) {
+ ret |= t4[i];
+ }
+ ret = (ret | -ret) >> 31;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ square_f256(t7, t2);
+ mul_f256(t6, t1, t7);
+ mul_f256(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ square_f256(P1->x, t4);
+ for (i = 0; i < 20; i ++) {
+ P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
+ }
+ norm13(P1->x, P1->x, 20);
+ reduce_f256(P1->x);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ for (i = 0; i < 20; i ++) {
+ t6[i] += (F256[i] << 1) - P1->x[i];
+ }
+ norm13(t6, t6, 20);
+ mul_f256(P1->y, t4, t6);
+ mul_f256(t1, t5, t3);
+ for (i = 0; i < 20; i ++) {
+ P1->y[i] += (F256[i] << 1) - t1[i];
+ }
+ norm13(P1->y, P1->y, 20);
+ reduce_f256(P1->y);
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ mul_f256(P1->z, P1->z, t2);
+
+ return ret;
+}
+
+/*
+ * Decode a P-256 point. This function does not support the point at
+ * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
+ */
+static uint32_t
+p256_decode(p256_jacobian *P, const void *src, size_t len)
+{
+ const unsigned char *buf;
+ uint32_t tx[20], ty[20], t1[20], t2[20];
+ uint32_t bad;
+ int i;
+
+ if (len != 65) {
+ return 0;
+ }
+ buf = src;
+
+ /*
+ * First byte must be 0x04 (uncompressed format). We could support
+ * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
+ * least significant bit of the Y coordinate), but it is explicitly
+ * forbidden by RFC 5480 (section 2.2).
+ */
+ bad = NEQ(buf[0], 0x04);
+
+ /*
+ * Decode the coordinates, and check that they are both lower
+ * than the modulus.
+ */
+ tx[19] = be8_to_le13(tx, buf + 1, 32);
+ ty[19] = be8_to_le13(ty, buf + 33, 32);
+ bad |= reduce_final_f256(tx);
+ bad |= reduce_final_f256(ty);
+
+ /*
+ * Check curve equation.
+ */
+ square_f256(t1, tx);
+ mul_f256(t1, tx, t1);
+ square_f256(t2, ty);
+ for (i = 0; i < 20; i ++) {
+ t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
+ }
+ norm13(t1, t1, 20);
+ reduce_f256(t1);
+ reduce_final_f256(t1);
+ for (i = 0; i < 20; i ++) {
+ bad |= t1[i];
+ }
+
+ /*
+ * Copy coordinates to the point structure.
+ */
+ memcpy(P->x, tx, sizeof tx);
+ memcpy(P->y, ty, sizeof ty);
+ memset(P->z, 0, sizeof P->z);
+ P->z[0] = 1;
+ return EQ(bad, 0);
+}
+
+/*
+ * Encode a point into a buffer. This function assumes that the point is
+ * valid, in affine coordinates, and not the point at infinity.
+ */
+static void
+p256_encode(void *dst, const p256_jacobian *P)
+{
+ unsigned char *buf;
+
+ buf = dst;
+ buf[0] = 0x04;
+ le13_to_be8(buf + 1, 32, P->x);
+ le13_to_be8(buf + 33, 32, P->y);
+}
+
+/*
+ * Multiply a curve point by an integer. The integer is assumed to be
+ * lower than the curve order, and the base point must not be the point
+ * at infinity.
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
+{
+ /*
+ * qz is a flag that is initially 1, and remains equal to 1
+ * as long as the point is the point at infinity.
+ *
+ * We use a 2-bit window to handle multiplier bits by pairs.
+ * The precomputed window really is the points P2 and P3.
+ */
+ uint32_t qz;
+ p256_jacobian P2, P3, Q, T, U;
+
+ /*
+ * Compute window values.
+ */
+ P2 = *P;
+ p256_double(&P2);
+ P3 = *P;
+ p256_add(&P3, &P2);
+
+ /*
+ * We start with Q = 0. We process multiplier bits 2 by 2.
+ */
+ memset(&Q, 0, sizeof Q);
+ qz = 1;
+ while (xlen -- > 0) {
+ int k;
+
+ for (k = 6; k >= 0; k -= 2) {
+ uint32_t bits;
+ uint32_t bnz;
+
+ p256_double(&Q);
+ p256_double(&Q);
+ T = *P;
+ U = Q;
+ bits = (*x >> k) & (uint32_t)3;
+ bnz = NEQ(bits, 0);
+ CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
+ CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
+ p256_add(&U, &T);
+ CCOPY(bnz & qz, &Q, &T, sizeof Q);
+ CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+ qz &= ~bnz;
+ }
+ x ++;
+ }
+ *P = Q;
+}
+
+/*
+ * Precomputed window: k*G points, where G is the curve generator, and k
+ * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
+ * the point are encoded as 20 words of 13 bits each (little-endian
+ * order); 13-bit words are then grouped 2-by-2 into 32-bit words
+ * (little-endian order within each word).
+ */
+static const uint32_t Gwin[15][20] = {
+
+ { 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
+ 0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
+ 0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
+ 0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
+ 0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
+
+ { 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
+ 0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
+ 0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
+ 0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
+ 0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
+
+ { 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
+ 0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
+ 0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
+ 0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
+ 0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
+
+ { 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
+ 0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
+ 0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
+ 0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
+ 0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
+
+ { 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
+ 0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
+ 0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
+ 0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
+ 0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
+
+ { 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
+ 0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
+ 0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
+ 0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
+ 0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
+
+ { 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
+ 0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
+ 0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
+ 0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
+ 0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
+
+ { 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
+ 0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
+ 0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
+ 0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
+ 0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
+
+ { 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
+ 0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
+ 0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
+ 0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
+ 0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
+
+ { 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
+ 0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
+ 0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
+ 0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
+ 0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
+
+ { 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
+ 0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
+ 0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
+ 0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
+ 0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
+
+ { 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
+ 0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
+ 0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
+ 0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
+ 0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
+
+ { 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
+ 0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
+ 0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
+ 0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
+ 0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
+
+ { 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
+ 0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
+ 0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
+ 0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
+ 0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
+
+ { 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
+ 0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
+ 0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
+ 0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
+ 0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
+};
+
+/*
+ * Lookup one of the Gwin[] values, by index. This is constant-time.
+ */
+static void
+lookup_Gwin(p256_jacobian *T, uint32_t idx)
+{
+ uint32_t xy[20];
+ uint32_t k;
+ size_t u;
+
+ memset(xy, 0, sizeof xy);
+ for (k = 0; k < 15; k ++) {
+ uint32_t m;
+
+ m = -EQ(idx, k + 1);
+ for (u = 0; u < 20; u ++) {
+ xy[u] |= m & Gwin[k][u];
+ }
+ }
+ for (u = 0; u < 10; u ++) {
+ T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
+ T->x[(u << 1) + 1] = xy[u] >> 16;
+ T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
+ T->y[(u << 1) + 1] = xy[u + 10] >> 16;
+ }
+ memset(T->z, 0, sizeof T->z);
+ T->z[0] = 1;
+}
+
+/*
+ * Multiply the generator by an integer. The integer is assumed non-zero
+ * and lower than the curve order.
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
+{
+ /*
+ * qz is a flag that is initially 1, and remains equal to 1
+ * as long as the point is the point at infinity.
+ *
+ * We use a 4-bit window to handle multiplier bits by groups
+ * of 4. The precomputed window is constant static data, with
+ * points in affine coordinates; we use a constant-time lookup.
+ */
+ p256_jacobian Q;
+ uint32_t qz;
+
+ memset(&Q, 0, sizeof Q);
+ qz = 1;
+ while (xlen -- > 0) {
+ int k;
+ unsigned bx;
+
+ bx = *x ++;
+ for (k = 0; k < 2; k ++) {
+ uint32_t bits;
+ uint32_t bnz;
+ p256_jacobian T, U;
+
+ p256_double(&Q);
+ p256_double(&Q);
+ p256_double(&Q);
+ p256_double(&Q);
+ bits = (bx >> 4) & 0x0F;
+ bnz = NEQ(bits, 0);
+ lookup_Gwin(&T, bits);
+ U = Q;
+ p256_add_mixed(&U, &T);
+ CCOPY(bnz & qz, &Q, &T, sizeof Q);
+ CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+ qz &= ~bnz;
+ bx <<= 4;
+ }
+ }
+ *P = Q;
+}
+
+static const unsigned char P256_G[] = {
+ 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+ 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+ 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+ 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+ 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+ 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+ 0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+ 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+ 0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = sizeof P256_G;
+ return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = sizeof P256_N;
+ return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 1;
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ uint32_t r;
+ p256_jacobian P;
+
+ (void)curve;
+ r = p256_decode(&P, G, Glen);
+ p256_mul(&P, x, xlen);
+ if (Glen >= 65) {
+ p256_to_affine(&P);
+ p256_encode(G, &P);
+ }
+ return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ p256_jacobian P;
+
+ (void)curve;
+ p256_mulgen(&P, x, xlen);
+ p256_to_affine(&P);
+ p256_encode(R, &P);
+ return 65;
+
+ /*
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+ */
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ p256_jacobian P, Q;
+ uint32_t r, t, z;
+ int i;
+
+ (void)curve;
+ r = p256_decode(&P, A, len);
+ p256_mul(&P, x, xlen);
+ if (B == NULL) {
+ p256_mulgen(&Q, y, ylen);
+ } else {
+ r &= p256_decode(&Q, B, len);
+ p256_mul(&Q, y, ylen);
+ }
+
+ /*
+ * The final addition may fail in case both points are equal.
+ */
+ t = p256_add(&P, &Q);
+ reduce_final_f256(P.z);
+ z = 0;
+ for (i = 0; i < 20; i ++) {
+ z |= P.z[i];
+ }
+ z = EQ(z, 0);
+ p256_double(&Q);
+
+ /*
+ * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+ * have the following:
+ *
+ * z = 0, t = 0 return P (normal addition)
+ * z = 0, t = 1 return P (normal addition)
+ * z = 1, t = 0 return Q (a 'double' case)
+ * z = 1, t = 1 report an error (P+Q = 0)
+ */
+ CCOPY(z & ~t, &P, &Q, sizeof Q);
+ p256_to_affine(&P);
+ p256_encode(A, &P);
+ r &= ~(z & t);
+ return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m15 = {
+ (uint32_t)0x00800000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m31.c b/test/monniaux/BearSSL/src/ec/ec_p256_m31.c
new file mode 100644
index 00000000..d57ef7b0
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_p256_m31.c
@@ -0,0 +1,1475 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
+ * that right-shifting a signed negative integer copies the sign bit
+ * (arithmetic right-shift). This is "implementation-defined behaviour",
+ * i.e. it is not undefined, but it may differ between compilers. Each
+ * compiler is supposed to document its behaviour in that respect. GCC
+ * explicitly defines that an arithmetic right shift is used. We expect
+ * all other compilers to do the same, because underlying CPU offer an
+ * arithmetic right shift opcode that could not be used otherwise.
+ */
+#if BR_NO_ARITH_SHIFT
+#define ARSH(x, n) (((uint32_t)(x) >> (n)) \
+ | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
+#define ARSHW(x, n) (((uint64_t)(x) >> (n)) \
+ | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
+#else
+#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
+#define ARSHW(x, n) ((*(int64_t *)&(x)) >> (n))
+#endif
+
+/*
+ * Convert an integer from unsigned big-endian encoding to a sequence of
+ * 30-bit words in little-endian order. The final "partial" word is
+ * returned.
+ */
+static uint32_t
+be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
+{
+ uint32_t acc;
+ int acc_len;
+
+ acc = 0;
+ acc_len = 0;
+ while (len -- > 0) {
+ uint32_t b;
+
+ b = src[len];
+ if (acc_len < 22) {
+ acc |= b << acc_len;
+ acc_len += 8;
+ } else {
+ *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
+ acc = b >> (30 - acc_len);
+ acc_len -= 22;
+ }
+ }
+ return acc;
+}
+
+/*
+ * Convert an integer (30-bit words, little-endian) to unsigned
+ * big-endian encoding. The total encoding length is provided; all
+ * the destination bytes will be filled.
+ */
+static void
+le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
+{
+ uint32_t acc;
+ int acc_len;
+
+ acc = 0;
+ acc_len = 0;
+ while (len -- > 0) {
+ if (acc_len < 8) {
+ uint32_t w;
+
+ w = *src ++;
+ dst[len] = (unsigned char)(acc | (w << acc_len));
+ acc = w >> (8 - acc_len);
+ acc_len += 22;
+ } else {
+ dst[len] = (unsigned char)acc;
+ acc >>= 8;
+ acc_len -= 8;
+ }
+ }
+}
+
+/*
+ * Multiply two integers. Source integers are represented as arrays of
+ * nine 30-bit words, for values up to 2^270-1. Result is encoded over
+ * 18 words of 30 bits each.
+ */
+static void
+mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ /*
+ * Maximum intermediate result is no more than
+ * 10376293531797946367, which fits in 64 bits. Reason:
+ *
+ * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
+ * 10376293531797946367 < 9663676407 * 2^30
+ *
+ * Thus, adding together 9 products of 30-bit integers, with
+ * a carry of at most 9663676406, yields an integer that fits
+ * on 64 bits and generates a carry of at most 9663676406.
+ */
+ uint64_t t[17];
+ uint64_t cc;
+ int i;
+
+ t[ 0] = MUL31(a[0], b[0]);
+ t[ 1] = MUL31(a[0], b[1])
+ + MUL31(a[1], b[0]);
+ t[ 2] = MUL31(a[0], b[2])
+ + MUL31(a[1], b[1])
+ + MUL31(a[2], b[0]);
+ t[ 3] = MUL31(a[0], b[3])
+ + MUL31(a[1], b[2])
+ + MUL31(a[2], b[1])
+ + MUL31(a[3], b[0]);
+ t[ 4] = MUL31(a[0], b[4])
+ + MUL31(a[1], b[3])
+ + MUL31(a[2], b[2])
+ + MUL31(a[3], b[1])
+ + MUL31(a[4], b[0]);
+ t[ 5] = MUL31(a[0], b[5])
+ + MUL31(a[1], b[4])
+ + MUL31(a[2], b[3])
+ + MUL31(a[3], b[2])
+ + MUL31(a[4], b[1])
+ + MUL31(a[5], b[0]);
+ t[ 6] = MUL31(a[0], b[6])
+ + MUL31(a[1], b[5])
+ + MUL31(a[2], b[4])
+ + MUL31(a[3], b[3])
+ + MUL31(a[4], b[2])
+ + MUL31(a[5], b[1])
+ + MUL31(a[6], b[0]);
+ t[ 7] = MUL31(a[0], b[7])
+ + MUL31(a[1], b[6])
+ + MUL31(a[2], b[5])
+ + MUL31(a[3], b[4])
+ + MUL31(a[4], b[3])
+ + MUL31(a[5], b[2])
+ + MUL31(a[6], b[1])
+ + MUL31(a[7], b[0]);
+ t[ 8] = MUL31(a[0], b[8])
+ + MUL31(a[1], b[7])
+ + MUL31(a[2], b[6])
+ + MUL31(a[3], b[5])
+ + MUL31(a[4], b[4])
+ + MUL31(a[5], b[3])
+ + MUL31(a[6], b[2])
+ + MUL31(a[7], b[1])
+ + MUL31(a[8], b[0]);
+ t[ 9] = MUL31(a[1], b[8])
+ + MUL31(a[2], b[7])
+ + MUL31(a[3], b[6])
+ + MUL31(a[4], b[5])
+ + MUL31(a[5], b[4])
+ + MUL31(a[6], b[3])
+ + MUL31(a[7], b[2])
+ + MUL31(a[8], b[1]);
+ t[10] = MUL31(a[2], b[8])
+ + MUL31(a[3], b[7])
+ + MUL31(a[4], b[6])
+ + MUL31(a[5], b[5])
+ + MUL31(a[6], b[4])
+ + MUL31(a[7], b[3])
+ + MUL31(a[8], b[2]);
+ t[11] = MUL31(a[3], b[8])
+ + MUL31(a[4], b[7])
+ + MUL31(a[5], b[6])
+ + MUL31(a[6], b[5])
+ + MUL31(a[7], b[4])
+ + MUL31(a[8], b[3]);
+ t[12] = MUL31(a[4], b[8])
+ + MUL31(a[5], b[7])
+ + MUL31(a[6], b[6])
+ + MUL31(a[7], b[5])
+ + MUL31(a[8], b[4]);
+ t[13] = MUL31(a[5], b[8])
+ + MUL31(a[6], b[7])
+ + MUL31(a[7], b[6])
+ + MUL31(a[8], b[5]);
+ t[14] = MUL31(a[6], b[8])
+ + MUL31(a[7], b[7])
+ + MUL31(a[8], b[6]);
+ t[15] = MUL31(a[7], b[8])
+ + MUL31(a[8], b[7]);
+ t[16] = MUL31(a[8], b[8]);
+
+ /*
+ * Propagate carries.
+ */
+ cc = 0;
+ for (i = 0; i < 17; i ++) {
+ uint64_t w;
+
+ w = t[i] + cc;
+ d[i] = (uint32_t)w & 0x3FFFFFFF;
+ cc = w >> 30;
+ }
+ d[17] = (uint32_t)cc;
+}
+
+/*
+ * Square a 270-bit integer, represented as an array of nine 30-bit words.
+ * Result uses 18 words of 30 bits each.
+ */
+static void
+square9(uint32_t *d, const uint32_t *a)
+{
+ uint64_t t[17];
+ uint64_t cc;
+ int i;
+
+ t[ 0] = MUL31(a[0], a[0]);
+ t[ 1] = ((MUL31(a[0], a[1])) << 1);
+ t[ 2] = MUL31(a[1], a[1])
+ + ((MUL31(a[0], a[2])) << 1);
+ t[ 3] = ((MUL31(a[0], a[3])
+ + MUL31(a[1], a[2])) << 1);
+ t[ 4] = MUL31(a[2], a[2])
+ + ((MUL31(a[0], a[4])
+ + MUL31(a[1], a[3])) << 1);
+ t[ 5] = ((MUL31(a[0], a[5])
+ + MUL31(a[1], a[4])
+ + MUL31(a[2], a[3])) << 1);
+ t[ 6] = MUL31(a[3], a[3])
+ + ((MUL31(a[0], a[6])
+ + MUL31(a[1], a[5])
+ + MUL31(a[2], a[4])) << 1);
+ t[ 7] = ((MUL31(a[0], a[7])
+ + MUL31(a[1], a[6])
+ + MUL31(a[2], a[5])
+ + MUL31(a[3], a[4])) << 1);
+ t[ 8] = MUL31(a[4], a[4])
+ + ((MUL31(a[0], a[8])
+ + MUL31(a[1], a[7])
+ + MUL31(a[2], a[6])
+ + MUL31(a[3], a[5])) << 1);
+ t[ 9] = ((MUL31(a[1], a[8])
+ + MUL31(a[2], a[7])
+ + MUL31(a[3], a[6])
+ + MUL31(a[4], a[5])) << 1);
+ t[10] = MUL31(a[5], a[5])
+ + ((MUL31(a[2], a[8])
+ + MUL31(a[3], a[7])
+ + MUL31(a[4], a[6])) << 1);
+ t[11] = ((MUL31(a[3], a[8])
+ + MUL31(a[4], a[7])
+ + MUL31(a[5], a[6])) << 1);
+ t[12] = MUL31(a[6], a[6])
+ + ((MUL31(a[4], a[8])
+ + MUL31(a[5], a[7])) << 1);
+ t[13] = ((MUL31(a[5], a[8])
+ + MUL31(a[6], a[7])) << 1);
+ t[14] = MUL31(a[7], a[7])
+ + ((MUL31(a[6], a[8])) << 1);
+ t[15] = ((MUL31(a[7], a[8])) << 1);
+ t[16] = MUL31(a[8], a[8]);
+
+ /*
+ * Propagate carries.
+ */
+ cc = 0;
+ for (i = 0; i < 17; i ++) {
+ uint64_t w;
+
+ w = t[i] + cc;
+ d[i] = (uint32_t)w & 0x3FFFFFFF;
+ cc = w >> 30;
+ }
+ d[17] = (uint32_t)cc;
+}
+
+/*
+ * Base field modulus for P-256.
+ */
+static const uint32_t F256[] = {
+
+ 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
+ 0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
+};
+
+/*
+ * The 'b' curve equation coefficient for P-256.
+ */
+static const uint32_t P256_B[] = {
+
+ 0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
+ 0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
+};
+
+/*
+ * Addition in the field. Source operands shall fit on 257 bits; output
+ * will be lower than twice the modulus.
+ */
+static void
+add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t w, cc;
+ int i;
+
+ cc = 0;
+ for (i = 0; i < 9; i ++) {
+ w = a[i] + b[i] + cc;
+ d[i] = w & 0x3FFFFFFF;
+ cc = w >> 30;
+ }
+ w >>= 16;
+ d[8] &= 0xFFFF;
+ d[3] -= w << 6;
+ d[6] -= w << 12;
+ d[7] += w << 14;
+ cc = w;
+ for (i = 0; i < 9; i ++) {
+ w = d[i] + cc;
+ d[i] = w & 0x3FFFFFFF;
+ cc = ARSH(w, 30);
+ }
+}
+
+/*
+ * Subtraction in the field. Source operands shall be smaller than twice
+ * the modulus; the result will fulfil the same property.
+ */
+static void
+sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t w, cc;
+ int i;
+
+ /*
+ * We really compute a - b + 2*p to make sure that the result is
+ * positive.
+ */
+ w = a[0] - b[0] - 0x00002;
+ d[0] = w & 0x3FFFFFFF;
+ w = a[1] - b[1] + ARSH(w, 30);
+ d[1] = w & 0x3FFFFFFF;
+ w = a[2] - b[2] + ARSH(w, 30);
+ d[2] = w & 0x3FFFFFFF;
+ w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
+ d[3] = w & 0x3FFFFFFF;
+ w = a[4] - b[4] + ARSH(w, 30);
+ d[4] = w & 0x3FFFFFFF;
+ w = a[5] - b[5] + ARSH(w, 30);
+ d[5] = w & 0x3FFFFFFF;
+ w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
+ d[6] = w & 0x3FFFFFFF;
+ w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
+ d[7] = w & 0x3FFFFFFF;
+ w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
+ d[8] = w & 0xFFFF;
+ w >>= 16;
+ d[8] &= 0xFFFF;
+ d[3] -= w << 6;
+ d[6] -= w << 12;
+ d[7] += w << 14;
+ cc = w;
+ for (i = 0; i < 9; i ++) {
+ w = d[i] + cc;
+ d[i] = w & 0x3FFFFFFF;
+ cc = ARSH(w, 30);
+ }
+}
+
+/*
+ * Compute a multiplication in F256. Source operands shall be less than
+ * twice the modulus.
+ */
+static void
+mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+ uint32_t t[18];
+ uint64_t s[18];
+ uint64_t cc, x;
+ uint32_t z, c;
+ int i;
+
+ mul9(t, a, b);
+
+ /*
+ * Modular reduction: each high word in added/subtracted where
+ * necessary.
+ *
+ * The modulus is:
+ * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+ * Therefore:
+ * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+ *
+ * For a word x at bit offset n (n >= 256), we have:
+ * x*2^n = x*2^(n-32) - x*2^(n-64)
+ * - x*2^(n - 160) + x*2^(n-256) mod p
+ *
+ * Thus, we can nullify the high word if we reinject it at some
+ * proper emplacements.
+ *
+ * We use 64-bit intermediate words to allow for carries to
+ * accumulate easily, before performing the final propagation.
+ */
+ for (i = 0; i < 18; i ++) {
+ s[i] = t[i];
+ }
+
+ for (i = 17; i >= 9; i --) {
+ uint64_t y;
+
+ y = s[i];
+ s[i - 1] += ARSHW(y, 2);
+ s[i - 2] += (y << 28) & 0x3FFFFFFF;
+ s[i - 2] -= ARSHW(y, 4);
+ s[i - 3] -= (y << 26) & 0x3FFFFFFF;
+ s[i - 5] -= ARSHW(y, 10);
+ s[i - 6] -= (y << 20) & 0x3FFFFFFF;
+ s[i - 8] += ARSHW(y, 16);
+ s[i - 9] += (y << 14) & 0x3FFFFFFF;
+ }
+
+ /*
+ * Carry propagation must be signed. Moreover, we may have overdone
+ * it a bit, and obtain a negative result.
+ *
+ * The loop above ran 9 times; each time, each word was augmented
+ * by at most one extra word (in absolute value). Thus, the top
+ * word must in fine fit in 39 bits, so the carry below will fit
+ * on 9 bits.
+ */
+ cc = 0;
+ for (i = 0; i < 9; i ++) {
+ x = s[i] + cc;
+ d[i] = (uint32_t)x & 0x3FFFFFFF;
+ cc = ARSHW(x, 30);
+ }
+
+ /*
+ * All nine words fit on 30 bits, but there may be an extra
+ * carry for a few bits (at most 9), and that carry may be
+ * negative. Moreover, we want the result to fit on 257 bits.
+ * The two lines below ensure that the word in d[] has length
+ * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
+ * significant length of cc is less than 24 bits, so we will be
+ * able to switch to 32-bit operations.
+ */
+ cc = ARSHW(x, 16);
+ d[8] &= 0xFFFF;
+
+ /*
+ * One extra round of reduction, for cc*2^256, which means
+ * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
+ * value. If cc is negative, then it may happen (rarely, but
+ * not neglectibly so) that the result would be negative. In
+ * order to avoid that, if cc is negative, then we add the
+ * modulus once. Note that if cc is negative, then propagating
+ * that carry must yield a value lower than the modulus, so
+ * adding the modulus once will keep the final result under
+ * twice the modulus.
+ */
+ z = (uint32_t)cc;
+ d[3] -= z << 6;
+ d[6] -= (z << 12) & 0x3FFFFFFF;
+ d[7] -= ARSH(z, 18);
+ d[7] += (z << 14) & 0x3FFFFFFF;
+ d[8] += ARSH(z, 16);
+ c = z >> 31;
+ d[0] -= c;
+ d[3] += c << 6;
+ d[6] += c << 12;
+ d[7] -= c << 14;
+ d[8] += c << 16;
+ for (i = 0; i < 9; i ++) {
+ uint32_t w;
+
+ w = d[i] + z;
+ d[i] = w & 0x3FFFFFFF;
+ z = ARSH(w, 30);
+ }
+}
+
+/*
+ * Compute a square in F256. Source operand shall be less than
+ * twice the modulus.
+ */
+static void
+square_f256(uint32_t *d, const uint32_t *a)
+{
+ uint32_t t[18];
+ uint64_t s[18];
+ uint64_t cc, x;
+ uint32_t z, c;
+ int i;
+
+ square9(t, a);
+
+ /*
+ * Modular reduction: each high word in added/subtracted where
+ * necessary.
+ *
+ * The modulus is:
+ * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+ * Therefore:
+ * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+ *
+ * For a word x at bit offset n (n >= 256), we have:
+ * x*2^n = x*2^(n-32) - x*2^(n-64)
+ * - x*2^(n - 160) + x*2^(n-256) mod p
+ *
+ * Thus, we can nullify the high word if we reinject it at some
+ * proper emplacements.
+ *
+ * We use 64-bit intermediate words to allow for carries to
+ * accumulate easily, before performing the final propagation.
+ */
+ for (i = 0; i < 18; i ++) {
+ s[i] = t[i];
+ }
+
+ for (i = 17; i >= 9; i --) {
+ uint64_t y;
+
+ y = s[i];
+ s[i - 1] += ARSHW(y, 2);
+ s[i - 2] += (y << 28) & 0x3FFFFFFF;
+ s[i - 2] -= ARSHW(y, 4);
+ s[i - 3] -= (y << 26) & 0x3FFFFFFF;
+ s[i - 5] -= ARSHW(y, 10);
+ s[i - 6] -= (y << 20) & 0x3FFFFFFF;
+ s[i - 8] += ARSHW(y, 16);
+ s[i - 9] += (y << 14) & 0x3FFFFFFF;
+ }
+
+ /*
+ * Carry propagation must be signed. Moreover, we may have overdone
+ * it a bit, and obtain a negative result.
+ *
+ * The loop above ran 9 times; each time, each word was augmented
+ * by at most one extra word (in absolute value). Thus, the top
+ * word must in fine fit in 39 bits, so the carry below will fit
+ * on 9 bits.
+ */
+ cc = 0;
+ for (i = 0; i < 9; i ++) {
+ x = s[i] + cc;
+ d[i] = (uint32_t)x & 0x3FFFFFFF;
+ cc = ARSHW(x, 30);
+ }
+
+ /*
+ * All nine words fit on 30 bits, but there may be an extra
+ * carry for a few bits (at most 9), and that carry may be
+ * negative. Moreover, we want the result to fit on 257 bits.
+ * The two lines below ensure that the word in d[] has length
+ * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
+ * significant length of cc is less than 24 bits, so we will be
+ * able to switch to 32-bit operations.
+ */
+ cc = ARSHW(x, 16);
+ d[8] &= 0xFFFF;
+
+ /*
+ * One extra round of reduction, for cc*2^256, which means
+ * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
+ * value. If cc is negative, then it may happen (rarely, but
+ * not neglectibly so) that the result would be negative. In
+ * order to avoid that, if cc is negative, then we add the
+ * modulus once. Note that if cc is negative, then propagating
+ * that carry must yield a value lower than the modulus, so
+ * adding the modulus once will keep the final result under
+ * twice the modulus.
+ */
+ z = (uint32_t)cc;
+ d[3] -= z << 6;
+ d[6] -= (z << 12) & 0x3FFFFFFF;
+ d[7] -= ARSH(z, 18);
+ d[7] += (z << 14) & 0x3FFFFFFF;
+ d[8] += ARSH(z, 16);
+ c = z >> 31;
+ d[0] -= c;
+ d[3] += c << 6;
+ d[6] += c << 12;
+ d[7] -= c << 14;
+ d[8] += c << 16;
+ for (i = 0; i < 9; i ++) {
+ uint32_t w;
+
+ w = d[i] + z;
+ d[i] = w & 0x3FFFFFFF;
+ z = ARSH(w, 30);
+ }
+}
+
+/*
+ * Perform a "final reduction" in field F256 (field for curve P-256).
+ * The source value must be less than twice the modulus. If the value
+ * is not lower than the modulus, then the modulus is subtracted and
+ * this function returns 1; otherwise, it leaves it untouched and it
+ * returns 0.
+ */
+static uint32_t
+reduce_final_f256(uint32_t *d)
+{
+ uint32_t t[9];
+ uint32_t cc;
+ int i;
+
+ cc = 0;
+ for (i = 0; i < 9; i ++) {
+ uint32_t w;
+
+ w = d[i] - F256[i] - cc;
+ cc = w >> 31;
+ t[i] = w & 0x3FFFFFFF;
+ }
+ cc ^= 1;
+ CCOPY(cc, d, t, sizeof t);
+ return cc;
+}
+
+/*
+ * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
+ * are such that:
+ * X = x / z^2
+ * Y = y / z^3
+ * For the point at infinity, z = 0.
+ * Each point thus admits many possible representations.
+ *
+ * Coordinates are represented in arrays of 32-bit integers, each holding
+ * 30 bits of data. Values may also be slightly greater than the modulus,
+ * but they will always be lower than twice the modulus.
+ */
+typedef struct {
+ uint32_t x[9];
+ uint32_t y[9];
+ uint32_t z[9];
+} p256_jacobian;
+
+/*
+ * Convert a point to affine coordinates:
+ * - If the point is the point at infinity, then all three coordinates
+ * are set to 0.
+ * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
+ * coordinates are the 'X' and 'Y' affine coordinates.
+ * The coordinates are guaranteed to be lower than the modulus.
+ */
+static void
+p256_to_affine(p256_jacobian *P)
+{
+ uint32_t t1[9], t2[9];
+ int i;
+
+ /*
+ * Invert z with a modular exponentiation: the modulus is
+ * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
+ * p-2. Exponent bit pattern (from high to low) is:
+ * - 32 bits of value 1
+ * - 31 bits of value 0
+ * - 1 bit of value 1
+ * - 96 bits of value 0
+ * - 94 bits of value 1
+ * - 1 bit of value 0
+ * - 1 bit of value 1
+ * Thus, we precompute z^(2^31-1) to speed things up.
+ *
+ * If z = 0 (point at infinity) then the modular exponentiation
+ * will yield 0, which leads to the expected result (all three
+ * coordinates set to 0).
+ */
+
+ /*
+ * A simple square-and-multiply for z^(2^31-1). We could save about
+ * two dozen multiplications here with an addition chain, but
+ * this would require a bit more code, and extra stack buffers.
+ */
+ memcpy(t1, P->z, sizeof P->z);
+ for (i = 0; i < 30; i ++) {
+ square_f256(t1, t1);
+ mul_f256(t1, t1, P->z);
+ }
+
+ /*
+ * Square-and-multiply. Apart from the squarings, we have a few
+ * multiplications to set bits to 1; we multiply by the original z
+ * for setting 1 bit, and by t1 for setting 31 bits.
+ */
+ memcpy(t2, P->z, sizeof P->z);
+ for (i = 1; i < 256; i ++) {
+ square_f256(t2, t2);
+ switch (i) {
+ case 31:
+ case 190:
+ case 221:
+ case 252:
+ mul_f256(t2, t2, t1);
+ break;
+ case 63:
+ case 253:
+ case 255:
+ mul_f256(t2, t2, P->z);
+ break;
+ }
+ }
+
+ /*
+ * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
+ */
+ mul_f256(t1, t2, t2);
+ mul_f256(P->x, t1, P->x);
+ mul_f256(t1, t1, t2);
+ mul_f256(P->y, t1, P->y);
+ reduce_final_f256(P->x);
+ reduce_final_f256(P->y);
+
+ /*
+ * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
+ * this will set z to 1.
+ */
+ mul_f256(P->z, P->z, t2);
+ reduce_final_f256(P->z);
+}
+
+/*
+ * Double a point in P-256. This function works for all valid points,
+ * including the point at infinity.
+ */
+static void
+p256_double(p256_jacobian *Q)
+{
+ /*
+ * Doubling formulas are:
+ *
+ * s = 4*x*y^2
+ * m = 3*(x + z^2)*(x - z^2)
+ * x' = m^2 - 2*s
+ * y' = m*(s - x') - 8*y^4
+ * z' = 2*y*z
+ *
+ * These formulas work for all points, including points of order 2
+ * and points at infinity:
+ * - If y = 0 then z' = 0. But there is no such point in P-256
+ * anyway.
+ * - If z = 0 then z' = 0.
+ */
+ uint32_t t1[9], t2[9], t3[9], t4[9];
+
+ /*
+ * Compute z^2 in t1.
+ */
+ square_f256(t1, Q->z);
+
+ /*
+ * Compute x-z^2 in t2 and x+z^2 in t1.
+ */
+ add_f256(t2, Q->x, t1);
+ sub_f256(t1, Q->x, t1);
+
+ /*
+ * Compute 3*(x+z^2)*(x-z^2) in t1.
+ */
+ mul_f256(t3, t1, t2);
+ add_f256(t1, t3, t3);
+ add_f256(t1, t3, t1);
+
+ /*
+ * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+ */
+ square_f256(t3, Q->y);
+ add_f256(t3, t3, t3);
+ mul_f256(t2, Q->x, t3);
+ add_f256(t2, t2, t2);
+
+ /*
+ * Compute x' = m^2 - 2*s.
+ */
+ square_f256(Q->x, t1);
+ sub_f256(Q->x, Q->x, t2);
+ sub_f256(Q->x, Q->x, t2);
+
+ /*
+ * Compute z' = 2*y*z.
+ */
+ mul_f256(t4, Q->y, Q->z);
+ add_f256(Q->z, t4, t4);
+
+ /*
+ * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+ * 2*y^2 in t3.
+ */
+ sub_f256(t2, t2, Q->x);
+ mul_f256(Q->y, t1, t2);
+ square_f256(t4, t3);
+ add_f256(t4, t4, t4);
+ sub_f256(Q->y, Q->y, t4);
+}
+
+/*
+ * Add point P2 to point P1.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ * - If P1 == 0 but P2 != 0
+ * - If P1 != 0 but P2 == 0
+ * - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ * - P1 and P2 have the same Y coordinate
+ * - P1 == 0 and P2 == 0
+ * - The Y coordinate of one of the points is 0 and the other point is
+ * the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ * - If the result is not the point at infinity, then it is correct.
+ * - Otherwise, if the returned value is 1, then this is a case of
+ * P1+P2 == 0, so the result is indeed the point at infinity.
+ * - Otherwise, P1 == P2, so a "double" operation should have been
+ * performed.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+ /*
+ * Addtions formulas are:
+ *
+ * u1 = x1 * z2^2
+ * u2 = x2 * z1^2
+ * s1 = y1 * z2^3
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1 * z2
+ */
+ uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
+ uint32_t ret;
+ int i;
+
+ /*
+ * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+ */
+ square_f256(t3, P2->z);
+ mul_f256(t1, P1->x, t3);
+ mul_f256(t4, P2->z, t3);
+ mul_f256(t3, P1->y, t4);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ square_f256(t4, P1->z);
+ mul_f256(t2, P2->x, t4);
+ mul_f256(t5, P1->z, t4);
+ mul_f256(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * We need to test whether r is zero, so we will do some extra
+ * reduce.
+ */
+ sub_f256(t2, t2, t1);
+ sub_f256(t4, t4, t3);
+ reduce_final_f256(t4);
+ ret = 0;
+ for (i = 0; i < 9; i ++) {
+ ret |= t4[i];
+ }
+ ret = (ret | -ret) >> 31;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ square_f256(t7, t2);
+ mul_f256(t6, t1, t7);
+ mul_f256(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ square_f256(P1->x, t4);
+ sub_f256(P1->x, P1->x, t5);
+ sub_f256(P1->x, P1->x, t6);
+ sub_f256(P1->x, P1->x, t6);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ sub_f256(t6, t6, P1->x);
+ mul_f256(P1->y, t4, t6);
+ mul_f256(t1, t5, t3);
+ sub_f256(P1->y, P1->y, t1);
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ mul_f256(t1, P1->z, P2->z);
+ mul_f256(P1->z, t1, t2);
+
+ return ret;
+}
+
+/*
+ * Add point P2 to point P1. This is a specialised function for the
+ * case when P2 is a non-zero point in affine coordinate.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ * - If P1 == 0
+ * - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ * - P1 and P2 have the same Y coordinate
+ * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ * - If the result is not the point at infinity, then it is correct.
+ * - Otherwise, if the returned value is 1, then this is a case of
+ * P1+P2 == 0, so the result is indeed the point at infinity.
+ * - Otherwise, P1 == P2, so a "double" operation should have been
+ * performed.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
+{
+ /*
+ * Addtions formulas are:
+ *
+ * u1 = x1
+ * u2 = x2 * z1^2
+ * s1 = y1
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1
+ */
+ uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
+ uint32_t ret;
+ int i;
+
+ /*
+ * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+ */
+ memcpy(t1, P1->x, sizeof t1);
+ memcpy(t3, P1->y, sizeof t3);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ square_f256(t4, P1->z);
+ mul_f256(t2, P2->x, t4);
+ mul_f256(t5, P1->z, t4);
+ mul_f256(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * We need to test whether r is zero, so we will do some extra
+ * reduce.
+ */
+ sub_f256(t2, t2, t1);
+ sub_f256(t4, t4, t3);
+ reduce_final_f256(t4);
+ ret = 0;
+ for (i = 0; i < 9; i ++) {
+ ret |= t4[i];
+ }
+ ret = (ret | -ret) >> 31;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ square_f256(t7, t2);
+ mul_f256(t6, t1, t7);
+ mul_f256(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ square_f256(P1->x, t4);
+ sub_f256(P1->x, P1->x, t5);
+ sub_f256(P1->x, P1->x, t6);
+ sub_f256(P1->x, P1->x, t6);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ sub_f256(t6, t6, P1->x);
+ mul_f256(P1->y, t4, t6);
+ mul_f256(t1, t5, t3);
+ sub_f256(P1->y, P1->y, t1);
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ mul_f256(P1->z, P1->z, t2);
+
+ return ret;
+}
+
+/*
+ * Decode a P-256 point. This function does not support the point at
+ * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
+ */
+static uint32_t
+p256_decode(p256_jacobian *P, const void *src, size_t len)
+{
+ const unsigned char *buf;
+ uint32_t tx[9], ty[9], t1[9], t2[9];
+ uint32_t bad;
+ int i;
+
+ if (len != 65) {
+ return 0;
+ }
+ buf = src;
+
+ /*
+ * First byte must be 0x04 (uncompressed format). We could support
+ * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
+ * least significant bit of the Y coordinate), but it is explicitly
+ * forbidden by RFC 5480 (section 2.2).
+ */
+ bad = NEQ(buf[0], 0x04);
+
+ /*
+ * Decode the coordinates, and check that they are both lower
+ * than the modulus.
+ */
+ tx[8] = be8_to_le30(tx, buf + 1, 32);
+ ty[8] = be8_to_le30(ty, buf + 33, 32);
+ bad |= reduce_final_f256(tx);
+ bad |= reduce_final_f256(ty);
+
+ /*
+ * Check curve equation.
+ */
+ square_f256(t1, tx);
+ mul_f256(t1, tx, t1);
+ square_f256(t2, ty);
+ sub_f256(t1, t1, tx);
+ sub_f256(t1, t1, tx);
+ sub_f256(t1, t1, tx);
+ add_f256(t1, t1, P256_B);
+ sub_f256(t1, t1, t2);
+ reduce_final_f256(t1);
+ for (i = 0; i < 9; i ++) {
+ bad |= t1[i];
+ }
+
+ /*
+ * Copy coordinates to the point structure.
+ */
+ memcpy(P->x, tx, sizeof tx);
+ memcpy(P->y, ty, sizeof ty);
+ memset(P->z, 0, sizeof P->z);
+ P->z[0] = 1;
+ return EQ(bad, 0);
+}
+
+/*
+ * Encode a point into a buffer. This function assumes that the point is
+ * valid, in affine coordinates, and not the point at infinity.
+ */
+static void
+p256_encode(void *dst, const p256_jacobian *P)
+{
+ unsigned char *buf;
+
+ buf = dst;
+ buf[0] = 0x04;
+ le30_to_be8(buf + 1, 32, P->x);
+ le30_to_be8(buf + 33, 32, P->y);
+}
+
+/*
+ * Multiply a curve point by an integer. The integer is assumed to be
+ * lower than the curve order, and the base point must not be the point
+ * at infinity.
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
+{
+ /*
+ * qz is a flag that is initially 1, and remains equal to 1
+ * as long as the point is the point at infinity.
+ *
+ * We use a 2-bit window to handle multiplier bits by pairs.
+ * The precomputed window really is the points P2 and P3.
+ */
+ uint32_t qz;
+ p256_jacobian P2, P3, Q, T, U;
+
+ /*
+ * Compute window values.
+ */
+ P2 = *P;
+ p256_double(&P2);
+ P3 = *P;
+ p256_add(&P3, &P2);
+
+ /*
+ * We start with Q = 0. We process multiplier bits 2 by 2.
+ */
+ memset(&Q, 0, sizeof Q);
+ qz = 1;
+ while (xlen -- > 0) {
+ int k;
+
+ for (k = 6; k >= 0; k -= 2) {
+ uint32_t bits;
+ uint32_t bnz;
+
+ p256_double(&Q);
+ p256_double(&Q);
+ T = *P;
+ U = Q;
+ bits = (*x >> k) & (uint32_t)3;
+ bnz = NEQ(bits, 0);
+ CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
+ CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
+ p256_add(&U, &T);
+ CCOPY(bnz & qz, &Q, &T, sizeof Q);
+ CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+ qz &= ~bnz;
+ }
+ x ++;
+ }
+ *P = Q;
+}
+
+/*
+ * Precomputed window: k*G points, where G is the curve generator, and k
+ * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
+ * the point are encoded as 9 words of 30 bits each (little-endian
+ * order).
+ */
+static const uint32_t Gwin[15][18] = {
+
+ { 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
+ 0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
+ 0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
+ 0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
+ 0x10B8BF86, 0x00004FE3 },
+
+ { 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
+ 0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
+ 0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
+ 0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
+ 0x154436E3, 0x00000777 },
+
+ { 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
+ 0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
+ 0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
+ 0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
+ 0x19031266, 0x00008734 },
+
+ { 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
+ 0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
+ 0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
+ 0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
+ 0x15D69318, 0x0000E0F1 },
+
+ { 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
+ 0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
+ 0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
+ 0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
+ 0x1F6A2412, 0x0000E0C1 },
+
+ { 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
+ 0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
+ 0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
+ 0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
+ 0x041D0C8D, 0x0000E85C },
+
+ { 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
+ 0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
+ 0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
+ 0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
+ 0x076F780C, 0x000073EB },
+
+ { 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
+ 0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
+ 0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
+ 0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
+ 0x332F647A, 0x0000AD5A },
+
+ { 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
+ 0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
+ 0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
+ 0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
+ 0x11325CB2, 0x00002A27 },
+
+ { 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
+ 0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
+ 0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
+ 0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
+ 0x18A88A6A, 0x00008786 },
+
+ { 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
+ 0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
+ 0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
+ 0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
+ 0x0826B331, 0x00009099 },
+
+ { 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
+ 0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
+ 0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
+ 0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
+ 0x2D1AA70E, 0x00000770 },
+
+ { 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
+ 0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
+ 0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
+ 0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
+ 0x163353AF, 0x000063BB },
+
+ { 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
+ 0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
+ 0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
+ 0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
+ 0x3C6ECA7D, 0x0000F599 },
+
+ { 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
+ 0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
+ 0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
+ 0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
+ 0x0FB8D64B, 0x0000B5B9 }
+};
+
+/*
+ * Lookup one of the Gwin[] values, by index. This is constant-time.
+ */
+static void
+lookup_Gwin(p256_jacobian *T, uint32_t idx)
+{
+ uint32_t xy[18];
+ uint32_t k;
+ size_t u;
+
+ memset(xy, 0, sizeof xy);
+ for (k = 0; k < 15; k ++) {
+ uint32_t m;
+
+ m = -EQ(idx, k + 1);
+ for (u = 0; u < 18; u ++) {
+ xy[u] |= m & Gwin[k][u];
+ }
+ }
+ memcpy(T->x, &xy[0], sizeof T->x);
+ memcpy(T->y, &xy[9], sizeof T->y);
+ memset(T->z, 0, sizeof T->z);
+ T->z[0] = 1;
+}
+
+/*
+ * Multiply the generator by an integer. The integer is assumed non-zero
+ * and lower than the curve order.
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
+{
+ /*
+ * qz is a flag that is initially 1, and remains equal to 1
+ * as long as the point is the point at infinity.
+ *
+ * We use a 4-bit window to handle multiplier bits by groups
+ * of 4. The precomputed window is constant static data, with
+ * points in affine coordinates; we use a constant-time lookup.
+ */
+ p256_jacobian Q;
+ uint32_t qz;
+
+ memset(&Q, 0, sizeof Q);
+ qz = 1;
+ while (xlen -- > 0) {
+ int k;
+ unsigned bx;
+
+ bx = *x ++;
+ for (k = 0; k < 2; k ++) {
+ uint32_t bits;
+ uint32_t bnz;
+ p256_jacobian T, U;
+
+ p256_double(&Q);
+ p256_double(&Q);
+ p256_double(&Q);
+ p256_double(&Q);
+ bits = (bx >> 4) & 0x0F;
+ bnz = NEQ(bits, 0);
+ lookup_Gwin(&T, bits);
+ U = Q;
+ p256_add_mixed(&U, &T);
+ CCOPY(bnz & qz, &Q, &T, sizeof Q);
+ CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+ qz &= ~bnz;
+ bx <<= 4;
+ }
+ }
+ *P = Q;
+}
+
+static const unsigned char P256_G[] = {
+ 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+ 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+ 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+ 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+ 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+ 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+ 0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+ 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+ 0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = sizeof P256_G;
+ return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = sizeof P256_N;
+ return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 1;
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ uint32_t r;
+ p256_jacobian P;
+
+ (void)curve;
+ r = p256_decode(&P, G, Glen);
+ p256_mul(&P, x, xlen);
+ if (Glen >= 65) {
+ p256_to_affine(&P);
+ p256_encode(G, &P);
+ }
+ return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ p256_jacobian P;
+
+ (void)curve;
+ p256_mulgen(&P, x, xlen);
+ p256_to_affine(&P);
+ p256_encode(R, &P);
+ return 65;
+
+ /*
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+ */
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ p256_jacobian P, Q;
+ uint32_t r, t, z;
+ int i;
+
+ (void)curve;
+ r = p256_decode(&P, A, len);
+ p256_mul(&P, x, xlen);
+ if (B == NULL) {
+ p256_mulgen(&Q, y, ylen);
+ } else {
+ r &= p256_decode(&Q, B, len);
+ p256_mul(&Q, y, ylen);
+ }
+
+ /*
+ * The final addition may fail in case both points are equal.
+ */
+ t = p256_add(&P, &Q);
+ reduce_final_f256(P.z);
+ z = 0;
+ for (i = 0; i < 9; i ++) {
+ z |= P.z[i];
+ }
+ z = EQ(z, 0);
+ p256_double(&Q);
+
+ /*
+ * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+ * have the following:
+ *
+ * z = 0, t = 0 return P (normal addition)
+ * z = 0, t = 1 return P (normal addition)
+ * z = 1, t = 0 return Q (a 'double' case)
+ * z = 1, t = 1 report an error (P+Q = 0)
+ */
+ CCOPY(z & ~t, &P, &Q, sizeof Q);
+ p256_to_affine(&P);
+ p256_encode(A, &P);
+ r &= ~(z & t);
+ return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m31 = {
+ (uint32_t)0x00800000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m62.c b/test/monniaux/BearSSL/src/ec/ec_p256_m62.c
new file mode 100644
index 00000000..3bcb95b5
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_p256_m62.c
@@ -0,0 +1,1765 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char P256_G[] = {
+ 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+ 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+ 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+ 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+ 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+ 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+ 0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+ 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+ 0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = sizeof P256_G;
+ return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = sizeof P256_N;
+ return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 1;
+}
+
+/*
+ * A field element is encoded as five 64-bit integers, in basis 2^52.
+ * Limbs may occasionally exceed 2^52.
+ *
+ * A _partially reduced_ value is such that the following hold:
+ * - top limb is less than 2^48 + 2^30
+ * - the other limbs fit on 53 bits each
+ * In particular, such a value is less than twice the modulus p.
+ */
+
+#define BIT(n) ((uint64_t)1 << (n))
+#define MASK48 (BIT(48) - BIT(0))
+#define MASK52 (BIT(52) - BIT(0))
+
+/* R = 2^260 mod p */
+static const uint64_t F256_R[] = {
+ 0x0000000000010, 0xF000000000000, 0xFFFFFFFFFFFFF,
+ 0xFFEFFFFFFFFFF, 0x00000000FFFFF
+};
+
+/* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p
+ (Montgomery representation of B). */
+static const uint64_t P256_B_MONTY[] = {
+ 0xDF6229C4BDDFD, 0xCA8843090D89C, 0x212ED6ACF005C,
+ 0x83415A220ABF7, 0x0C30061DD4874
+};
+
+/*
+ * Addition in the field. Carry propagation is not performed.
+ * On input, limbs may be up to 63 bits each; on output, they will
+ * be up to one bit more than on input.
+ */
+static inline void
+f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+ d[0] = a[0] + b[0];
+ d[1] = a[1] + b[1];
+ d[2] = a[2] + b[2];
+ d[3] = a[3] + b[3];
+ d[4] = a[4] + b[4];
+}
+
+/*
+ * Partially reduce the provided value.
+ * Input: limbs can go up to 61 bits each.
+ * Output: partially reduced.
+ */
+static inline void
+f256_partial_reduce(uint64_t *a)
+{
+ uint64_t w, cc, s;
+
+ /*
+ * Propagate carries.
+ */
+ w = a[0];
+ a[0] = w & MASK52;
+ cc = w >> 52;
+ w = a[1] + cc;
+ a[1] = w & MASK52;
+ cc = w >> 52;
+ w = a[2] + cc;
+ a[2] = w & MASK52;
+ cc = w >> 52;
+ w = a[3] + cc;
+ a[3] = w & MASK52;
+ cc = w >> 52;
+ a[4] += cc;
+
+ s = a[4] >> 48; /* s < 2^14 */
+ a[0] += s; /* a[0] < 2^52 + 2^14 */
+ w = a[1] - (s << 44);
+ a[1] = w & MASK52; /* a[1] < 2^52 */
+ cc = -(w >> 52) & 0xFFF; /* cc < 16 */
+ w = a[2] - cc;
+ a[2] = w & MASK52; /* a[2] < 2^52 */
+ cc = w >> 63; /* cc = 0 or 1 */
+ w = a[3] - cc - (s << 36);
+ a[3] = w & MASK52; /* a[3] < 2^52 */
+ cc = w >> 63; /* cc = 0 or 1 */
+ w = a[4] & MASK48;
+ a[4] = w + (s << 16) - cc; /* a[4] < 2^48 + 2^30 */
+}
+
+/*
+ * Subtraction in the field.
+ * Input: limbs must fit on 60 bits each; in particular, the complete
+ * integer will be less than 2^268 + 2^217.
+ * Output: partially reduced.
+ */
+static inline void
+f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+ uint64_t t[5], w, s, cc;
+
+ /*
+ * We compute d = 2^13*p + a - b; this ensures a positive
+ * intermediate value.
+ *
+ * Each individual addition/subtraction may yield a positive or
+ * negative result; thus, we need to handle a signed carry, thus
+ * with sign extension. We prefer not to use signed types (int64_t)
+ * because conversion from unsigned to signed is cumbersome (a
+ * direct cast with the top bit set is undefined behavior; instead,
+ * we have to use pointer aliasing, using the guaranteed properties
+ * of exact-width types, but this requires the compiler to optimize
+ * away the writes and reads from RAM), and right-shifting a
+ * signed negative value is implementation-defined. Therefore,
+ * we use a custom sign extension.
+ */
+
+ w = a[0] - b[0] - BIT(13);
+ t[0] = w & MASK52;
+ cc = w >> 52;
+ cc |= -(cc & BIT(11));
+ w = a[1] - b[1] + cc;
+ t[1] = w & MASK52;
+ cc = w >> 52;
+ cc |= -(cc & BIT(11));
+ w = a[2] - b[2] + cc;
+ t[2] = (w & MASK52) + BIT(5);
+ cc = w >> 52;
+ cc |= -(cc & BIT(11));
+ w = a[3] - b[3] + cc;
+ t[3] = (w & MASK52) + BIT(49);
+ cc = w >> 52;
+ cc |= -(cc & BIT(11));
+ t[4] = (BIT(61) - BIT(29)) + a[4] - b[4] + cc;
+
+ /*
+ * Perform partial reduction. Rule is:
+ * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+ *
+ * At that point:
+ * 0 <= t[0] <= 2^52 - 1
+ * 0 <= t[1] <= 2^52 - 1
+ * 2^5 <= t[2] <= 2^52 + 2^5 - 1
+ * 2^49 <= t[3] <= 2^52 + 2^49 - 1
+ * 2^59 < t[4] <= 2^61 + 2^60 - 2^29
+ *
+ * Thus, the value 's' (t[4] / 2^48) will be necessarily
+ * greater than 2048, and less than 12288.
+ */
+ s = t[4] >> 48;
+
+ d[0] = t[0] + s; /* d[0] <= 2^52 + 12287 */
+ w = t[1] - (s << 44);
+ d[1] = w & MASK52; /* d[1] <= 2^52 - 1 */
+ cc = -(w >> 52) & 0xFFF; /* cc <= 48 */
+ w = t[2] - cc;
+ cc = w >> 63; /* cc = 0 or 1 */
+ d[2] = w + (cc << 52); /* d[2] <= 2^52 + 31 */
+ w = t[3] - cc - (s << 36);
+ cc = w >> 63; /* cc = 0 or 1 */
+ d[3] = w + (cc << 52); /* t[3] <= 2^52 + 2^49 - 1 */
+ d[4] = (t[4] & MASK48) + (s << 16) - cc; /* d[4] < 2^48 + 2^30 */
+
+ /*
+ * If s = 0, then none of the limbs is modified, and there cannot
+ * be an overflow; if s != 0, then (s << 16) > cc, and there is
+ * no overflow either.
+ */
+}
+
+/*
+ * Montgomery multiplication in the field.
+ * Input: limbs must fit on 56 bits each.
+ * Output: partially reduced.
+ */
+static void
+f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+ int i;
+ uint64_t t[5];
+
+ t[0] = 0;
+ t[1] = 0;
+ t[2] = 0;
+ t[3] = 0;
+ t[4] = 0;
+ for (i = 0; i < 5; i ++) {
+ uint64_t x, f, cc, w, s;
+ unsigned __int128 z;
+
+ /*
+ * Since limbs of a[] and b[] fit on 56 bits each,
+ * each individual product fits on 112 bits. Also,
+ * the factor f fits on 52 bits, so f<<48 fits on
+ * 112 bits too. This guarantees that carries (cc)
+ * will fit on 62 bits, thus no overflow.
+ *
+ * The operations below compute:
+ * t <- (t + x*b + f*p) / 2^64
+ */
+ x = a[i];
+ z = (unsigned __int128)b[0] * (unsigned __int128)x
+ + (unsigned __int128)t[0];
+ f = (uint64_t)z & MASK52;
+ cc = (uint64_t)(z >> 52);
+ z = (unsigned __int128)b[1] * (unsigned __int128)x
+ + (unsigned __int128)t[1] + cc
+ + ((unsigned __int128)f << 44);
+ t[0] = (uint64_t)z & MASK52;
+ cc = (uint64_t)(z >> 52);
+ z = (unsigned __int128)b[2] * (unsigned __int128)x
+ + (unsigned __int128)t[2] + cc;
+ t[1] = (uint64_t)z & MASK52;
+ cc = (uint64_t)(z >> 52);
+ z = (unsigned __int128)b[3] * (unsigned __int128)x
+ + (unsigned __int128)t[3] + cc
+ + ((unsigned __int128)f << 36);
+ t[2] = (uint64_t)z & MASK52;
+ cc = (uint64_t)(z >> 52);
+ z = (unsigned __int128)b[4] * (unsigned __int128)x
+ + (unsigned __int128)t[4] + cc
+ + ((unsigned __int128)f << 48)
+ - ((unsigned __int128)f << 16);
+ t[3] = (uint64_t)z & MASK52;
+ t[4] = (uint64_t)(z >> 52);
+
+ /*
+ * t[4] may be up to 62 bits here; we need to do a
+ * partial reduction. Note that limbs t[0] to t[3]
+ * fit on 52 bits each.
+ */
+ s = t[4] >> 48; /* s < 2^14 */
+ t[0] += s; /* t[0] < 2^52 + 2^14 */
+ w = t[1] - (s << 44);
+ t[1] = w & MASK52; /* t[1] < 2^52 */
+ cc = -(w >> 52) & 0xFFF; /* cc < 16 */
+ w = t[2] - cc;
+ t[2] = w & MASK52; /* t[2] < 2^52 */
+ cc = w >> 63; /* cc = 0 or 1 */
+ w = t[3] - cc - (s << 36);
+ t[3] = w & MASK52; /* t[3] < 2^52 */
+ cc = w >> 63; /* cc = 0 or 1 */
+ w = t[4] & MASK48;
+ t[4] = w + (s << 16) - cc; /* t[4] < 2^48 + 2^30 */
+
+ /*
+ * The final t[4] cannot overflow because cc is 0 or 1,
+ * and cc can be 1 only if s != 0.
+ */
+ }
+
+ d[0] = t[0];
+ d[1] = t[1];
+ d[2] = t[2];
+ d[3] = t[3];
+ d[4] = t[4];
+
+#elif BR_UMUL128
+
+ int i;
+ uint64_t t[5];
+
+ t[0] = 0;
+ t[1] = 0;
+ t[2] = 0;
+ t[3] = 0;
+ t[4] = 0;
+ for (i = 0; i < 5; i ++) {
+ uint64_t x, f, cc, w, s, zh, zl;
+ unsigned char k;
+
+ /*
+ * Since limbs of a[] and b[] fit on 56 bits each,
+ * each individual product fits on 112 bits. Also,
+ * the factor f fits on 52 bits, so f<<48 fits on
+ * 112 bits too. This guarantees that carries (cc)
+ * will fit on 62 bits, thus no overflow.
+ *
+ * The operations below compute:
+ * t <- (t + x*b + f*p) / 2^64
+ */
+ x = a[i];
+ zl = _umul128(b[0], x, &zh);
+ k = _addcarry_u64(0, t[0], zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ f = zl & MASK52;
+ cc = (zl >> 52) | (zh << 12);
+
+ zl = _umul128(b[1], x, &zh);
+ k = _addcarry_u64(0, t[1], zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ k = _addcarry_u64(0, cc, zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ k = _addcarry_u64(0, f << 44, zl, &zl);
+ (void)_addcarry_u64(k, f >> 20, zh, &zh);
+ t[0] = zl & MASK52;
+ cc = (zl >> 52) | (zh << 12);
+
+ zl = _umul128(b[2], x, &zh);
+ k = _addcarry_u64(0, t[2], zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ k = _addcarry_u64(0, cc, zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ t[1] = zl & MASK52;
+ cc = (zl >> 52) | (zh << 12);
+
+ zl = _umul128(b[3], x, &zh);
+ k = _addcarry_u64(0, t[3], zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ k = _addcarry_u64(0, cc, zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ k = _addcarry_u64(0, f << 36, zl, &zl);
+ (void)_addcarry_u64(k, f >> 28, zh, &zh);
+ t[2] = zl & MASK52;
+ cc = (zl >> 52) | (zh << 12);
+
+ zl = _umul128(b[4], x, &zh);
+ k = _addcarry_u64(0, t[4], zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ k = _addcarry_u64(0, cc, zl, &zl);
+ (void)_addcarry_u64(k, 0, zh, &zh);
+ k = _addcarry_u64(0, f << 48, zl, &zl);
+ (void)_addcarry_u64(k, f >> 16, zh, &zh);
+ k = _subborrow_u64(0, zl, f << 16, &zl);
+ (void)_subborrow_u64(k, zh, f >> 48, &zh);
+ t[3] = zl & MASK52;
+ t[4] = (zl >> 52) | (zh << 12);
+
+ /*
+ * t[4] may be up to 62 bits here; we need to do a
+ * partial reduction. Note that limbs t[0] to t[3]
+ * fit on 52 bits each.
+ */
+ s = t[4] >> 48; /* s < 2^14 */
+ t[0] += s; /* t[0] < 2^52 + 2^14 */
+ w = t[1] - (s << 44);
+ t[1] = w & MASK52; /* t[1] < 2^52 */
+ cc = -(w >> 52) & 0xFFF; /* cc < 16 */
+ w = t[2] - cc;
+ t[2] = w & MASK52; /* t[2] < 2^52 */
+ cc = w >> 63; /* cc = 0 or 1 */
+ w = t[3] - cc - (s << 36);
+ t[3] = w & MASK52; /* t[3] < 2^52 */
+ cc = w >> 63; /* cc = 0 or 1 */
+ w = t[4] & MASK48;
+ t[4] = w + (s << 16) - cc; /* t[4] < 2^48 + 2^30 */
+
+ /*
+ * The final t[4] cannot overflow because cc is 0 or 1,
+ * and cc can be 1 only if s != 0.
+ */
+ }
+
+ d[0] = t[0];
+ d[1] = t[1];
+ d[2] = t[2];
+ d[3] = t[3];
+ d[4] = t[4];
+
+#endif
+}
+
+/*
+ * Montgomery squaring in the field; currently a basic wrapper around
+ * multiplication (inline, should be optimized away).
+ * TODO: see if some extra speed can be gained here.
+ */
+static inline void
+f256_montysquare(uint64_t *d, const uint64_t *a)
+{
+ f256_montymul(d, a, a);
+}
+
+/*
+ * Convert to Montgomery representation.
+ */
+static void
+f256_tomonty(uint64_t *d, const uint64_t *a)
+{
+ /*
+ * R2 = 2^520 mod p.
+ * If R = 2^260 mod p, then R2 = R^2 mod p; and the Montgomery
+ * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the
+ * conversion to Montgomery representation.
+ */
+ static const uint64_t R2[] = {
+ 0x0000000000300, 0xFFFFFFFF00000, 0xFFFFEFFFFFFFB,
+ 0xFDFFFFFFFFFFF, 0x0000004FFFFFF
+ };
+
+ f256_montymul(d, a, R2);
+}
+
+/*
+ * Convert from Montgomery representation.
+ */
+static void
+f256_frommonty(uint64_t *d, const uint64_t *a)
+{
+ /*
+ * Montgomery multiplication by 1 is division by 2^260 modulo p.
+ */
+ static const uint64_t one[] = { 1, 0, 0, 0, 0 };
+
+ f256_montymul(d, a, one);
+}
+
+/*
+ * Inversion in the field. If the source value is 0 modulo p, then this
+ * returns 0 or p. This function uses Montgomery representation.
+ */
+static void
+f256_invert(uint64_t *d, const uint64_t *a)
+{
+ /*
+ * We compute a^(p-2) mod p. The exponent pattern (from high to
+ * low) is:
+ * - 32 bits of value 1
+ * - 31 bits of value 0
+ * - 1 bit of value 1
+ * - 96 bits of value 0
+ * - 94 bits of value 1
+ * - 1 bit of value 0
+ * - 1 bit of value 1
+ * To speed up the square-and-multiply algorithm, we precompute
+ * a^(2^31-1).
+ */
+
+ uint64_t r[5], t[5];
+ int i;
+
+ memcpy(t, a, sizeof t);
+ for (i = 0; i < 30; i ++) {
+ f256_montysquare(t, t);
+ f256_montymul(t, t, a);
+ }
+
+ memcpy(r, t, sizeof t);
+ for (i = 224; i >= 0; i --) {
+ f256_montysquare(r, r);
+ switch (i) {
+ case 0:
+ case 2:
+ case 192:
+ case 224:
+ f256_montymul(r, r, a);
+ break;
+ case 3:
+ case 34:
+ case 65:
+ f256_montymul(r, r, t);
+ break;
+ }
+ }
+ memcpy(d, r, sizeof r);
+}
+
+/*
+ * Finalize reduction.
+ * Input value should be partially reduced.
+ * On output, limbs a[0] to a[3] fit on 52 bits each, limb a[4] fits
+ * on 48 bits, and the integer is less than p.
+ */
+static inline void
+f256_final_reduce(uint64_t *a)
+{
+ uint64_t r[5], t[5], w, cc;
+ int i;
+
+ /*
+ * Propagate carries to ensure that limbs 0 to 3 fit on 52 bits.
+ */
+ cc = 0;
+ for (i = 0; i < 5; i ++) {
+ w = a[i] + cc;
+ r[i] = w & MASK52;
+ cc = w >> 52;
+ }
+
+ /*
+ * We compute t = r + (2^256 - p) = r + 2^224 - 2^192 - 2^96 + 1.
+ * If t < 2^256, then r < p, and we return r. Otherwise, we
+ * want to return r - p = t - 2^256.
+ */
+
+ /*
+ * Add 2^224 + 1, and propagate carries to ensure that limbs
+ * t[0] to t[3] fit in 52 bits each.
+ */
+ w = r[0] + 1;
+ t[0] = w & MASK52;
+ cc = w >> 52;
+ w = r[1] + cc;
+ t[1] = w & MASK52;
+ cc = w >> 52;
+ w = r[2] + cc;
+ t[2] = w & MASK52;
+ cc = w >> 52;
+ w = r[3] + cc;
+ t[3] = w & MASK52;
+ cc = w >> 52;
+ t[4] = r[4] + cc + BIT(16);
+
+ /*
+ * Subtract 2^192 + 2^96. Since we just added 2^224 + 1, the
+ * result cannot be negative.
+ */
+ w = t[1] - BIT(44);
+ t[1] = w & MASK52;
+ cc = w >> 63;
+ w = t[2] - cc;
+ t[2] = w & MASK52;
+ cc = w >> 63;
+ w = t[3] - BIT(36);
+ t[3] = w & MASK52;
+ cc = w >> 63;
+ t[4] -= cc;
+
+ /*
+ * If the top limb t[4] fits on 48 bits, then r[] is already
+ * in the proper range. Otherwise, t[] is the value to return
+ * (truncated to 256 bits).
+ */
+ cc = -(t[4] >> 48);
+ t[4] &= MASK48;
+ for (i = 0; i < 5; i ++) {
+ a[i] = r[i] ^ (cc & (r[i] ^ t[i]));
+ }
+}
+
+/*
+ * Points in affine and Jacobian coordinates.
+ *
+ * - In affine coordinates, the point-at-infinity cannot be encoded.
+ * - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
+ * if Z = 0 then this is the point-at-infinity.
+ */
+typedef struct {
+ uint64_t x[5];
+ uint64_t y[5];
+} p256_affine;
+
+typedef struct {
+ uint64_t x[5];
+ uint64_t y[5];
+ uint64_t z[5];
+} p256_jacobian;
+
+/*
+ * Decode a field element (unsigned big endian notation).
+ */
+static void
+f256_decode(uint64_t *a, const unsigned char *buf)
+{
+ uint64_t w0, w1, w2, w3;
+
+ w3 = br_dec64be(buf + 0);
+ w2 = br_dec64be(buf + 8);
+ w1 = br_dec64be(buf + 16);
+ w0 = br_dec64be(buf + 24);
+ a[0] = w0 & MASK52;
+ a[1] = ((w0 >> 52) | (w1 << 12)) & MASK52;
+ a[2] = ((w1 >> 40) | (w2 << 24)) & MASK52;
+ a[3] = ((w2 >> 28) | (w3 << 36)) & MASK52;
+ a[4] = w3 >> 16;
+}
+
+/*
+ * Encode a field element (unsigned big endian notation). The field
+ * element MUST be fully reduced.
+ */
+static void
+f256_encode(unsigned char *buf, const uint64_t *a)
+{
+ uint64_t w0, w1, w2, w3;
+
+ w0 = a[0] | (a[1] << 52);
+ w1 = (a[1] >> 12) | (a[2] << 40);
+ w2 = (a[2] >> 24) | (a[3] << 28);
+ w3 = (a[3] >> 36) | (a[4] << 16);
+ br_enc64be(buf + 0, w3);
+ br_enc64be(buf + 8, w2);
+ br_enc64be(buf + 16, w1);
+ br_enc64be(buf + 24, w0);
+}
+
+/*
+ * Decode a point. The returned point is in Jacobian coordinates, but
+ * with z = 1. If the encoding is invalid, or encodes a point which is
+ * not on the curve, or encodes the point at infinity, then this function
+ * returns 0. Otherwise, 1 is returned.
+ *
+ * The buffer is assumed to have length exactly 65 bytes.
+ */
+static uint32_t
+point_decode(p256_jacobian *P, const unsigned char *buf)
+{
+ uint64_t x[5], y[5], t[5], x3[5], tt;
+ uint32_t r;
+
+ /*
+ * Header byte shall be 0x04.
+ */
+ r = EQ(buf[0], 0x04);
+
+ /*
+ * Decode X and Y coordinates, and convert them into
+ * Montgomery representation.
+ */
+ f256_decode(x, buf + 1);
+ f256_decode(y, buf + 33);
+ f256_tomonty(x, x);
+ f256_tomonty(y, y);
+
+ /*
+ * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
+ * Note that the Montgomery representation of 0 is 0. We must
+ * take care to apply the final reduction to make sure we have
+ * 0 and not p.
+ */
+ f256_montysquare(t, y);
+ f256_montysquare(x3, x);
+ f256_montymul(x3, x3, x);
+ f256_sub(t, t, x3);
+ f256_add(t, t, x);
+ f256_add(t, t, x);
+ f256_add(t, t, x);
+ f256_sub(t, t, P256_B_MONTY);
+ f256_final_reduce(t);
+ tt = t[0] | t[1] | t[2] | t[3] | t[4];
+ r &= EQ((uint32_t)(tt | (tt >> 32)), 0);
+
+ /*
+ * Return the point in Jacobian coordinates (and Montgomery
+ * representation).
+ */
+ memcpy(P->x, x, sizeof x);
+ memcpy(P->y, y, sizeof y);
+ memcpy(P->z, F256_R, sizeof F256_R);
+ return r;
+}
+
+/*
+ * Final conversion for a point:
+ * - The point is converted back to affine coordinates.
+ * - Final reduction is performed.
+ * - The point is encoded into the provided buffer.
+ *
+ * If the point is the point-at-infinity, all operations are performed,
+ * but the buffer contents are indeterminate, and 0 is returned. Otherwise,
+ * the encoded point is written in the buffer, and 1 is returned.
+ */
+static uint32_t
+point_encode(unsigned char *buf, const p256_jacobian *P)
+{
+ uint64_t t1[5], t2[5], z;
+
+ /* Set t1 = 1/z^2 and t2 = 1/z^3. */
+ f256_invert(t2, P->z);
+ f256_montysquare(t1, t2);
+ f256_montymul(t2, t2, t1);
+
+ /* Compute affine coordinates x (in t1) and y (in t2). */
+ f256_montymul(t1, P->x, t1);
+ f256_montymul(t2, P->y, t2);
+
+ /* Convert back from Montgomery representation, and finalize
+ reductions. */
+ f256_frommonty(t1, t1);
+ f256_frommonty(t2, t2);
+ f256_final_reduce(t1);
+ f256_final_reduce(t2);
+
+ /* Encode. */
+ buf[0] = 0x04;
+ f256_encode(buf + 1, t1);
+ f256_encode(buf + 33, t2);
+
+ /* Return success if and only if P->z != 0. */
+ z = P->z[0] | P->z[1] | P->z[2] | P->z[3] | P->z[4];
+ return NEQ((uint32_t)(z | z >> 32), 0);
+}
+
+/*
+ * Point doubling in Jacobian coordinates: point P is doubled.
+ * Note: if the source point is the point-at-infinity, then the result is
+ * still the point-at-infinity, which is correct. Moreover, if the three
+ * coordinates were zero, then they still are zero in the returned value.
+ */
+static void
+p256_double(p256_jacobian *P)
+{
+ /*
+ * Doubling formulas are:
+ *
+ * s = 4*x*y^2
+ * m = 3*(x + z^2)*(x - z^2)
+ * x' = m^2 - 2*s
+ * y' = m*(s - x') - 8*y^4
+ * z' = 2*y*z
+ *
+ * These formulas work for all points, including points of order 2
+ * and points at infinity:
+ * - If y = 0 then z' = 0. But there is no such point in P-256
+ * anyway.
+ * - If z = 0 then z' = 0.
+ */
+ uint64_t t1[5], t2[5], t3[5], t4[5];
+
+ /*
+ * Compute z^2 in t1.
+ */
+ f256_montysquare(t1, P->z);
+
+ /*
+ * Compute x-z^2 in t2 and x+z^2 in t1.
+ */
+ f256_add(t2, P->x, t1);
+ f256_sub(t1, P->x, t1);
+
+ /*
+ * Compute 3*(x+z^2)*(x-z^2) in t1.
+ */
+ f256_montymul(t3, t1, t2);
+ f256_add(t1, t3, t3);
+ f256_add(t1, t3, t1);
+
+ /*
+ * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+ */
+ f256_montysquare(t3, P->y);
+ f256_add(t3, t3, t3);
+ f256_montymul(t2, P->x, t3);
+ f256_add(t2, t2, t2);
+
+ /*
+ * Compute x' = m^2 - 2*s.
+ */
+ f256_montysquare(P->x, t1);
+ f256_sub(P->x, P->x, t2);
+ f256_sub(P->x, P->x, t2);
+
+ /*
+ * Compute z' = 2*y*z.
+ */
+ f256_montymul(t4, P->y, P->z);
+ f256_add(P->z, t4, t4);
+ f256_partial_reduce(P->z);
+
+ /*
+ * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+ * 2*y^2 in t3.
+ */
+ f256_sub(t2, t2, P->x);
+ f256_montymul(P->y, t1, t2);
+ f256_montysquare(t4, t3);
+ f256_add(t4, t4, t4);
+ f256_sub(P->y, P->y, t4);
+}
+
+/*
+ * Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
+ * This function computes the wrong result in the following cases:
+ *
+ * - If P1 == 0 but P2 != 0
+ * - If P1 != 0 but P2 == 0
+ * - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ * - P1 and P2 have the same Y coordinate.
+ * - P1 == 0 and P2 == 0.
+ * - The Y coordinate of one of the points is 0 and the other point is
+ * the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ * - If the result is not the point at infinity, then it is correct.
+ * - Otherwise, if the returned value is 1, then this is a case of
+ * P1+P2 == 0, so the result is indeed the point at infinity.
+ * - Otherwise, P1 == P2, so a "double" operation should have been
+ * performed.
+ *
+ * Note that you can get a returned value of 0 with a correct result,
+ * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+ /*
+ * Addtions formulas are:
+ *
+ * u1 = x1 * z2^2
+ * u2 = x2 * z1^2
+ * s1 = y1 * z2^3
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1 * z2
+ */
+ uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt;
+ uint32_t ret;
+
+ /*
+ * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+ */
+ f256_montysquare(t3, P2->z);
+ f256_montymul(t1, P1->x, t3);
+ f256_montymul(t4, P2->z, t3);
+ f256_montymul(t3, P1->y, t4);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ f256_montysquare(t4, P1->z);
+ f256_montymul(t2, P2->x, t4);
+ f256_montymul(t5, P1->z, t4);
+ f256_montymul(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * We need to test whether r is zero, so we will do some extra
+ * reduce.
+ */
+ f256_sub(t2, t2, t1);
+ f256_sub(t4, t4, t3);
+ f256_final_reduce(t4);
+ tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+ ret = (uint32_t)(tt | (tt >> 32));
+ ret = (ret | -ret) >> 31;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ f256_montysquare(t7, t2);
+ f256_montymul(t6, t1, t7);
+ f256_montymul(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ f256_montysquare(P1->x, t4);
+ f256_sub(P1->x, P1->x, t5);
+ f256_sub(P1->x, P1->x, t6);
+ f256_sub(P1->x, P1->x, t6);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ f256_sub(t6, t6, P1->x);
+ f256_montymul(P1->y, t4, t6);
+ f256_montymul(t1, t5, t3);
+ f256_sub(P1->y, P1->y, t1);
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ f256_montymul(t1, P1->z, P2->z);
+ f256_montymul(P1->z, t1, t2);
+
+ return ret;
+}
+
+/*
+ * Point addition (mixed coordinates): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ * - If P1 == 0
+ * - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ * - P1 and P2 have the same Y (affine) coordinate.
+ * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ * - If the result is not the point at infinity, then it is correct.
+ * - Otherwise, if the returned value is 1, then this is a case of
+ * P1+P2 == 0, so the result is indeed the point at infinity.
+ * - Otherwise, P1 == P2, so a "double" operation should have been
+ * performed.
+ *
+ * Again, a value of 0 may be returned in some cases where the addition
+ * result is correct.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+ /*
+ * Addtions formulas are:
+ *
+ * u1 = x1
+ * u2 = x2 * z1^2
+ * s1 = y1
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1
+ */
+ uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt;
+ uint32_t ret;
+
+ /*
+ * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+ */
+ memcpy(t1, P1->x, sizeof t1);
+ memcpy(t3, P1->y, sizeof t3);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ f256_montysquare(t4, P1->z);
+ f256_montymul(t2, P2->x, t4);
+ f256_montymul(t5, P1->z, t4);
+ f256_montymul(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * We need to test whether r is zero, so we will do some extra
+ * reduce.
+ */
+ f256_sub(t2, t2, t1);
+ f256_sub(t4, t4, t3);
+ f256_final_reduce(t4);
+ tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+ ret = (uint32_t)(tt | (tt >> 32));
+ ret = (ret | -ret) >> 31;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ f256_montysquare(t7, t2);
+ f256_montymul(t6, t1, t7);
+ f256_montymul(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ f256_montysquare(P1->x, t4);
+ f256_sub(P1->x, P1->x, t5);
+ f256_sub(P1->x, P1->x, t6);
+ f256_sub(P1->x, P1->x, t6);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ f256_sub(t6, t6, P1->x);
+ f256_montymul(P1->y, t4, t6);
+ f256_montymul(t1, t5, t3);
+ f256_sub(P1->y, P1->y, t1);
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ f256_montymul(P1->z, P1->z, t2);
+
+ return ret;
+}
+
+#if 0
+/* unused */
+/*
+ * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function returns the correct result in all cases.
+ */
+static uint32_t
+p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+ /*
+ * Addtions formulas, in the general case, are:
+ *
+ * u1 = x1
+ * u2 = x2 * z1^2
+ * s1 = y1
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1
+ *
+ * These formulas mishandle the two following cases:
+ *
+ * - If P1 is the point-at-infinity (z1 = 0), then z3 is
+ * incorrectly set to 0.
+ *
+ * - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
+ * are all set to 0.
+ *
+ * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
+ * we correctly get z3 = 0 (the point-at-infinity).
+ *
+ * To fix the case P1 = 0, we perform at the end a copy of P2
+ * over P1, conditional to z1 = 0.
+ *
+ * For P1 = P2: in that case, both h and r are set to 0, and
+ * we get x3, y3 and z3 equal to 0. We can test for that
+ * occurrence to make a mask which will be all-one if P1 = P2,
+ * or all-zero otherwise; then we can compute the double of P2
+ * and add it, combined with the mask, to (x3,y3,z3).
+ *
+ * Using the doubling formulas in p256_double() on (x2,y2),
+ * simplifying since P2 is affine (i.e. z2 = 1, implicitly),
+ * we get:
+ * s = 4*x2*y2^2
+ * m = 3*(x2 + 1)*(x2 - 1)
+ * x' = m^2 - 2*s
+ * y' = m*(s - x') - 8*y2^4
+ * z' = 2*y2
+ * which requires only 6 multiplications. Added to the 11
+ * multiplications of the normal mixed addition in Jacobian
+ * coordinates, we get a cost of 17 multiplications in total.
+ */
+ uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt, zz;
+ int i;
+
+ /*
+ * Set zz to -1 if P1 is the point at infinity, 0 otherwise.
+ */
+ zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3] | P1->z[4];
+ zz = ((zz | -zz) >> 63) - (uint64_t)1;
+
+ /*
+ * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+ */
+ memcpy(t1, P1->x, sizeof t1);
+ memcpy(t3, P1->y, sizeof t3);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ f256_montysquare(t4, P1->z);
+ f256_montymul(t2, P2->x, t4);
+ f256_montymul(t5, P1->z, t4);
+ f256_montymul(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * reduce.
+ */
+ f256_sub(t2, t2, t1);
+ f256_sub(t4, t4, t3);
+
+ /*
+ * If both h = 0 and r = 0, then P1 = P2, and we want to set
+ * the mask tt to -1; otherwise, the mask will be 0.
+ */
+ f256_final_reduce(t2);
+ f256_final_reduce(t4);
+ tt = t2[0] | t2[1] | t2[2] | t2[3] | t2[4]
+ | t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+ tt = ((tt | -tt) >> 63) - (uint64_t)1;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ f256_montysquare(t7, t2);
+ f256_montymul(t6, t1, t7);
+ f256_montymul(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ f256_montysquare(P1->x, t4);
+ f256_sub(P1->x, P1->x, t5);
+ f256_sub(P1->x, P1->x, t6);
+ f256_sub(P1->x, P1->x, t6);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ f256_sub(t6, t6, P1->x);
+ f256_montymul(P1->y, t4, t6);
+ f256_montymul(t1, t5, t3);
+ f256_sub(P1->y, P1->y, t1);
+
+ /*
+ * Compute z3 = h*z1.
+ */
+ f256_montymul(P1->z, P1->z, t2);
+
+ /*
+ * The "double" result, in case P1 = P2.
+ */
+
+ /*
+ * Compute z' = 2*y2 (in t1).
+ */
+ f256_add(t1, P2->y, P2->y);
+ f256_partial_reduce(t1);
+
+ /*
+ * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3).
+ */
+ f256_montysquare(t2, P2->y);
+ f256_add(t2, t2, t2);
+ f256_add(t3, t2, t2);
+ f256_montymul(t3, P2->x, t3);
+
+ /*
+ * Compute m = 3*(x2^2 - 1) (in t4).
+ */
+ f256_montysquare(t4, P2->x);
+ f256_sub(t4, t4, F256_R);
+ f256_add(t5, t4, t4);
+ f256_add(t4, t4, t5);
+
+ /*
+ * Compute x' = m^2 - 2*s (in t5).
+ */
+ f256_montysquare(t5, t4);
+ f256_sub(t5, t3);
+ f256_sub(t5, t3);
+
+ /*
+ * Compute y' = m*(s - x') - 8*y2^4 (in t6).
+ */
+ f256_sub(t6, t3, t5);
+ f256_montymul(t6, t6, t4);
+ f256_montysquare(t7, t2);
+ f256_sub(t6, t6, t7);
+ f256_sub(t6, t6, t7);
+
+ /*
+ * We now have the alternate (doubling) coordinates in (t5,t6,t1).
+ * We combine them with (x3,y3,z3).
+ */
+ for (i = 0; i < 5; i ++) {
+ P1->x[i] |= tt & t5[i];
+ P1->y[i] |= tt & t6[i];
+ P1->z[i] |= tt & t1[i];
+ }
+
+ /*
+ * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
+ * then we want to replace the result with a copy of P2. The
+ * test on z1 was done at the start, in the zz mask.
+ */
+ for (i = 0; i < 5; i ++) {
+ P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
+ P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
+ P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
+ }
+}
+#endif
+
+/*
+ * Inner function for computing a point multiplication. A window is
+ * provided, with points 1*P to 15*P in affine coordinates.
+ *
+ * Assumptions:
+ * - All provided points are valid points on the curve.
+ * - Multiplier is non-zero, and smaller than the curve order.
+ * - Everything is in Montgomery representation.
+ */
+static void
+point_mul_inner(p256_jacobian *R, const p256_affine *W,
+ const unsigned char *k, size_t klen)
+{
+ p256_jacobian Q;
+ uint32_t qz;
+
+ memset(&Q, 0, sizeof Q);
+ qz = 1;
+ while (klen -- > 0) {
+ int i;
+ unsigned bk;
+
+ bk = *k ++;
+ for (i = 0; i < 2; i ++) {
+ uint32_t bits;
+ uint32_t bnz;
+ p256_affine T;
+ p256_jacobian U;
+ uint32_t n;
+ int j;
+ uint64_t m;
+
+ p256_double(&Q);
+ p256_double(&Q);
+ p256_double(&Q);
+ p256_double(&Q);
+ bits = (bk >> 4) & 0x0F;
+ bnz = NEQ(bits, 0);
+
+ /*
+ * Lookup point in window. If the bits are 0,
+ * we get something invalid, which is not a
+ * problem because we will use it only if the
+ * bits are non-zero.
+ */
+ memset(&T, 0, sizeof T);
+ for (n = 0; n < 15; n ++) {
+ m = -(uint64_t)EQ(bits, n + 1);
+ T.x[0] |= m & W[n].x[0];
+ T.x[1] |= m & W[n].x[1];
+ T.x[2] |= m & W[n].x[2];
+ T.x[3] |= m & W[n].x[3];
+ T.x[4] |= m & W[n].x[4];
+ T.y[0] |= m & W[n].y[0];
+ T.y[1] |= m & W[n].y[1];
+ T.y[2] |= m & W[n].y[2];
+ T.y[3] |= m & W[n].y[3];
+ T.y[4] |= m & W[n].y[4];
+ }
+
+ U = Q;
+ p256_add_mixed(&U, &T);
+
+ /*
+ * If qz is still 1, then Q was all-zeros, and this
+ * is conserved through p256_double().
+ */
+ m = -(uint64_t)(bnz & qz);
+ for (j = 0; j < 5; j ++) {
+ Q.x[j] ^= m & (Q.x[j] ^ T.x[j]);
+ Q.y[j] ^= m & (Q.y[j] ^ T.y[j]);
+ Q.z[j] ^= m & (Q.z[j] ^ F256_R[j]);
+ }
+ CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+ qz &= ~bnz;
+ bk <<= 4;
+ }
+ }
+ *R = Q;
+}
+
+/*
+ * Convert a window from Jacobian to affine coordinates. A single
+ * field inversion is used. This function works for windows up to
+ * 32 elements.
+ *
+ * The destination array (aff[]) and the source array (jac[]) may
+ * overlap, provided that the start of aff[] is not after the start of
+ * jac[]. Even if the arrays do _not_ overlap, the source array is
+ * modified.
+ */
+static void
+window_to_affine(p256_affine *aff, p256_jacobian *jac, int num)
+{
+ /*
+ * Convert the window points to affine coordinates. We use the
+ * following trick to mutualize the inversion computation: if
+ * we have z1, z2, z3, and z4, and want to invert all of them,
+ * we compute u = 1/(z1*z2*z3*z4), and then we have:
+ * 1/z1 = u*z2*z3*z4
+ * 1/z2 = u*z1*z3*z4
+ * 1/z3 = u*z1*z2*z4
+ * 1/z4 = u*z1*z2*z3
+ *
+ * The partial products are computed recursively:
+ *
+ * - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
+ * - on input (z_1,z_2,... z_n):
+ * recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
+ * recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
+ * multiply elements of r1 by m2 -> s1
+ * multiply elements of r2 by m1 -> s2
+ * return r1||r2 and m1*m2
+ *
+ * In the example below, we suppose that we have 14 elements.
+ * Let z1, z2,... zE be the 14 values to invert (index noted in
+ * hexadecimal, starting at 1).
+ *
+ * - Depth 1:
+ * swap(z1, z2); z12 = z1*z2
+ * swap(z3, z4); z34 = z3*z4
+ * swap(z5, z6); z56 = z5*z6
+ * swap(z7, z8); z78 = z7*z8
+ * swap(z9, zA); z9A = z9*zA
+ * swap(zB, zC); zBC = zB*zC
+ * swap(zD, zE); zDE = zD*zE
+ *
+ * - Depth 2:
+ * z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12
+ * z1234 = z12*z34
+ * z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56
+ * z5678 = z56*z78
+ * z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A
+ * z9ABC = z9A*zBC
+ *
+ * - Depth 3:
+ * z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678
+ * z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234
+ * z12345678 = z1234*z5678
+ * z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE
+ * zD <- zD*z9ABC, zE*z9ABC
+ * z9ABCDE = z9ABC*zDE
+ *
+ * - Depth 4:
+ * multiply z1..z8 by z9ABCDE
+ * multiply z9..zE by z12345678
+ * final z = z12345678*z9ABCDE
+ */
+
+ uint64_t z[16][5];
+ int i, k, s;
+#define zt (z[15])
+#define zu (z[14])
+#define zv (z[13])
+
+ /*
+ * First recursion step (pairwise swapping and multiplication).
+ * If there is an odd number of elements, then we "invent" an
+ * extra one with coordinate Z = 1 (in Montgomery representation).
+ */
+ for (i = 0; (i + 1) < num; i += 2) {
+ memcpy(zt, jac[i].z, sizeof zt);
+ memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
+ memcpy(jac[i + 1].z, zt, sizeof zt);
+ f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
+ }
+ if ((num & 1) != 0) {
+ memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
+ memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
+ }
+
+ /*
+ * Perform further recursion steps. At the entry of each step,
+ * the process has been done for groups of 's' points. The
+ * integer k is the log2 of s.
+ */
+ for (k = 1, s = 2; s < num; k ++, s <<= 1) {
+ int n;
+
+ for (i = 0; i < num; i ++) {
+ f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
+ }
+ n = (num + s - 1) >> k;
+ for (i = 0; i < (n >> 1); i ++) {
+ f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
+ }
+ if ((n & 1) != 0) {
+ memmove(z[n >> 1], z[n], sizeof zt);
+ }
+ }
+
+ /*
+ * Invert the final result, and convert all points.
+ */
+ f256_invert(zt, z[0]);
+ for (i = 0; i < num; i ++) {
+ f256_montymul(zv, jac[i].z, zt);
+ f256_montysquare(zu, zv);
+ f256_montymul(zv, zv, zu);
+ f256_montymul(aff[i].x, jac[i].x, zu);
+ f256_montymul(aff[i].y, jac[i].y, zv);
+ }
+}
+
+/*
+ * Multiply the provided point by an integer.
+ * Assumptions:
+ * - Source point is a valid curve point.
+ * - Source point is not the point-at-infinity.
+ * - Integer is not 0, and is lower than the curve order.
+ * If these conditions are not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+ union {
+ p256_affine aff[15];
+ p256_jacobian jac[15];
+ } window;
+ int i;
+
+ /*
+ * Compute window, in Jacobian coordinates.
+ */
+ window.jac[0] = *P;
+ for (i = 2; i < 16; i ++) {
+ window.jac[i - 1] = window.jac[(i >> 1) - 1];
+ if ((i & 1) == 0) {
+ p256_double(&window.jac[i - 1]);
+ } else {
+ p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
+ }
+ }
+
+ /*
+ * Convert the window points to affine coordinates. Point
+ * window[0] is the source point, already in affine coordinates.
+ */
+ window_to_affine(window.aff, window.jac, 15);
+
+ /*
+ * Perform point multiplication.
+ */
+ point_mul_inner(P, window.aff, k, klen);
+}
+
+/*
+ * Precomputed window for the conventional generator: P256_Gwin[n]
+ * contains (n+1)*G (affine coordinates, in Montgomery representation).
+ */
+static const p256_affine P256_Gwin[] = {
+ {
+ { 0x30D418A9143C1, 0xC4FEDB60179E7, 0x62251075BA95F,
+ 0x5C669FB732B77, 0x08905F76B5375 },
+ { 0x5357CE95560A8, 0x43A19E45CDDF2, 0x21F3258B4AB8E,
+ 0xD8552E88688DD, 0x0571FF18A5885 }
+ },
+ {
+ { 0x46D410DDD64DF, 0x0B433827D8500, 0x1490D9AA6AE3C,
+ 0xA3A832205038D, 0x06BB32E52DCF3 },
+ { 0x48D361BEE1A57, 0xB7B236FF82F36, 0x042DBE152CD7C,
+ 0xA3AA9A8FB0E92, 0x08C577517A5B8 }
+ },
+ {
+ { 0x3F904EEBC1272, 0x9E87D81FBFFAC, 0xCBBC98B027F84,
+ 0x47E46AD77DD87, 0x06936A3FD6FF7 },
+ { 0x5C1FC983A7EBD, 0xC3861FE1AB04C, 0x2EE98E583E47A,
+ 0xC06A88208311A, 0x05F06A2AB587C }
+ },
+ {
+ { 0xB50D46918DCC5, 0xD7623C17374B0, 0x100AF24650A6E,
+ 0x76ABCDAACACE8, 0x077362F591B01 },
+ { 0xF24CE4CBABA68, 0x17AD6F4472D96, 0xDDD22E1762847,
+ 0x862EB6C36DEE5, 0x04B14C39CC5AB }
+ },
+ {
+ { 0x8AAEC45C61F5C, 0x9D4B9537DBE1B, 0x76C20C90EC649,
+ 0x3C7D41CB5AAD0, 0x0907960649052 },
+ { 0x9B4AE7BA4F107, 0xF75EB882BEB30, 0x7A1F6873C568E,
+ 0x915C540A9877E, 0x03A076BB9DD1E }
+ },
+ {
+ { 0x47373E77664A1, 0xF246CEE3E4039, 0x17A3AD55AE744,
+ 0x673C50A961A5B, 0x03074B5964213 },
+ { 0x6220D377E44BA, 0x30DFF14B593D3, 0x639F11299C2B5,
+ 0x75F5424D44CEF, 0x04C9916DEA07F }
+ },
+ {
+ { 0x354EA0173B4F1, 0x3C23C00F70746, 0x23BB082BD2021,
+ 0xE03E43EAAB50C, 0x03BA5119D3123 },
+ { 0xD0303F5B9D4DE, 0x17DA67BDD2847, 0xC941956742F2F,
+ 0x8670F933BDC77, 0x0AEDD9164E240 }
+ },
+ {
+ { 0x4CD19499A78FB, 0x4BF9B345527F1, 0x2CFC6B462AB5C,
+ 0x30CDF90F02AF0, 0x0763891F62652 },
+ { 0xA3A9532D49775, 0xD7F9EBA15F59D, 0x60BBF021E3327,
+ 0xF75C23C7B84BE, 0x06EC12F2C706D }
+ },
+ {
+ { 0x6E8F264E20E8E, 0xC79A7A84175C9, 0xC8EB00ABE6BFE,
+ 0x16A4CC09C0444, 0x005B3081D0C4E },
+ { 0x777AA45F33140, 0xDCE5D45E31EB7, 0xB12F1A56AF7BE,
+ 0xF9B2B6E019A88, 0x086659CDFD835 }
+ },
+ {
+ { 0xDBD19DC21EC8C, 0x94FCF81392C18, 0x250B4998F9868,
+ 0x28EB37D2CD648, 0x0C61C947E4B34 },
+ { 0x407880DD9E767, 0x0C83FBE080C2B, 0x9BE5D2C43A899,
+ 0xAB4EF7D2D6577, 0x08719A555B3B4 }
+ },
+ {
+ { 0x260A6245E4043, 0x53E7FDFE0EA7D, 0xAC1AB59DE4079,
+ 0x072EFF3A4158D, 0x0E7090F1949C9 },
+ { 0x85612B944E886, 0xE857F61C81A76, 0xAD643D250F939,
+ 0x88DAC0DAA891E, 0x089300244125B }
+ },
+ {
+ { 0x1AA7D26977684, 0x58A345A3304B7, 0x37385EABDEDEF,
+ 0x155E409D29DEE, 0x0EE1DF780B83E },
+ { 0x12D91CBB5B437, 0x65A8956370CAC, 0xDE6D66170ED2F,
+ 0xAC9B8228CFA8A, 0x0FF57C95C3238 }
+ },
+ {
+ { 0x25634B2ED7097, 0x9156FD30DCCC4, 0x9E98110E35676,
+ 0x7594CBCD43F55, 0x038477ACC395B },
+ { 0x2B90C00EE17FF, 0xF842ED2E33575, 0x1F5BC16874838,
+ 0x7968CD06422BD, 0x0BC0876AB9E7B }
+ },
+ {
+ { 0xA35BB0CF664AF, 0x68F9707E3A242, 0x832660126E48F,
+ 0x72D2717BF54C6, 0x0AAE7333ED12C },
+ { 0x2DB7995D586B1, 0xE732237C227B5, 0x65E7DBBE29569,
+ 0xBBBD8E4193E2A, 0x052706DC3EAA1 }
+ },
+ {
+ { 0xD8B7BC60055BE, 0xD76E27E4B72BC, 0x81937003CC23E,
+ 0xA090E337424E4, 0x02AA0E43EAD3D },
+ { 0x524F6383C45D2, 0x422A41B2540B8, 0x8A4797D766355,
+ 0xDF444EFA6DE77, 0x0042170A9079A }
+ },
+};
+
+/*
+ * Multiply the conventional generator of the curve by the provided
+ * integer. Return is written in *P.
+ *
+ * Assumptions:
+ * - Integer is not 0, and is lower than the curve order.
+ * If this conditions is not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+ point_mul_inner(P, P256_Gwin, k, klen);
+}
+
+/*
+ * Return 1 if all of the following hold:
+ * - klen <= 32
+ * - k != 0
+ * - k is lower than the curve order
+ * Otherwise, return 0.
+ *
+ * Constant-time behaviour: only klen may be observable.
+ */
+static uint32_t
+check_scalar(const unsigned char *k, size_t klen)
+{
+ uint32_t z;
+ int32_t c;
+ size_t u;
+
+ if (klen > 32) {
+ return 0;
+ }
+ z = 0;
+ for (u = 0; u < klen; u ++) {
+ z |= k[u];
+ }
+ if (klen == 32) {
+ c = 0;
+ for (u = 0; u < klen; u ++) {
+ c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
+ }
+ } else {
+ c = -1;
+ }
+ return NEQ(z, 0) & LT0(c);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *k, size_t klen, int curve)
+{
+ uint32_t r;
+ p256_jacobian P;
+
+ (void)curve;
+ if (Glen != 65) {
+ return 0;
+ }
+ r = check_scalar(k, klen);
+ r &= point_decode(&P, G);
+ p256_mul(&P, k, klen);
+ r &= point_encode(G, &P);
+ return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *k, size_t klen, int curve)
+{
+ p256_jacobian P;
+
+ (void)curve;
+ p256_mulgen(&P, k, klen);
+ point_encode(R, &P);
+ return 65;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ /*
+ * We might want to use Shamir's trick here: make a composite
+ * window of u*P+v*Q points, to merge the two doubling-ladders
+ * into one. This, however, has some complications:
+ *
+ * - During the computation, we may hit the point-at-infinity.
+ * Thus, we would need p256_add_complete_mixed() (complete
+ * formulas for point addition), with a higher cost (17 muls
+ * instead of 11).
+ *
+ * - A 4-bit window would be too large, since it would involve
+ * 16*16-1 = 255 points. For the same window size as in the
+ * p256_mul() case, we would need to reduce the window size
+ * to 2 bits, and thus perform twice as many non-doubling
+ * point additions.
+ *
+ * - The window may itself contain the point-at-infinity, and
+ * thus cannot be in all generality be made of affine points.
+ * Instead, we would need to make it a window of points in
+ * Jacobian coordinates. Even p256_add_complete_mixed() would
+ * be inappropriate.
+ *
+ * For these reasons, the code below performs two separate
+ * point multiplications, then computes the final point addition
+ * (which is both a "normal" addition, and a doubling, to handle
+ * all cases).
+ */
+
+ p256_jacobian P, Q;
+ uint32_t r, t, s;
+ uint64_t z;
+
+ (void)curve;
+ if (len != 65) {
+ return 0;
+ }
+ r = point_decode(&P, A);
+ p256_mul(&P, x, xlen);
+ if (B == NULL) {
+ p256_mulgen(&Q, y, ylen);
+ } else {
+ r &= point_decode(&Q, B);
+ p256_mul(&Q, y, ylen);
+ }
+
+ /*
+ * The final addition may fail in case both points are equal.
+ */
+ t = p256_add(&P, &Q);
+ f256_final_reduce(P.z);
+ z = P.z[0] | P.z[1] | P.z[2] | P.z[3] | P.z[4];
+ s = EQ((uint32_t)(z | (z >> 32)), 0);
+ p256_double(&Q);
+
+ /*
+ * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+ * have the following:
+ *
+ * s = 0, t = 0 return P (normal addition)
+ * s = 0, t = 1 return P (normal addition)
+ * s = 1, t = 0 return Q (a 'double' case)
+ * s = 1, t = 1 report an error (P+Q = 0)
+ */
+ CCOPY(s & ~t, &P, &Q, sizeof Q);
+ point_encode(A, &P);
+ r &= ~(s & t);
+ return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m62 = {
+ (uint32_t)0x00800000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m62_get(void)
+{
+ return &br_ec_p256_m62;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m62_get(void)
+{
+ return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m64.c b/test/monniaux/BearSSL/src/ec/ec_p256_m64.c
new file mode 100644
index 00000000..5a7ea177
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_p256_m64.c
@@ -0,0 +1,1730 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char P256_G[] = {
+ 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+ 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+ 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+ 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+ 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+ 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+ 0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+ 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+ 0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ (void)curve;
+ *len = sizeof P256_G;
+ return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ (void)curve;
+ *len = sizeof P256_N;
+ return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ (void)curve;
+ *len = 32;
+ return 1;
+}
+
+/*
+ * A field element is encoded as four 64-bit integers, in basis 2^64.
+ * Values may reach up to 2^256-1. Montgomery multiplication is used.
+ */
+
+/* R = 2^256 mod p */
+static const uint64_t F256_R[] = {
+ 0x0000000000000001, 0xFFFFFFFF00000000,
+ 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFE
+};
+
+/* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p
+ (Montgomery representation of B). */
+static const uint64_t P256_B_MONTY[] = {
+ 0xD89CDF6229C4BDDF, 0xACF005CD78843090,
+ 0xE5A220ABF7212ED6, 0xDC30061D04874834
+};
+
+/*
+ * Addition in the field.
+ */
+static inline void
+f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+ unsigned __int128 w;
+ uint64_t t;
+
+ w = (unsigned __int128)a[0] + b[0];
+ d[0] = (uint64_t)w;
+ w = (unsigned __int128)a[1] + b[1] + (w >> 64);
+ d[1] = (uint64_t)w;
+ w = (unsigned __int128)a[2] + b[2] + (w >> 64);
+ d[2] = (uint64_t)w;
+ w = (unsigned __int128)a[3] + b[3] + (w >> 64);
+ d[3] = (uint64_t)w;
+ t = (uint64_t)(w >> 64);
+
+ /*
+ * 2^256 = 2^224 - 2^192 - 2^96 + 1 in the field.
+ */
+ w = (unsigned __int128)d[0] + t;
+ d[0] = (uint64_t)w;
+ w = (unsigned __int128)d[1] + (w >> 64) - (t << 32);
+ d[1] = (uint64_t)w;
+ /* Here, carry "w >> 64" can only be 0 or -1 */
+ w = (unsigned __int128)d[2] - ((w >> 64) & 1);
+ d[2] = (uint64_t)w;
+ /* Again, carry is 0 or -1 */
+ d[3] += (uint64_t)(w >> 64) + (t << 32) - t;
+
+#elif BR_UMUL128
+
+ unsigned char cc;
+ uint64_t t;
+
+ cc = _addcarry_u64(0, a[0], b[0], &d[0]);
+ cc = _addcarry_u64(cc, a[1], b[1], &d[1]);
+ cc = _addcarry_u64(cc, a[2], b[2], &d[2]);
+ cc = _addcarry_u64(cc, a[3], b[3], &d[3]);
+
+ /*
+ * If there is a carry, then we want to subtract p, which we
+ * do by adding 2^256 - p.
+ */
+ t = cc;
+ cc = _addcarry_u64(cc, d[0], 0, &d[0]);
+ cc = _addcarry_u64(cc, d[1], -(t << 32), &d[1]);
+ cc = _addcarry_u64(cc, d[2], -t, &d[2]);
+ (void)_addcarry_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
+
+#endif
+}
+
+/*
+ * Subtraction in the field.
+ */
+static inline void
+f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+ unsigned __int128 w;
+ uint64_t t;
+
+ w = (unsigned __int128)a[0] - b[0];
+ d[0] = (uint64_t)w;
+ w = (unsigned __int128)a[1] - b[1] - ((w >> 64) & 1);
+ d[1] = (uint64_t)w;
+ w = (unsigned __int128)a[2] - b[2] - ((w >> 64) & 1);
+ d[2] = (uint64_t)w;
+ w = (unsigned __int128)a[3] - b[3] - ((w >> 64) & 1);
+ d[3] = (uint64_t)w;
+ t = (uint64_t)(w >> 64) & 1;
+
+ /*
+ * p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
+ */
+ w = (unsigned __int128)d[0] - t;
+ d[0] = (uint64_t)w;
+ w = (unsigned __int128)d[1] + (t << 32) - ((w >> 64) & 1);
+ d[1] = (uint64_t)w;
+ /* Here, carry "w >> 64" can only be 0 or +1 */
+ w = (unsigned __int128)d[2] + (w >> 64);
+ d[2] = (uint64_t)w;
+ /* Again, carry is 0 or +1 */
+ d[3] += (uint64_t)(w >> 64) - (t << 32) + t;
+
+#elif BR_UMUL128
+
+ unsigned char cc;
+ uint64_t t;
+
+ cc = _subborrow_u64(0, a[0], b[0], &d[0]);
+ cc = _subborrow_u64(cc, a[1], b[1], &d[1]);
+ cc = _subborrow_u64(cc, a[2], b[2], &d[2]);
+ cc = _subborrow_u64(cc, a[3], b[3], &d[3]);
+
+ /*
+ * If there is a carry, then we need to add p.
+ */
+ t = cc;
+ cc = _addcarry_u64(0, d[0], -t, &d[0]);
+ cc = _addcarry_u64(cc, d[1], (-t) >> 32, &d[1]);
+ cc = _addcarry_u64(cc, d[2], 0, &d[2]);
+ (void)_addcarry_u64(cc, d[3], t - (t << 32), &d[3]);
+
+#endif
+}
+
+/*
+ * Montgomery multiplication in the field.
+ */
+static void
+f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+ uint64_t x, f, t0, t1, t2, t3, t4;
+ unsigned __int128 z, ff;
+ int i;
+
+ /*
+ * When computing d <- d + a[u]*b, we also add f*p such
+ * that d + a[u]*b + f*p is a multiple of 2^64. Since
+ * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
+ */
+
+ /*
+ * Step 1: t <- (a[0]*b + f*p) / 2^64
+ * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
+ * ensures that (a[0]*b + f*p) is a multiple of 2^64.
+ *
+ * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f.
+ */
+ x = a[0];
+ z = (unsigned __int128)b[0] * x;
+ f = (uint64_t)z;
+ z = (unsigned __int128)b[1] * x + (z >> 64) + (uint64_t)(f << 32);
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)b[2] * x + (z >> 64) + (uint64_t)(f >> 32);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)b[3] * x + (z >> 64) + f;
+ t2 = (uint64_t)z;
+ t3 = (uint64_t)(z >> 64);
+ ff = ((unsigned __int128)f << 64) - ((unsigned __int128)f << 32);
+ z = (unsigned __int128)t2 + (uint64_t)ff;
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
+ t3 = (uint64_t)z;
+ t4 = (uint64_t)(z >> 64);
+
+ /*
+ * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64
+ */
+ for (i = 1; i < 4; i ++) {
+ x = a[i];
+
+ /* t <- (t + x*b - f) / 2^64 */
+ z = (unsigned __int128)b[0] * x + t0;
+ f = (uint64_t)z;
+ z = (unsigned __int128)b[1] * x + t1 + (z >> 64);
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)b[2] * x + t2 + (z >> 64);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)b[3] * x + t3 + (z >> 64);
+ t2 = (uint64_t)z;
+ z = t4 + (z >> 64);
+ t3 = (uint64_t)z;
+ t4 = (uint64_t)(z >> 64);
+
+ /* t <- t + f*2^32, carry in the upper half of z */
+ z = (unsigned __int128)t0 + (uint64_t)(f << 32);
+ t0 = (uint64_t)z;
+ z = (z >> 64) + (unsigned __int128)t1 + (uint64_t)(f >> 32);
+ t1 = (uint64_t)z;
+
+ /* t <- t + f*2^192 - f*2^160 + f*2^128 */
+ ff = ((unsigned __int128)f << 64)
+ - ((unsigned __int128)f << 32) + f;
+ z = (z >> 64) + (unsigned __int128)t2 + (uint64_t)ff;
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
+ t3 = (uint64_t)z;
+ t4 += (uint64_t)(z >> 64);
+ }
+
+ /*
+ * At that point, we have computed t = (a*b + F*p) / 2^256, where
+ * F is a 256-bit integer whose limbs are the "f" coefficients
+ * in the steps above. We have:
+ * a <= 2^256-1
+ * b <= 2^256-1
+ * F <= 2^256-1
+ * Hence:
+ * a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1)
+ * a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p
+ * Therefore:
+ * t < 2^256 + p - 2
+ * Since p < 2^256, it follows that:
+ * t4 can be only 0 or 1
+ * t - p < 2^256
+ * We can therefore subtract p from t, conditionally on t4, to
+ * get a nonnegative result that fits on 256 bits.
+ */
+ z = (unsigned __int128)t0 + t4;
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)t1 - (t4 << 32) + (z >> 64);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)t2 - (z >> 127);
+ t2 = (uint64_t)z;
+ t3 = t3 - (uint64_t)(z >> 127) - t4 + (t4 << 32);
+
+ d[0] = t0;
+ d[1] = t1;
+ d[2] = t2;
+ d[3] = t3;
+
+#elif BR_UMUL128
+
+ uint64_t x, f, t0, t1, t2, t3, t4;
+ uint64_t zl, zh, ffl, ffh;
+ unsigned char k, m;
+ int i;
+
+ /*
+ * When computing d <- d + a[u]*b, we also add f*p such
+ * that d + a[u]*b + f*p is a multiple of 2^64. Since
+ * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
+ */
+
+ /*
+ * Step 1: t <- (a[0]*b + f*p) / 2^64
+ * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
+ * ensures that (a[0]*b + f*p) is a multiple of 2^64.
+ *
+ * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f.
+ */
+ x = a[0];
+
+ zl = _umul128(b[0], x, &zh);
+ f = zl;
+ t0 = zh;
+
+ zl = _umul128(b[1], x, &zh);
+ k = _addcarry_u64(0, zl, t0, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ k = _addcarry_u64(0, zl, f << 32, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ t0 = zl;
+ t1 = zh;
+
+ zl = _umul128(b[2], x, &zh);
+ k = _addcarry_u64(0, zl, t1, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ k = _addcarry_u64(0, zl, f >> 32, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ t1 = zl;
+ t2 = zh;
+
+ zl = _umul128(b[3], x, &zh);
+ k = _addcarry_u64(0, zl, t2, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ k = _addcarry_u64(0, zl, f, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ t2 = zl;
+ t3 = zh;
+
+ t4 = _addcarry_u64(0, t3, f, &t3);
+ k = _subborrow_u64(0, t2, f << 32, &t2);
+ k = _subborrow_u64(k, t3, f >> 32, &t3);
+ (void)_subborrow_u64(k, t4, 0, &t4);
+
+ /*
+ * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64
+ */
+ for (i = 1; i < 4; i ++) {
+ x = a[i];
+ /* f = t0 + x * b[0]; -- computed below */
+
+ /* t <- (t + x*b - f) / 2^64 */
+ zl = _umul128(b[0], x, &zh);
+ k = _addcarry_u64(0, zl, t0, &f);
+ (void)_addcarry_u64(k, zh, 0, &t0);
+
+ zl = _umul128(b[1], x, &zh);
+ k = _addcarry_u64(0, zl, t0, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ k = _addcarry_u64(0, zl, t1, &t0);
+ (void)_addcarry_u64(k, zh, 0, &t1);
+
+ zl = _umul128(b[2], x, &zh);
+ k = _addcarry_u64(0, zl, t1, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ k = _addcarry_u64(0, zl, t2, &t1);
+ (void)_addcarry_u64(k, zh, 0, &t2);
+
+ zl = _umul128(b[3], x, &zh);
+ k = _addcarry_u64(0, zl, t2, &zl);
+ (void)_addcarry_u64(k, zh, 0, &zh);
+ k = _addcarry_u64(0, zl, t3, &t2);
+ (void)_addcarry_u64(k, zh, 0, &t3);
+
+ t4 = _addcarry_u64(0, t3, t4, &t3);
+
+ /* t <- t + f*2^32, carry in k */
+ k = _addcarry_u64(0, t0, f << 32, &t0);
+ k = _addcarry_u64(k, t1, f >> 32, &t1);
+
+ /* t <- t + f*2^192 - f*2^160 + f*2^128 */
+ m = _subborrow_u64(0, f, f << 32, &ffl);
+ (void)_subborrow_u64(m, f, f >> 32, &ffh);
+ k = _addcarry_u64(k, t2, ffl, &t2);
+ k = _addcarry_u64(k, t3, ffh, &t3);
+ (void)_addcarry_u64(k, t4, 0, &t4);
+ }
+
+ /*
+ * At that point, we have computed t = (a*b + F*p) / 2^256, where
+ * F is a 256-bit integer whose limbs are the "f" coefficients
+ * in the steps above. We have:
+ * a <= 2^256-1
+ * b <= 2^256-1
+ * F <= 2^256-1
+ * Hence:
+ * a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1)
+ * a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p
+ * Therefore:
+ * t < 2^256 + p - 2
+ * Since p < 2^256, it follows that:
+ * t4 can be only 0 or 1
+ * t - p < 2^256
+ * We can therefore subtract p from t, conditionally on t4, to
+ * get a nonnegative result that fits on 256 bits.
+ */
+ k = _addcarry_u64(0, t0, t4, &t0);
+ k = _addcarry_u64(k, t1, -(t4 << 32), &t1);
+ k = _addcarry_u64(k, t2, -t4, &t2);
+ (void)_addcarry_u64(k, t3, (t4 << 32) - (t4 << 1), &t3);
+
+ d[0] = t0;
+ d[1] = t1;
+ d[2] = t2;
+ d[3] = t3;
+
+#endif
+}
+
+/*
+ * Montgomery squaring in the field; currently a basic wrapper around
+ * multiplication (inline, should be optimized away).
+ * TODO: see if some extra speed can be gained here.
+ */
+static inline void
+f256_montysquare(uint64_t *d, const uint64_t *a)
+{
+ f256_montymul(d, a, a);
+}
+
+/*
+ * Convert to Montgomery representation.
+ */
+static void
+f256_tomonty(uint64_t *d, const uint64_t *a)
+{
+ /*
+ * R2 = 2^512 mod p.
+ * If R = 2^256 mod p, then R2 = R^2 mod p; and the Montgomery
+ * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the
+ * conversion to Montgomery representation.
+ */
+ static const uint64_t R2[] = {
+ 0x0000000000000003,
+ 0xFFFFFFFBFFFFFFFF,
+ 0xFFFFFFFFFFFFFFFE,
+ 0x00000004FFFFFFFD
+ };
+
+ f256_montymul(d, a, R2);
+}
+
+/*
+ * Convert from Montgomery representation.
+ */
+static void
+f256_frommonty(uint64_t *d, const uint64_t *a)
+{
+ /*
+ * Montgomery multiplication by 1 is division by 2^256 modulo p.
+ */
+ static const uint64_t one[] = { 1, 0, 0, 0 };
+
+ f256_montymul(d, a, one);
+}
+
+/*
+ * Inversion in the field. If the source value is 0 modulo p, then this
+ * returns 0 or p. This function uses Montgomery representation.
+ */
+static void
+f256_invert(uint64_t *d, const uint64_t *a)
+{
+ /*
+ * We compute a^(p-2) mod p. The exponent pattern (from high to
+ * low) is:
+ * - 32 bits of value 1
+ * - 31 bits of value 0
+ * - 1 bit of value 1
+ * - 96 bits of value 0
+ * - 94 bits of value 1
+ * - 1 bit of value 0
+ * - 1 bit of value 1
+ * To speed up the square-and-multiply algorithm, we precompute
+ * a^(2^31-1).
+ */
+
+ uint64_t r[4], t[4];
+ int i;
+
+ memcpy(t, a, sizeof t);
+ for (i = 0; i < 30; i ++) {
+ f256_montysquare(t, t);
+ f256_montymul(t, t, a);
+ }
+
+ memcpy(r, t, sizeof t);
+ for (i = 224; i >= 0; i --) {
+ f256_montysquare(r, r);
+ switch (i) {
+ case 0:
+ case 2:
+ case 192:
+ case 224:
+ f256_montymul(r, r, a);
+ break;
+ case 3:
+ case 34:
+ case 65:
+ f256_montymul(r, r, t);
+ break;
+ }
+ }
+ memcpy(d, r, sizeof r);
+}
+
+/*
+ * Finalize reduction.
+ * Input value fits on 256 bits. This function subtracts p if and only
+ * if the input is greater than or equal to p.
+ */
+static inline void
+f256_final_reduce(uint64_t *a)
+{
+#if BR_INT128
+
+ uint64_t t0, t1, t2, t3, cc;
+ unsigned __int128 z;
+
+ /*
+ * We add 2^224 - 2^192 - 2^96 + 1 to a. If there is no carry,
+ * then a < p; otherwise, the addition result we computed is
+ * the value we must return.
+ */
+ z = (unsigned __int128)a[0] + 1;
+ t0 = (uint64_t)z;
+ z = (unsigned __int128)a[1] + (z >> 64) - ((uint64_t)1 << 32);
+ t1 = (uint64_t)z;
+ z = (unsigned __int128)a[2] - (z >> 127);
+ t2 = (uint64_t)z;
+ z = (unsigned __int128)a[3] - (z >> 127) + 0xFFFFFFFF;
+ t3 = (uint64_t)z;
+ cc = -(uint64_t)(z >> 64);
+
+ a[0] ^= cc & (a[0] ^ t0);
+ a[1] ^= cc & (a[1] ^ t1);
+ a[2] ^= cc & (a[2] ^ t2);
+ a[3] ^= cc & (a[3] ^ t3);
+
+#elif BR_UMUL128
+
+ uint64_t t0, t1, t2, t3, m;
+ unsigned char k;
+
+ k = _addcarry_u64(0, a[0], (uint64_t)1, &t0);
+ k = _addcarry_u64(k, a[1], -((uint64_t)1 << 32), &t1);
+ k = _addcarry_u64(k, a[2], -(uint64_t)1, &t2);
+ k = _addcarry_u64(k, a[3], ((uint64_t)1 << 32) - 2, &t3);
+ m = -(uint64_t)k;
+
+ a[0] ^= m & (a[0] ^ t0);
+ a[1] ^= m & (a[1] ^ t1);
+ a[2] ^= m & (a[2] ^ t2);
+ a[3] ^= m & (a[3] ^ t3);
+
+#endif
+}
+
+/*
+ * Points in affine and Jacobian coordinates.
+ *
+ * - In affine coordinates, the point-at-infinity cannot be encoded.
+ * - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
+ * if Z = 0 then this is the point-at-infinity.
+ */
+typedef struct {
+ uint64_t x[4];
+ uint64_t y[4];
+} p256_affine;
+
+typedef struct {
+ uint64_t x[4];
+ uint64_t y[4];
+ uint64_t z[4];
+} p256_jacobian;
+
+/*
+ * Decode a point. The returned point is in Jacobian coordinates, but
+ * with z = 1. If the encoding is invalid, or encodes a point which is
+ * not on the curve, or encodes the point at infinity, then this function
+ * returns 0. Otherwise, 1 is returned.
+ *
+ * The buffer is assumed to have length exactly 65 bytes.
+ */
+static uint32_t
+point_decode(p256_jacobian *P, const unsigned char *buf)
+{
+ uint64_t x[4], y[4], t[4], x3[4], tt;
+ uint32_t r;
+
+ /*
+ * Header byte shall be 0x04.
+ */
+ r = EQ(buf[0], 0x04);
+
+ /*
+ * Decode X and Y coordinates, and convert them into
+ * Montgomery representation.
+ */
+ x[3] = br_dec64be(buf + 1);
+ x[2] = br_dec64be(buf + 9);
+ x[1] = br_dec64be(buf + 17);
+ x[0] = br_dec64be(buf + 25);
+ y[3] = br_dec64be(buf + 33);
+ y[2] = br_dec64be(buf + 41);
+ y[1] = br_dec64be(buf + 49);
+ y[0] = br_dec64be(buf + 57);
+ f256_tomonty(x, x);
+ f256_tomonty(y, y);
+
+ /*
+ * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
+ * Note that the Montgomery representation of 0 is 0. We must
+ * take care to apply the final reduction to make sure we have
+ * 0 and not p.
+ */
+ f256_montysquare(t, y);
+ f256_montysquare(x3, x);
+ f256_montymul(x3, x3, x);
+ f256_sub(t, t, x3);
+ f256_add(t, t, x);
+ f256_add(t, t, x);
+ f256_add(t, t, x);
+ f256_sub(t, t, P256_B_MONTY);
+ f256_final_reduce(t);
+ tt = t[0] | t[1] | t[2] | t[3];
+ r &= EQ((uint32_t)(tt | (tt >> 32)), 0);
+
+ /*
+ * Return the point in Jacobian coordinates (and Montgomery
+ * representation).
+ */
+ memcpy(P->x, x, sizeof x);
+ memcpy(P->y, y, sizeof y);
+ memcpy(P->z, F256_R, sizeof F256_R);
+ return r;
+}
+
+/*
+ * Final conversion for a point:
+ * - The point is converted back to affine coordinates.
+ * - Final reduction is performed.
+ * - The point is encoded into the provided buffer.
+ *
+ * If the point is the point-at-infinity, all operations are performed,
+ * but the buffer contents are indeterminate, and 0 is returned. Otherwise,
+ * the encoded point is written in the buffer, and 1 is returned.
+ */
+static uint32_t
+point_encode(unsigned char *buf, const p256_jacobian *P)
+{
+ uint64_t t1[4], t2[4], z;
+
+ /* Set t1 = 1/z^2 and t2 = 1/z^3. */
+ f256_invert(t2, P->z);
+ f256_montysquare(t1, t2);
+ f256_montymul(t2, t2, t1);
+
+ /* Compute affine coordinates x (in t1) and y (in t2). */
+ f256_montymul(t1, P->x, t1);
+ f256_montymul(t2, P->y, t2);
+
+ /* Convert back from Montgomery representation, and finalize
+ reductions. */
+ f256_frommonty(t1, t1);
+ f256_frommonty(t2, t2);
+ f256_final_reduce(t1);
+ f256_final_reduce(t2);
+
+ /* Encode. */
+ buf[0] = 0x04;
+ br_enc64be(buf + 1, t1[3]);
+ br_enc64be(buf + 9, t1[2]);
+ br_enc64be(buf + 17, t1[1]);
+ br_enc64be(buf + 25, t1[0]);
+ br_enc64be(buf + 33, t2[3]);
+ br_enc64be(buf + 41, t2[2]);
+ br_enc64be(buf + 49, t2[1]);
+ br_enc64be(buf + 57, t2[0]);
+
+ /* Return success if and only if P->z != 0. */
+ z = P->z[0] | P->z[1] | P->z[2] | P->z[3];
+ return NEQ((uint32_t)(z | z >> 32), 0);
+}
+
+/*
+ * Point doubling in Jacobian coordinates: point P is doubled.
+ * Note: if the source point is the point-at-infinity, then the result is
+ * still the point-at-infinity, which is correct. Moreover, if the three
+ * coordinates were zero, then they still are zero in the returned value.
+ *
+ * (Note: this is true even without the final reduction: if the three
+ * coordinates are encoded as four words of value zero each, then the
+ * result will also have all-zero coordinate encodings, not the alternate
+ * encoding as the integer p.)
+ */
+static void
+p256_double(p256_jacobian *P)
+{
+ /*
+ * Doubling formulas are:
+ *
+ * s = 4*x*y^2
+ * m = 3*(x + z^2)*(x - z^2)
+ * x' = m^2 - 2*s
+ * y' = m*(s - x') - 8*y^4
+ * z' = 2*y*z
+ *
+ * These formulas work for all points, including points of order 2
+ * and points at infinity:
+ * - If y = 0 then z' = 0. But there is no such point in P-256
+ * anyway.
+ * - If z = 0 then z' = 0.
+ */
+ uint64_t t1[4], t2[4], t3[4], t4[4];
+
+ /*
+ * Compute z^2 in t1.
+ */
+ f256_montysquare(t1, P->z);
+
+ /*
+ * Compute x-z^2 in t2 and x+z^2 in t1.
+ */
+ f256_add(t2, P->x, t1);
+ f256_sub(t1, P->x, t1);
+
+ /*
+ * Compute 3*(x+z^2)*(x-z^2) in t1.
+ */
+ f256_montymul(t3, t1, t2);
+ f256_add(t1, t3, t3);
+ f256_add(t1, t3, t1);
+
+ /*
+ * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+ */
+ f256_montysquare(t3, P->y);
+ f256_add(t3, t3, t3);
+ f256_montymul(t2, P->x, t3);
+ f256_add(t2, t2, t2);
+
+ /*
+ * Compute x' = m^2 - 2*s.
+ */
+ f256_montysquare(P->x, t1);
+ f256_sub(P->x, P->x, t2);
+ f256_sub(P->x, P->x, t2);
+
+ /*
+ * Compute z' = 2*y*z.
+ */
+ f256_montymul(t4, P->y, P->z);
+ f256_add(P->z, t4, t4);
+
+ /*
+ * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+ * 2*y^2 in t3.
+ */
+ f256_sub(t2, t2, P->x);
+ f256_montymul(P->y, t1, t2);
+ f256_montysquare(t4, t3);
+ f256_add(t4, t4, t4);
+ f256_sub(P->y, P->y, t4);
+}
+
+/*
+ * Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
+ * This function computes the wrong result in the following cases:
+ *
+ * - If P1 == 0 but P2 != 0
+ * - If P1 != 0 but P2 == 0
+ * - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ * - P1 and P2 have the same Y coordinate.
+ * - P1 == 0 and P2 == 0.
+ * - The Y coordinate of one of the points is 0 and the other point is
+ * the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ * - If the result is not the point at infinity, then it is correct.
+ * - Otherwise, if the returned value is 1, then this is a case of
+ * P1+P2 == 0, so the result is indeed the point at infinity.
+ * - Otherwise, P1 == P2, so a "double" operation should have been
+ * performed.
+ *
+ * Note that you can get a returned value of 0 with a correct result,
+ * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+ /*
+ * Addtions formulas are:
+ *
+ * u1 = x1 * z2^2
+ * u2 = x2 * z1^2
+ * s1 = y1 * z2^3
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1 * z2
+ */
+ uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
+ uint32_t ret;
+
+ /*
+ * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+ */
+ f256_montysquare(t3, P2->z);
+ f256_montymul(t1, P1->x, t3);
+ f256_montymul(t4, P2->z, t3);
+ f256_montymul(t3, P1->y, t4);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ f256_montysquare(t4, P1->z);
+ f256_montymul(t2, P2->x, t4);
+ f256_montymul(t5, P1->z, t4);
+ f256_montymul(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * We need to test whether r is zero, so we will do some extra
+ * reduce.
+ */
+ f256_sub(t2, t2, t1);
+ f256_sub(t4, t4, t3);
+ f256_final_reduce(t4);
+ tt = t4[0] | t4[1] | t4[2] | t4[3];
+ ret = (uint32_t)(tt | (tt >> 32));
+ ret = (ret | -ret) >> 31;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ f256_montysquare(t7, t2);
+ f256_montymul(t6, t1, t7);
+ f256_montymul(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ f256_montysquare(P1->x, t4);
+ f256_sub(P1->x, P1->x, t5);
+ f256_sub(P1->x, P1->x, t6);
+ f256_sub(P1->x, P1->x, t6);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ f256_sub(t6, t6, P1->x);
+ f256_montymul(P1->y, t4, t6);
+ f256_montymul(t1, t5, t3);
+ f256_sub(P1->y, P1->y, t1);
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ f256_montymul(t1, P1->z, P2->z);
+ f256_montymul(P1->z, t1, t2);
+
+ return ret;
+}
+
+/*
+ * Point addition (mixed coordinates): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ * - If P1 == 0
+ * - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ * - P1 and P2 have the same Y (affine) coordinate.
+ * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ * - If the result is not the point at infinity, then it is correct.
+ * - Otherwise, if the returned value is 1, then this is a case of
+ * P1+P2 == 0, so the result is indeed the point at infinity.
+ * - Otherwise, P1 == P2, so a "double" operation should have been
+ * performed.
+ *
+ * Again, a value of 0 may be returned in some cases where the addition
+ * result is correct.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+ /*
+ * Addtions formulas are:
+ *
+ * u1 = x1
+ * u2 = x2 * z1^2
+ * s1 = y1
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1
+ */
+ uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
+ uint32_t ret;
+
+ /*
+ * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+ */
+ memcpy(t1, P1->x, sizeof t1);
+ memcpy(t3, P1->y, sizeof t3);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ f256_montysquare(t4, P1->z);
+ f256_montymul(t2, P2->x, t4);
+ f256_montymul(t5, P1->z, t4);
+ f256_montymul(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * We need to test whether r is zero, so we will do some extra
+ * reduce.
+ */
+ f256_sub(t2, t2, t1);
+ f256_sub(t4, t4, t3);
+ f256_final_reduce(t4);
+ tt = t4[0] | t4[1] | t4[2] | t4[3];
+ ret = (uint32_t)(tt | (tt >> 32));
+ ret = (ret | -ret) >> 31;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ f256_montysquare(t7, t2);
+ f256_montymul(t6, t1, t7);
+ f256_montymul(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ f256_montysquare(P1->x, t4);
+ f256_sub(P1->x, P1->x, t5);
+ f256_sub(P1->x, P1->x, t6);
+ f256_sub(P1->x, P1->x, t6);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ f256_sub(t6, t6, P1->x);
+ f256_montymul(P1->y, t4, t6);
+ f256_montymul(t1, t5, t3);
+ f256_sub(P1->y, P1->y, t1);
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ f256_montymul(P1->z, P1->z, t2);
+
+ return ret;
+}
+
+#if 0
+/* unused */
+/*
+ * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function returns the correct result in all cases.
+ */
+static uint32_t
+p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+ /*
+ * Addtions formulas, in the general case, are:
+ *
+ * u1 = x1
+ * u2 = x2 * z1^2
+ * s1 = y1
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1
+ *
+ * These formulas mishandle the two following cases:
+ *
+ * - If P1 is the point-at-infinity (z1 = 0), then z3 is
+ * incorrectly set to 0.
+ *
+ * - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
+ * are all set to 0.
+ *
+ * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
+ * we correctly get z3 = 0 (the point-at-infinity).
+ *
+ * To fix the case P1 = 0, we perform at the end a copy of P2
+ * over P1, conditional to z1 = 0.
+ *
+ * For P1 = P2: in that case, both h and r are set to 0, and
+ * we get x3, y3 and z3 equal to 0. We can test for that
+ * occurrence to make a mask which will be all-one if P1 = P2,
+ * or all-zero otherwise; then we can compute the double of P2
+ * and add it, combined with the mask, to (x3,y3,z3).
+ *
+ * Using the doubling formulas in p256_double() on (x2,y2),
+ * simplifying since P2 is affine (i.e. z2 = 1, implicitly),
+ * we get:
+ * s = 4*x2*y2^2
+ * m = 3*(x2 + 1)*(x2 - 1)
+ * x' = m^2 - 2*s
+ * y' = m*(s - x') - 8*y2^4
+ * z' = 2*y2
+ * which requires only 6 multiplications. Added to the 11
+ * multiplications of the normal mixed addition in Jacobian
+ * coordinates, we get a cost of 17 multiplications in total.
+ */
+ uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt, zz;
+ int i;
+
+ /*
+ * Set zz to -1 if P1 is the point at infinity, 0 otherwise.
+ */
+ zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3];
+ zz = ((zz | -zz) >> 63) - (uint64_t)1;
+
+ /*
+ * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+ */
+ memcpy(t1, P1->x, sizeof t1);
+ memcpy(t3, P1->y, sizeof t3);
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ f256_montysquare(t4, P1->z);
+ f256_montymul(t2, P2->x, t4);
+ f256_montymul(t5, P1->z, t4);
+ f256_montymul(t4, P2->y, t5);
+
+ /*
+ * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+ * reduce.
+ */
+ f256_sub(t2, t2, t1);
+ f256_sub(t4, t4, t3);
+
+ /*
+ * If both h = 0 and r = 0, then P1 = P2, and we want to set
+ * the mask tt to -1; otherwise, the mask will be 0.
+ */
+ f256_final_reduce(t2);
+ f256_final_reduce(t4);
+ tt = t2[0] | t2[1] | t2[2] | t2[3] | t4[0] | t4[1] | t4[2] | t4[3];
+ tt = ((tt | -tt) >> 63) - (uint64_t)1;
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5);
+ */
+ f256_montysquare(t7, t2);
+ f256_montymul(t6, t1, t7);
+ f256_montymul(t5, t7, t2);
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ */
+ f256_montysquare(P1->x, t4);
+ f256_sub(P1->x, P1->x, t5);
+ f256_sub(P1->x, P1->x, t6);
+ f256_sub(P1->x, P1->x, t6);
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ f256_sub(t6, t6, P1->x);
+ f256_montymul(P1->y, t4, t6);
+ f256_montymul(t1, t5, t3);
+ f256_sub(P1->y, P1->y, t1);
+
+ /*
+ * Compute z3 = h*z1.
+ */
+ f256_montymul(P1->z, P1->z, t2);
+
+ /*
+ * The "double" result, in case P1 = P2.
+ */
+
+ /*
+ * Compute z' = 2*y2 (in t1).
+ */
+ f256_add(t1, P2->y, P2->y);
+
+ /*
+ * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3).
+ */
+ f256_montysquare(t2, P2->y);
+ f256_add(t2, t2, t2);
+ f256_add(t3, t2, t2);
+ f256_montymul(t3, P2->x, t3);
+
+ /*
+ * Compute m = 3*(x2^2 - 1) (in t4).
+ */
+ f256_montysquare(t4, P2->x);
+ f256_sub(t4, t4, F256_R);
+ f256_add(t5, t4, t4);
+ f256_add(t4, t4, t5);
+
+ /*
+ * Compute x' = m^2 - 2*s (in t5).
+ */
+ f256_montysquare(t5, t4);
+ f256_sub(t5, t3);
+ f256_sub(t5, t3);
+
+ /*
+ * Compute y' = m*(s - x') - 8*y2^4 (in t6).
+ */
+ f256_sub(t6, t3, t5);
+ f256_montymul(t6, t6, t4);
+ f256_montysquare(t7, t2);
+ f256_sub(t6, t6, t7);
+ f256_sub(t6, t6, t7);
+
+ /*
+ * We now have the alternate (doubling) coordinates in (t5,t6,t1).
+ * We combine them with (x3,y3,z3).
+ */
+ for (i = 0; i < 4; i ++) {
+ P1->x[i] |= tt & t5[i];
+ P1->y[i] |= tt & t6[i];
+ P1->z[i] |= tt & t1[i];
+ }
+
+ /*
+ * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
+ * then we want to replace the result with a copy of P2. The
+ * test on z1 was done at the start, in the zz mask.
+ */
+ for (i = 0; i < 4; i ++) {
+ P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
+ P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
+ P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
+ }
+}
+#endif
+
+/*
+ * Inner function for computing a point multiplication. A window is
+ * provided, with points 1*P to 15*P in affine coordinates.
+ *
+ * Assumptions:
+ * - All provided points are valid points on the curve.
+ * - Multiplier is non-zero, and smaller than the curve order.
+ * - Everything is in Montgomery representation.
+ */
+static void
+point_mul_inner(p256_jacobian *R, const p256_affine *W,
+ const unsigned char *k, size_t klen)
+{
+ p256_jacobian Q;
+ uint32_t qz;
+
+ memset(&Q, 0, sizeof Q);
+ qz = 1;
+ while (klen -- > 0) {
+ int i;
+ unsigned bk;
+
+ bk = *k ++;
+ for (i = 0; i < 2; i ++) {
+ uint32_t bits;
+ uint32_t bnz;
+ p256_affine T;
+ p256_jacobian U;
+ uint32_t n;
+ int j;
+ uint64_t m;
+
+ p256_double(&Q);
+ p256_double(&Q);
+ p256_double(&Q);
+ p256_double(&Q);
+ bits = (bk >> 4) & 0x0F;
+ bnz = NEQ(bits, 0);
+
+ /*
+ * Lookup point in window. If the bits are 0,
+ * we get something invalid, which is not a
+ * problem because we will use it only if the
+ * bits are non-zero.
+ */
+ memset(&T, 0, sizeof T);
+ for (n = 0; n < 15; n ++) {
+ m = -(uint64_t)EQ(bits, n + 1);
+ T.x[0] |= m & W[n].x[0];
+ T.x[1] |= m & W[n].x[1];
+ T.x[2] |= m & W[n].x[2];
+ T.x[3] |= m & W[n].x[3];
+ T.y[0] |= m & W[n].y[0];
+ T.y[1] |= m & W[n].y[1];
+ T.y[2] |= m & W[n].y[2];
+ T.y[3] |= m & W[n].y[3];
+ }
+
+ U = Q;
+ p256_add_mixed(&U, &T);
+
+ /*
+ * If qz is still 1, then Q was all-zeros, and this
+ * is conserved through p256_double().
+ */
+ m = -(uint64_t)(bnz & qz);
+ for (j = 0; j < 4; j ++) {
+ Q.x[j] |= m & T.x[j];
+ Q.y[j] |= m & T.y[j];
+ Q.z[j] |= m & F256_R[j];
+ }
+ CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+ qz &= ~bnz;
+ bk <<= 4;
+ }
+ }
+ *R = Q;
+}
+
+/*
+ * Convert a window from Jacobian to affine coordinates. A single
+ * field inversion is used. This function works for windows up to
+ * 32 elements.
+ *
+ * The destination array (aff[]) and the source array (jac[]) may
+ * overlap, provided that the start of aff[] is not after the start of
+ * jac[]. Even if the arrays do _not_ overlap, the source array is
+ * modified.
+ */
+static void
+window_to_affine(p256_affine *aff, p256_jacobian *jac, int num)
+{
+ /*
+ * Convert the window points to affine coordinates. We use the
+ * following trick to mutualize the inversion computation: if
+ * we have z1, z2, z3, and z4, and want to inverse all of them,
+ * we compute u = 1/(z1*z2*z3*z4), and then we have:
+ * 1/z1 = u*z2*z3*z4
+ * 1/z2 = u*z1*z3*z4
+ * 1/z3 = u*z1*z2*z4
+ * 1/z4 = u*z1*z2*z3
+ *
+ * The partial products are computed recursively:
+ *
+ * - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
+ * - on input (z_1,z_2,... z_n):
+ * recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
+ * recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
+ * multiply elements of r1 by m2 -> s1
+ * multiply elements of r2 by m1 -> s2
+ * return r1||r2 and m1*m2
+ *
+ * In the example below, we suppose that we have 14 elements.
+ * Let z1, z2,... zE be the 14 values to invert (index noted in
+ * hexadecimal, starting at 1).
+ *
+ * - Depth 1:
+ * swap(z1, z2); z12 = z1*z2
+ * swap(z3, z4); z34 = z3*z4
+ * swap(z5, z6); z56 = z5*z6
+ * swap(z7, z8); z78 = z7*z8
+ * swap(z9, zA); z9A = z9*zA
+ * swap(zB, zC); zBC = zB*zC
+ * swap(zD, zE); zDE = zD*zE
+ *
+ * - Depth 2:
+ * z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12
+ * z1234 = z12*z34
+ * z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56
+ * z5678 = z56*z78
+ * z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A
+ * z9ABC = z9A*zBC
+ *
+ * - Depth 3:
+ * z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678
+ * z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234
+ * z12345678 = z1234*z5678
+ * z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE
+ * zD <- zD*z9ABC, zE*z9ABC
+ * z9ABCDE = z9ABC*zDE
+ *
+ * - Depth 4:
+ * multiply z1..z8 by z9ABCDE
+ * multiply z9..zE by z12345678
+ * final z = z12345678*z9ABCDE
+ */
+
+ uint64_t z[16][4];
+ int i, k, s;
+#define zt (z[15])
+#define zu (z[14])
+#define zv (z[13])
+
+ /*
+ * First recursion step (pairwise swapping and multiplication).
+ * If there is an odd number of elements, then we "invent" an
+ * extra one with coordinate Z = 1 (in Montgomery representation).
+ */
+ for (i = 0; (i + 1) < num; i += 2) {
+ memcpy(zt, jac[i].z, sizeof zt);
+ memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
+ memcpy(jac[i + 1].z, zt, sizeof zt);
+ f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
+ }
+ if ((num & 1) != 0) {
+ memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
+ memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
+ }
+
+ /*
+ * Perform further recursion steps. At the entry of each step,
+ * the process has been done for groups of 's' points. The
+ * integer k is the log2 of s.
+ */
+ for (k = 1, s = 2; s < num; k ++, s <<= 1) {
+ int n;
+
+ for (i = 0; i < num; i ++) {
+ f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
+ }
+ n = (num + s - 1) >> k;
+ for (i = 0; i < (n >> 1); i ++) {
+ f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
+ }
+ if ((n & 1) != 0) {
+ memmove(z[n >> 1], z[n], sizeof zt);
+ }
+ }
+
+ /*
+ * Invert the final result, and convert all points.
+ */
+ f256_invert(zt, z[0]);
+ for (i = 0; i < num; i ++) {
+ f256_montymul(zv, jac[i].z, zt);
+ f256_montysquare(zu, zv);
+ f256_montymul(zv, zv, zu);
+ f256_montymul(aff[i].x, jac[i].x, zu);
+ f256_montymul(aff[i].y, jac[i].y, zv);
+ }
+}
+
+/*
+ * Multiply the provided point by an integer.
+ * Assumptions:
+ * - Source point is a valid curve point.
+ * - Source point is not the point-at-infinity.
+ * - Integer is not 0, and is lower than the curve order.
+ * If these conditions are not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+ union {
+ p256_affine aff[15];
+ p256_jacobian jac[15];
+ } window;
+ int i;
+
+ /*
+ * Compute window, in Jacobian coordinates.
+ */
+ window.jac[0] = *P;
+ for (i = 2; i < 16; i ++) {
+ window.jac[i - 1] = window.jac[(i >> 1) - 1];
+ if ((i & 1) == 0) {
+ p256_double(&window.jac[i - 1]);
+ } else {
+ p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
+ }
+ }
+
+ /*
+ * Convert the window points to affine coordinates. Point
+ * window[0] is the source point, already in affine coordinates.
+ */
+ window_to_affine(window.aff, window.jac, 15);
+
+ /*
+ * Perform point multiplication.
+ */
+ point_mul_inner(P, window.aff, k, klen);
+}
+
+/*
+ * Precomputed window for the conventional generator: P256_Gwin[n]
+ * contains (n+1)*G (affine coordinates, in Montgomery representation).
+ */
+static const p256_affine P256_Gwin[] = {
+ {
+ { 0x79E730D418A9143C, 0x75BA95FC5FEDB601,
+ 0x79FB732B77622510, 0x18905F76A53755C6 },
+ { 0xDDF25357CE95560A, 0x8B4AB8E4BA19E45C,
+ 0xD2E88688DD21F325, 0x8571FF1825885D85 }
+ },
+ {
+ { 0x850046D410DDD64D, 0xAA6AE3C1A433827D,
+ 0x732205038D1490D9, 0xF6BB32E43DCF3A3B },
+ { 0x2F3648D361BEE1A5, 0x152CD7CBEB236FF8,
+ 0x19A8FB0E92042DBE, 0x78C577510A5B8A3B }
+ },
+ {
+ { 0xFFAC3F904EEBC127, 0xB027F84A087D81FB,
+ 0x66AD77DD87CBBC98, 0x26936A3FB6FF747E },
+ { 0xB04C5C1FC983A7EB, 0x583E47AD0861FE1A,
+ 0x788208311A2EE98E, 0xD5F06A29E587CC07 }
+ },
+ {
+ { 0x74B0B50D46918DCC, 0x4650A6EDC623C173,
+ 0x0CDAACACE8100AF2, 0x577362F541B0176B },
+ { 0x2D96F24CE4CBABA6, 0x17628471FAD6F447,
+ 0x6B6C36DEE5DDD22E, 0x84B14C394C5AB863 }
+ },
+ {
+ { 0xBE1B8AAEC45C61F5, 0x90EC649A94B9537D,
+ 0x941CB5AAD076C20C, 0xC9079605890523C8 },
+ { 0xEB309B4AE7BA4F10, 0x73C568EFE5EB882B,
+ 0x3540A9877E7A1F68, 0x73A076BB2DD1E916 }
+ },
+ {
+ { 0x403947373E77664A, 0x55AE744F346CEE3E,
+ 0xD50A961A5B17A3AD, 0x13074B5954213673 },
+ { 0x93D36220D377E44B, 0x299C2B53ADFF14B5,
+ 0xF424D44CEF639F11, 0xA4C9916D4A07F75F }
+ },
+ {
+ { 0x0746354EA0173B4F, 0x2BD20213D23C00F7,
+ 0xF43EAAB50C23BB08, 0x13BA5119C3123E03 },
+ { 0x2847D0303F5B9D4D, 0x6742F2F25DA67BDD,
+ 0xEF933BDC77C94195, 0xEAEDD9156E240867 }
+ },
+ {
+ { 0x27F14CD19499A78F, 0x462AB5C56F9B3455,
+ 0x8F90F02AF02CFC6B, 0xB763891EB265230D },
+ { 0xF59DA3A9532D4977, 0x21E3327DCF9EBA15,
+ 0x123C7B84BE60BBF0, 0x56EC12F27706DF76 }
+ },
+ {
+ { 0x75C96E8F264E20E8, 0xABE6BFED59A7A841,
+ 0x2CC09C0444C8EB00, 0xE05B3080F0C4E16B },
+ { 0x1EB7777AA45F3314, 0x56AF7BEDCE5D45E3,
+ 0x2B6E019A88B12F1A, 0x086659CDFD835F9B }
+ },
+ {
+ { 0x2C18DBD19DC21EC8, 0x98F9868A0FCF8139,
+ 0x737D2CD648250B49, 0xCC61C94724B3428F },
+ { 0x0C2B407880DD9E76, 0xC43A8991383FBE08,
+ 0x5F7D2D65779BE5D2, 0x78719A54EB3B4AB5 }
+ },
+ {
+ { 0xEA7D260A6245E404, 0x9DE407956E7FDFE0,
+ 0x1FF3A4158DAC1AB5, 0x3E7090F1649C9073 },
+ { 0x1A7685612B944E88, 0x250F939EE57F61C8,
+ 0x0C0DAA891EAD643D, 0x68930023E125B88E }
+ },
+ {
+ { 0x04B71AA7D2697768, 0xABDEDEF5CA345A33,
+ 0x2409D29DEE37385E, 0x4EE1DF77CB83E156 },
+ { 0x0CAC12D91CBB5B43, 0x170ED2F6CA895637,
+ 0x28228CFA8ADE6D66, 0x7FF57C9553238ACA }
+ },
+ {
+ { 0xCCC425634B2ED709, 0x0E356769856FD30D,
+ 0xBCBCD43F559E9811, 0x738477AC5395B759 },
+ { 0x35752B90C00EE17F, 0x68748390742ED2E3,
+ 0x7CD06422BD1F5BC1, 0xFBC08769C9E7B797 }
+ },
+ {
+ { 0xA242A35BB0CF664A, 0x126E48F77F9707E3,
+ 0x1717BF54C6832660, 0xFAAE7332FD12C72E },
+ { 0x27B52DB7995D586B, 0xBE29569E832237C2,
+ 0xE8E4193E2A65E7DB, 0x152706DC2EAA1BBB }
+ },
+ {
+ { 0x72BCD8B7BC60055B, 0x03CC23EE56E27E4B,
+ 0xEE337424E4819370, 0xE2AA0E430AD3DA09 },
+ { 0x40B8524F6383C45D, 0xD766355442A41B25,
+ 0x64EFA6DE778A4797, 0x2042170A7079ADF4 }
+ }
+};
+
+/*
+ * Multiply the conventional generator of the curve by the provided
+ * integer. Return is written in *P.
+ *
+ * Assumptions:
+ * - Integer is not 0, and is lower than the curve order.
+ * If this conditions is not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+ point_mul_inner(P, P256_Gwin, k, klen);
+}
+
+/*
+ * Return 1 if all of the following hold:
+ * - klen <= 32
+ * - k != 0
+ * - k is lower than the curve order
+ * Otherwise, return 0.
+ *
+ * Constant-time behaviour: only klen may be observable.
+ */
+static uint32_t
+check_scalar(const unsigned char *k, size_t klen)
+{
+ uint32_t z;
+ int32_t c;
+ size_t u;
+
+ if (klen > 32) {
+ return 0;
+ }
+ z = 0;
+ for (u = 0; u < klen; u ++) {
+ z |= k[u];
+ }
+ if (klen == 32) {
+ c = 0;
+ for (u = 0; u < klen; u ++) {
+ c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
+ }
+ } else {
+ c = -1;
+ }
+ return NEQ(z, 0) & LT0(c);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *k, size_t klen, int curve)
+{
+ uint32_t r;
+ p256_jacobian P;
+
+ (void)curve;
+ if (Glen != 65) {
+ return 0;
+ }
+ r = check_scalar(k, klen);
+ r &= point_decode(&P, G);
+ p256_mul(&P, k, klen);
+ r &= point_encode(G, &P);
+ return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *k, size_t klen, int curve)
+{
+ p256_jacobian P;
+
+ (void)curve;
+ p256_mulgen(&P, k, klen);
+ point_encode(R, &P);
+ return 65;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ /*
+ * We might want to use Shamir's trick here: make a composite
+ * window of u*P+v*Q points, to merge the two doubling-ladders
+ * into one. This, however, has some complications:
+ *
+ * - During the computation, we may hit the point-at-infinity.
+ * Thus, we would need p256_add_complete_mixed() (complete
+ * formulas for point addition), with a higher cost (17 muls
+ * instead of 11).
+ *
+ * - A 4-bit window would be too large, since it would involve
+ * 16*16-1 = 255 points. For the same window size as in the
+ * p256_mul() case, we would need to reduce the window size
+ * to 2 bits, and thus perform twice as many non-doubling
+ * point additions.
+ *
+ * - The window may itself contain the point-at-infinity, and
+ * thus cannot be in all generality be made of affine points.
+ * Instead, we would need to make it a window of points in
+ * Jacobian coordinates. Even p256_add_complete_mixed() would
+ * be inappropriate.
+ *
+ * For these reasons, the code below performs two separate
+ * point multiplications, then computes the final point addition
+ * (which is both a "normal" addition, and a doubling, to handle
+ * all cases).
+ */
+
+ p256_jacobian P, Q;
+ uint32_t r, t, s;
+ uint64_t z;
+
+ (void)curve;
+ if (len != 65) {
+ return 0;
+ }
+ r = point_decode(&P, A);
+ p256_mul(&P, x, xlen);
+ if (B == NULL) {
+ p256_mulgen(&Q, y, ylen);
+ } else {
+ r &= point_decode(&Q, B);
+ p256_mul(&Q, y, ylen);
+ }
+
+ /*
+ * The final addition may fail in case both points are equal.
+ */
+ t = p256_add(&P, &Q);
+ f256_final_reduce(P.z);
+ z = P.z[0] | P.z[1] | P.z[2] | P.z[3];
+ s = EQ((uint32_t)(z | (z >> 32)), 0);
+ p256_double(&Q);
+
+ /*
+ * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+ * have the following:
+ *
+ * s = 0, t = 0 return P (normal addition)
+ * s = 0, t = 1 return P (normal addition)
+ * s = 1, t = 0 return Q (a 'double' case)
+ * s = 1, t = 1 report an error (P+Q = 0)
+ */
+ CCOPY(s & ~t, &P, &Q, sizeof Q);
+ point_encode(A, &P);
+ r &= ~(s & t);
+ return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m64 = {
+ (uint32_t)0x00800000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m64_get(void)
+{
+ return &br_ec_p256_m64;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m64_get(void)
+{
+ return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/ec/ec_prime_i15.c b/test/monniaux/BearSSL/src/ec/ec_prime_i15.c
new file mode 100644
index 00000000..0f210f24
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_prime_i15.c
@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Parameters for supported curves:
+ * - field modulus p
+ * - R^2 mod p (R = 2^(15k) for the smallest k such that R >= p)
+ * - b*R mod p (b is the second curve equation parameter)
+ */
+
+static const uint16_t P256_P[] = {
+ 0x0111,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x003F, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x1000, 0x0000, 0x4000, 0x7FFF,
+ 0x7FFF, 0x0001
+};
+
+static const uint16_t P256_R2[] = {
+ 0x0111,
+ 0x0000, 0x6000, 0x0000, 0x0000, 0x0000, 0x0000, 0x7FFC, 0x7FFF,
+ 0x7FBF, 0x7FFF, 0x7FBF, 0x7FFF, 0x7FFF, 0x7FFF, 0x77FF, 0x7FFF,
+ 0x4FFF, 0x0000
+};
+
+static const uint16_t P256_B[] = {
+ 0x0111,
+ 0x770C, 0x5EEF, 0x29C4, 0x3EC4, 0x6273, 0x0486, 0x4543, 0x3993,
+ 0x3C01, 0x6B56, 0x212E, 0x57EE, 0x4882, 0x204B, 0x7483, 0x3C16,
+ 0x0187, 0x0000
+};
+
+static const uint16_t P384_P[] = {
+ 0x0199,
+ 0x7FFF, 0x7FFF, 0x0003, 0x0000, 0x0000, 0x0000, 0x7FC0, 0x7FFF,
+ 0x7EFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+ 0x7FFF, 0x01FF
+};
+
+static const uint16_t P384_R2[] = {
+ 0x0199,
+ 0x1000, 0x0000, 0x0000, 0x7FFF, 0x7FFF, 0x0001, 0x0000, 0x0010,
+ 0x0000, 0x0000, 0x0000, 0x7F00, 0x7FFF, 0x01FF, 0x0000, 0x1000,
+ 0x0000, 0x2000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000
+};
+
+static const uint16_t P384_B[] = {
+ 0x0199,
+ 0x7333, 0x2096, 0x70D1, 0x2310, 0x3020, 0x6197, 0x1464, 0x35BB,
+ 0x70CA, 0x0117, 0x1920, 0x4136, 0x5FC8, 0x5713, 0x4938, 0x7DD2,
+ 0x4DD2, 0x4A71, 0x0220, 0x683E, 0x2C87, 0x4DB1, 0x7BFF, 0x6C09,
+ 0x0452, 0x0084
+};
+
+static const uint16_t P521_P[] = {
+ 0x022B,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+ 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+ 0x7FFF, 0x7FFF, 0x07FF
+};
+
+static const uint16_t P521_R2[] = {
+ 0x022B,
+ 0x0100, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+ 0x0000, 0x0000, 0x0000
+};
+
+static const uint16_t P521_B[] = {
+ 0x022B,
+ 0x7002, 0x6A07, 0x751A, 0x228F, 0x71EF, 0x5869, 0x20F4, 0x1EFC,
+ 0x7357, 0x37E0, 0x4EEC, 0x605E, 0x1652, 0x26F6, 0x31FA, 0x4A8F,
+ 0x6193, 0x3C2A, 0x3C42, 0x48C7, 0x3489, 0x6771, 0x4C57, 0x5CCD,
+ 0x2725, 0x545B, 0x503B, 0x5B42, 0x21A0, 0x2534, 0x687E, 0x70E4,
+ 0x1618, 0x27D7, 0x0465
+};
+
+typedef struct {
+ const uint16_t *p;
+ const uint16_t *b;
+ const uint16_t *R2;
+ uint16_t p0i;
+ size_t point_len;
+} curve_params;
+
+static inline const curve_params *
+id_to_curve(int curve)
+{
+ static const curve_params pp[] = {
+ { P256_P, P256_B, P256_R2, 0x0001, 65 },
+ { P384_P, P384_B, P384_R2, 0x0001, 97 },
+ { P521_P, P521_B, P521_R2, 0x0001, 133 }
+ };
+
+ return &pp[curve - BR_EC_secp256r1];
+}
+
+#define I15_LEN ((BR_MAX_EC_SIZE + 29) / 15)
+
+/*
+ * Type for a point in Jacobian coordinates:
+ * -- three values, x, y and z, in Montgomery representation
+ * -- affine coordinates are X = x / z^2 and Y = y / z^3
+ * -- for the point at infinity, z = 0
+ */
+typedef struct {
+ uint16_t c[3][I15_LEN];
+} jacobian;
+
+/*
+ * We use a custom interpreter that uses a dozen registers, and
+ * only six operations:
+ * MSET(d, a) copy a into d
+ * MADD(d, a) d = d+a (modular)
+ * MSUB(d, a) d = d-a (modular)
+ * MMUL(d, a, b) d = a*b (Montgomery multiplication)
+ * MINV(d, a, b) invert d modulo p; a and b are used as scratch registers
+ * MTZ(d) clear return value if d = 0
+ * Destination of MMUL (d) must be distinct from operands (a and b).
+ * There is no such constraint for MSUB and MADD.
+ *
+ * Registers include the operand coordinates, and temporaries.
+ */
+#define MSET(d, a) (0x0000 + ((d) << 8) + ((a) << 4))
+#define MADD(d, a) (0x1000 + ((d) << 8) + ((a) << 4))
+#define MSUB(d, a) (0x2000 + ((d) << 8) + ((a) << 4))
+#define MMUL(d, a, b) (0x3000 + ((d) << 8) + ((a) << 4) + (b))
+#define MINV(d, a, b) (0x4000 + ((d) << 8) + ((a) << 4) + (b))
+#define MTZ(d) (0x5000 + ((d) << 8))
+#define ENDCODE 0
+
+/*
+ * Registers for the input operands.
+ */
+#define P1x 0
+#define P1y 1
+#define P1z 2
+#define P2x 3
+#define P2y 4
+#define P2z 5
+
+/*
+ * Alternate names for the first input operand.
+ */
+#define Px 0
+#define Py 1
+#define Pz 2
+
+/*
+ * Temporaries.
+ */
+#define t1 6
+#define t2 7
+#define t3 8
+#define t4 9
+#define t5 10
+#define t6 11
+#define t7 12
+
+/*
+ * Extra scratch registers available when there is no second operand (e.g.
+ * for "double" and "affine").
+ */
+#define t8 3
+#define t9 4
+#define t10 5
+
+/*
+ * Doubling formulas are:
+ *
+ * s = 4*x*y^2
+ * m = 3*(x + z^2)*(x - z^2)
+ * x' = m^2 - 2*s
+ * y' = m*(s - x') - 8*y^4
+ * z' = 2*y*z
+ *
+ * If y = 0 (P has order 2) then this yields infinity (z' = 0), as it
+ * should. This case should not happen anyway, because our curves have
+ * prime order, and thus do not contain any point of order 2.
+ *
+ * If P is infinity (z = 0), then again the formulas yield infinity,
+ * which is correct. Thus, this code works for all points.
+ *
+ * Cost: 8 multiplications
+ */
+static const uint16_t code_double[] = {
+ /*
+ * Compute z^2 (in t1).
+ */
+ MMUL(t1, Pz, Pz),
+
+ /*
+ * Compute x-z^2 (in t2) and then x+z^2 (in t1).
+ */
+ MSET(t2, Px),
+ MSUB(t2, t1),
+ MADD(t1, Px),
+
+ /*
+ * Compute m = 3*(x+z^2)*(x-z^2) (in t1).
+ */
+ MMUL(t3, t1, t2),
+ MSET(t1, t3),
+ MADD(t1, t3),
+ MADD(t1, t3),
+
+ /*
+ * Compute s = 4*x*y^2 (in t2) and 2*y^2 (in t3).
+ */
+ MMUL(t3, Py, Py),
+ MADD(t3, t3),
+ MMUL(t2, Px, t3),
+ MADD(t2, t2),
+
+ /*
+ * Compute x' = m^2 - 2*s.
+ */
+ MMUL(Px, t1, t1),
+ MSUB(Px, t2),
+ MSUB(Px, t2),
+
+ /*
+ * Compute z' = 2*y*z.
+ */
+ MMUL(t4, Py, Pz),
+ MSET(Pz, t4),
+ MADD(Pz, t4),
+
+ /*
+ * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+ * 2*y^2 in t3.
+ */
+ MSUB(t2, Px),
+ MMUL(Py, t1, t2),
+ MMUL(t4, t3, t3),
+ MSUB(Py, t4),
+ MSUB(Py, t4),
+
+ ENDCODE
+};
+
+/*
+ * Addtions formulas are:
+ *
+ * u1 = x1 * z2^2
+ * u2 = x2 * z1^2
+ * s1 = y1 * z2^3
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1 * z2
+ *
+ * If both P1 and P2 are infinity, then z1 == 0 and z2 == 0, implying that
+ * z3 == 0, so the result is correct.
+ * If either of P1 or P2 is infinity, but not both, then z3 == 0, which is
+ * not correct.
+ * h == 0 only if u1 == u2; this happens in two cases:
+ * -- if s1 == s2 then P1 and/or P2 is infinity, or P1 == P2
+ * -- if s1 != s2 then P1 + P2 == infinity (but neither P1 or P2 is infinity)
+ *
+ * Thus, the following situations are not handled correctly:
+ * -- P1 = 0 and P2 != 0
+ * -- P1 != 0 and P2 = 0
+ * -- P1 = P2
+ * All other cases are properly computed. However, even in "incorrect"
+ * situations, the three coordinates still are properly formed field
+ * elements.
+ *
+ * The returned flag is cleared if r == 0. This happens in the following
+ * cases:
+ * -- Both points are on the same horizontal line (same Y coordinate).
+ * -- Both points are infinity.
+ * -- One point is infinity and the other is on line Y = 0.
+ * The third case cannot happen with our curves (there is no valid point
+ * on line Y = 0 since that would be a point of order 2). If the two
+ * source points are non-infinity, then remains only the case where the
+ * two points are on the same horizontal line.
+ *
+ * This allows us to detect the "P1 == P2" case, assuming that P1 != 0 and
+ * P2 != 0:
+ * -- If the returned value is not the point at infinity, then it was properly
+ * computed.
+ * -- Otherwise, if the returned flag is 1, then P1+P2 = 0, and the result
+ * is indeed the point at infinity.
+ * -- Otherwise (result is infinity, flag is 0), then P1 = P2 and we should
+ * use the 'double' code.
+ *
+ * Cost: 16 multiplications
+ */
+static const uint16_t code_add[] = {
+ /*
+ * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+ */
+ MMUL(t3, P2z, P2z),
+ MMUL(t1, P1x, t3),
+ MMUL(t4, P2z, t3),
+ MMUL(t3, P1y, t4),
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ MMUL(t4, P1z, P1z),
+ MMUL(t2, P2x, t4),
+ MMUL(t5, P1z, t4),
+ MMUL(t4, P2y, t5),
+
+ /*
+ * Compute h = u2 - u1 (in t2) and r = s2 - s1 (in t4).
+ */
+ MSUB(t2, t1),
+ MSUB(t4, t3),
+
+ /*
+ * Report cases where r = 0 through the returned flag.
+ */
+ MTZ(t4),
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5).
+ */
+ MMUL(t7, t2, t2),
+ MMUL(t6, t1, t7),
+ MMUL(t5, t7, t2),
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ * t1 and t7 can be used as scratch registers.
+ */
+ MMUL(P1x, t4, t4),
+ MSUB(P1x, t5),
+ MSUB(P1x, t6),
+ MSUB(P1x, t6),
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ MSUB(t6, P1x),
+ MMUL(P1y, t4, t6),
+ MMUL(t1, t5, t3),
+ MSUB(P1y, t1),
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ MMUL(t1, P1z, P2z),
+ MMUL(P1z, t1, t2),
+
+ ENDCODE
+};
+
+/*
+ * Check that the point is on the curve. This code snippet assumes the
+ * following conventions:
+ * -- Coordinates x and y have been freshly decoded in P1 (but not
+ * converted to Montgomery coordinates yet).
+ * -- P2x, P2y and P2z are set to, respectively, R^2, b*R and 1.
+ */
+static const uint16_t code_check[] = {
+
+ /* Convert x and y to Montgomery representation. */
+ MMUL(t1, P1x, P2x),
+ MMUL(t2, P1y, P2x),
+ MSET(P1x, t1),
+ MSET(P1y, t2),
+
+ /* Compute x^3 in t1. */
+ MMUL(t2, P1x, P1x),
+ MMUL(t1, P1x, t2),
+
+ /* Subtract 3*x from t1. */
+ MSUB(t1, P1x),
+ MSUB(t1, P1x),
+ MSUB(t1, P1x),
+
+ /* Add b. */
+ MADD(t1, P2y),
+
+ /* Compute y^2 in t2. */
+ MMUL(t2, P1y, P1y),
+
+ /* Compare y^2 with x^3 - 3*x + b; they must match. */
+ MSUB(t1, t2),
+ MTZ(t1),
+
+ /* Set z to 1 (in Montgomery representation). */
+ MMUL(P1z, P2x, P2z),
+
+ ENDCODE
+};
+
+/*
+ * Conversion back to affine coordinates. This code snippet assumes that
+ * the z coordinate of P2 is set to 1 (not in Montgomery representation).
+ */
+static const uint16_t code_affine[] = {
+
+ /* Save z*R in t1. */
+ MSET(t1, P1z),
+
+ /* Compute z^3 in t2. */
+ MMUL(t2, P1z, P1z),
+ MMUL(t3, P1z, t2),
+ MMUL(t2, t3, P2z),
+
+ /* Invert to (1/z^3) in t2. */
+ MINV(t2, t3, t4),
+
+ /* Compute y. */
+ MSET(t3, P1y),
+ MMUL(P1y, t2, t3),
+
+ /* Compute (1/z^2) in t3. */
+ MMUL(t3, t2, t1),
+
+ /* Compute x. */
+ MSET(t2, P1x),
+ MMUL(P1x, t2, t3),
+
+ ENDCODE
+};
+
+static uint32_t
+run_code(jacobian *P1, const jacobian *P2,
+ const curve_params *cc, const uint16_t *code)
+{
+ uint32_t r;
+ uint16_t t[13][I15_LEN];
+ size_t u;
+
+ r = 1;
+
+ /*
+ * Copy the two operands in the dedicated registers.
+ */
+ memcpy(t[P1x], P1->c, 3 * I15_LEN * sizeof(uint16_t));
+ memcpy(t[P2x], P2->c, 3 * I15_LEN * sizeof(uint16_t));
+
+ /*
+ * Run formulas.
+ */
+ for (u = 0;; u ++) {
+ unsigned op, d, a, b;
+
+ op = code[u];
+ if (op == 0) {
+ break;
+ }
+ d = (op >> 8) & 0x0F;
+ a = (op >> 4) & 0x0F;
+ b = op & 0x0F;
+ op >>= 12;
+ switch (op) {
+ uint32_t ctl;
+ size_t plen;
+ unsigned char tp[(BR_MAX_EC_SIZE + 7) >> 3];
+
+ case 0:
+ memcpy(t[d], t[a], I15_LEN * sizeof(uint16_t));
+ break;
+ case 1:
+ ctl = br_i15_add(t[d], t[a], 1);
+ ctl |= NOT(br_i15_sub(t[d], cc->p, 0));
+ br_i15_sub(t[d], cc->p, ctl);
+ break;
+ case 2:
+ br_i15_add(t[d], cc->p, br_i15_sub(t[d], t[a], 1));
+ break;
+ case 3:
+ br_i15_montymul(t[d], t[a], t[b], cc->p, cc->p0i);
+ break;
+ case 4:
+ plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3;
+ br_i15_encode(tp, plen, cc->p);
+ tp[plen - 1] -= 2;
+ br_i15_modpow(t[d], tp, plen,
+ cc->p, cc->p0i, t[a], t[b]);
+ break;
+ default:
+ r &= ~br_i15_iszero(t[d]);
+ break;
+ }
+ }
+
+ /*
+ * Copy back result.
+ */
+ memcpy(P1->c, t[P1x], 3 * I15_LEN * sizeof(uint16_t));
+ return r;
+}
+
+static void
+set_one(uint16_t *x, const uint16_t *p)
+{
+ size_t plen;
+
+ plen = (p[0] + 31) >> 4;
+ memset(x, 0, plen * sizeof *x);
+ x[0] = p[0];
+ x[1] = 0x0001;
+}
+
+static void
+point_zero(jacobian *P, const curve_params *cc)
+{
+ memset(P, 0, sizeof *P);
+ P->c[0][0] = P->c[1][0] = P->c[2][0] = cc->p[0];
+}
+
+static inline void
+point_double(jacobian *P, const curve_params *cc)
+{
+ run_code(P, P, cc, code_double);
+}
+
+static inline uint32_t
+point_add(jacobian *P1, const jacobian *P2, const curve_params *cc)
+{
+ return run_code(P1, P2, cc, code_add);
+}
+
+static void
+point_mul(jacobian *P, const unsigned char *x, size_t xlen,
+ const curve_params *cc)
+{
+ /*
+ * We do a simple double-and-add ladder with a 2-bit window
+ * to make only one add every two doublings. We thus first
+ * precompute 2P and 3P in some local buffers.
+ *
+ * We always perform two doublings and one addition; the
+ * addition is with P, 2P and 3P and is done in a temporary
+ * array.
+ *
+ * The addition code cannot handle cases where one of the
+ * operands is infinity, which is the case at the start of the
+ * ladder. We therefore need to maintain a flag that controls
+ * this situation.
+ */
+ uint32_t qz;
+ jacobian P2, P3, Q, T, U;
+
+ memcpy(&P2, P, sizeof P2);
+ point_double(&P2, cc);
+ memcpy(&P3, P, sizeof P3);
+ point_add(&P3, &P2, cc);
+
+ point_zero(&Q, cc);
+ qz = 1;
+ while (xlen -- > 0) {
+ int k;
+
+ for (k = 6; k >= 0; k -= 2) {
+ uint32_t bits;
+ uint32_t bnz;
+
+ point_double(&Q, cc);
+ point_double(&Q, cc);
+ memcpy(&T, P, sizeof T);
+ memcpy(&U, &Q, sizeof U);
+ bits = (*x >> k) & (uint32_t)3;
+ bnz = NEQ(bits, 0);
+ CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
+ CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
+ point_add(&U, &T, cc);
+ CCOPY(bnz & qz, &Q, &T, sizeof Q);
+ CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+ qz &= ~bnz;
+ }
+ x ++;
+ }
+ memcpy(P, &Q, sizeof Q);
+}
+
+/*
+ * Decode point into Jacobian coordinates. This function does not support
+ * the point at infinity. If the point is invalid then this returns 0, but
+ * the coordinates are still set to properly formed field elements.
+ */
+static uint32_t
+point_decode(jacobian *P, const void *src, size_t len, const curve_params *cc)
+{
+ /*
+ * Points must use uncompressed format:
+ * -- first byte is 0x04;
+ * -- coordinates X and Y use unsigned big-endian, with the same
+ * length as the field modulus.
+ *
+ * We don't support hybrid format (uncompressed, but first byte
+ * has value 0x06 or 0x07, depending on the least significant bit
+ * of Y) because it is rather useless, and explicitly forbidden
+ * by PKIX (RFC 5480, section 2.2).
+ *
+ * We don't support compressed format either, because it is not
+ * much used in practice (there are or were patent-related
+ * concerns about point compression, which explains the lack of
+ * generalised support). Also, point compression support would
+ * need a bit more code.
+ */
+ const unsigned char *buf;
+ size_t plen, zlen;
+ uint32_t r;
+ jacobian Q;
+
+ buf = src;
+ point_zero(P, cc);
+ plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3;
+ if (len != 1 + (plen << 1)) {
+ return 0;
+ }
+ r = br_i15_decode_mod(P->c[0], buf + 1, plen, cc->p);
+ r &= br_i15_decode_mod(P->c[1], buf + 1 + plen, plen, cc->p);
+
+ /*
+ * Check first byte.
+ */
+ r &= EQ(buf[0], 0x04);
+ /* obsolete
+ r &= EQ(buf[0], 0x04) | (EQ(buf[0] & 0xFE, 0x06)
+ & ~(uint32_t)(buf[0] ^ buf[plen << 1]));
+ */
+
+ /*
+ * Convert coordinates and check that the point is valid.
+ */
+ zlen = ((cc->p[0] + 31) >> 4) * sizeof(uint16_t);
+ memcpy(Q.c[0], cc->R2, zlen);
+ memcpy(Q.c[1], cc->b, zlen);
+ set_one(Q.c[2], cc->p);
+ r &= ~run_code(P, &Q, cc, code_check);
+ return r;
+}
+
+/*
+ * Encode a point. This method assumes that the point is correct and is
+ * not the point at infinity. Encoded size is always 1+2*plen, where
+ * plen is the field modulus length, in bytes.
+ */
+static void
+point_encode(void *dst, const jacobian *P, const curve_params *cc)
+{
+ unsigned char *buf;
+ size_t plen;
+ jacobian Q, T;
+
+ buf = dst;
+ plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3;
+ buf[0] = 0x04;
+ memcpy(&Q, P, sizeof *P);
+ set_one(T.c[2], cc->p);
+ run_code(&Q, &T, cc, code_affine);
+ br_i15_encode(buf + 1, plen, Q.c[0]);
+ br_i15_encode(buf + 1 + plen, plen, Q.c[1]);
+}
+
+static const br_ec_curve_def *
+id_to_curve_def(int curve)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+ return &br_secp256r1;
+ case BR_EC_secp384r1:
+ return &br_secp384r1;
+ case BR_EC_secp521r1:
+ return &br_secp521r1;
+ }
+ return NULL;
+}
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ const br_ec_curve_def *cd;
+
+ cd = id_to_curve_def(curve);
+ *len = cd->generator_len;
+ return cd->generator;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ const br_ec_curve_def *cd;
+
+ cd = id_to_curve_def(curve);
+ *len = cd->order_len;
+ return cd->order;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ api_generator(curve, len);
+ *len >>= 1;
+ return 1;
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ uint32_t r;
+ const curve_params *cc;
+ jacobian P;
+
+ cc = id_to_curve(curve);
+ r = point_decode(&P, G, Glen, cc);
+ point_mul(&P, x, xlen, cc);
+ if (Glen == cc->point_len) {
+ point_encode(G, &P, cc);
+ }
+ return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ uint32_t r, t, z;
+ const curve_params *cc;
+ jacobian P, Q;
+
+ /*
+ * TODO: see about merging the two ladders. Right now, we do
+ * two independent point multiplications, which is a bit
+ * wasteful of CPU resources (but yields short code).
+ */
+
+ cc = id_to_curve(curve);
+ r = point_decode(&P, A, len, cc);
+ if (B == NULL) {
+ size_t Glen;
+
+ B = api_generator(curve, &Glen);
+ }
+ r &= point_decode(&Q, B, len, cc);
+ point_mul(&P, x, xlen, cc);
+ point_mul(&Q, y, ylen, cc);
+
+ /*
+ * We want to compute P+Q. Since the base points A and B are distinct
+ * from infinity, and the multipliers are non-zero and lower than the
+ * curve order, then we know that P and Q are non-infinity. This
+ * leaves two special situations to test for:
+ * -- If P = Q then we must use point_double().
+ * -- If P+Q = 0 then we must report an error.
+ */
+ t = point_add(&P, &Q, cc);
+ point_double(&Q, cc);
+ z = br_i15_iszero(P.c[2]);
+
+ /*
+ * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+ * have the following:
+ *
+ * z = 0, t = 0 return P (normal addition)
+ * z = 0, t = 1 return P (normal addition)
+ * z = 1, t = 0 return Q (a 'double' case)
+ * z = 1, t = 1 report an error (P+Q = 0)
+ */
+ CCOPY(z & ~t, &P, &Q, sizeof Q);
+ point_encode(A, &P, cc);
+ r &= ~(z & t);
+
+ return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_prime_i15 = {
+ (uint32_t)0x03800000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_prime_i31.c b/test/monniaux/BearSSL/src/ec/ec_prime_i31.c
new file mode 100644
index 00000000..0586a3b5
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_prime_i31.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Parameters for supported curves (field modulus, and 'b' equation
+ * parameter; both values use the 'i31' format, and 'b' is in Montgomery
+ * representation).
+ */
+
+static const uint32_t P256_P[] = {
+ 0x00000108,
+ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x00000007,
+ 0x00000000, 0x00000000, 0x00000040, 0x7FFFFF80,
+ 0x000000FF
+};
+
+static const uint32_t P256_R2[] = {
+ 0x00000108,
+ 0x00014000, 0x00018000, 0x00000000, 0x7FF40000,
+ 0x7FEFFFFF, 0x7FF7FFFF, 0x7FAFFFFF, 0x005FFFFF,
+ 0x00000000
+};
+
+static const uint32_t P256_B[] = {
+ 0x00000108,
+ 0x6FEE1803, 0x6229C4BD, 0x21B139BE, 0x327150AA,
+ 0x3567802E, 0x3F7212ED, 0x012E4355, 0x782DD38D,
+ 0x0000000E
+};
+
+static const uint32_t P384_P[] = {
+ 0x0000018C,
+ 0x7FFFFFFF, 0x00000001, 0x00000000, 0x7FFFFFF8,
+ 0x7FFFFFEF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+ 0x00000FFF
+};
+
+static const uint32_t P384_R2[] = {
+ 0x0000018C,
+ 0x00000000, 0x00000080, 0x7FFFFE00, 0x000001FF,
+ 0x00000800, 0x00000000, 0x7FFFE000, 0x00001FFF,
+ 0x00008000, 0x00008000, 0x00000000, 0x00000000,
+ 0x00000000
+};
+
+static const uint32_t P384_B[] = {
+ 0x0000018C,
+ 0x6E666840, 0x070D0392, 0x5D810231, 0x7651D50C,
+ 0x17E218D6, 0x1B192002, 0x44EFE441, 0x3A524E2B,
+ 0x2719BA5F, 0x41F02209, 0x36C5643E, 0x5813EFFE,
+ 0x000008A5
+};
+
+static const uint32_t P521_P[] = {
+ 0x00000219,
+ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+ 0x01FFFFFF
+};
+
+static const uint32_t P521_R2[] = {
+ 0x00000219,
+ 0x00001000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+ 0x00000000
+};
+
+static const uint32_t P521_B[] = {
+ 0x00000219,
+ 0x540FC00A, 0x228FEA35, 0x2C34F1EF, 0x67BF107A,
+ 0x46FC1CD5, 0x1605E9DD, 0x6937B165, 0x272A3D8F,
+ 0x42785586, 0x44C8C778, 0x15F3B8B4, 0x64B73366,
+ 0x03BA8B69, 0x0D05B42A, 0x21F929A2, 0x2C31C393,
+ 0x00654FAE
+};
+
+typedef struct {
+ const uint32_t *p;
+ const uint32_t *b;
+ const uint32_t *R2;
+ uint32_t p0i;
+} curve_params;
+
+static inline const curve_params *
+id_to_curve(int curve)
+{
+ static const curve_params pp[] = {
+ { P256_P, P256_B, P256_R2, 0x00000001 },
+ { P384_P, P384_B, P384_R2, 0x00000001 },
+ { P521_P, P521_B, P521_R2, 0x00000001 }
+ };
+
+ return &pp[curve - BR_EC_secp256r1];
+}
+
+#define I31_LEN ((BR_MAX_EC_SIZE + 61) / 31)
+
+/*
+ * Type for a point in Jacobian coordinates:
+ * -- three values, x, y and z, in Montgomery representation
+ * -- affine coordinates are X = x / z^2 and Y = y / z^3
+ * -- for the point at infinity, z = 0
+ */
+typedef struct {
+ uint32_t c[3][I31_LEN];
+} jacobian;
+
+/*
+ * We use a custom interpreter that uses a dozen registers, and
+ * only six operations:
+ * MSET(d, a) copy a into d
+ * MADD(d, a) d = d+a (modular)
+ * MSUB(d, a) d = d-a (modular)
+ * MMUL(d, a, b) d = a*b (Montgomery multiplication)
+ * MINV(d, a, b) invert d modulo p; a and b are used as scratch registers
+ * MTZ(d) clear return value if d = 0
+ * Destination of MMUL (d) must be distinct from operands (a and b).
+ * There is no such constraint for MSUB and MADD.
+ *
+ * Registers include the operand coordinates, and temporaries.
+ */
+#define MSET(d, a) (0x0000 + ((d) << 8) + ((a) << 4))
+#define MADD(d, a) (0x1000 + ((d) << 8) + ((a) << 4))
+#define MSUB(d, a) (0x2000 + ((d) << 8) + ((a) << 4))
+#define MMUL(d, a, b) (0x3000 + ((d) << 8) + ((a) << 4) + (b))
+#define MINV(d, a, b) (0x4000 + ((d) << 8) + ((a) << 4) + (b))
+#define MTZ(d) (0x5000 + ((d) << 8))
+#define ENDCODE 0
+
+/*
+ * Registers for the input operands.
+ */
+#define P1x 0
+#define P1y 1
+#define P1z 2
+#define P2x 3
+#define P2y 4
+#define P2z 5
+
+/*
+ * Alternate names for the first input operand.
+ */
+#define Px 0
+#define Py 1
+#define Pz 2
+
+/*
+ * Temporaries.
+ */
+#define t1 6
+#define t2 7
+#define t3 8
+#define t4 9
+#define t5 10
+#define t6 11
+#define t7 12
+
+/*
+ * Extra scratch registers available when there is no second operand (e.g.
+ * for "double" and "affine").
+ */
+#define t8 3
+#define t9 4
+#define t10 5
+
+/*
+ * Doubling formulas are:
+ *
+ * s = 4*x*y^2
+ * m = 3*(x + z^2)*(x - z^2)
+ * x' = m^2 - 2*s
+ * y' = m*(s - x') - 8*y^4
+ * z' = 2*y*z
+ *
+ * If y = 0 (P has order 2) then this yields infinity (z' = 0), as it
+ * should. This case should not happen anyway, because our curves have
+ * prime order, and thus do not contain any point of order 2.
+ *
+ * If P is infinity (z = 0), then again the formulas yield infinity,
+ * which is correct. Thus, this code works for all points.
+ *
+ * Cost: 8 multiplications
+ */
+static const uint16_t code_double[] = {
+ /*
+ * Compute z^2 (in t1).
+ */
+ MMUL(t1, Pz, Pz),
+
+ /*
+ * Compute x-z^2 (in t2) and then x+z^2 (in t1).
+ */
+ MSET(t2, Px),
+ MSUB(t2, t1),
+ MADD(t1, Px),
+
+ /*
+ * Compute m = 3*(x+z^2)*(x-z^2) (in t1).
+ */
+ MMUL(t3, t1, t2),
+ MSET(t1, t3),
+ MADD(t1, t3),
+ MADD(t1, t3),
+
+ /*
+ * Compute s = 4*x*y^2 (in t2) and 2*y^2 (in t3).
+ */
+ MMUL(t3, Py, Py),
+ MADD(t3, t3),
+ MMUL(t2, Px, t3),
+ MADD(t2, t2),
+
+ /*
+ * Compute x' = m^2 - 2*s.
+ */
+ MMUL(Px, t1, t1),
+ MSUB(Px, t2),
+ MSUB(Px, t2),
+
+ /*
+ * Compute z' = 2*y*z.
+ */
+ MMUL(t4, Py, Pz),
+ MSET(Pz, t4),
+ MADD(Pz, t4),
+
+ /*
+ * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+ * 2*y^2 in t3.
+ */
+ MSUB(t2, Px),
+ MMUL(Py, t1, t2),
+ MMUL(t4, t3, t3),
+ MSUB(Py, t4),
+ MSUB(Py, t4),
+
+ ENDCODE
+};
+
+/*
+ * Addtions formulas are:
+ *
+ * u1 = x1 * z2^2
+ * u2 = x2 * z1^2
+ * s1 = y1 * z2^3
+ * s2 = y2 * z1^3
+ * h = u2 - u1
+ * r = s2 - s1
+ * x3 = r^2 - h^3 - 2 * u1 * h^2
+ * y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ * z3 = h * z1 * z2
+ *
+ * If both P1 and P2 are infinity, then z1 == 0 and z2 == 0, implying that
+ * z3 == 0, so the result is correct.
+ * If either of P1 or P2 is infinity, but not both, then z3 == 0, which is
+ * not correct.
+ * h == 0 only if u1 == u2; this happens in two cases:
+ * -- if s1 == s2 then P1 and/or P2 is infinity, or P1 == P2
+ * -- if s1 != s2 then P1 + P2 == infinity (but neither P1 or P2 is infinity)
+ *
+ * Thus, the following situations are not handled correctly:
+ * -- P1 = 0 and P2 != 0
+ * -- P1 != 0 and P2 = 0
+ * -- P1 = P2
+ * All other cases are properly computed. However, even in "incorrect"
+ * situations, the three coordinates still are properly formed field
+ * elements.
+ *
+ * The returned flag is cleared if r == 0. This happens in the following
+ * cases:
+ * -- Both points are on the same horizontal line (same Y coordinate).
+ * -- Both points are infinity.
+ * -- One point is infinity and the other is on line Y = 0.
+ * The third case cannot happen with our curves (there is no valid point
+ * on line Y = 0 since that would be a point of order 2). If the two
+ * source points are non-infinity, then remains only the case where the
+ * two points are on the same horizontal line.
+ *
+ * This allows us to detect the "P1 == P2" case, assuming that P1 != 0 and
+ * P2 != 0:
+ * -- If the returned value is not the point at infinity, then it was properly
+ * computed.
+ * -- Otherwise, if the returned flag is 1, then P1+P2 = 0, and the result
+ * is indeed the point at infinity.
+ * -- Otherwise (result is infinity, flag is 0), then P1 = P2 and we should
+ * use the 'double' code.
+ *
+ * Cost: 16 multiplications
+ */
+static const uint16_t code_add[] = {
+ /*
+ * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+ */
+ MMUL(t3, P2z, P2z),
+ MMUL(t1, P1x, t3),
+ MMUL(t4, P2z, t3),
+ MMUL(t3, P1y, t4),
+
+ /*
+ * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+ */
+ MMUL(t4, P1z, P1z),
+ MMUL(t2, P2x, t4),
+ MMUL(t5, P1z, t4),
+ MMUL(t4, P2y, t5),
+
+ /*
+ * Compute h = u2 - u1 (in t2) and r = s2 - s1 (in t4).
+ */
+ MSUB(t2, t1),
+ MSUB(t4, t3),
+
+ /*
+ * Report cases where r = 0 through the returned flag.
+ */
+ MTZ(t4),
+
+ /*
+ * Compute u1*h^2 (in t6) and h^3 (in t5).
+ */
+ MMUL(t7, t2, t2),
+ MMUL(t6, t1, t7),
+ MMUL(t5, t7, t2),
+
+ /*
+ * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+ * t1 and t7 can be used as scratch registers.
+ */
+ MMUL(P1x, t4, t4),
+ MSUB(P1x, t5),
+ MSUB(P1x, t6),
+ MSUB(P1x, t6),
+
+ /*
+ * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+ */
+ MSUB(t6, P1x),
+ MMUL(P1y, t4, t6),
+ MMUL(t1, t5, t3),
+ MSUB(P1y, t1),
+
+ /*
+ * Compute z3 = h*z1*z2.
+ */
+ MMUL(t1, P1z, P2z),
+ MMUL(P1z, t1, t2),
+
+ ENDCODE
+};
+
+/*
+ * Check that the point is on the curve. This code snippet assumes the
+ * following conventions:
+ * -- Coordinates x and y have been freshly decoded in P1 (but not
+ * converted to Montgomery coordinates yet).
+ * -- P2x, P2y and P2z are set to, respectively, R^2, b*R and 1.
+ */
+static const uint16_t code_check[] = {
+
+ /* Convert x and y to Montgomery representation. */
+ MMUL(t1, P1x, P2x),
+ MMUL(t2, P1y, P2x),
+ MSET(P1x, t1),
+ MSET(P1y, t2),
+
+ /* Compute x^3 in t1. */
+ MMUL(t2, P1x, P1x),
+ MMUL(t1, P1x, t2),
+
+ /* Subtract 3*x from t1. */
+ MSUB(t1, P1x),
+ MSUB(t1, P1x),
+ MSUB(t1, P1x),
+
+ /* Add b. */
+ MADD(t1, P2y),
+
+ /* Compute y^2 in t2. */
+ MMUL(t2, P1y, P1y),
+
+ /* Compare y^2 with x^3 - 3*x + b; they must match. */
+ MSUB(t1, t2),
+ MTZ(t1),
+
+ /* Set z to 1 (in Montgomery representation). */
+ MMUL(P1z, P2x, P2z),
+
+ ENDCODE
+};
+
+/*
+ * Conversion back to affine coordinates. This code snippet assumes that
+ * the z coordinate of P2 is set to 1 (not in Montgomery representation).
+ */
+static const uint16_t code_affine[] = {
+
+ /* Save z*R in t1. */
+ MSET(t1, P1z),
+
+ /* Compute z^3 in t2. */
+ MMUL(t2, P1z, P1z),
+ MMUL(t3, P1z, t2),
+ MMUL(t2, t3, P2z),
+
+ /* Invert to (1/z^3) in t2. */
+ MINV(t2, t3, t4),
+
+ /* Compute y. */
+ MSET(t3, P1y),
+ MMUL(P1y, t2, t3),
+
+ /* Compute (1/z^2) in t3. */
+ MMUL(t3, t2, t1),
+
+ /* Compute x. */
+ MSET(t2, P1x),
+ MMUL(P1x, t2, t3),
+
+ ENDCODE
+};
+
+static uint32_t
+run_code(jacobian *P1, const jacobian *P2,
+ const curve_params *cc, const uint16_t *code)
+{
+ uint32_t r;
+ uint32_t t[13][I31_LEN];
+ size_t u;
+
+ r = 1;
+
+ /*
+ * Copy the two operands in the dedicated registers.
+ */
+ memcpy(t[P1x], P1->c, 3 * I31_LEN * sizeof(uint32_t));
+ memcpy(t[P2x], P2->c, 3 * I31_LEN * sizeof(uint32_t));
+
+ /*
+ * Run formulas.
+ */
+ for (u = 0;; u ++) {
+ unsigned op, d, a, b;
+
+ op = code[u];
+ if (op == 0) {
+ break;
+ }
+ d = (op >> 8) & 0x0F;
+ a = (op >> 4) & 0x0F;
+ b = op & 0x0F;
+ op >>= 12;
+ switch (op) {
+ uint32_t ctl;
+ size_t plen;
+ unsigned char tp[(BR_MAX_EC_SIZE + 7) >> 3];
+
+ case 0:
+ memcpy(t[d], t[a], I31_LEN * sizeof(uint32_t));
+ break;
+ case 1:
+ ctl = br_i31_add(t[d], t[a], 1);
+ ctl |= NOT(br_i31_sub(t[d], cc->p, 0));
+ br_i31_sub(t[d], cc->p, ctl);
+ break;
+ case 2:
+ br_i31_add(t[d], cc->p, br_i31_sub(t[d], t[a], 1));
+ break;
+ case 3:
+ br_i31_montymul(t[d], t[a], t[b], cc->p, cc->p0i);
+ break;
+ case 4:
+ plen = (cc->p[0] - (cc->p[0] >> 5) + 7) >> 3;
+ br_i31_encode(tp, plen, cc->p);
+ tp[plen - 1] -= 2;
+ br_i31_modpow(t[d], tp, plen,
+ cc->p, cc->p0i, t[a], t[b]);
+ break;
+ default:
+ r &= ~br_i31_iszero(t[d]);
+ break;
+ }
+ }
+
+ /*
+ * Copy back result.
+ */
+ memcpy(P1->c, t[P1x], 3 * I31_LEN * sizeof(uint32_t));
+ return r;
+}
+
+static void
+set_one(uint32_t *x, const uint32_t *p)
+{
+ size_t plen;
+
+ plen = (p[0] + 63) >> 5;
+ memset(x, 0, plen * sizeof *x);
+ x[0] = p[0];
+ x[1] = 0x00000001;
+}
+
+static void
+point_zero(jacobian *P, const curve_params *cc)
+{
+ memset(P, 0, sizeof *P);
+ P->c[0][0] = P->c[1][0] = P->c[2][0] = cc->p[0];
+}
+
+static inline void
+point_double(jacobian *P, const curve_params *cc)
+{
+ run_code(P, P, cc, code_double);
+}
+
+static inline uint32_t
+point_add(jacobian *P1, const jacobian *P2, const curve_params *cc)
+{
+ return run_code(P1, P2, cc, code_add);
+}
+
+static void
+point_mul(jacobian *P, const unsigned char *x, size_t xlen,
+ const curve_params *cc)
+{
+ /*
+ * We do a simple double-and-add ladder with a 2-bit window
+ * to make only one add every two doublings. We thus first
+ * precompute 2P and 3P in some local buffers.
+ *
+ * We always perform two doublings and one addition; the
+ * addition is with P, 2P and 3P and is done in a temporary
+ * array.
+ *
+ * The addition code cannot handle cases where one of the
+ * operands is infinity, which is the case at the start of the
+ * ladder. We therefore need to maintain a flag that controls
+ * this situation.
+ */
+ uint32_t qz;
+ jacobian P2, P3, Q, T, U;
+
+ memcpy(&P2, P, sizeof P2);
+ point_double(&P2, cc);
+ memcpy(&P3, P, sizeof P3);
+ point_add(&P3, &P2, cc);
+
+ point_zero(&Q, cc);
+ qz = 1;
+ while (xlen -- > 0) {
+ int k;
+
+ for (k = 6; k >= 0; k -= 2) {
+ uint32_t bits;
+ uint32_t bnz;
+
+ point_double(&Q, cc);
+ point_double(&Q, cc);
+ memcpy(&T, P, sizeof T);
+ memcpy(&U, &Q, sizeof U);
+ bits = (*x >> k) & (uint32_t)3;
+ bnz = NEQ(bits, 0);
+ CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
+ CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
+ point_add(&U, &T, cc);
+ CCOPY(bnz & qz, &Q, &T, sizeof Q);
+ CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+ qz &= ~bnz;
+ }
+ x ++;
+ }
+ memcpy(P, &Q, sizeof Q);
+}
+
+/*
+ * Decode point into Jacobian coordinates. This function does not support
+ * the point at infinity. If the point is invalid then this returns 0, but
+ * the coordinates are still set to properly formed field elements.
+ */
+static uint32_t
+point_decode(jacobian *P, const void *src, size_t len, const curve_params *cc)
+{
+ /*
+ * Points must use uncompressed format:
+ * -- first byte is 0x04;
+ * -- coordinates X and Y use unsigned big-endian, with the same
+ * length as the field modulus.
+ *
+ * We don't support hybrid format (uncompressed, but first byte
+ * has value 0x06 or 0x07, depending on the least significant bit
+ * of Y) because it is rather useless, and explicitly forbidden
+ * by PKIX (RFC 5480, section 2.2).
+ *
+ * We don't support compressed format either, because it is not
+ * much used in practice (there are or were patent-related
+ * concerns about point compression, which explains the lack of
+ * generalised support). Also, point compression support would
+ * need a bit more code.
+ */
+ const unsigned char *buf;
+ size_t plen, zlen;
+ uint32_t r;
+ jacobian Q;
+
+ buf = src;
+ point_zero(P, cc);
+ plen = (cc->p[0] - (cc->p[0] >> 5) + 7) >> 3;
+ if (len != 1 + (plen << 1)) {
+ return 0;
+ }
+ r = br_i31_decode_mod(P->c[0], buf + 1, plen, cc->p);
+ r &= br_i31_decode_mod(P->c[1], buf + 1 + plen, plen, cc->p);
+
+ /*
+ * Check first byte.
+ */
+ r &= EQ(buf[0], 0x04);
+ /* obsolete
+ r &= EQ(buf[0], 0x04) | (EQ(buf[0] & 0xFE, 0x06)
+ & ~(uint32_t)(buf[0] ^ buf[plen << 1]));
+ */
+
+ /*
+ * Convert coordinates and check that the point is valid.
+ */
+ zlen = ((cc->p[0] + 63) >> 5) * sizeof(uint32_t);
+ memcpy(Q.c[0], cc->R2, zlen);
+ memcpy(Q.c[1], cc->b, zlen);
+ set_one(Q.c[2], cc->p);
+ r &= ~run_code(P, &Q, cc, code_check);
+ return r;
+}
+
+/*
+ * Encode a point. This method assumes that the point is correct and is
+ * not the point at infinity. Encoded size is always 1+2*plen, where
+ * plen is the field modulus length, in bytes.
+ */
+static void
+point_encode(void *dst, const jacobian *P, const curve_params *cc)
+{
+ unsigned char *buf;
+ uint32_t xbl;
+ size_t plen;
+ jacobian Q, T;
+
+ buf = dst;
+ xbl = cc->p[0];
+ xbl -= (xbl >> 5);
+ plen = (xbl + 7) >> 3;
+ buf[0] = 0x04;
+ memcpy(&Q, P, sizeof *P);
+ set_one(T.c[2], cc->p);
+ run_code(&Q, &T, cc, code_affine);
+ br_i31_encode(buf + 1, plen, Q.c[0]);
+ br_i31_encode(buf + 1 + plen, plen, Q.c[1]);
+}
+
+static const br_ec_curve_def *
+id_to_curve_def(int curve)
+{
+ switch (curve) {
+ case BR_EC_secp256r1:
+ return &br_secp256r1;
+ case BR_EC_secp384r1:
+ return &br_secp384r1;
+ case BR_EC_secp521r1:
+ return &br_secp521r1;
+ }
+ return NULL;
+}
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+ const br_ec_curve_def *cd;
+
+ cd = id_to_curve_def(curve);
+ *len = cd->generator_len;
+ return cd->generator;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+ const br_ec_curve_def *cd;
+
+ cd = id_to_curve_def(curve);
+ *len = cd->order_len;
+ return cd->order;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+ api_generator(curve, len);
+ *len >>= 1;
+ return 1;
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ uint32_t r;
+ const curve_params *cc;
+ jacobian P;
+
+ cc = id_to_curve(curve);
+ r = point_decode(&P, G, Glen, cc);
+ point_mul(&P, x, xlen, cc);
+ point_encode(G, &P, cc);
+ return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+ const unsigned char *x, size_t xlen, int curve)
+{
+ const unsigned char *G;
+ size_t Glen;
+
+ G = api_generator(curve, &Glen);
+ memcpy(R, G, Glen);
+ api_mul(R, Glen, x, xlen, curve);
+ return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+ const unsigned char *x, size_t xlen,
+ const unsigned char *y, size_t ylen, int curve)
+{
+ uint32_t r, t, z;
+ const curve_params *cc;
+ jacobian P, Q;
+
+ /*
+ * TODO: see about merging the two ladders. Right now, we do
+ * two independent point multiplications, which is a bit
+ * wasteful of CPU resources (but yields short code).
+ */
+
+ cc = id_to_curve(curve);
+ r = point_decode(&P, A, len, cc);
+ if (B == NULL) {
+ size_t Glen;
+
+ B = api_generator(curve, &Glen);
+ }
+ r &= point_decode(&Q, B, len, cc);
+ point_mul(&P, x, xlen, cc);
+ point_mul(&Q, y, ylen, cc);
+
+ /*
+ * We want to compute P+Q. Since the base points A and B are distinct
+ * from infinity, and the multipliers are non-zero and lower than the
+ * curve order, then we know that P and Q are non-infinity. This
+ * leaves two special situations to test for:
+ * -- If P = Q then we must use point_double().
+ * -- If P+Q = 0 then we must report an error.
+ */
+ t = point_add(&P, &Q, cc);
+ point_double(&Q, cc);
+ z = br_i31_iszero(P.c[2]);
+
+ /*
+ * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+ * have the following:
+ *
+ * z = 0, t = 0 return P (normal addition)
+ * z = 0, t = 1 return P (normal addition)
+ * z = 1, t = 0 return Q (a 'double' case)
+ * z = 1, t = 1 report an error (P+Q = 0)
+ */
+ CCOPY(z & ~t, &P, &Q, sizeof Q);
+ point_encode(A, &P, cc);
+ r &= ~(z & t);
+
+ return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_prime_i31 = {
+ (uint32_t)0x03800000,
+ &api_generator,
+ &api_order,
+ &api_xoff,
+ &api_mul,
+ &api_mulgen,
+ &api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_pubkey.c b/test/monniaux/BearSSL/src/ec/ec_pubkey.c
new file mode 100644
index 00000000..383ff286
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_pubkey.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char POINT_LEN[] = {
+ 0, /* 0: not a valid curve ID */
+ 43, /* sect163k1 */
+ 43, /* sect163r1 */
+ 43, /* sect163r2 */
+ 51, /* sect193r1 */
+ 51, /* sect193r2 */
+ 61, /* sect233k1 */
+ 61, /* sect233r1 */
+ 61, /* sect239k1 */
+ 73, /* sect283k1 */
+ 73, /* sect283r1 */
+ 105, /* sect409k1 */
+ 105, /* sect409r1 */
+ 145, /* sect571k1 */
+ 145, /* sect571r1 */
+ 41, /* secp160k1 */
+ 41, /* secp160r1 */
+ 41, /* secp160r2 */
+ 49, /* secp192k1 */
+ 49, /* secp192r1 */
+ 57, /* secp224k1 */
+ 57, /* secp224r1 */
+ 65, /* secp256k1 */
+ 65, /* secp256r1 */
+ 97, /* secp384r1 */
+ 133, /* secp521r1 */
+ 65, /* brainpoolP256r1 */
+ 97, /* brainpoolP384r1 */
+ 129, /* brainpoolP512r1 */
+ 32, /* curve25519 */
+ 56, /* curve448 */
+};
+
+/* see bearssl_ec.h */
+size_t
+br_ec_compute_pub(const br_ec_impl *impl, br_ec_public_key *pk,
+ void *kbuf, const br_ec_private_key *sk)
+{
+ int curve;
+ size_t len;
+
+ curve = sk->curve;
+ if (curve < 0 || curve >= 32 || curve >= (int)(sizeof POINT_LEN)
+ || ((impl->supported_curves >> curve) & 1) == 0)
+ {
+ return 0;
+ }
+ if (kbuf == NULL) {
+ return POINT_LEN[curve];
+ }
+ len = impl->mulgen(kbuf, sk->x, sk->xlen, curve);
+ if (pk != NULL) {
+ pk->curve = curve;
+ pk->q = kbuf;
+ pk->qlen = len;
+ }
+ return len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ec_secp256r1.c b/test/monniaux/BearSSL/src/ec/ec_secp256r1.c
new file mode 100644
index 00000000..a9d6c456
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_secp256r1.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char P256_N[] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xBC, 0xE6, 0xFA, 0xAD, 0xA7, 0x17, 0x9E, 0x84,
+ 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, 0x25, 0x51
+};
+
+static const unsigned char P256_G[] = {
+ 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42,
+ 0x47, 0xF8, 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40,
+ 0xF2, 0x77, 0x03, 0x7D, 0x81, 0x2D, 0xEB, 0x33,
+ 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, 0x98, 0xC2,
+ 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+ 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E,
+ 0x16, 0x2B, 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E,
+ 0xCE, 0xCB, 0xB6, 0x40, 0x68, 0x37, 0xBF, 0x51,
+ 0xF5
+};
+
+/* see inner.h */
+const br_ec_curve_def br_secp256r1 = {
+ BR_EC_secp256r1,
+ P256_N, sizeof P256_N,
+ P256_G, sizeof P256_G
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_secp384r1.c b/test/monniaux/BearSSL/src/ec/ec_secp384r1.c
new file mode 100644
index 00000000..693d93e4
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_secp384r1.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char P384_N[] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xC7, 0x63, 0x4D, 0x81, 0xF4, 0x37, 0x2D, 0xDF,
+ 0x58, 0x1A, 0x0D, 0xB2, 0x48, 0xB0, 0xA7, 0x7A,
+ 0xEC, 0xEC, 0x19, 0x6A, 0xCC, 0xC5, 0x29, 0x73
+};
+
+static const unsigned char P384_G[] = {
+ 0x04, 0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05,
+ 0x37, 0x8E, 0xB1, 0xC7, 0x1E, 0xF3, 0x20, 0xAD,
+ 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B,
+ 0x98, 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A,
+ 0x38, 0x55, 0x02, 0xF2, 0x5D, 0xBF, 0x55, 0x29,
+ 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A,
+ 0xB7, 0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C,
+ 0x6F, 0x5D, 0x9E, 0x98, 0xBF, 0x92, 0x92, 0xDC,
+ 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14,
+ 0x7C, 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8,
+ 0xC0, 0x0A, 0x60, 0xB1, 0xCE, 0x1D, 0x7E, 0x81,
+ 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E,
+ 0x5F
+};
+
+/* see inner.h */
+const br_ec_curve_def br_secp384r1 = {
+ BR_EC_secp384r1,
+ P384_N, sizeof P384_N,
+ P384_G, sizeof P384_G
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_secp521r1.c b/test/monniaux/BearSSL/src/ec/ec_secp521r1.c
new file mode 100644
index 00000000..161acd0e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_secp521r1.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char P521_N[] = {
+ 0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFA, 0x51, 0x86, 0x87, 0x83, 0xBF, 0x2F,
+ 0x96, 0x6B, 0x7F, 0xCC, 0x01, 0x48, 0xF7, 0x09,
+ 0xA5, 0xD0, 0x3B, 0xB5, 0xC9, 0xB8, 0x89, 0x9C,
+ 0x47, 0xAE, 0xBB, 0x6F, 0xB7, 0x1E, 0x91, 0x38,
+ 0x64, 0x09
+};
+
+static const unsigned char P521_G[] = {
+ 0x04, 0x00, 0xC6, 0x85, 0x8E, 0x06, 0xB7, 0x04,
+ 0x04, 0xE9, 0xCD, 0x9E, 0x3E, 0xCB, 0x66, 0x23,
+ 0x95, 0xB4, 0x42, 0x9C, 0x64, 0x81, 0x39, 0x05,
+ 0x3F, 0xB5, 0x21, 0xF8, 0x28, 0xAF, 0x60, 0x6B,
+ 0x4D, 0x3D, 0xBA, 0xA1, 0x4B, 0x5E, 0x77, 0xEF,
+ 0xE7, 0x59, 0x28, 0xFE, 0x1D, 0xC1, 0x27, 0xA2,
+ 0xFF, 0xA8, 0xDE, 0x33, 0x48, 0xB3, 0xC1, 0x85,
+ 0x6A, 0x42, 0x9B, 0xF9, 0x7E, 0x7E, 0x31, 0xC2,
+ 0xE5, 0xBD, 0x66, 0x01, 0x18, 0x39, 0x29, 0x6A,
+ 0x78, 0x9A, 0x3B, 0xC0, 0x04, 0x5C, 0x8A, 0x5F,
+ 0xB4, 0x2C, 0x7D, 0x1B, 0xD9, 0x98, 0xF5, 0x44,
+ 0x49, 0x57, 0x9B, 0x44, 0x68, 0x17, 0xAF, 0xBD,
+ 0x17, 0x27, 0x3E, 0x66, 0x2C, 0x97, 0xEE, 0x72,
+ 0x99, 0x5E, 0xF4, 0x26, 0x40, 0xC5, 0x50, 0xB9,
+ 0x01, 0x3F, 0xAD, 0x07, 0x61, 0x35, 0x3C, 0x70,
+ 0x86, 0xA2, 0x72, 0xC2, 0x40, 0x88, 0xBE, 0x94,
+ 0x76, 0x9F, 0xD1, 0x66, 0x50
+};
+
+/* see inner.h */
+const br_ec_curve_def br_secp521r1 = {
+ BR_EC_secp521r1,
+ P521_N, sizeof P521_N,
+ P521_G, sizeof P521_G
+};
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_atr.c b/test/monniaux/BearSSL/src/ec/ecdsa_atr.c
new file mode 100644
index 00000000..3a11226e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_atr.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_asn1_to_raw(void *sig, size_t sig_len)
+{
+ /*
+ * Note: this code is a bit lenient in that it accepts a few
+ * deviations to DER with regards to minimality of encoding of
+ * lengths and integer values. These deviations are still
+ * unambiguous.
+ *
+ * Signature format is a SEQUENCE of two INTEGER values. We
+ * support only integers of less than 127 bytes each (signed
+ * encoding) so the resulting raw signature will have length
+ * at most 254 bytes.
+ */
+
+ unsigned char *buf, *r, *s;
+ size_t zlen, rlen, slen, off;
+ unsigned char tmp[254];
+
+ buf = sig;
+ if (sig_len < 8) {
+ return 0;
+ }
+
+ /*
+ * First byte is SEQUENCE tag.
+ */
+ if (buf[0] != 0x30) {
+ return 0;
+ }
+
+ /*
+ * The SEQUENCE length will be encoded over one or two bytes. We
+ * limit the total SEQUENCE contents to 255 bytes, because it
+ * makes things simpler; this is enough for subgroup orders up
+ * to 999 bits.
+ */
+ zlen = buf[1];
+ if (zlen > 0x80) {
+ if (zlen != 0x81) {
+ return 0;
+ }
+ zlen = buf[2];
+ if (zlen != sig_len - 3) {
+ return 0;
+ }
+ off = 3;
+ } else {
+ if (zlen != sig_len - 2) {
+ return 0;
+ }
+ off = 2;
+ }
+
+ /*
+ * First INTEGER (r).
+ */
+ if (buf[off ++] != 0x02) {
+ return 0;
+ }
+ rlen = buf[off ++];
+ if (rlen >= 0x80) {
+ return 0;
+ }
+ r = buf + off;
+ off += rlen;
+
+ /*
+ * Second INTEGER (s).
+ */
+ if (off + 2 > sig_len) {
+ return 0;
+ }
+ if (buf[off ++] != 0x02) {
+ return 0;
+ }
+ slen = buf[off ++];
+ if (slen >= 0x80 || slen != sig_len - off) {
+ return 0;
+ }
+ s = buf + off;
+
+ /*
+ * Removing leading zeros from r and s.
+ */
+ while (rlen > 0 && *r == 0) {
+ rlen --;
+ r ++;
+ }
+ while (slen > 0 && *s == 0) {
+ slen --;
+ s ++;
+ }
+
+ /*
+ * Compute common length for the two integers, then copy integers
+ * into the temporary buffer, and finally copy it back over the
+ * signature buffer.
+ */
+ zlen = rlen > slen ? rlen : slen;
+ sig_len = zlen << 1;
+ memset(tmp, 0, sig_len);
+ memcpy(tmp + zlen - rlen, r, rlen);
+ memcpy(tmp + sig_len - slen, s, slen);
+ memcpy(sig, tmp, sig_len);
+ return sig_len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c
new file mode 100644
index 00000000..afbf8acb
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+br_ecdsa_sign
+br_ecdsa_sign_asn1_get_default(void)
+{
+#if BR_LOMUL
+ return &br_ecdsa_i15_sign_asn1;
+#else
+ return &br_ecdsa_i31_sign_asn1;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c
new file mode 100644
index 00000000..287c9704
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+br_ecdsa_sign
+br_ecdsa_sign_raw_get_default(void)
+{
+#if BR_LOMUL
+ return &br_ecdsa_i15_sign_raw;
+#else
+ return &br_ecdsa_i31_sign_raw;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c
new file mode 100644
index 00000000..fe0996e8
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+br_ecdsa_vrfy
+br_ecdsa_vrfy_asn1_get_default(void)
+{
+#if BR_LOMUL
+ return &br_ecdsa_i15_vrfy_asn1;
+#else
+ return &br_ecdsa_i31_vrfy_asn1;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c
new file mode 100644
index 00000000..e564a105
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+br_ecdsa_vrfy
+br_ecdsa_vrfy_raw_get_default(void)
+{
+#if BR_LOMUL
+ return &br_ecdsa_i15_vrfy_raw;
+#else
+ return &br_ecdsa_i31_vrfy_raw;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c
new file mode 100644
index 00000000..402d14a6
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_ecdsa_i15_bits2int(uint16_t *x,
+ const void *src, size_t len, uint32_t ebitlen)
+{
+ uint32_t bitlen, hbitlen;
+ int sc;
+
+ bitlen = ebitlen - (ebitlen >> 4);
+ hbitlen = (uint32_t)len << 3;
+ if (hbitlen > bitlen) {
+ len = (bitlen + 7) >> 3;
+ sc = (int)((hbitlen - bitlen) & 7);
+ } else {
+ sc = 0;
+ }
+ br_i15_zero(x, ebitlen);
+ br_i15_decode(x, src, len);
+ br_i15_rshift(x, sc);
+ x[0] = ebitlen;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c
new file mode 100644
index 00000000..ab4a283c
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define ORDER_LEN ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_i15_sign_asn1(const br_ec_impl *impl,
+ const br_hash_class *hf, const void *hash_value,
+ const br_ec_private_key *sk, void *sig)
+{
+ unsigned char rsig[(ORDER_LEN << 1) + 12];
+ size_t sig_len;
+
+ sig_len = br_ecdsa_i15_sign_raw(impl, hf, hash_value, sk, rsig);
+ if (sig_len == 0) {
+ return 0;
+ }
+ sig_len = br_ecdsa_raw_to_asn1(rsig, sig_len);
+ memcpy(sig, rsig, sig_len);
+ return sig_len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c
new file mode 100644
index 00000000..39b2e1d7
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define I15_LEN ((BR_MAX_EC_SIZE + 29) / 15)
+#define POINT_LEN (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1))
+#define ORDER_LEN ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_i15_sign_raw(const br_ec_impl *impl,
+ const br_hash_class *hf, const void *hash_value,
+ const br_ec_private_key *sk, void *sig)
+{
+ /*
+ * IMPORTANT: this code is fit only for curves with a prime
+ * order. This is needed so that modular reduction of the X
+ * coordinate of a point can be done with a simple subtraction.
+ * We also rely on the last byte of the curve order to be distinct
+ * from 0 and 1.
+ */
+ const br_ec_curve_def *cd;
+ uint16_t n[I15_LEN], r[I15_LEN], s[I15_LEN], x[I15_LEN];
+ uint16_t m[I15_LEN], k[I15_LEN], t1[I15_LEN], t2[I15_LEN];
+ unsigned char tt[ORDER_LEN << 1];
+ unsigned char eU[POINT_LEN];
+ size_t hash_len, nlen, ulen;
+ uint16_t n0i;
+ uint32_t ctl;
+ br_hmac_drbg_context drbg;
+
+ /*
+ * If the curve is not supported, then exit with an error.
+ */
+ if (((impl->supported_curves >> sk->curve) & 1) == 0) {
+ return 0;
+ }
+
+ /*
+ * Get the curve parameters (generator and order).
+ */
+ switch (sk->curve) {
+ case BR_EC_secp256r1:
+ cd = &br_secp256r1;
+ break;
+ case BR_EC_secp384r1:
+ cd = &br_secp384r1;
+ break;
+ case BR_EC_secp521r1:
+ cd = &br_secp521r1;
+ break;
+ default:
+ return 0;
+ }
+
+ /*
+ * Get modulus.
+ */
+ nlen = cd->order_len;
+ br_i15_decode(n, cd->order, nlen);
+ n0i = br_i15_ninv15(n[1]);
+
+ /*
+ * Get private key as an i15 integer. This also checks that the
+ * private key is well-defined (not zero, and less than the
+ * curve order).
+ */
+ if (!br_i15_decode_mod(x, sk->x, sk->xlen, n)) {
+ return 0;
+ }
+ if (br_i15_iszero(x)) {
+ return 0;
+ }
+
+ /*
+ * Get hash length.
+ */
+ hash_len = (hf->desc >> BR_HASHDESC_OUT_OFF) & BR_HASHDESC_OUT_MASK;
+
+ /*
+ * Truncate and reduce the hash value modulo the curve order.
+ */
+ br_ecdsa_i15_bits2int(m, hash_value, hash_len, n[0]);
+ br_i15_sub(m, n, br_i15_sub(m, n, 0) ^ 1);
+
+ /*
+ * RFC 6979 generation of the "k" value.
+ *
+ * The process uses HMAC_DRBG (with the hash function used to
+ * process the message that is to be signed). The seed is the
+ * concatenation of the encodings of the private key and
+ * the hash value (after truncation and modular reduction).
+ */
+ br_i15_encode(tt, nlen, x);
+ br_i15_encode(tt + nlen, nlen, m);
+ br_hmac_drbg_init(&drbg, hf, tt, nlen << 1);
+ for (;;) {
+ br_hmac_drbg_generate(&drbg, tt, nlen);
+ br_ecdsa_i15_bits2int(k, tt, nlen, n[0]);
+ if (br_i15_iszero(k)) {
+ continue;
+ }
+ if (br_i15_sub(k, n, 0)) {
+ break;
+ }
+ }
+
+ /*
+ * Compute k*G and extract the X coordinate, then reduce it
+ * modulo the curve order. Since we support only curves with
+ * prime order, that reduction is only a matter of computing
+ * a subtraction.
+ */
+ br_i15_encode(tt, nlen, k);
+ ulen = impl->mulgen(eU, tt, nlen, sk->curve);
+ br_i15_zero(r, n[0]);
+ br_i15_decode(r, &eU[1], ulen >> 1);
+ r[0] = n[0];
+ br_i15_sub(r, n, br_i15_sub(r, n, 0) ^ 1);
+
+ /*
+ * Compute 1/k in double-Montgomery representation. We do so by
+ * first converting _from_ Montgomery representation (twice),
+ * then using a modular exponentiation.
+ */
+ br_i15_from_monty(k, n, n0i);
+ br_i15_from_monty(k, n, n0i);
+ memcpy(tt, cd->order, nlen);
+ tt[nlen - 1] -= 2;
+ br_i15_modpow(k, tt, nlen, n, n0i, t1, t2);
+
+ /*
+ * Compute s = (m+xr)/k (mod n).
+ * The k[] array contains R^2/k (double-Montgomery representation);
+ * we thus can use direct Montgomery multiplications and conversions
+ * from Montgomery, avoiding any call to br_i15_to_monty() (which
+ * is slower).
+ */
+ br_i15_from_monty(m, n, n0i);
+ br_i15_montymul(t1, x, r, n, n0i);
+ ctl = br_i15_add(t1, m, 1);
+ ctl |= br_i15_sub(t1, n, 0) ^ 1;
+ br_i15_sub(t1, n, ctl);
+ br_i15_montymul(s, t1, k, n, n0i);
+
+ /*
+ * Encode r and s in the signature.
+ */
+ br_i15_encode(sig, nlen, r);
+ br_i15_encode((unsigned char *)sig + nlen, nlen, s);
+ return nlen << 1;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c
new file mode 100644
index 00000000..f4bef997
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define FIELD_LEN ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+uint32_t
+br_ecdsa_i15_vrfy_asn1(const br_ec_impl *impl,
+ const void *hash, size_t hash_len,
+ const br_ec_public_key *pk,
+ const void *sig, size_t sig_len)
+{
+ /*
+ * We use a double-sized buffer because a malformed ASN.1 signature
+ * may trigger a size expansion when converting to "raw" format.
+ */
+ unsigned char rsig[(FIELD_LEN << 2) + 24];
+
+ if (sig_len > ((sizeof rsig) >> 1)) {
+ return 0;
+ }
+ memcpy(rsig, sig, sig_len);
+ sig_len = br_ecdsa_asn1_to_raw(rsig, sig_len);
+ return br_ecdsa_i15_vrfy_raw(impl, hash, hash_len, pk, rsig, sig_len);
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c
new file mode 100644
index 00000000..14dd5e46
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define I15_LEN ((BR_MAX_EC_SIZE + 29) / 15)
+#define POINT_LEN (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1))
+
+/* see bearssl_ec.h */
+uint32_t
+br_ecdsa_i15_vrfy_raw(const br_ec_impl *impl,
+ const void *hash, size_t hash_len,
+ const br_ec_public_key *pk,
+ const void *sig, size_t sig_len)
+{
+ /*
+ * IMPORTANT: this code is fit only for curves with a prime
+ * order. This is needed so that modular reduction of the X
+ * coordinate of a point can be done with a simple subtraction.
+ */
+ const br_ec_curve_def *cd;
+ uint16_t n[I15_LEN], r[I15_LEN], s[I15_LEN], t1[I15_LEN], t2[I15_LEN];
+ unsigned char tx[(BR_MAX_EC_SIZE + 7) >> 3];
+ unsigned char ty[(BR_MAX_EC_SIZE + 7) >> 3];
+ unsigned char eU[POINT_LEN];
+ size_t nlen, rlen, ulen;
+ uint16_t n0i;
+ uint32_t res;
+
+ /*
+ * If the curve is not supported, then report an error.
+ */
+ if (((impl->supported_curves >> pk->curve) & 1) == 0) {
+ return 0;
+ }
+
+ /*
+ * Get the curve parameters (generator and order).
+ */
+ switch (pk->curve) {
+ case BR_EC_secp256r1:
+ cd = &br_secp256r1;
+ break;
+ case BR_EC_secp384r1:
+ cd = &br_secp384r1;
+ break;
+ case BR_EC_secp521r1:
+ cd = &br_secp521r1;
+ break;
+ default:
+ return 0;
+ }
+
+ /*
+ * Signature length must be even.
+ */
+ if (sig_len & 1) {
+ return 0;
+ }
+ rlen = sig_len >> 1;
+
+ /*
+ * Public key point must have the proper size for this curve.
+ */
+ if (pk->qlen != cd->generator_len) {
+ return 0;
+ }
+
+ /*
+ * Get modulus; then decode the r and s values. They must be
+ * lower than the modulus, and s must not be null.
+ */
+ nlen = cd->order_len;
+ br_i15_decode(n, cd->order, nlen);
+ n0i = br_i15_ninv15(n[1]);
+ if (!br_i15_decode_mod(r, sig, rlen, n)) {
+ return 0;
+ }
+ if (!br_i15_decode_mod(s, (const unsigned char *)sig + rlen, rlen, n)) {
+ return 0;
+ }
+ if (br_i15_iszero(s)) {
+ return 0;
+ }
+
+ /*
+ * Invert s. We do that with a modular exponentiation; we use
+ * the fact that for all the curves we support, the least
+ * significant byte is not 0 or 1, so we can subtract 2 without
+ * any carry to process.
+ * We also want 1/s in Montgomery representation, which can be
+ * done by converting _from_ Montgomery representation before
+ * the inversion (because (1/s)*R = 1/(s/R)).
+ */
+ br_i15_from_monty(s, n, n0i);
+ memcpy(tx, cd->order, nlen);
+ tx[nlen - 1] -= 2;
+ br_i15_modpow(s, tx, nlen, n, n0i, t1, t2);
+
+ /*
+ * Truncate the hash to the modulus length (in bits) and reduce
+ * it modulo the curve order. The modular reduction can be done
+ * with a subtraction since the truncation already reduced the
+ * value to the modulus bit length.
+ */
+ br_ecdsa_i15_bits2int(t1, hash, hash_len, n[0]);
+ br_i15_sub(t1, n, br_i15_sub(t1, n, 0) ^ 1);
+
+ /*
+ * Multiply the (truncated, reduced) hash value with 1/s, result in
+ * t2, encoded in ty.
+ */
+ br_i15_montymul(t2, t1, s, n, n0i);
+ br_i15_encode(ty, nlen, t2);
+
+ /*
+ * Multiply r with 1/s, result in t1, encoded in tx.
+ */
+ br_i15_montymul(t1, r, s, n, n0i);
+ br_i15_encode(tx, nlen, t1);
+
+ /*
+ * Compute the point x*Q + y*G.
+ */
+ ulen = cd->generator_len;
+ memcpy(eU, pk->q, ulen);
+ res = impl->muladd(eU, NULL, ulen,
+ tx, nlen, ty, nlen, cd->curve);
+
+ /*
+ * Get the X coordinate, reduce modulo the curve order, and
+ * compare with the 'r' value.
+ *
+ * The modular reduction can be done with subtractions because
+ * we work with curves of prime order, so the curve order is
+ * close to the field order (Hasse's theorem).
+ */
+ br_i15_zero(t1, n[0]);
+ br_i15_decode(t1, &eU[1], ulen >> 1);
+ t1[0] = n[0];
+ br_i15_sub(t1, n, br_i15_sub(t1, n, 0) ^ 1);
+ res &= ~br_i15_sub(t1, r, 1);
+ res &= br_i15_iszero(t1);
+ return res;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c
new file mode 100644
index 00000000..9a8d6730
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_ecdsa_i31_bits2int(uint32_t *x,
+ const void *src, size_t len, uint32_t ebitlen)
+{
+ uint32_t bitlen, hbitlen;
+ int sc;
+
+ bitlen = ebitlen - (ebitlen >> 5);
+ hbitlen = (uint32_t)len << 3;
+ if (hbitlen > bitlen) {
+ len = (bitlen + 7) >> 3;
+ sc = (int)((hbitlen - bitlen) & 7);
+ } else {
+ sc = 0;
+ }
+ br_i31_zero(x, ebitlen);
+ br_i31_decode(x, src, len);
+ br_i31_rshift(x, sc);
+ x[0] = ebitlen;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c
new file mode 100644
index 00000000..cf0d351d
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define ORDER_LEN ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_i31_sign_asn1(const br_ec_impl *impl,
+ const br_hash_class *hf, const void *hash_value,
+ const br_ec_private_key *sk, void *sig)
+{
+ unsigned char rsig[(ORDER_LEN << 1) + 12];
+ size_t sig_len;
+
+ sig_len = br_ecdsa_i31_sign_raw(impl, hf, hash_value, sk, rsig);
+ if (sig_len == 0) {
+ return 0;
+ }
+ sig_len = br_ecdsa_raw_to_asn1(rsig, sig_len);
+ memcpy(sig, rsig, sig_len);
+ return sig_len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c
new file mode 100644
index 00000000..1df98fed
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define I31_LEN ((BR_MAX_EC_SIZE + 61) / 31)
+#define POINT_LEN (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1))
+#define ORDER_LEN ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_i31_sign_raw(const br_ec_impl *impl,
+ const br_hash_class *hf, const void *hash_value,
+ const br_ec_private_key *sk, void *sig)
+{
+ /*
+ * IMPORTANT: this code is fit only for curves with a prime
+ * order. This is needed so that modular reduction of the X
+ * coordinate of a point can be done with a simple subtraction.
+ * We also rely on the last byte of the curve order to be distinct
+ * from 0 and 1.
+ */
+ const br_ec_curve_def *cd;
+ uint32_t n[I31_LEN], r[I31_LEN], s[I31_LEN], x[I31_LEN];
+ uint32_t m[I31_LEN], k[I31_LEN], t1[I31_LEN], t2[I31_LEN];
+ unsigned char tt[ORDER_LEN << 1];
+ unsigned char eU[POINT_LEN];
+ size_t hash_len, nlen, ulen;
+ uint32_t n0i, ctl;
+ br_hmac_drbg_context drbg;
+
+ /*
+ * If the curve is not supported, then exit with an error.
+ */
+ if (((impl->supported_curves >> sk->curve) & 1) == 0) {
+ return 0;
+ }
+
+ /*
+ * Get the curve parameters (generator and order).
+ */
+ switch (sk->curve) {
+ case BR_EC_secp256r1:
+ cd = &br_secp256r1;
+ break;
+ case BR_EC_secp384r1:
+ cd = &br_secp384r1;
+ break;
+ case BR_EC_secp521r1:
+ cd = &br_secp521r1;
+ break;
+ default:
+ return 0;
+ }
+
+ /*
+ * Get modulus.
+ */
+ nlen = cd->order_len;
+ br_i31_decode(n, cd->order, nlen);
+ n0i = br_i31_ninv31(n[1]);
+
+ /*
+ * Get private key as an i31 integer. This also checks that the
+ * private key is well-defined (not zero, and less than the
+ * curve order).
+ */
+ if (!br_i31_decode_mod(x, sk->x, sk->xlen, n)) {
+ return 0;
+ }
+ if (br_i31_iszero(x)) {
+ return 0;
+ }
+
+ /*
+ * Get hash length.
+ */
+ hash_len = (hf->desc >> BR_HASHDESC_OUT_OFF) & BR_HASHDESC_OUT_MASK;
+
+ /*
+ * Truncate and reduce the hash value modulo the curve order.
+ */
+ br_ecdsa_i31_bits2int(m, hash_value, hash_len, n[0]);
+ br_i31_sub(m, n, br_i31_sub(m, n, 0) ^ 1);
+
+ /*
+ * RFC 6979 generation of the "k" value.
+ *
+ * The process uses HMAC_DRBG (with the hash function used to
+ * process the message that is to be signed). The seed is the
+ * concatenation of the encodings of the private key and
+ * the hash value (after truncation and modular reduction).
+ */
+ br_i31_encode(tt, nlen, x);
+ br_i31_encode(tt + nlen, nlen, m);
+ br_hmac_drbg_init(&drbg, hf, tt, nlen << 1);
+ for (;;) {
+ br_hmac_drbg_generate(&drbg, tt, nlen);
+ br_ecdsa_i31_bits2int(k, tt, nlen, n[0]);
+ if (br_i31_iszero(k)) {
+ continue;
+ }
+ if (br_i31_sub(k, n, 0)) {
+ break;
+ }
+ }
+
+ /*
+ * Compute k*G and extract the X coordinate, then reduce it
+ * modulo the curve order. Since we support only curves with
+ * prime order, that reduction is only a matter of computing
+ * a subtraction.
+ */
+ br_i31_encode(tt, nlen, k);
+ ulen = impl->mulgen(eU, tt, nlen, sk->curve);
+ br_i31_zero(r, n[0]);
+ br_i31_decode(r, &eU[1], ulen >> 1);
+ r[0] = n[0];
+ br_i31_sub(r, n, br_i31_sub(r, n, 0) ^ 1);
+
+ /*
+ * Compute 1/k in double-Montgomery representation. We do so by
+ * first converting _from_ Montgomery representation (twice),
+ * then using a modular exponentiation.
+ */
+ br_i31_from_monty(k, n, n0i);
+ br_i31_from_monty(k, n, n0i);
+ memcpy(tt, cd->order, nlen);
+ tt[nlen - 1] -= 2;
+ br_i31_modpow(k, tt, nlen, n, n0i, t1, t2);
+
+ /*
+ * Compute s = (m+xr)/k (mod n).
+ * The k[] array contains R^2/k (double-Montgomery representation);
+ * we thus can use direct Montgomery multiplications and conversions
+ * from Montgomery, avoiding any call to br_i31_to_monty() (which
+ * is slower).
+ */
+ br_i31_from_monty(m, n, n0i);
+ br_i31_montymul(t1, x, r, n, n0i);
+ ctl = br_i31_add(t1, m, 1);
+ ctl |= br_i31_sub(t1, n, 0) ^ 1;
+ br_i31_sub(t1, n, ctl);
+ br_i31_montymul(s, t1, k, n, n0i);
+
+ /*
+ * Encode r and s in the signature.
+ */
+ br_i31_encode(sig, nlen, r);
+ br_i31_encode((unsigned char *)sig + nlen, nlen, s);
+ return nlen << 1;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c
new file mode 100644
index 00000000..4161aaaa
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define FIELD_LEN ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+uint32_t
+br_ecdsa_i31_vrfy_asn1(const br_ec_impl *impl,
+ const void *hash, size_t hash_len,
+ const br_ec_public_key *pk,
+ const void *sig, size_t sig_len)
+{
+ /*
+ * We use a double-sized buffer because a malformed ASN.1 signature
+ * may trigger a size expansion when converting to "raw" format.
+ */
+ unsigned char rsig[(FIELD_LEN << 2) + 24];
+
+ if (sig_len > ((sizeof rsig) >> 1)) {
+ return 0;
+ }
+ memcpy(rsig, sig, sig_len);
+ sig_len = br_ecdsa_asn1_to_raw(rsig, sig_len);
+ return br_ecdsa_i31_vrfy_raw(impl, hash, hash_len, pk, rsig, sig_len);
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c
new file mode 100644
index 00000000..259477fd
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define I31_LEN ((BR_MAX_EC_SIZE + 61) / 31)
+#define POINT_LEN (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1))
+
+/* see bearssl_ec.h */
+uint32_t
+br_ecdsa_i31_vrfy_raw(const br_ec_impl *impl,
+ const void *hash, size_t hash_len,
+ const br_ec_public_key *pk,
+ const void *sig, size_t sig_len)
+{
+ /*
+ * IMPORTANT: this code is fit only for curves with a prime
+ * order. This is needed so that modular reduction of the X
+ * coordinate of a point can be done with a simple subtraction.
+ */
+ const br_ec_curve_def *cd;
+ uint32_t n[I31_LEN], r[I31_LEN], s[I31_LEN], t1[I31_LEN], t2[I31_LEN];
+ unsigned char tx[(BR_MAX_EC_SIZE + 7) >> 3];
+ unsigned char ty[(BR_MAX_EC_SIZE + 7) >> 3];
+ unsigned char eU[POINT_LEN];
+ size_t nlen, rlen, ulen;
+ uint32_t n0i, res;
+
+ /*
+ * If the curve is not supported, then report an error.
+ */
+ if (((impl->supported_curves >> pk->curve) & 1) == 0) {
+ return 0;
+ }
+
+ /*
+ * Get the curve parameters (generator and order).
+ */
+ switch (pk->curve) {
+ case BR_EC_secp256r1:
+ cd = &br_secp256r1;
+ break;
+ case BR_EC_secp384r1:
+ cd = &br_secp384r1;
+ break;
+ case BR_EC_secp521r1:
+ cd = &br_secp521r1;
+ break;
+ default:
+ return 0;
+ }
+
+ /*
+ * Signature length must be even.
+ */
+ if (sig_len & 1) {
+ return 0;
+ }
+ rlen = sig_len >> 1;
+
+ /*
+ * Public key point must have the proper size for this curve.
+ */
+ if (pk->qlen != cd->generator_len) {
+ return 0;
+ }
+
+ /*
+ * Get modulus; then decode the r and s values. They must be
+ * lower than the modulus, and s must not be null.
+ */
+ nlen = cd->order_len;
+ br_i31_decode(n, cd->order, nlen);
+ n0i = br_i31_ninv31(n[1]);
+ if (!br_i31_decode_mod(r, sig, rlen, n)) {
+ return 0;
+ }
+ if (!br_i31_decode_mod(s, (const unsigned char *)sig + rlen, rlen, n)) {
+ return 0;
+ }
+ if (br_i31_iszero(s)) {
+ return 0;
+ }
+
+ /*
+ * Invert s. We do that with a modular exponentiation; we use
+ * the fact that for all the curves we support, the least
+ * significant byte is not 0 or 1, so we can subtract 2 without
+ * any carry to process.
+ * We also want 1/s in Montgomery representation, which can be
+ * done by converting _from_ Montgomery representation before
+ * the inversion (because (1/s)*R = 1/(s/R)).
+ */
+ br_i31_from_monty(s, n, n0i);
+ memcpy(tx, cd->order, nlen);
+ tx[nlen - 1] -= 2;
+ br_i31_modpow(s, tx, nlen, n, n0i, t1, t2);
+
+ /*
+ * Truncate the hash to the modulus length (in bits) and reduce
+ * it modulo the curve order. The modular reduction can be done
+ * with a subtraction since the truncation already reduced the
+ * value to the modulus bit length.
+ */
+ br_ecdsa_i31_bits2int(t1, hash, hash_len, n[0]);
+ br_i31_sub(t1, n, br_i31_sub(t1, n, 0) ^ 1);
+
+ /*
+ * Multiply the (truncated, reduced) hash value with 1/s, result in
+ * t2, encoded in ty.
+ */
+ br_i31_montymul(t2, t1, s, n, n0i);
+ br_i31_encode(ty, nlen, t2);
+
+ /*
+ * Multiply r with 1/s, result in t1, encoded in tx.
+ */
+ br_i31_montymul(t1, r, s, n, n0i);
+ br_i31_encode(tx, nlen, t1);
+
+ /*
+ * Compute the point x*Q + y*G.
+ */
+ ulen = cd->generator_len;
+ memcpy(eU, pk->q, ulen);
+ res = impl->muladd(eU, NULL, ulen,
+ tx, nlen, ty, nlen, cd->curve);
+
+ /*
+ * Get the X coordinate, reduce modulo the curve order, and
+ * compare with the 'r' value.
+ *
+ * The modular reduction can be done with subtractions because
+ * we work with curves of prime order, so the curve order is
+ * close to the field order (Hasse's theorem).
+ */
+ br_i31_zero(t1, n[0]);
+ br_i31_decode(t1, &eU[1], ulen >> 1);
+ t1[0] = n[0];
+ br_i31_sub(t1, n, br_i31_sub(t1, n, 0) ^ 1);
+ res &= ~br_i31_sub(t1, r, 1);
+ res &= br_i31_iszero(t1);
+ return res;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_rta.c b/test/monniaux/BearSSL/src/ec/ecdsa_rta.c
new file mode 100644
index 00000000..005c62c2
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_rta.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Compute ASN.1 encoded length for the provided integer. The ASN.1
+ * encoding is signed, so its leading bit must have value 0; it must
+ * also be of minimal length (so leading bytes of value 0 must be
+ * removed, except if that would contradict the rule about the sign
+ * bit).
+ */
+static size_t
+asn1_int_length(const unsigned char *x, size_t xlen)
+{
+ while (xlen > 0 && *x == 0) {
+ x ++;
+ xlen --;
+ }
+ if (xlen == 0 || *x >= 0x80) {
+ xlen ++;
+ }
+ return xlen;
+}
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_raw_to_asn1(void *sig, size_t sig_len)
+{
+ /*
+ * Internal buffer is large enough to accommodate a signature
+ * such that r and s fit on 125 bytes each (signed encoding),
+ * meaning a curve order of up to 999 bits. This is the limit
+ * that ensures "simple" length encodings.
+ */
+ unsigned char *buf;
+ size_t hlen, rlen, slen, zlen, off;
+ unsigned char tmp[257];
+
+ buf = sig;
+ if ((sig_len & 1) != 0) {
+ return 0;
+ }
+
+ /*
+ * Compute lengths for the two integers.
+ */
+ hlen = sig_len >> 1;
+ rlen = asn1_int_length(buf, hlen);
+ slen = asn1_int_length(buf + hlen, hlen);
+ if (rlen > 125 || slen > 125) {
+ return 0;
+ }
+
+ /*
+ * SEQUENCE header.
+ */
+ tmp[0] = 0x30;
+ zlen = rlen + slen + 4;
+ if (zlen >= 0x80) {
+ tmp[1] = 0x81;
+ tmp[2] = zlen;
+ off = 3;
+ } else {
+ tmp[1] = zlen;
+ off = 2;
+ }
+
+ /*
+ * First INTEGER (r).
+ */
+ tmp[off ++] = 0x02;
+ tmp[off ++] = rlen;
+ if (rlen > hlen) {
+ tmp[off] = 0x00;
+ memcpy(tmp + off + 1, buf, hlen);
+ } else {
+ memcpy(tmp + off, buf + hlen - rlen, rlen);
+ }
+ off += rlen;
+
+ /*
+ * Second INTEGER (s).
+ */
+ tmp[off ++] = 0x02;
+ tmp[off ++] = slen;
+ if (slen > hlen) {
+ tmp[off] = 0x00;
+ memcpy(tmp + off + 1, buf + hlen, hlen);
+ } else {
+ memcpy(tmp + off, buf + sig_len - slen, slen);
+ }
+ off += slen;
+
+ /*
+ * Return ASN.1 signature.
+ */
+ memcpy(sig, tmp, off);
+ return off;
+}