37 files changed, 15319 insertions, 0 deletions
diff --git a/test/monniaux/BearSSL/src/ec/ec_all_m15.c b/test/monniaux/BearSSL/src/ec/ec_all_m15.c
new file mode 100644
index 00000000..bb550e18
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_all_m15.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m15.generator(curve, len);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m15.generator(curve, len);
+	default:
+		return br_ec_prime_i15.generator(curve, len);
+	}
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m15.order(curve, len);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m15.order(curve, len);
+	default:
+		return br_ec_prime_i15.order(curve, len);
+	}
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m15.xoff(curve, len);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m15.xoff(curve, len);
+	default:
+		return br_ec_prime_i15.xoff(curve, len);
+	}
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m15.mul(G, Glen, kb, kblen, curve);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m15.mul(G, Glen, kb, kblen, curve);
+	default:
+		return br_ec_prime_i15.mul(G, Glen, kb, kblen, curve);
+	}
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m15.mulgen(R, x, xlen, curve);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m15.mulgen(R, x, xlen, curve);
+	default:
+		return br_ec_prime_i15.mulgen(R, x, xlen, curve);
+	}
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return br_ec_p256_m15.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+	case BR_EC_curve25519:
+		return br_ec_c25519_m15.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+	default:
+		return br_ec_prime_i15.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+	}
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_all_m15 = {
+	(uint32_t)0x23800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_all_m31.c b/test/monniaux/BearSSL/src/ec/ec_all_m31.c
new file mode 100644
index 00000000..8fd8c3c0
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_all_m31.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.generator(curve, len);
+#else
+		return br_ec_p256_m31.generator(curve, len);
+#endif
+	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.generator(curve, len);
+#else
+		return br_ec_c25519_m31.generator(curve, len);
+#endif
+	default:
+		return br_ec_prime_i31.generator(curve, len);
+	}
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.order(curve, len);
+#else
+		return br_ec_p256_m31.order(curve, len);
+#endif
+	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.order(curve, len);
+#else
+		return br_ec_c25519_m31.order(curve, len);
+#endif
+	default:
+		return br_ec_prime_i31.order(curve, len);
+	}
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.xoff(curve, len);
+#else
+		return br_ec_p256_m31.xoff(curve, len);
+#endif
+	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.xoff(curve, len);
+#else
+		return br_ec_c25519_m31.xoff(curve, len);
+#endif
+	default:
+		return br_ec_prime_i31.xoff(curve, len);
+	}
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.mul(G, Glen, kb, kblen, curve);
+#else
+		return br_ec_p256_m31.mul(G, Glen, kb, kblen, curve);
+#endif
+	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.mul(G, Glen, kb, kblen, curve);
+#else
+		return br_ec_c25519_m31.mul(G, Glen, kb, kblen, curve);
+#endif
+	default:
+		return br_ec_prime_i31.mul(G, Glen, kb, kblen, curve);
+	}
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.mulgen(R, x, xlen, curve);
+#else
+		return br_ec_p256_m31.mulgen(R, x, xlen, curve);
+#endif
+	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.mulgen(R, x, xlen, curve);
+#else
+		return br_ec_c25519_m31.mulgen(R, x, xlen, curve);
+#endif
+	default:
+		return br_ec_prime_i31.mulgen(R, x, xlen, curve);
+	}
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_p256_m64.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+#else
+		return br_ec_p256_m31.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+#endif
+	case BR_EC_curve25519:
+#if BR_INT128 || BR_UMUL128
+		return br_ec_c25519_m64.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+#else
+		return br_ec_c25519_m31.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+#endif
+	default:
+		return br_ec_prime_i31.muladd(A, B, len,
+			x, xlen, y, ylen, curve);
+	}
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_all_m31 = {
+	(uint32_t)0x23800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c b/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c
new file mode 100644
index 00000000..8fadcf48
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Parameters for the field:
+ *   - field modulus p = 2^255-19
+ *   - R^2 mod p (R = 2^(15k) for the smallest k such that R >= p)
+ */
+
+static const uint16_t C255_P[] = {
+	0x0110,
+	0x7FED, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+	0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+	0x7FFF
+};
+
+#define P0I   0x4A1B
+
+static const uint16_t C255_R2[] = {
+	0x0110,
+	0x0169, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000
+};
+
+/* obsolete
+#include <stdio.h>
+#include <stdlib.h>
+static void
+print_int_mont(const char *name, const uint16_t *x)
+{
+	uint16_t y[18];
+	unsigned char tmp[32];
+	size_t u;
+
+	printf("%s = ", name);
+	memcpy(y, x, sizeof y);
+	br_i15_from_monty(y, C255_P, P0I);
+	br_i15_encode(tmp, sizeof tmp, y);
+	for (u = 0; u < sizeof tmp; u ++) {
+		printf("%02X", tmp[u]);
+	}
+	printf("\n");
+}
+*/
+
+static const uint16_t C255_A24[] = {
+	0x0110,
+	0x45D3, 0x0046, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000
+};
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+static void
+cswap(uint16_t *a, uint16_t *b, uint32_t ctl)
+{
+	int i;
+
+	ctl = -ctl;
+	for (i = 0; i < 18; i ++) {
+		uint32_t aw, bw, tw;
+
+		aw = a[i];
+		bw = b[i];
+		tw = ctl & (aw ^ bw);
+		a[i] = aw ^ tw;
+		b[i] = bw ^ tw;
+	}
+}
+
+static void
+c255_add(uint16_t *d, const uint16_t *a, const uint16_t *b)
+{
+	uint32_t ctl;
+	uint16_t t[18];
+
+	memcpy(t, a, sizeof t);
+	ctl = br_i15_add(t, b, 1);
+	ctl |= NOT(br_i15_sub(t, C255_P, 0));
+	br_i15_sub(t, C255_P, ctl);
+	memcpy(d, t, sizeof t);
+}
+
+static void
+c255_sub(uint16_t *d, const uint16_t *a, const uint16_t *b)
+{
+	uint16_t t[18];
+
+	memcpy(t, a, sizeof t);
+	br_i15_add(t, C255_P, br_i15_sub(t, b, 1));
+	memcpy(d, t, sizeof t);
+}
+
+static void
+c255_mul(uint16_t *d, const uint16_t *a, const uint16_t *b)
+{
+	uint16_t t[18];
+
+	br_i15_montymul(t, a, b, C255_P, P0I);
+	memcpy(d, t, sizeof t);
+}
+
+static void
+byteswap(unsigned char *G)
+{
+	int i;
+
+	for (i = 0; i < 16; i ++) {
+		unsigned char t;
+
+		t = G[i];
+		G[i] = G[31 - i];
+		G[31 - i] = t;
+	}
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+#define ILEN   (18 * sizeof(uint16_t))
+
+	/*
+	 * The a[] and b[] arrays have an extra word to allow for
+	 * decoding without using br_i15_decode_reduce().
+	 */
+	uint16_t x1[18], x2[18], x3[18], z2[18], z3[18];
+	uint16_t a[19], aa[18], b[19], bb[18];
+	uint16_t c[18], d[18], e[18], da[18], cb[18];
+	unsigned char k[32];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+	G[31] &= 0x7F;
+
+	/*
+	 * Byteswap the point encoding, because it uses little-endian, and
+	 * the generic decoding routine uses big-endian.
+	 */
+	byteswap(G);
+
+	/*
+	 * Decode the point ('u' coordinate). This should be reduced
+	 * modulo p, but we prefer to avoid the dependency on
+	 * br_i15_decode_reduce(). Instead, we use br_i15_decode_mod()
+	 * with a synthetic modulus of value 2^255 (this must work
+	 * since G was truncated to 255 bits), then use a conditional
+	 * subtraction. We use br_i15_decode_mod() and not
+	 * br_i15_decode(), because the ec_prime_i15 implementation uses
+	 * the former but not the latter.
+	 *    br_i15_decode_reduce(a, G, 32, C255_P);
+	 */
+	br_i15_zero(b, 0x111);
+	b[18] = 1;
+	br_i15_decode_mod(a, G, 32, b);
+	a[0] = 0x110;
+	br_i15_sub(a, C255_P, NOT(br_i15_sub(a, C255_P, 0)));
+
+	/*
+	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
+	 * into Montgomery representation.
+	 */
+	br_i15_montymul(x1, a, C255_R2, C255_P, P0I);
+	memcpy(x3, x1, ILEN);
+	br_i15_zero(z2, C255_P[0]);
+	memcpy(x2, z2, ILEN);
+	x2[1] = 19;
+	memcpy(z3, x2, ILEN);
+
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	/* obsolete
+	print_int_mont("x1", x1);
+	*/
+
+	swap = 0;
+	for (i = 254; i >= 0; i --) {
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		cswap(x2, x3, swap);
+		cswap(z2, z3, swap);
+		swap = kt;
+
+		/* obsolete
+		print_int_mont("x2", x2);
+		print_int_mont("z2", z2);
+		print_int_mont("x3", x3);
+		print_int_mont("z3", z3);
+		*/
+
+		c255_add(a, x2, z2);
+		c255_mul(aa, a, a);
+		c255_sub(b, x2, z2);
+		c255_mul(bb, b, b);
+		c255_sub(e, aa, bb);
+		c255_add(c, x3, z3);
+		c255_sub(d, x3, z3);
+		c255_mul(da, d, a);
+		c255_mul(cb, c, b);
+
+		/* obsolete
+		print_int_mont("a ", a);
+		print_int_mont("aa", aa);
+		print_int_mont("b ", b);
+		print_int_mont("bb", bb);
+		print_int_mont("e ", e);
+		print_int_mont("c ", c);
+		print_int_mont("d ", d);
+		print_int_mont("da", da);
+		print_int_mont("cb", cb);
+		*/
+
+		c255_add(x3, da, cb);
+		c255_mul(x3, x3, x3);
+		c255_sub(z3, da, cb);
+		c255_mul(z3, z3, z3);
+		c255_mul(z3, z3, x1);
+		c255_mul(x2, aa, bb);
+		c255_mul(z2, C255_A24, e);
+		c255_add(z2, z2, aa);
+		c255_mul(z2, e, z2);
+
+		/* obsolete
+		print_int_mont("x2", x2);
+		print_int_mont("z2", z2);
+		print_int_mont("x3", x3);
+		print_int_mont("z3", z3);
+		*/
+	}
+	cswap(x2, x3, swap);
+	cswap(z2, z3, swap);
+
+	/*
+	 * Inverse z2 with a modular exponentiation. This is a simple
+	 * square-and-multiply algorithm; we mutualise most non-squarings
+	 * since the exponent contains almost only ones.
+	 */
+	memcpy(a, z2, ILEN);
+	for (i = 0; i < 15; i ++) {
+		c255_mul(a, a, a);
+		c255_mul(a, a, z2);
+	}
+	memcpy(b, a, ILEN);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			c255_mul(b, b, b);
+		}
+		c255_mul(b, b, a);
+	}
+	for (i = 14; i >= 0; i --) {
+		c255_mul(b, b, b);
+		if ((0xFFEB >> i) & 1) {
+			c255_mul(b, z2, b);
+		}
+	}
+	c255_mul(b, x2, b);
+
+	/*
+	 * To avoid a dependency on br_i15_from_monty(), we use a
+	 * Montgomery multiplication with 1.
+	 *    memcpy(x2, b, ILEN);
+	 *    br_i15_from_monty(x2, C255_P, P0I);
+	 */
+	br_i15_zero(a, C255_P[0]);
+	a[1] = 1;
+	br_i15_montymul(x2, a, b, C255_P, P0I);
+
+	br_i15_encode(G, 32, x2);
+	byteswap(G);
+	return 1;
+
+#undef ILEN
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_i15 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c b/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c
new file mode 100644
index 00000000..f8ffc2c2
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Parameters for the field:
+ *   - field modulus p = 2^255-19
+ *   - R^2 mod p (R = 2^(31k) for the smallest k such that R >= p)
+ */
+
+static const uint32_t C255_P[] = {
+	0x00000107,
+	0x7FFFFFED, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0000007F
+};
+
+#define P0I   0x286BCA1B
+
+static const uint32_t C255_R2[] = {
+	0x00000107,
+	0x00000000, 0x02D20000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000
+};
+
+static const uint32_t C255_A24[] = {
+	0x00000107,
+	0x53000000, 0x0000468B, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000
+};
+
+/* obsolete
+#include <stdio.h>
+#include <stdlib.h>
+static void
+print_int_mont(const char *name, const uint32_t *x)
+{
+	uint32_t y[10];
+	unsigned char tmp[32];
+	size_t u;
+
+	printf("%s = ", name);
+	memcpy(y, x, sizeof y);
+	br_i31_from_monty(y, C255_P, P0I);
+	br_i31_encode(tmp, sizeof tmp, y);
+	for (u = 0; u < sizeof tmp; u ++) {
+		printf("%02X", tmp[u]);
+	}
+	printf("\n");
+}
+*/
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+static void
+cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
+{
+	int i;
+
+	ctl = -ctl;
+	for (i = 0; i < 10; i ++) {
+		uint32_t aw, bw, tw;
+
+		aw = a[i];
+		bw = b[i];
+		tw = ctl & (aw ^ bw);
+		a[i] = aw ^ tw;
+		b[i] = bw ^ tw;
+	}
+}
+
+static void
+c255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t ctl;
+	uint32_t t[10];
+
+	memcpy(t, a, sizeof t);
+	ctl = br_i31_add(t, b, 1);
+	ctl |= NOT(br_i31_sub(t, C255_P, 0));
+	br_i31_sub(t, C255_P, ctl);
+	memcpy(d, t, sizeof t);
+}
+
+static void
+c255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t t[10];
+
+	memcpy(t, a, sizeof t);
+	br_i31_add(t, C255_P, br_i31_sub(t, b, 1));
+	memcpy(d, t, sizeof t);
+}
+
+static void
+c255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t t[10];
+
+	br_i31_montymul(t, a, b, C255_P, P0I);
+	memcpy(d, t, sizeof t);
+}
+
+static void
+byteswap(unsigned char *G)
+{
+	int i;
+
+	for (i = 0; i < 16; i ++) {
+		unsigned char t;
+
+		t = G[i];
+		G[i] = G[31 - i];
+		G[31 - i] = t;
+	}
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	uint32_t x1[10], x2[10], x3[10], z2[10], z3[10];
+	uint32_t a[10], aa[10], b[10], bb[10];
+	uint32_t c[10], d[10], e[10], da[10], cb[10];
+	unsigned char k[32];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+	G[31] &= 0x7F;
+
+	/*
+	 * Byteswap the point encoding, because it uses little-endian, and
+	 * the generic decoding routine uses big-endian.
+	 */
+	byteswap(G);
+
+	/*
+	 * Decode the point ('u' coordinate). This should be reduced
+	 * modulo p, but we prefer to avoid the dependency on
+	 * br_i31_decode_reduce(). Instead, we use br_i31_decode_mod()
+	 * with a synthetic modulus of value 2^255 (this must work
+	 * since G was truncated to 255 bits), then use a conditional
+	 * subtraction. We use br_i31_decode_mod() and not
+	 * br_i31_decode(), because the ec_prime_i31 implementation uses
+	 * the former but not the latter.
+	 *    br_i31_decode_reduce(a, G, 32, C255_P);
+	 */
+	br_i31_zero(b, 0x108);
+	b[9] = 0x0080;
+	br_i31_decode_mod(a, G, 32, b);
+	a[0] = 0x107;
+	br_i31_sub(a, C255_P, NOT(br_i31_sub(a, C255_P, 0)));
+
+	/*
+	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
+	 * into Montgomery representation.
+	 */
+	br_i31_montymul(x1, a, C255_R2, C255_P, P0I);
+	memcpy(x3, x1, sizeof x1);
+	br_i31_zero(z2, C255_P[0]);
+	memcpy(x2, z2, sizeof z2);
+	x2[1] = 0x13000000;
+	memcpy(z3, x2, sizeof x2);
+
+	/*
+	 * kb[] is in big-endian notation, but possibly shorter than k[].
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	/* obsolete
+	print_int_mont("x1", x1);
+	*/
+
+	swap = 0;
+	for (i = 254; i >= 0; i --) {
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		cswap(x2, x3, swap);
+		cswap(z2, z3, swap);
+		swap = kt;
+
+		/* obsolete
+		print_int_mont("x2", x2);
+		print_int_mont("z2", z2);
+		print_int_mont("x3", x3);
+		print_int_mont("z3", z3);
+		*/
+
+		c255_add(a, x2, z2);
+		c255_mul(aa, a, a);
+		c255_sub(b, x2, z2);
+		c255_mul(bb, b, b);
+		c255_sub(e, aa, bb);
+		c255_add(c, x3, z3);
+		c255_sub(d, x3, z3);
+		c255_mul(da, d, a);
+		c255_mul(cb, c, b);
+
+		/* obsolete
+		print_int_mont("a ", a);
+		print_int_mont("aa", aa);
+		print_int_mont("b ", b);
+		print_int_mont("bb", bb);
+		print_int_mont("e ", e);
+		print_int_mont("c ", c);
+		print_int_mont("d ", d);
+		print_int_mont("da", da);
+		print_int_mont("cb", cb);
+		*/
+
+		c255_add(x3, da, cb);
+		c255_mul(x3, x3, x3);
+		c255_sub(z3, da, cb);
+		c255_mul(z3, z3, z3);
+		c255_mul(z3, z3, x1);
+		c255_mul(x2, aa, bb);
+		c255_mul(z2, C255_A24, e);
+		c255_add(z2, z2, aa);
+		c255_mul(z2, e, z2);
+
+		/* obsolete
+		print_int_mont("x2", x2);
+		print_int_mont("z2", z2);
+		print_int_mont("x3", x3);
+		print_int_mont("z3", z3);
+		*/
+	}
+	cswap(x2, x3, swap);
+	cswap(z2, z3, swap);
+
+	/*
+	 * Inverse z2 with a modular exponentiation. This is a simple
+	 * square-and-multiply algorithm; we mutualise most non-squarings
+	 * since the exponent contains almost only ones.
+	 */
+	memcpy(a, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		c255_mul(a, a, a);
+		c255_mul(a, a, z2);
+	}
+	memcpy(b, a, sizeof a);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			c255_mul(b, b, b);
+		}
+		c255_mul(b, b, a);
+	}
+	for (i = 14; i >= 0; i --) {
+		c255_mul(b, b, b);
+		if ((0xFFEB >> i) & 1) {
+			c255_mul(b, z2, b);
+		}
+	}
+	c255_mul(b, x2, b);
+
+	/*
+	 * To avoid a dependency on br_i31_from_monty(), we use
+	 * a Montgomery multiplication with 1.
+	 *    memcpy(x2, b, sizeof b);
+	 *    br_i31_from_monty(x2, C255_P, P0I);
+	 */
+	br_i31_zero(a, C255_P[0]);
+	a[1] = 1;
+	br_i31_montymul(x2, a, b, C255_P, P0I);
+
+	br_i31_encode(G, 32, x2);
+	byteswap(G);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_i31 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c
new file mode 100644
index 00000000..deff55b3
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c
@@ -0,0 +1,1478 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* obsolete
+#include <stdio.h>
+#include <stdlib.h>
+static void
+print_int(const char *name, const uint32_t *x)
+{
+	size_t u;
+	unsigned char tmp[36];
+
+	printf("%s = ", name);
+	for (u = 0; u < 20; u ++) {
+		if (x[u] > 0x1FFF) {
+			printf("INVALID:");
+			for (u = 0; u < 20; u ++) {
+				printf(" %04X", x[u]);
+			}
+			printf("\n");
+			return;
+		}
+	}
+	memset(tmp, 0, sizeof tmp);
+	for (u = 0; u < 20; u ++) {
+		uint32_t w;
+		int j, k;
+
+		w = x[u];
+		j = 13 * (int)u;
+		k = j & 7;
+		if (k != 0) {
+			w <<= k;
+			j -= k;
+		}
+		k = j >> 3;
+		tmp[35 - k] |= (unsigned char)w;
+		tmp[34 - k] |= (unsigned char)(w >> 8);
+		tmp[33 - k] |= (unsigned char)(w >> 16);
+		tmp[32 - k] |= (unsigned char)(w >> 24);
+	}
+	for (u = 4; u < 36; u ++) {
+		printf("%02X", tmp[u]);
+	}
+	printf("\n");
+}
+*/
+
+/*
+ * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
+ * that right-shifting a signed negative integer copies the sign bit
+ * (arithmetic right-shift). This is "implementation-defined behaviour",
+ * i.e. it is not undefined, but it may differ between compilers. Each
+ * compiler is supposed to document its behaviour in that respect. GCC
+ * explicitly defines that an arithmetic right shift is used. We expect
+ * all other compilers to do the same, because underlying CPU offer an
+ * arithmetic right shift opcode that could not be used otherwise.
+ */
+#if BR_NO_ARITH_SHIFT
+#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
+                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
+#else
+#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
+#endif
+
+/*
+ * Convert an integer from unsigned little-endian encoding to a sequence of
+ * 13-bit words in little-endian order. The final "partial" word is
+ * returned.
+ */
+static uint32_t
+le8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	uint32_t acc;
+	int acc_len;
+
+	acc = 0;
+	acc_len = 0;
+	while (len -- > 0) {
+		acc |= (uint32_t)(*src ++) << acc_len;
+		acc_len += 8;
+		if (acc_len >= 13) {
+			*dst ++ = acc & 0x1FFF;
+			acc >>= 13;
+			acc_len -= 13;
+		}
+	}
+	return acc;
+}
+
+/*
+ * Convert an integer (13-bit words, little-endian) to unsigned
+ * little-endian encoding. The total encoding length is provided; all
+ * the destination bytes will be filled.
+ */
+static void
+le13_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
+{
+	uint32_t acc;
+	int acc_len;
+
+	acc = 0;
+	acc_len = 0;
+	while (len -- > 0) {
+		if (acc_len < 8) {
+			acc |= (*src ++) << acc_len;
+			acc_len += 13;
+		}
+		*dst ++ = (unsigned char)acc;
+		acc >>= 8;
+		acc_len -= 8;
+	}
+}
+
+/*
+ * Normalise an array of words to a strict 13 bits per word. Returned
+ * value is the resulting carry. The source (w) and destination (d)
+ * arrays may be identical, but shall not overlap partially.
+ */
+static inline uint32_t
+norm13(uint32_t *d, const uint32_t *w, size_t len)
+{
+	size_t u;
+	uint32_t cc;
+
+	cc = 0;
+	for (u = 0; u < len; u ++) {
+		int32_t z;
+
+		z = w[u] + cc;
+		d[u] = z & 0x1FFF;
+		cc = ARSH(z, 13);
+	}
+	return cc;
+}
+
+/*
+ * mul20() multiplies two 260-bit integers together. Each word must fit
+ * on 13 bits; source operands use 20 words, destination operand
+ * receives 40 words. All overlaps allowed.
+ *
+ * square20() computes the square of a 260-bit integer. Each word must
+ * fit on 13 bits; source operand uses 20 words, destination operand
+ * receives 40 words. All overlaps allowed.
+ */
+
+#if BR_SLOW_MUL15
+
+static void
+mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	/*
+	 * Two-level Karatsuba: turns a 20x20 multiplication into
+	 * nine 5x5 multiplications. We use 13-bit words but do not
+	 * propagate carries immediately, so words may expand:
+	 *
+	 *  - First Karatsuba decomposition turns the 20x20 mul on
+	 *    13-bit words into three 10x10 muls, two on 13-bit words
+	 *    and one on 14-bit words.
+	 *
+	 *  - Second Karatsuba decomposition further splits these into:
+	 *
+	 *     * four 5x5 muls on 13-bit words
+	 *     * four 5x5 muls on 14-bit words
+	 *     * one 5x5 mul on 15-bit words
+	 *
+	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
+	 * or 15-bit words, respectively.
+	 */
+	uint32_t u[45], v[45], w[90];
+	uint32_t cc;
+	int i;
+
+#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
+		(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
+			+ (s2w)[5 * (s2_off) + 0]; \
+		(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
+			+ (s2w)[5 * (s2_off) + 1]; \
+		(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
+			+ (s2w)[5 * (s2_off) + 2]; \
+		(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
+			+ (s2w)[5 * (s2_off) + 3]; \
+		(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
+			+ (s2w)[5 * (s2_off) + 4]; \
+	} while (0)
+
+#define ZADDT(dw, d_off, sw, s_off)   do { \
+		(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
+		(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
+		(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
+		(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
+		(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
+	} while (0)
+
+#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
+		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
+			+ (s2w)[5 * (s2_off) + 0]; \
+		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
+			+ (s2w)[5 * (s2_off) + 1]; \
+		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
+			+ (s2w)[5 * (s2_off) + 2]; \
+		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
+			+ (s2w)[5 * (s2_off) + 3]; \
+		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
+			+ (s2w)[5 * (s2_off) + 4]; \
+	} while (0)
+
+#define CPR1(w, cprcc)   do { \
+		uint32_t cprz = (w) + cprcc; \
+		(w) = cprz & 0x1FFF; \
+		cprcc = cprz >> 13; \
+	} while (0)
+
+#define CPR(dw, d_off)   do { \
+		uint32_t cprcc; \
+		cprcc = 0; \
+		CPR1((dw)[(d_off) + 0], cprcc); \
+		CPR1((dw)[(d_off) + 1], cprcc); \
+		CPR1((dw)[(d_off) + 2], cprcc); \
+		CPR1((dw)[(d_off) + 3], cprcc); \
+		CPR1((dw)[(d_off) + 4], cprcc); \
+		CPR1((dw)[(d_off) + 5], cprcc); \
+		CPR1((dw)[(d_off) + 6], cprcc); \
+		CPR1((dw)[(d_off) + 7], cprcc); \
+		CPR1((dw)[(d_off) + 8], cprcc); \
+		(dw)[(d_off) + 9] = cprcc; \
+	} while (0)
+
+	memcpy(u, a, 20 * sizeof *a);
+	ZADD(u, 4, a, 0, a, 1);
+	ZADD(u, 5, a, 2, a, 3);
+	ZADD(u, 6, a, 0, a, 2);
+	ZADD(u, 7, a, 1, a, 3);
+	ZADD(u, 8, u, 6, u, 7);
+
+	memcpy(v, b, 20 * sizeof *b);
+	ZADD(v, 4, b, 0, b, 1);
+	ZADD(v, 5, b, 2, b, 3);
+	ZADD(v, 6, b, 0, b, 2);
+	ZADD(v, 7, b, 1, b, 3);
+	ZADD(v, 8, v, 6, v, 7);
+
+	/*
+	 * Do the eight first 8x8 muls. Source words are at most 16382
+	 * each, so we can add product results together "as is" in 32-bit
+	 * words.
+	 */
+	for (i = 0; i < 40; i += 5) {
+		w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
+		w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
+			+ MUL15(u[i + 1], v[i + 0]);
+		w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
+			+ MUL15(u[i + 1], v[i + 1])
+			+ MUL15(u[i + 2], v[i + 0]);
+		w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
+			+ MUL15(u[i + 1], v[i + 2])
+			+ MUL15(u[i + 2], v[i + 1])
+			+ MUL15(u[i + 3], v[i + 0]);
+		w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
+			+ MUL15(u[i + 1], v[i + 3])
+			+ MUL15(u[i + 2], v[i + 2])
+			+ MUL15(u[i + 3], v[i + 1])
+			+ MUL15(u[i + 4], v[i + 0]);
+		w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
+			+ MUL15(u[i + 2], v[i + 3])
+			+ MUL15(u[i + 3], v[i + 2])
+			+ MUL15(u[i + 4], v[i + 1]);
+		w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
+			+ MUL15(u[i + 3], v[i + 3])
+			+ MUL15(u[i + 4], v[i + 2]);
+		w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
+			+ MUL15(u[i + 4], v[i + 3]);
+		w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
+		w[(i << 1) + 9] = 0;
+	}
+
+	/*
+	 * For the 9th multiplication, source words are up to 32764,
+	 * so we must do some carry propagation. If we add up to
+	 * 4 products and the carry is no more than 524224, then the
+	 * result fits in 32 bits, and the next carry will be no more
+	 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
+	 *
+	 * We thus just skip one of the products in the middle word,
+	 * then do a carry propagation (this reduces words to 13 bits
+	 * each, except possibly the last, which may use up to 17 bits
+	 * or so), then add the missing product.
+	 */
+	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
+	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
+		+ MUL15(u[40 + 1], v[40 + 0]);
+	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
+		+ MUL15(u[40 + 1], v[40 + 1])
+		+ MUL15(u[40 + 2], v[40 + 0]);
+	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
+		+ MUL15(u[40 + 1], v[40 + 2])
+		+ MUL15(u[40 + 2], v[40 + 1])
+		+ MUL15(u[40 + 3], v[40 + 0]);
+	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
+		+ MUL15(u[40 + 1], v[40 + 3])
+		+ MUL15(u[40 + 2], v[40 + 2])
+		+ MUL15(u[40 + 3], v[40 + 1]);
+		/* + MUL15(u[40 + 4], v[40 + 0]) */
+	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
+		+ MUL15(u[40 + 2], v[40 + 3])
+		+ MUL15(u[40 + 3], v[40 + 2])
+		+ MUL15(u[40 + 4], v[40 + 1]);
+	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
+		+ MUL15(u[40 + 3], v[40 + 3])
+		+ MUL15(u[40 + 4], v[40 + 2]);
+	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
+		+ MUL15(u[40 + 4], v[40 + 3]);
+	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
+
+	CPR(w, 80);
+
+	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
+
+	/*
+	 * The products on 14-bit words in slots 6 and 7 yield values
+	 * up to 5*(16382^2) each, and we need to subtract two such
+	 * values from the higher word. We need the subtraction to fit
+	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
+	 * However, 10*(16382^2) does not fit. So we must perform a
+	 * bit of reduction here.
+	 */
+	CPR(w, 60);
+	CPR(w, 70);
+
+	/*
+	 * Recompose results.
+	 */
+
+	/* 0..1*0..1 into 0..3 */
+	ZSUB2F(w, 8, w, 0, w, 2);
+	ZSUB2F(w, 9, w, 1, w, 3);
+	ZADDT(w, 1, w, 8);
+	ZADDT(w, 2, w, 9);
+
+	/* 2..3*2..3 into 4..7 */
+	ZSUB2F(w, 10, w, 4, w, 6);
+	ZSUB2F(w, 11, w, 5, w, 7);
+	ZADDT(w, 5, w, 10);
+	ZADDT(w, 6, w, 11);
+
+	/* (0..1+2..3)*(0..1+2..3) into 12..15 */
+	ZSUB2F(w, 16, w, 12, w, 14);
+	ZSUB2F(w, 17, w, 13, w, 15);
+	ZADDT(w, 13, w, 16);
+	ZADDT(w, 14, w, 17);
+
+	/* first-level recomposition */
+	ZSUB2F(w, 12, w, 0, w, 4);
+	ZSUB2F(w, 13, w, 1, w, 5);
+	ZSUB2F(w, 14, w, 2, w, 6);
+	ZSUB2F(w, 15, w, 3, w, 7);
+	ZADDT(w, 2, w, 12);
+	ZADDT(w, 3, w, 13);
+	ZADDT(w, 4, w, 14);
+	ZADDT(w, 5, w, 15);
+
+	/*
+	 * Perform carry propagation to bring all words down to 13 bits.
+	 */
+	cc = norm13(d, w, 40);
+	d[39] += (cc << 13);
+
+#undef ZADD
+#undef ZADDT
+#undef ZSUB2F
+#undef CPR1
+#undef CPR
+}
+
+static inline void
+square20(uint32_t *d, const uint32_t *a)
+{
+	mul20(d, a, a);
+}
+
+#else
+
+static void
+mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t t[39];
+
+	t[ 0] = MUL15(a[ 0], b[ 0]);
+	t[ 1] = MUL15(a[ 0], b[ 1])
+		+ MUL15(a[ 1], b[ 0]);
+	t[ 2] = MUL15(a[ 0], b[ 2])
+		+ MUL15(a[ 1], b[ 1])
+		+ MUL15(a[ 2], b[ 0]);
+	t[ 3] = MUL15(a[ 0], b[ 3])
+		+ MUL15(a[ 1], b[ 2])
+		+ MUL15(a[ 2], b[ 1])
+		+ MUL15(a[ 3], b[ 0]);
+	t[ 4] = MUL15(a[ 0], b[ 4])
+		+ MUL15(a[ 1], b[ 3])
+		+ MUL15(a[ 2], b[ 2])
+		+ MUL15(a[ 3], b[ 1])
+		+ MUL15(a[ 4], b[ 0]);
+	t[ 5] = MUL15(a[ 0], b[ 5])
+		+ MUL15(a[ 1], b[ 4])
+		+ MUL15(a[ 2], b[ 3])
+		+ MUL15(a[ 3], b[ 2])
+		+ MUL15(a[ 4], b[ 1])
+		+ MUL15(a[ 5], b[ 0]);
+	t[ 6] = MUL15(a[ 0], b[ 6])
+		+ MUL15(a[ 1], b[ 5])
+		+ MUL15(a[ 2], b[ 4])
+		+ MUL15(a[ 3], b[ 3])
+		+ MUL15(a[ 4], b[ 2])
+		+ MUL15(a[ 5], b[ 1])
+		+ MUL15(a[ 6], b[ 0]);
+	t[ 7] = MUL15(a[ 0], b[ 7])
+		+ MUL15(a[ 1], b[ 6])
+		+ MUL15(a[ 2], b[ 5])
+		+ MUL15(a[ 3], b[ 4])
+		+ MUL15(a[ 4], b[ 3])
+		+ MUL15(a[ 5], b[ 2])
+		+ MUL15(a[ 6], b[ 1])
+		+ MUL15(a[ 7], b[ 0]);
+	t[ 8] = MUL15(a[ 0], b[ 8])
+		+ MUL15(a[ 1], b[ 7])
+		+ MUL15(a[ 2], b[ 6])
+		+ MUL15(a[ 3], b[ 5])
+		+ MUL15(a[ 4], b[ 4])
+		+ MUL15(a[ 5], b[ 3])
+		+ MUL15(a[ 6], b[ 2])
+		+ MUL15(a[ 7], b[ 1])
+		+ MUL15(a[ 8], b[ 0]);
+	t[ 9] = MUL15(a[ 0], b[ 9])
+		+ MUL15(a[ 1], b[ 8])
+		+ MUL15(a[ 2], b[ 7])
+		+ MUL15(a[ 3], b[ 6])
+		+ MUL15(a[ 4], b[ 5])
+		+ MUL15(a[ 5], b[ 4])
+		+ MUL15(a[ 6], b[ 3])
+		+ MUL15(a[ 7], b[ 2])
+		+ MUL15(a[ 8], b[ 1])
+		+ MUL15(a[ 9], b[ 0]);
+	t[10] = MUL15(a[ 0], b[10])
+		+ MUL15(a[ 1], b[ 9])
+		+ MUL15(a[ 2], b[ 8])
+		+ MUL15(a[ 3], b[ 7])
+		+ MUL15(a[ 4], b[ 6])
+		+ MUL15(a[ 5], b[ 5])
+		+ MUL15(a[ 6], b[ 4])
+		+ MUL15(a[ 7], b[ 3])
+		+ MUL15(a[ 8], b[ 2])
+		+ MUL15(a[ 9], b[ 1])
+		+ MUL15(a[10], b[ 0]);
+	t[11] = MUL15(a[ 0], b[11])
+		+ MUL15(a[ 1], b[10])
+		+ MUL15(a[ 2], b[ 9])
+		+ MUL15(a[ 3], b[ 8])
+		+ MUL15(a[ 4], b[ 7])
+		+ MUL15(a[ 5], b[ 6])
+		+ MUL15(a[ 6], b[ 5])
+		+ MUL15(a[ 7], b[ 4])
+		+ MUL15(a[ 8], b[ 3])
+		+ MUL15(a[ 9], b[ 2])
+		+ MUL15(a[10], b[ 1])
+		+ MUL15(a[11], b[ 0]);
+	t[12] = MUL15(a[ 0], b[12])
+		+ MUL15(a[ 1], b[11])
+		+ MUL15(a[ 2], b[10])
+		+ MUL15(a[ 3], b[ 9])
+		+ MUL15(a[ 4], b[ 8])
+		+ MUL15(a[ 5], b[ 7])
+		+ MUL15(a[ 6], b[ 6])
+		+ MUL15(a[ 7], b[ 5])
+		+ MUL15(a[ 8], b[ 4])
+		+ MUL15(a[ 9], b[ 3])
+		+ MUL15(a[10], b[ 2])
+		+ MUL15(a[11], b[ 1])
+		+ MUL15(a[12], b[ 0]);
+	t[13] = MUL15(a[ 0], b[13])
+		+ MUL15(a[ 1], b[12])
+		+ MUL15(a[ 2], b[11])
+		+ MUL15(a[ 3], b[10])
+		+ MUL15(a[ 4], b[ 9])
+		+ MUL15(a[ 5], b[ 8])
+		+ MUL15(a[ 6], b[ 7])
+		+ MUL15(a[ 7], b[ 6])
+		+ MUL15(a[ 8], b[ 5])
+		+ MUL15(a[ 9], b[ 4])
+		+ MUL15(a[10], b[ 3])
+		+ MUL15(a[11], b[ 2])
+		+ MUL15(a[12], b[ 1])
+		+ MUL15(a[13], b[ 0]);
+	t[14] = MUL15(a[ 0], b[14])
+		+ MUL15(a[ 1], b[13])
+		+ MUL15(a[ 2], b[12])
+		+ MUL15(a[ 3], b[11])
+		+ MUL15(a[ 4], b[10])
+		+ MUL15(a[ 5], b[ 9])
+		+ MUL15(a[ 6], b[ 8])
+		+ MUL15(a[ 7], b[ 7])
+		+ MUL15(a[ 8], b[ 6])
+		+ MUL15(a[ 9], b[ 5])
+		+ MUL15(a[10], b[ 4])
+		+ MUL15(a[11], b[ 3])
+		+ MUL15(a[12], b[ 2])
+		+ MUL15(a[13], b[ 1])
+		+ MUL15(a[14], b[ 0]);
+	t[15] = MUL15(a[ 0], b[15])
+		+ MUL15(a[ 1], b[14])
+		+ MUL15(a[ 2], b[13])
+		+ MUL15(a[ 3], b[12])
+		+ MUL15(a[ 4], b[11])
+		+ MUL15(a[ 5], b[10])
+		+ MUL15(a[ 6], b[ 9])
+		+ MUL15(a[ 7], b[ 8])
+		+ MUL15(a[ 8], b[ 7])
+		+ MUL15(a[ 9], b[ 6])
+		+ MUL15(a[10], b[ 5])
+		+ MUL15(a[11], b[ 4])
+		+ MUL15(a[12], b[ 3])
+		+ MUL15(a[13], b[ 2])
+		+ MUL15(a[14], b[ 1])
+		+ MUL15(a[15], b[ 0]);
+	t[16] = MUL15(a[ 0], b[16])
+		+ MUL15(a[ 1], b[15])
+		+ MUL15(a[ 2], b[14])
+		+ MUL15(a[ 3], b[13])
+		+ MUL15(a[ 4], b[12])
+		+ MUL15(a[ 5], b[11])
+		+ MUL15(a[ 6], b[10])
+		+ MUL15(a[ 7], b[ 9])
+		+ MUL15(a[ 8], b[ 8])
+		+ MUL15(a[ 9], b[ 7])
+		+ MUL15(a[10], b[ 6])
+		+ MUL15(a[11], b[ 5])
+		+ MUL15(a[12], b[ 4])
+		+ MUL15(a[13], b[ 3])
+		+ MUL15(a[14], b[ 2])
+		+ MUL15(a[15], b[ 1])
+		+ MUL15(a[16], b[ 0]);
+	t[17] = MUL15(a[ 0], b[17])
+		+ MUL15(a[ 1], b[16])
+		+ MUL15(a[ 2], b[15])
+		+ MUL15(a[ 3], b[14])
+		+ MUL15(a[ 4], b[13])
+		+ MUL15(a[ 5], b[12])
+		+ MUL15(a[ 6], b[11])
+		+ MUL15(a[ 7], b[10])
+		+ MUL15(a[ 8], b[ 9])
+		+ MUL15(a[ 9], b[ 8])
+		+ MUL15(a[10], b[ 7])
+		+ MUL15(a[11], b[ 6])
+		+ MUL15(a[12], b[ 5])
+		+ MUL15(a[13], b[ 4])
+		+ MUL15(a[14], b[ 3])
+		+ MUL15(a[15], b[ 2])
+		+ MUL15(a[16], b[ 1])
+		+ MUL15(a[17], b[ 0]);
+	t[18] = MUL15(a[ 0], b[18])
+		+ MUL15(a[ 1], b[17])
+		+ MUL15(a[ 2], b[16])
+		+ MUL15(a[ 3], b[15])
+		+ MUL15(a[ 4], b[14])
+		+ MUL15(a[ 5], b[13])
+		+ MUL15(a[ 6], b[12])
+		+ MUL15(a[ 7], b[11])
+		+ MUL15(a[ 8], b[10])
+		+ MUL15(a[ 9], b[ 9])
+		+ MUL15(a[10], b[ 8])
+		+ MUL15(a[11], b[ 7])
+		+ MUL15(a[12], b[ 6])
+		+ MUL15(a[13], b[ 5])
+		+ MUL15(a[14], b[ 4])
+		+ MUL15(a[15], b[ 3])
+		+ MUL15(a[16], b[ 2])
+		+ MUL15(a[17], b[ 1])
+		+ MUL15(a[18], b[ 0]);
+	t[19] = MUL15(a[ 0], b[19])
+		+ MUL15(a[ 1], b[18])
+		+ MUL15(a[ 2], b[17])
+		+ MUL15(a[ 3], b[16])
+		+ MUL15(a[ 4], b[15])
+		+ MUL15(a[ 5], b[14])
+		+ MUL15(a[ 6], b[13])
+		+ MUL15(a[ 7], b[12])
+		+ MUL15(a[ 8], b[11])
+		+ MUL15(a[ 9], b[10])
+		+ MUL15(a[10], b[ 9])
+		+ MUL15(a[11], b[ 8])
+		+ MUL15(a[12], b[ 7])
+		+ MUL15(a[13], b[ 6])
+		+ MUL15(a[14], b[ 5])
+		+ MUL15(a[15], b[ 4])
+		+ MUL15(a[16], b[ 3])
+		+ MUL15(a[17], b[ 2])
+		+ MUL15(a[18], b[ 1])
+		+ MUL15(a[19], b[ 0]);
+	t[20] = MUL15(a[ 1], b[19])
+		+ MUL15(a[ 2], b[18])
+		+ MUL15(a[ 3], b[17])
+		+ MUL15(a[ 4], b[16])
+		+ MUL15(a[ 5], b[15])
+		+ MUL15(a[ 6], b[14])
+		+ MUL15(a[ 7], b[13])
+		+ MUL15(a[ 8], b[12])
+		+ MUL15(a[ 9], b[11])
+		+ MUL15(a[10], b[10])
+		+ MUL15(a[11], b[ 9])
+		+ MUL15(a[12], b[ 8])
+		+ MUL15(a[13], b[ 7])
+		+ MUL15(a[14], b[ 6])
+		+ MUL15(a[15], b[ 5])
+		+ MUL15(a[16], b[ 4])
+		+ MUL15(a[17], b[ 3])
+		+ MUL15(a[18], b[ 2])
+		+ MUL15(a[19], b[ 1]);
+	t[21] = MUL15(a[ 2], b[19])
+		+ MUL15(a[ 3], b[18])
+		+ MUL15(a[ 4], b[17])
+		+ MUL15(a[ 5], b[16])
+		+ MUL15(a[ 6], b[15])
+		+ MUL15(a[ 7], b[14])
+		+ MUL15(a[ 8], b[13])
+		+ MUL15(a[ 9], b[12])
+		+ MUL15(a[10], b[11])
+		+ MUL15(a[11], b[10])
+		+ MUL15(a[12], b[ 9])
+		+ MUL15(a[13], b[ 8])
+		+ MUL15(a[14], b[ 7])
+		+ MUL15(a[15], b[ 6])
+		+ MUL15(a[16], b[ 5])
+		+ MUL15(a[17], b[ 4])
+		+ MUL15(a[18], b[ 3])
+		+ MUL15(a[19], b[ 2]);
+	t[22] = MUL15(a[ 3], b[19])
+		+ MUL15(a[ 4], b[18])
+		+ MUL15(a[ 5], b[17])
+		+ MUL15(a[ 6], b[16])
+		+ MUL15(a[ 7], b[15])
+		+ MUL15(a[ 8], b[14])
+		+ MUL15(a[ 9], b[13])
+		+ MUL15(a[10], b[12])
+		+ MUL15(a[11], b[11])
+		+ MUL15(a[12], b[10])
+		+ MUL15(a[13], b[ 9])
+		+ MUL15(a[14], b[ 8])
+		+ MUL15(a[15], b[ 7])
+		+ MUL15(a[16], b[ 6])
+		+ MUL15(a[17], b[ 5])
+		+ MUL15(a[18], b[ 4])
+		+ MUL15(a[19], b[ 3]);
+	t[23] = MUL15(a[ 4], b[19])
+		+ MUL15(a[ 5], b[18])
+		+ MUL15(a[ 6], b[17])
+		+ MUL15(a[ 7], b[16])
+		+ MUL15(a[ 8], b[15])
+		+ MUL15(a[ 9], b[14])
+		+ MUL15(a[10], b[13])
+		+ MUL15(a[11], b[12])
+		+ MUL15(a[12], b[11])
+		+ MUL15(a[13], b[10])
+		+ MUL15(a[14], b[ 9])
+		+ MUL15(a[15], b[ 8])
+		+ MUL15(a[16], b[ 7])
+		+ MUL15(a[17], b[ 6])
+		+ MUL15(a[18], b[ 5])
+		+ MUL15(a[19], b[ 4]);
+	t[24] = MUL15(a[ 5], b[19])
+		+ MUL15(a[ 6], b[18])
+		+ MUL15(a[ 7], b[17])
+		+ MUL15(a[ 8], b[16])
+		+ MUL15(a[ 9], b[15])
+		+ MUL15(a[10], b[14])
+		+ MUL15(a[11], b[13])
+		+ MUL15(a[12], b[12])
+		+ MUL15(a[13], b[11])
+		+ MUL15(a[14], b[10])
+		+ MUL15(a[15], b[ 9])
+		+ MUL15(a[16], b[ 8])
+		+ MUL15(a[17], b[ 7])
+		+ MUL15(a[18], b[ 6])
+		+ MUL15(a[19], b[ 5]);
+	t[25] = MUL15(a[ 6], b[19])
+		+ MUL15(a[ 7], b[18])
+		+ MUL15(a[ 8], b[17])
+		+ MUL15(a[ 9], b[16])
+		+ MUL15(a[10], b[15])
+		+ MUL15(a[11], b[14])
+		+ MUL15(a[12], b[13])
+		+ MUL15(a[13], b[12])
+		+ MUL15(a[14], b[11])
+		+ MUL15(a[15], b[10])
+		+ MUL15(a[16], b[ 9])
+		+ MUL15(a[17], b[ 8])
+		+ MUL15(a[18], b[ 7])
+		+ MUL15(a[19], b[ 6]);
+	t[26] = MUL15(a[ 7], b[19])
+		+ MUL15(a[ 8], b[18])
+		+ MUL15(a[ 9], b[17])
+		+ MUL15(a[10], b[16])
+		+ MUL15(a[11], b[15])
+		+ MUL15(a[12], b[14])
+		+ MUL15(a[13], b[13])
+		+ MUL15(a[14], b[12])
+		+ MUL15(a[15], b[11])
+		+ MUL15(a[16], b[10])
+		+ MUL15(a[17], b[ 9])
+		+ MUL15(a[18], b[ 8])
+		+ MUL15(a[19], b[ 7]);
+	t[27] = MUL15(a[ 8], b[19])
+		+ MUL15(a[ 9], b[18])
+		+ MUL15(a[10], b[17])
+		+ MUL15(a[11], b[16])
+		+ MUL15(a[12], b[15])
+		+ MUL15(a[13], b[14])
+		+ MUL15(a[14], b[13])
+		+ MUL15(a[15], b[12])
+		+ MUL15(a[16], b[11])
+		+ MUL15(a[17], b[10])
+		+ MUL15(a[18], b[ 9])
+		+ MUL15(a[19], b[ 8]);
+	t[28] = MUL15(a[ 9], b[19])
+		+ MUL15(a[10], b[18])
+		+ MUL15(a[11], b[17])
+		+ MUL15(a[12], b[16])
+		+ MUL15(a[13], b[15])
+		+ MUL15(a[14], b[14])
+		+ MUL15(a[15], b[13])
+		+ MUL15(a[16], b[12])
+		+ MUL15(a[17], b[11])
+		+ MUL15(a[18], b[10])
+		+ MUL15(a[19], b[ 9]);
+	t[29] = MUL15(a[10], b[19])
+		+ MUL15(a[11], b[18])
+		+ MUL15(a[12], b[17])
+		+ MUL15(a[13], b[16])
+		+ MUL15(a[14], b[15])
+		+ MUL15(a[15], b[14])
+		+ MUL15(a[16], b[13])
+		+ MUL15(a[17], b[12])
+		+ MUL15(a[18], b[11])
+		+ MUL15(a[19], b[10]);
+	t[30] = MUL15(a[11], b[19])
+		+ MUL15(a[12], b[18])
+		+ MUL15(a[13], b[17])
+		+ MUL15(a[14], b[16])
+		+ MUL15(a[15], b[15])
+		+ MUL15(a[16], b[14])
+		+ MUL15(a[17], b[13])
+		+ MUL15(a[18], b[12])
+		+ MUL15(a[19], b[11]);
+	t[31] = MUL15(a[12], b[19])
+		+ MUL15(a[13], b[18])
+		+ MUL15(a[14], b[17])
+		+ MUL15(a[15], b[16])
+		+ MUL15(a[16], b[15])
+		+ MUL15(a[17], b[14])
+		+ MUL15(a[18], b[13])
+		+ MUL15(a[19], b[12]);
+	t[32] = MUL15(a[13], b[19])
+		+ MUL15(a[14], b[18])
+		+ MUL15(a[15], b[17])
+		+ MUL15(a[16], b[16])
+		+ MUL15(a[17], b[15])
+		+ MUL15(a[18], b[14])
+		+ MUL15(a[19], b[13]);
+	t[33] = MUL15(a[14], b[19])
+		+ MUL15(a[15], b[18])
+		+ MUL15(a[16], b[17])
+		+ MUL15(a[17], b[16])
+		+ MUL15(a[18], b[15])
+		+ MUL15(a[19], b[14]);
+	t[34] = MUL15(a[15], b[19])
+		+ MUL15(a[16], b[18])
+		+ MUL15(a[17], b[17])
+		+ MUL15(a[18], b[16])
+		+ MUL15(a[19], b[15]);
+	t[35] = MUL15(a[16], b[19])
+		+ MUL15(a[17], b[18])
+		+ MUL15(a[18], b[17])
+		+ MUL15(a[19], b[16]);
+	t[36] = MUL15(a[17], b[19])
+		+ MUL15(a[18], b[18])
+		+ MUL15(a[19], b[17]);
+	t[37] = MUL15(a[18], b[19])
+		+ MUL15(a[19], b[18]);
+	t[38] = MUL15(a[19], b[19]);
+
+	d[39] = norm13(d, t, 39);
+}
+
+static void
+square20(uint32_t *d, const uint32_t *a)
+{
+	uint32_t t[39];
+
+	t[ 0] = MUL15(a[ 0], a[ 0]);
+	t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
+	t[ 2] = MUL15(a[ 1], a[ 1])
+		+ ((MUL15(a[ 0], a[ 2])) << 1);
+	t[ 3] = ((MUL15(a[ 0], a[ 3])
+		+ MUL15(a[ 1], a[ 2])) << 1);
+	t[ 4] = MUL15(a[ 2], a[ 2])
+		+ ((MUL15(a[ 0], a[ 4])
+		+ MUL15(a[ 1], a[ 3])) << 1);
+	t[ 5] = ((MUL15(a[ 0], a[ 5])
+		+ MUL15(a[ 1], a[ 4])
+		+ MUL15(a[ 2], a[ 3])) << 1);
+	t[ 6] = MUL15(a[ 3], a[ 3])
+		+ ((MUL15(a[ 0], a[ 6])
+		+ MUL15(a[ 1], a[ 5])
+		+ MUL15(a[ 2], a[ 4])) << 1);
+	t[ 7] = ((MUL15(a[ 0], a[ 7])
+		+ MUL15(a[ 1], a[ 6])
+		+ MUL15(a[ 2], a[ 5])
+		+ MUL15(a[ 3], a[ 4])) << 1);
+	t[ 8] = MUL15(a[ 4], a[ 4])
+		+ ((MUL15(a[ 0], a[ 8])
+		+ MUL15(a[ 1], a[ 7])
+		+ MUL15(a[ 2], a[ 6])
+		+ MUL15(a[ 3], a[ 5])) << 1);
+	t[ 9] = ((MUL15(a[ 0], a[ 9])
+		+ MUL15(a[ 1], a[ 8])
+		+ MUL15(a[ 2], a[ 7])
+		+ MUL15(a[ 3], a[ 6])
+		+ MUL15(a[ 4], a[ 5])) << 1);
+	t[10] = MUL15(a[ 5], a[ 5])
+		+ ((MUL15(a[ 0], a[10])
+		+ MUL15(a[ 1], a[ 9])
+		+ MUL15(a[ 2], a[ 8])
+		+ MUL15(a[ 3], a[ 7])
+		+ MUL15(a[ 4], a[ 6])) << 1);
+	t[11] = ((MUL15(a[ 0], a[11])
+		+ MUL15(a[ 1], a[10])
+		+ MUL15(a[ 2], a[ 9])
+		+ MUL15(a[ 3], a[ 8])
+		+ MUL15(a[ 4], a[ 7])
+		+ MUL15(a[ 5], a[ 6])) << 1);
+	t[12] = MUL15(a[ 6], a[ 6])
+		+ ((MUL15(a[ 0], a[12])
+		+ MUL15(a[ 1], a[11])
+		+ MUL15(a[ 2], a[10])
+		+ MUL15(a[ 3], a[ 9])
+		+ MUL15(a[ 4], a[ 8])
+		+ MUL15(a[ 5], a[ 7])) << 1);
+	t[13] = ((MUL15(a[ 0], a[13])
+		+ MUL15(a[ 1], a[12])
+		+ MUL15(a[ 2], a[11])
+		+ MUL15(a[ 3], a[10])
+		+ MUL15(a[ 4], a[ 9])
+		+ MUL15(a[ 5], a[ 8])
+		+ MUL15(a[ 6], a[ 7])) << 1);
+	t[14] = MUL15(a[ 7], a[ 7])
+		+ ((MUL15(a[ 0], a[14])
+		+ MUL15(a[ 1], a[13])
+		+ MUL15(a[ 2], a[12])
+		+ MUL15(a[ 3], a[11])
+		+ MUL15(a[ 4], a[10])
+		+ MUL15(a[ 5], a[ 9])
+		+ MUL15(a[ 6], a[ 8])) << 1);
+	t[15] = ((MUL15(a[ 0], a[15])
+		+ MUL15(a[ 1], a[14])
+		+ MUL15(a[ 2], a[13])
+		+ MUL15(a[ 3], a[12])
+		+ MUL15(a[ 4], a[11])
+		+ MUL15(a[ 5], a[10])
+		+ MUL15(a[ 6], a[ 9])
+		+ MUL15(a[ 7], a[ 8])) << 1);
+	t[16] = MUL15(a[ 8], a[ 8])
+		+ ((MUL15(a[ 0], a[16])
+		+ MUL15(a[ 1], a[15])
+		+ MUL15(a[ 2], a[14])
+		+ MUL15(a[ 3], a[13])
+		+ MUL15(a[ 4], a[12])
+		+ MUL15(a[ 5], a[11])
+		+ MUL15(a[ 6], a[10])
+		+ MUL15(a[ 7], a[ 9])) << 1);
+	t[17] = ((MUL15(a[ 0], a[17])
+		+ MUL15(a[ 1], a[16])
+		+ MUL15(a[ 2], a[15])
+		+ MUL15(a[ 3], a[14])
+		+ MUL15(a[ 4], a[13])
+		+ MUL15(a[ 5], a[12])
+		+ MUL15(a[ 6], a[11])
+		+ MUL15(a[ 7], a[10])
+		+ MUL15(a[ 8], a[ 9])) << 1);
+	t[18] = MUL15(a[ 9], a[ 9])
+		+ ((MUL15(a[ 0], a[18])
+		+ MUL15(a[ 1], a[17])
+		+ MUL15(a[ 2], a[16])
+		+ MUL15(a[ 3], a[15])
+		+ MUL15(a[ 4], a[14])
+		+ MUL15(a[ 5], a[13])
+		+ MUL15(a[ 6], a[12])
+		+ MUL15(a[ 7], a[11])
+		+ MUL15(a[ 8], a[10])) << 1);
+	t[19] = ((MUL15(a[ 0], a[19])
+		+ MUL15(a[ 1], a[18])
+		+ MUL15(a[ 2], a[17])
+		+ MUL15(a[ 3], a[16])
+		+ MUL15(a[ 4], a[15])
+		+ MUL15(a[ 5], a[14])
+		+ MUL15(a[ 6], a[13])
+		+ MUL15(a[ 7], a[12])
+		+ MUL15(a[ 8], a[11])
+		+ MUL15(a[ 9], a[10])) << 1);
+	t[20] = MUL15(a[10], a[10])
+		+ ((MUL15(a[ 1], a[19])
+		+ MUL15(a[ 2], a[18])
+		+ MUL15(a[ 3], a[17])
+		+ MUL15(a[ 4], a[16])
+		+ MUL15(a[ 5], a[15])
+		+ MUL15(a[ 6], a[14])
+		+ MUL15(a[ 7], a[13])
+		+ MUL15(a[ 8], a[12])
+		+ MUL15(a[ 9], a[11])) << 1);
+	t[21] = ((MUL15(a[ 2], a[19])
+		+ MUL15(a[ 3], a[18])
+		+ MUL15(a[ 4], a[17])
+		+ MUL15(a[ 5], a[16])
+		+ MUL15(a[ 6], a[15])
+		+ MUL15(a[ 7], a[14])
+		+ MUL15(a[ 8], a[13])
+		+ MUL15(a[ 9], a[12])
+		+ MUL15(a[10], a[11])) << 1);
+	t[22] = MUL15(a[11], a[11])
+		+ ((MUL15(a[ 3], a[19])
+		+ MUL15(a[ 4], a[18])
+		+ MUL15(a[ 5], a[17])
+		+ MUL15(a[ 6], a[16])
+		+ MUL15(a[ 7], a[15])
+		+ MUL15(a[ 8], a[14])
+		+ MUL15(a[ 9], a[13])
+		+ MUL15(a[10], a[12])) << 1);
+	t[23] = ((MUL15(a[ 4], a[19])
+		+ MUL15(a[ 5], a[18])
+		+ MUL15(a[ 6], a[17])
+		+ MUL15(a[ 7], a[16])
+		+ MUL15(a[ 8], a[15])
+		+ MUL15(a[ 9], a[14])
+		+ MUL15(a[10], a[13])
+		+ MUL15(a[11], a[12])) << 1);
+	t[24] = MUL15(a[12], a[12])
+		+ ((MUL15(a[ 5], a[19])
+		+ MUL15(a[ 6], a[18])
+		+ MUL15(a[ 7], a[17])
+		+ MUL15(a[ 8], a[16])
+		+ MUL15(a[ 9], a[15])
+		+ MUL15(a[10], a[14])
+		+ MUL15(a[11], a[13])) << 1);
+	t[25] = ((MUL15(a[ 6], a[19])
+		+ MUL15(a[ 7], a[18])
+		+ MUL15(a[ 8], a[17])
+		+ MUL15(a[ 9], a[16])
+		+ MUL15(a[10], a[15])
+		+ MUL15(a[11], a[14])
+		+ MUL15(a[12], a[13])) << 1);
+	t[26] = MUL15(a[13], a[13])
+		+ ((MUL15(a[ 7], a[19])
+		+ MUL15(a[ 8], a[18])
+		+ MUL15(a[ 9], a[17])
+		+ MUL15(a[10], a[16])
+		+ MUL15(a[11], a[15])
+		+ MUL15(a[12], a[14])) << 1);
+	t[27] = ((MUL15(a[ 8], a[19])
+		+ MUL15(a[ 9], a[18])
+		+ MUL15(a[10], a[17])
+		+ MUL15(a[11], a[16])
+		+ MUL15(a[12], a[15])
+		+ MUL15(a[13], a[14])) << 1);
+	t[28] = MUL15(a[14], a[14])
+		+ ((MUL15(a[ 9], a[19])
+		+ MUL15(a[10], a[18])
+		+ MUL15(a[11], a[17])
+		+ MUL15(a[12], a[16])
+		+ MUL15(a[13], a[15])) << 1);
+	t[29] = ((MUL15(a[10], a[19])
+		+ MUL15(a[11], a[18])
+		+ MUL15(a[12], a[17])
+		+ MUL15(a[13], a[16])
+		+ MUL15(a[14], a[15])) << 1);
+	t[30] = MUL15(a[15], a[15])
+		+ ((MUL15(a[11], a[19])
+		+ MUL15(a[12], a[18])
+		+ MUL15(a[13], a[17])
+		+ MUL15(a[14], a[16])) << 1);
+	t[31] = ((MUL15(a[12], a[19])
+		+ MUL15(a[13], a[18])
+		+ MUL15(a[14], a[17])
+		+ MUL15(a[15], a[16])) << 1);
+	t[32] = MUL15(a[16], a[16])
+		+ ((MUL15(a[13], a[19])
+		+ MUL15(a[14], a[18])
+		+ MUL15(a[15], a[17])) << 1);
+	t[33] = ((MUL15(a[14], a[19])
+		+ MUL15(a[15], a[18])
+		+ MUL15(a[16], a[17])) << 1);
+	t[34] = MUL15(a[17], a[17])
+		+ ((MUL15(a[15], a[19])
+		+ MUL15(a[16], a[18])) << 1);
+	t[35] = ((MUL15(a[16], a[19])
+		+ MUL15(a[17], a[18])) << 1);
+	t[36] = MUL15(a[18], a[18])
+		+ ((MUL15(a[17], a[19])) << 1);
+	t[37] = ((MUL15(a[18], a[19])) << 1);
+	t[38] = MUL15(a[19], a[19]);
+
+	d[39] = norm13(d, t, 39);
+}
+
+#endif
+
+/*
+ * Perform a "final reduction" in field F255 (field for Curve25519)
+ * The source value must be less than twice the modulus. If the value
+ * is not lower than the modulus, then the modulus is subtracted and
+ * this function returns 1; otherwise, it leaves it untouched and it
+ * returns 0.
+ */
+static uint32_t
+reduce_final_f255(uint32_t *d)
+{
+	uint32_t t[20];
+	uint32_t cc;
+	int i;
+
+	memcpy(t, d, sizeof t);
+	cc = 19;
+	for (i = 0; i < 20; i ++) {
+		uint32_t w;
+
+		w = t[i] + cc;
+		cc = w >> 13;
+		t[i] = w & 0x1FFF;
+	}
+	cc = t[19] >> 8;
+	t[19] &= 0xFF;
+	CCOPY(cc, d, t, sizeof t);
+	return cc;
+}
+
+static void
+f255_mulgen(uint32_t *d, const uint32_t *a, const uint32_t *b, int square)
+{
+	uint32_t t[40], cc, w;
+
+	/*
+	 * Compute raw multiplication. All result words fit in 13 bits
+	 * each; upper word (t[39]) must fit on 5 bits, since the product
+	 * of two 256-bit integers must fit on 512 bits.
+	 */
+	if (square) {
+		square20(t, a);
+	} else {
+		mul20(t, a, b);
+	}
+
+	/*
+	 * Modular reduction: each high word is added where necessary.
+	 * Since the modulus is 2^255-19 and word 20 corresponds to
+	 * offset 20*13 = 260, word 20+k must be added to word k with
+	 * a factor of 19*2^5 = 608. The extra bits in word 19 are also
+	 * added that way.
+	 */
+	cc = MUL15(t[19] >> 8, 19);
+	t[19] &= 0xFF;
+
+#define MM1(x)   do { \
+		w = t[x] + cc + MUL15(t[(x) + 20], 608); \
+		t[x] = w & 0x1FFF; \
+		cc = w >> 13; \
+	} while (0)
+
+	MM1( 0);
+	MM1( 1);
+	MM1( 2);
+	MM1( 3);
+	MM1( 4);
+	MM1( 5);
+	MM1( 6);
+	MM1( 7);
+	MM1( 8);
+	MM1( 9);
+	MM1(10);
+	MM1(11);
+	MM1(12);
+	MM1(13);
+	MM1(14);
+	MM1(15);
+	MM1(16);
+	MM1(17);
+	MM1(18);
+	MM1(19);
+
+#undef MM1
+
+	cc = MUL15(w >> 8, 19);
+	t[19] &= 0xFF;
+
+#define MM2(x)   do { \
+		w = t[x] + cc; \
+		d[x] = w & 0x1FFF; \
+		cc = w >> 13; \
+	} while (0)
+
+	MM2( 0);
+	MM2( 1);
+	MM2( 2);
+	MM2( 3);
+	MM2( 4);
+	MM2( 5);
+	MM2( 6);
+	MM2( 7);
+	MM2( 8);
+	MM2( 9);
+	MM2(10);
+	MM2(11);
+	MM2(12);
+	MM2(13);
+	MM2(14);
+	MM2(15);
+	MM2(16);
+	MM2(17);
+	MM2(18);
+	MM2(19);
+
+#undef MM2
+}
+
+/*
+ * Perform a multiplication of two integers modulo 2^255-19.
+ * Operands are arrays of 20 words, each containing 13 bits of data, in
+ * little-endian order. Input value may be up to 2^256-1; on output, value
+ * fits on 256 bits and is lower than twice the modulus.
+ *
+ * f255_mul() is the general multiplication, f255_square() is specialised
+ * for squarings.
+ */
+#define f255_mul(d, a, b)   f255_mulgen(d, a, b, 0)
+#define f255_square(d, a)   f255_mulgen(d, a, a, 1)
+
+/*
+ * Add two values in F255. Partial reduction is performed (down to less
+ * than twice the modulus).
+ */
+static void
+f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	int i;
+	uint32_t cc, w;
+
+	cc = 0;
+	for (i = 0; i < 20; i ++) {
+		w = a[i] + b[i] + cc;
+		d[i] = w & 0x1FFF;
+		cc = w >> 13;
+	}
+	cc = MUL15(w >> 8, 19);
+	d[19] &= 0xFF;
+	for (i = 0; i < 20; i ++) {
+		w = d[i] + cc;
+		d[i] = w & 0x1FFF;
+		cc = w >> 13;
+	}
+}
+
+/*
+ * Subtract one value from another in F255. Partial reduction is
+ * performed (down to less than twice the modulus).
+ */
+static void
+f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	/*
+	 * We actually compute a - b + 2*p, so that the final value is
+	 * necessarily positive.
+	 */
+	int i;
+	uint32_t cc, w;
+
+	cc = (uint32_t)-38;
+	for (i = 0; i < 20; i ++) {
+		w = a[i] - b[i] + cc;
+		d[i] = w & 0x1FFF;
+		cc = ARSH(w, 13);
+	}
+	cc = MUL15((w + 0x200) >> 8, 19);
+	d[19] &= 0xFF;
+	for (i = 0; i < 20; i ++) {
+		w = d[i] + cc;
+		d[i] = w & 0x1FFF;
+		cc = w >> 13;
+	}
+}
+
+/*
+ * Multiply an integer by the 'A24' constant (121665). Partial reduction
+ * is performed (down to less than twice the modulus).
+ */
+static void
+f255_mul_a24(uint32_t *d, const uint32_t *a)
+{
+	int i;
+	uint32_t cc, w;
+
+	cc = 0;
+	for (i = 0; i < 20; i ++) {
+		w = MUL15(a[i], 121665) + cc;
+		d[i] = w & 0x1FFF;
+		cc = w >> 13;
+	}
+	cc = MUL15(w >> 8, 19);
+	d[19] &= 0xFF;
+	for (i = 0; i < 20; i ++) {
+		w = d[i] + cc;
+		d[i] = w & 0x1FFF;
+		cc = w >> 13;
+	}
+}
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+static void
+cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
+{
+	int i;
+
+	ctl = -ctl;
+	for (i = 0; i < 20; i ++) {
+		uint32_t aw, bw, tw;
+
+		aw = a[i];
+		bw = b[i];
+		tw = ctl & (aw ^ bw);
+		a[i] = aw ^ tw;
+		b[i] = bw ^ tw;
+	}
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	uint32_t x1[20], x2[20], x3[20], z2[20], z3[20];
+	uint32_t a[20], aa[20], b[20], bb[20];
+	uint32_t c[20], d[20], e[20], da[20], cb[20];
+	unsigned char k[32];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+	G[31] &= 0x7F;
+
+	/*
+	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
+	 * into Montgomery representation.
+	 */
+	x1[19] = le8_to_le13(x1, G, 32);
+	memcpy(x3, x1, sizeof x1);
+	memset(z2, 0, sizeof z2);
+	memset(x2, 0, sizeof x2);
+	x2[0] = 1;
+	memset(z3, 0, sizeof z3);
+	z3[0] = 1;
+
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	/* obsolete
+	print_int("x1", x1);
+	*/
+
+	swap = 0;
+	for (i = 254; i >= 0; i --) {
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		cswap(x2, x3, swap);
+		cswap(z2, z3, swap);
+		swap = kt;
+
+		/* obsolete
+		print_int("x2", x2);
+		print_int("z2", z2);
+		print_int("x3", x3);
+		print_int("z3", z3);
+		*/
+
+		f255_add(a, x2, z2);
+		f255_square(aa, a);
+		f255_sub(b, x2, z2);
+		f255_square(bb, b);
+		f255_sub(e, aa, bb);
+		f255_add(c, x3, z3);
+		f255_sub(d, x3, z3);
+		f255_mul(da, d, a);
+		f255_mul(cb, c, b);
+
+		/* obsolete
+		print_int("a ", a);
+		print_int("aa", aa);
+		print_int("b ", b);
+		print_int("bb", bb);
+		print_int("e ", e);
+		print_int("c ", c);
+		print_int("d ", d);
+		print_int("da", da);
+		print_int("cb", cb);
+		*/
+
+		f255_add(x3, da, cb);
+		f255_square(x3, x3);
+		f255_sub(z3, da, cb);
+		f255_square(z3, z3);
+		f255_mul(z3, z3, x1);
+		f255_mul(x2, aa, bb);
+		f255_mul_a24(z2, e);
+		f255_add(z2, z2, aa);
+		f255_mul(z2, e, z2);
+
+		/* obsolete
+		print_int("x2", x2);
+		print_int("z2", z2);
+		print_int("x3", x3);
+		print_int("z3", z3);
+		*/
+	}
+	cswap(x2, x3, swap);
+	cswap(z2, z3, swap);
+
+	/*
+	 * Inverse z2 with a modular exponentiation. This is a simple
+	 * square-and-multiply algorithm; we mutualise most non-squarings
+	 * since the exponent contains almost only ones.
+	 */
+	memcpy(a, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		f255_square(a, a);
+		f255_mul(a, a, z2);
+	}
+	memcpy(b, a, sizeof a);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			f255_square(b, b);
+		}
+		f255_mul(b, b, a);
+	}
+	for (i = 14; i >= 0; i --) {
+		f255_square(b, b);
+		if ((0xFFEB >> i) & 1) {
+			f255_mul(b, z2, b);
+		}
+	}
+	f255_mul(x2, x2, b);
+	reduce_final_f255(x2);
+	le13_to_le8(G, 32, x2);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m15 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c
new file mode 100644
index 00000000..1dd6d514
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* obsolete
+#include <stdio.h>
+#include <stdlib.h>
+static void
+print_int(const char *name, const uint32_t *x)
+{
+	size_t u;
+	unsigned char tmp[40];
+
+	printf("%s = ", name);
+	for (u = 0; u < 9; u ++) {
+		if (x[u] > 0x3FFFFFFF) {
+			printf("INVALID:");
+			for (u = 0; u < 9; u ++) {
+				printf(" %08X", x[u]);
+			}
+			printf("\n");
+			return;
+		}
+	}
+	memset(tmp, 0, sizeof tmp);
+	for (u = 0; u < 9; u ++) {
+		uint64_t w;
+		int j, k;
+
+		w = x[u];
+		j = 30 * (int)u;
+		k = j & 7;
+		if (k != 0) {
+			w <<= k;
+			j -= k;
+		}
+		k = j >> 3;
+		for (j = 0; j < 8; j ++) {
+			tmp[39 - k - j] |= (unsigned char)w;
+			w >>= 8;
+		}
+	}
+	for (u = 8; u < 40; u ++) {
+		printf("%02X", tmp[u]);
+	}
+	printf("\n");
+}
+*/
+
+/*
+ * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
+ * that right-shifting a signed negative integer copies the sign bit
+ * (arithmetic right-shift). This is "implementation-defined behaviour",
+ * i.e. it is not undefined, but it may differ between compilers. Each
+ * compiler is supposed to document its behaviour in that respect. GCC
+ * explicitly defines that an arithmetic right shift is used. We expect
+ * all other compilers to do the same, because underlying CPU offer an
+ * arithmetic right shift opcode that could not be used otherwise.
+ */
+#if BR_NO_ARITH_SHIFT
+#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
+                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
+#else
+#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
+#endif
+
+/*
+ * Convert an integer from unsigned little-endian encoding to a sequence of
+ * 30-bit words in little-endian order. The final "partial" word is
+ * returned.
+ */
+static uint32_t
+le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	uint32_t acc;
+	int acc_len;
+
+	acc = 0;
+	acc_len = 0;
+	while (len -- > 0) {
+		uint32_t b;
+
+		b = *src ++;
+		if (acc_len < 22) {
+			acc |= b << acc_len;
+			acc_len += 8;
+		} else {
+			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
+			acc = b >> (30 - acc_len);
+			acc_len -= 22;
+		}
+	}
+	return acc;
+}
+
+/*
+ * Convert an integer (30-bit words, little-endian) to unsigned
+ * little-endian encoding. The total encoding length is provided; all
+ * the destination bytes will be filled.
+ */
+static void
+le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
+{
+	uint32_t acc;
+	int acc_len;
+
+	acc = 0;
+	acc_len = 0;
+	while (len -- > 0) {
+		if (acc_len < 8) {
+			uint32_t w;
+
+			w = *src ++;
+			*dst ++ = (unsigned char)(acc | (w << acc_len));
+			acc = w >> (8 - acc_len);
+			acc_len += 22;
+		} else {
+			*dst ++ = (unsigned char)acc;
+			acc >>= 8;
+			acc_len -= 8;
+		}
+	}
+}
+
+/*
+ * Multiply two integers. Source integers are represented as arrays of
+ * nine 30-bit words, for values up to 2^270-1. Result is encoded over
+ * 18 words of 30 bits each.
+ */
+static void
+mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	/*
+	 * Maximum intermediate result is no more than
+	 * 10376293531797946367, which fits in 64 bits. Reason:
+	 *
+	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
+	 *   10376293531797946367 < 9663676407 * 2^30
+	 *
+	 * Thus, adding together 9 products of 30-bit integers, with
+	 * a carry of at most 9663676406, yields an integer that fits
+	 * on 64 bits and generates a carry of at most 9663676406.
+	 */
+	uint64_t t[17];
+	uint64_t cc;
+	int i;
+
+	t[ 0] = MUL31(a[0], b[0]);
+	t[ 1] = MUL31(a[0], b[1])
+		+ MUL31(a[1], b[0]);
+	t[ 2] = MUL31(a[0], b[2])
+		+ MUL31(a[1], b[1])
+		+ MUL31(a[2], b[0]);
+	t[ 3] = MUL31(a[0], b[3])
+		+ MUL31(a[1], b[2])
+		+ MUL31(a[2], b[1])
+		+ MUL31(a[3], b[0]);
+	t[ 4] = MUL31(a[0], b[4])
+		+ MUL31(a[1], b[3])
+		+ MUL31(a[2], b[2])
+		+ MUL31(a[3], b[1])
+		+ MUL31(a[4], b[0]);
+	t[ 5] = MUL31(a[0], b[5])
+		+ MUL31(a[1], b[4])
+		+ MUL31(a[2], b[3])
+		+ MUL31(a[3], b[2])
+		+ MUL31(a[4], b[1])
+		+ MUL31(a[5], b[0]);
+	t[ 6] = MUL31(a[0], b[6])
+		+ MUL31(a[1], b[5])
+		+ MUL31(a[2], b[4])
+		+ MUL31(a[3], b[3])
+		+ MUL31(a[4], b[2])
+		+ MUL31(a[5], b[1])
+		+ MUL31(a[6], b[0]);
+	t[ 7] = MUL31(a[0], b[7])
+		+ MUL31(a[1], b[6])
+		+ MUL31(a[2], b[5])
+		+ MUL31(a[3], b[4])
+		+ MUL31(a[4], b[3])
+		+ MUL31(a[5], b[2])
+		+ MUL31(a[6], b[1])
+		+ MUL31(a[7], b[0]);
+	t[ 8] = MUL31(a[0], b[8])
+		+ MUL31(a[1], b[7])
+		+ MUL31(a[2], b[6])
+		+ MUL31(a[3], b[5])
+		+ MUL31(a[4], b[4])
+		+ MUL31(a[5], b[3])
+		+ MUL31(a[6], b[2])
+		+ MUL31(a[7], b[1])
+		+ MUL31(a[8], b[0]);
+	t[ 9] = MUL31(a[1], b[8])
+		+ MUL31(a[2], b[7])
+		+ MUL31(a[3], b[6])
+		+ MUL31(a[4], b[5])
+		+ MUL31(a[5], b[4])
+		+ MUL31(a[6], b[3])
+		+ MUL31(a[7], b[2])
+		+ MUL31(a[8], b[1]);
+	t[10] = MUL31(a[2], b[8])
+		+ MUL31(a[3], b[7])
+		+ MUL31(a[4], b[6])
+		+ MUL31(a[5], b[5])
+		+ MUL31(a[6], b[4])
+		+ MUL31(a[7], b[3])
+		+ MUL31(a[8], b[2]);
+	t[11] = MUL31(a[3], b[8])
+		+ MUL31(a[4], b[7])
+		+ MUL31(a[5], b[6])
+		+ MUL31(a[6], b[5])
+		+ MUL31(a[7], b[4])
+		+ MUL31(a[8], b[3]);
+	t[12] = MUL31(a[4], b[8])
+		+ MUL31(a[5], b[7])
+		+ MUL31(a[6], b[6])
+		+ MUL31(a[7], b[5])
+		+ MUL31(a[8], b[4]);
+	t[13] = MUL31(a[5], b[8])
+		+ MUL31(a[6], b[7])
+		+ MUL31(a[7], b[6])
+		+ MUL31(a[8], b[5]);
+	t[14] = MUL31(a[6], b[8])
+		+ MUL31(a[7], b[7])
+		+ MUL31(a[8], b[6]);
+	t[15] = MUL31(a[7], b[8])
+		+ MUL31(a[8], b[7]);
+	t[16] = MUL31(a[8], b[8]);
+
+	/*
+	 * Propagate carries.
+	 */
+	cc = 0;
+	for (i = 0; i < 17; i ++) {
+		uint64_t w;
+
+		w = t[i] + cc;
+		d[i] = (uint32_t)w & 0x3FFFFFFF;
+		cc = w >> 30;
+	}
+	d[17] = (uint32_t)cc;
+}
+
+/*
+ * Square a 270-bit integer, represented as an array of nine 30-bit words.
+ * Result uses 18 words of 30 bits each.
+ */
+static void
+square9(uint32_t *d, const uint32_t *a)
+{
+	uint64_t t[17];
+	uint64_t cc;
+	int i;
+
+	t[ 0] = MUL31(a[0], a[0]);
+	t[ 1] = ((MUL31(a[0], a[1])) << 1);
+	t[ 2] = MUL31(a[1], a[1])
+		+ ((MUL31(a[0], a[2])) << 1);
+	t[ 3] = ((MUL31(a[0], a[3])
+		+ MUL31(a[1], a[2])) << 1);
+	t[ 4] = MUL31(a[2], a[2])
+		+ ((MUL31(a[0], a[4])
+		+ MUL31(a[1], a[3])) << 1);
+	t[ 5] = ((MUL31(a[0], a[5])
+		+ MUL31(a[1], a[4])
+		+ MUL31(a[2], a[3])) << 1);
+	t[ 6] = MUL31(a[3], a[3])
+		+ ((MUL31(a[0], a[6])
+		+ MUL31(a[1], a[5])
+		+ MUL31(a[2], a[4])) << 1);
+	t[ 7] = ((MUL31(a[0], a[7])
+		+ MUL31(a[1], a[6])
+		+ MUL31(a[2], a[5])
+		+ MUL31(a[3], a[4])) << 1);
+	t[ 8] = MUL31(a[4], a[4])
+		+ ((MUL31(a[0], a[8])
+		+ MUL31(a[1], a[7])
+		+ MUL31(a[2], a[6])
+		+ MUL31(a[3], a[5])) << 1);
+	t[ 9] = ((MUL31(a[1], a[8])
+		+ MUL31(a[2], a[7])
+		+ MUL31(a[3], a[6])
+		+ MUL31(a[4], a[5])) << 1);
+	t[10] = MUL31(a[5], a[5])
+		+ ((MUL31(a[2], a[8])
+		+ MUL31(a[3], a[7])
+		+ MUL31(a[4], a[6])) << 1);
+	t[11] = ((MUL31(a[3], a[8])
+		+ MUL31(a[4], a[7])
+		+ MUL31(a[5], a[6])) << 1);
+	t[12] = MUL31(a[6], a[6])
+		+ ((MUL31(a[4], a[8])
+		+ MUL31(a[5], a[7])) << 1);
+	t[13] = ((MUL31(a[5], a[8])
+		+ MUL31(a[6], a[7])) << 1);
+	t[14] = MUL31(a[7], a[7])
+		+ ((MUL31(a[6], a[8])) << 1);
+	t[15] = ((MUL31(a[7], a[8])) << 1);
+	t[16] = MUL31(a[8], a[8]);
+
+	/*
+	 * Propagate carries.
+	 */
+	cc = 0;
+	for (i = 0; i < 17; i ++) {
+		uint64_t w;
+
+		w = t[i] + cc;
+		d[i] = (uint32_t)w & 0x3FFFFFFF;
+		cc = w >> 30;
+	}
+	d[17] = (uint32_t)cc;
+}
+
+/*
+ * Perform a "final reduction" in field F255 (field for Curve25519)
+ * The source value must be less than twice the modulus. If the value
+ * is not lower than the modulus, then the modulus is subtracted and
+ * this function returns 1; otherwise, it leaves it untouched and it
+ * returns 0.
+ */
+static uint32_t
+reduce_final_f255(uint32_t *d)
+{
+	uint32_t t[9];
+	uint32_t cc;
+	int i;
+
+	memcpy(t, d, sizeof t);
+	cc = 19;
+	for (i = 0; i < 9; i ++) {
+		uint32_t w;
+
+		w = t[i] + cc;
+		cc = w >> 30;
+		t[i] = w & 0x3FFFFFFF;
+	}
+	cc = t[8] >> 15;
+	t[8] &= 0x7FFF;
+	CCOPY(cc, d, t, sizeof t);
+	return cc;
+}
+
+/*
+ * Perform a multiplication of two integers modulo 2^255-19.
+ * Operands are arrays of 9 words, each containing 30 bits of data, in
+ * little-endian order. Input value may be up to 2^256-1; on output, value
+ * fits on 256 bits and is lower than twice the modulus.
+ */
+static void
+f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t t[18], cc;
+	int i;
+
+	/*
+	 * Compute raw multiplication. All result words fit in 30 bits
+	 * each; upper word (t[17]) must fit on 2 bits, since the product
+	 * of two 256-bit integers must fit on 512 bits.
+	 */
+	mul9(t, a, b);
+
+	/*
+	 * Modular reduction: each high word is added where necessary.
+	 * Since the modulus is 2^255-19 and word 9 corresponds to
+	 * offset 9*30 = 270, word 9+k must be added to word k with
+	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
+	 * added that way.
+	 *
+	 * Keeping the carry on 32 bits helps with 32-bit architectures,
+	 * and does not noticeably impact performance on 64-bit systems.
+	 */
+	cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
+	t[8] &= 0x7FFF;
+	for (i = 0; i < 9; i ++) {
+		uint64_t w;
+
+		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
+		t[i] = (uint32_t)w & 0x3FFFFFFF;
+		cc = (uint32_t)(w >> 30);  /* at most 622592 */
+	}
+
+	/*
+	 * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
+	 * This was split into two parts (upper of 257 bits, lower of 255
+	 * bits), and the upper was added to the lower with a factor 19,
+	 * which means that the intermediate value is less than 77*2^255
+	 * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
+	 * less than 77, and the initial carry cc is at most 76*19 = 1444.
+	 */
+	cc = MUL15(t[8] >> 15, 19);
+	t[8] &= 0x7FFF;
+	for (i = 0; i < 9; i ++) {
+		uint32_t z;
+
+		z = t[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
+	}
+
+	/*
+	 * Final result is at most 2^255 + 1443. In particular, the last
+	 * carry is necessarily 0, since t[8] was truncated to 15 bits.
+	 */
+}
+
+/*
+ * Perform a squaring of an integer modulo 2^255-19.
+ * Operands are arrays of 9 words, each containing 30 bits of data, in
+ * little-endian order. Input value may be up to 2^256-1; on output, value
+ * fits on 256 bits and is lower than twice the modulus.
+ */
+static void
+f255_square(uint32_t *d, const uint32_t *a)
+{
+	uint32_t t[18], cc;
+	int i;
+
+	/*
+	 * Compute raw squaring. All result words fit in 30 bits
+	 * each; upper word (t[17]) must fit on 2 bits, since the square
+	 * of a 256-bit integers must fit on 512 bits.
+	 */
+	square9(t, a);
+
+	/*
+	 * Modular reduction: each high word is added where necessary.
+	 * See f255_mul() for details on the reduction and carry limits.
+	 */
+	cc = MUL15(t[8] >> 15, 19);
+	t[8] &= 0x7FFF;
+	for (i = 0; i < 9; i ++) {
+		uint64_t w;
+
+		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
+		t[i] = (uint32_t)w & 0x3FFFFFFF;
+		cc = (uint32_t)(w >> 30);
+	}
+	cc = MUL15(t[8] >> 15, 19);
+	t[8] &= 0x7FFF;
+	for (i = 0; i < 9; i ++) {
+		uint32_t z;
+
+		z = t[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
+	}
+}
+
+/*
+ * Add two values in F255. Partial reduction is performed (down to less
+ * than twice the modulus).
+ */
+static void
+f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	/*
+	 * Since operand words fit on 30 bits, we can use 32-bit
+	 * variables throughout.
+	 */
+	int i;
+	uint32_t cc, w;
+
+	cc = 0;
+	for (i = 0; i < 9; i ++) {
+		w = a[i] + b[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = w >> 30;
+	}
+	cc = MUL15(w >> 15, 19);
+	d[8] &= 0x7FFF;
+	for (i = 0; i < 9; i ++) {
+		w = d[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = w >> 30;
+	}
+}
+
+/*
+ * Subtract one value from another in F255. Partial reduction is
+ * performed (down to less than twice the modulus).
+ */
+static void
+f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	/*
+	 * We actually compute a - b + 2*p, so that the final value is
+	 * necessarily positive.
+	 */
+	int i;
+	uint32_t cc, w;
+
+	cc = (uint32_t)-38;
+	for (i = 0; i < 9; i ++) {
+		w = a[i] - b[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = ARSH(w, 30);
+	}
+	cc = MUL15((w + 0x10000) >> 15, 19);
+	d[8] &= 0x7FFF;
+	for (i = 0; i < 9; i ++) {
+		w = d[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = w >> 30;
+	}
+}
+
+/*
+ * Multiply an integer by the 'A24' constant (121665). Partial reduction
+ * is performed (down to less than twice the modulus).
+ */
+static void
+f255_mul_a24(uint32_t *d, const uint32_t *a)
+{
+	int i;
+	uint64_t w;
+	uint32_t cc;
+
+	/*
+	 * a[] is over 256 bits, thus a[8] has length at most 16 bits.
+	 * We single out the processing of the last word: intermediate
+	 * value w is up to 121665*2^16, yielding a carry for the next
+	 * loop of at most 19*(121665*2^16/2^15) = 4623289.
+	 */
+	cc = 0;
+	for (i = 0; i < 8; i ++) {
+		w = MUL31(a[i], 121665) + (uint64_t)cc;
+		d[i] = (uint32_t)w & 0x3FFFFFFF;
+		cc = (uint32_t)(w >> 30);
+	}
+	w = MUL31(a[8], 121665) + (uint64_t)cc;
+	d[8] = (uint32_t)w & 0x7FFF;
+	cc = MUL15((uint32_t)(w >> 15), 19);
+
+	for (i = 0; i < 9; i ++) {
+		uint32_t z;
+
+		z = d[i] + cc;
+		d[i] = z & 0x3FFFFFFF;
+		cc = z >> 30;
+	}
+}
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+static void
+cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
+{
+	int i;
+
+	ctl = -ctl;
+	for (i = 0; i < 9; i ++) {
+		uint32_t aw, bw, tw;
+
+		aw = a[i];
+		bw = b[i];
+		tw = ctl & (aw ^ bw);
+		a[i] = aw ^ tw;
+		b[i] = bw ^ tw;
+	}
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
+	uint32_t a[9], aa[9], b[9], bb[9];
+	uint32_t c[9], d[9], e[9], da[9], cb[9];
+	unsigned char k[32];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+	G[31] &= 0x7F;
+
+	/*
+	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
+	 * into Montgomery representation.
+	 */
+	x1[8] = le8_to_le30(x1, G, 32);
+	memcpy(x3, x1, sizeof x1);
+	memset(z2, 0, sizeof z2);
+	memset(x2, 0, sizeof x2);
+	x2[0] = 1;
+	memset(z3, 0, sizeof z3);
+	z3[0] = 1;
+
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	/* obsolete
+	print_int("x1", x1);
+	*/
+
+	swap = 0;
+	for (i = 254; i >= 0; i --) {
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		cswap(x2, x3, swap);
+		cswap(z2, z3, swap);
+		swap = kt;
+
+		/* obsolete
+		print_int("x2", x2);
+		print_int("z2", z2);
+		print_int("x3", x3);
+		print_int("z3", z3);
+		*/
+
+		f255_add(a, x2, z2);
+		f255_square(aa, a);
+		f255_sub(b, x2, z2);
+		f255_square(bb, b);
+		f255_sub(e, aa, bb);
+		f255_add(c, x3, z3);
+		f255_sub(d, x3, z3);
+		f255_mul(da, d, a);
+		f255_mul(cb, c, b);
+
+		/* obsolete
+		print_int("a ", a);
+		print_int("aa", aa);
+		print_int("b ", b);
+		print_int("bb", bb);
+		print_int("e ", e);
+		print_int("c ", c);
+		print_int("d ", d);
+		print_int("da", da);
+		print_int("cb", cb);
+		*/
+
+		f255_add(x3, da, cb);
+		f255_square(x3, x3);
+		f255_sub(z3, da, cb);
+		f255_square(z3, z3);
+		f255_mul(z3, z3, x1);
+		f255_mul(x2, aa, bb);
+		f255_mul_a24(z2, e);
+		f255_add(z2, z2, aa);
+		f255_mul(z2, e, z2);
+
+		/* obsolete
+		print_int("x2", x2);
+		print_int("z2", z2);
+		print_int("x3", x3);
+		print_int("z3", z3);
+		*/
+	}
+	cswap(x2, x3, swap);
+	cswap(z2, z3, swap);
+
+	/*
+	 * Inverse z2 with a modular exponentiation. This is a simple
+	 * square-and-multiply algorithm; we mutualise most non-squarings
+	 * since the exponent contains almost only ones.
+	 */
+	memcpy(a, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		f255_square(a, a);
+		f255_mul(a, a, z2);
+	}
+	memcpy(b, a, sizeof a);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			f255_square(b, b);
+		}
+		f255_mul(b, b, a);
+	}
+	for (i = 14; i >= 0; i --) {
+		f255_square(b, b);
+		if ((0xFFEB >> i) & 1) {
+			f255_mul(b, z2, b);
+		}
+	}
+	f255_mul(x2, x2, b);
+	reduce_final_f255(x2);
+	le30_to_le8(G, 32, x2);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m31 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c
new file mode 100644
index 00000000..6b058eb1
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+/*
+ * A field element is encoded as five 64-bit integers, in basis 2^51.
+ * Limbs may be occasionally larger than 2^51, to save on carry
+ * propagation costs.
+ */
+
+#define MASK51   (((uint64_t)1 << 51) - (uint64_t)1)
+
+/*
+ * Swap two field elements, conditionally on a flag.
+ */
+static inline void
+f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
+{
+	uint64_t m, w;
+
+	m = -(uint64_t)ctl;
+	w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
+	w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
+	w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
+	w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
+	w = m & (a[4] ^ b[4]); a[4] ^= w; b[4] ^= w;
+}
+
+/*
+ * Addition with no carry propagation. Limbs double in size.
+ */
+static inline void
+f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	d[0] = a[0] + b[0];
+	d[1] = a[1] + b[1];
+	d[2] = a[2] + b[2];
+	d[3] = a[3] + b[3];
+	d[4] = a[4] + b[4];
+}
+
+/*
+ * Subtraction.
+ * On input, limbs must fit on 60 bits each. On output, result is
+ * partially reduced, with max value 2^255+19456; moreover, all
+ * limbs will fit on 51 bits, except the low limb, which may have
+ * value up to 2^51+19455.
+ */
+static inline void
+f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	uint64_t cc, w;
+
+	/*
+	 * We compute d = (2^255-19)*1024 + a - b. Since the limbs
+	 * fit on 60 bits, the maximum value of operands are slightly
+	 * more than 2^264, but much less than 2^265-19456. This
+	 * ensures that the result is positive.
+	 */
+
+	/*
+	 * Initial carry is 19456, since we add 2^265-19456. Each
+	 * individual subtraction may yield a carry up to 513.
+	 */
+	w = a[0] - b[0] - 19456;
+	d[0] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[1] - b[1] - cc;
+	d[1] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[2] - b[2] - cc;
+	d[2] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	w = a[3] - b[3] - cc;
+	d[3] = w & MASK51;
+	cc = -(w >> 51) & 0x3FF;
+	d[4] = ((uint64_t)1 << 61) + a[4] - b[4] - cc;
+
+	/*
+	 * Partial reduction. The intermediate result may be up to
+	 * slightly above 2^265, but less than 2^265+2^255. When we
+	 * truncate to 255 bits, the upper bits will be at most 1024.
+	 */
+	d[0] += 19 * (d[4] >> 51);
+	d[4] &= MASK51;
+}
+
+/*
+ * UMUL51(hi, lo, x, y) computes:
+ *
+ *   hi = floor((x * y) / (2^51))
+ *   lo = x * y mod 2^51
+ *
+ * Note that lo < 2^51, but "hi" may be larger, if the input operands are
+ * larger.
+ */
+#if BR_INT128
+
+#define UMUL51(hi, lo, x, y)   do { \
+		unsigned __int128 umul_tmp; \
+		umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
+		(hi) = (uint64_t)(umul_tmp >> 51); \
+		(lo) = (uint64_t)umul_tmp & MASK51; \
+	} while (0)
+
+#elif BR_UMUL128
+
+#define UMUL51(hi, lo, x, y)   do { \
+		uint64_t umul_hi, umul_lo; \
+		umul_lo = _umul128((x), (y), &umul_hi); \
+		(hi) = (umul_hi << 13) | (umul_lo >> 51); \
+		(lo) = umul_lo & MASK51; \
+	} while (0)
+
+#endif
+
+/*
+ * Multiplication.
+ * On input, limbs must fit on 54 bits each.
+ * On output, limb 0 is at most 2^51 + 155647, and other limbs fit
+ * on 51 bits each.
+ */
+static inline void
+f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
+{
+	uint64_t t[10], hi, lo, w, cc;
+
+	/*
+	 * Perform cross products, accumulating values without carry
+	 * propagation.
+	 *
+	 * Since input limbs fit on 54 bits each, each individual
+	 * UMUL51 will produce a "hi" of less than 2^57. The maximum
+	 * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]),
+	 * i.e. less than 324*2^51.
+	 */
+
+	UMUL51(t[1], t[0], a[0], b[0]);
+
+	UMUL51(t[2], lo, a[1], b[0]); t[1] += lo;
+	UMUL51(hi, lo, a[0], b[1]); t[1] += lo; t[2] += hi;
+
+	UMUL51(t[3], lo, a[2], b[0]); t[2] += lo;
+	UMUL51(hi, lo, a[1], b[1]); t[2] += lo; t[3] += hi;
+	UMUL51(hi, lo, a[0], b[2]); t[2] += lo; t[3] += hi;
+
+	UMUL51(t[4], lo, a[3], b[0]); t[3] += lo;
+	UMUL51(hi, lo, a[2], b[1]); t[3] += lo; t[4] += hi;
+	UMUL51(hi, lo, a[1], b[2]); t[3] += lo; t[4] += hi;
+	UMUL51(hi, lo, a[0], b[3]); t[3] += lo; t[4] += hi;
+
+	UMUL51(t[5], lo, a[4], b[0]); t[4] += lo;
+	UMUL51(hi, lo, a[3], b[1]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[2], b[2]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[1], b[3]); t[4] += lo; t[5] += hi;
+	UMUL51(hi, lo, a[0], b[4]); t[4] += lo; t[5] += hi;
+
+	UMUL51(t[6], lo, a[4], b[1]); t[5] += lo;
+	UMUL51(hi, lo, a[3], b[2]); t[5] += lo; t[6] += hi;
+	UMUL51(hi, lo, a[2], b[3]); t[5] += lo; t[6] += hi;
+	UMUL51(hi, lo, a[1], b[4]); t[5] += lo; t[6] += hi;
+
+	UMUL51(t[7], lo, a[4], b[2]); t[6] += lo;
+	UMUL51(hi, lo, a[3], b[3]); t[6] += lo; t[7] += hi;
+	UMUL51(hi, lo, a[2], b[4]); t[6] += lo; t[7] += hi;
+
+	UMUL51(t[8], lo, a[4], b[3]); t[7] += lo;
+	UMUL51(hi, lo, a[3], b[4]); t[7] += lo; t[8] += hi;
+
+	UMUL51(t[9], lo, a[4], b[4]); t[8] += lo;
+
+	/*
+	 * The upper words t[5]..t[9] are folded back into the lower
+	 * words, using the rule that 2^255 = 19 in the field.
+	 *
+	 * Since each t[i] is less than 324*2^51, the additions below
+	 * will yield less than 6480*2^51 in each limb; this fits in
+	 * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is
+	 * no overflow.
+	 */
+	t[0] += 19 * t[5];
+	t[1] += 19 * t[6];
+	t[2] += 19 * t[7];
+	t[3] += 19 * t[8];
+	t[4] += 19 * t[9];
+
+	/*
+	 * Propagate carries.
+	 */
+	w = t[0];
+	d[0] = w & MASK51;
+	cc = w >> 51;
+	w = t[1] + cc;
+	d[1] = w & MASK51;
+	cc = w >> 51;
+	w = t[2] + cc;
+	d[2] = w & MASK51;
+	cc = w >> 51;
+	w = t[3] + cc;
+	d[3] = w & MASK51;
+	cc = w >> 51;
+	w = t[4] + cc;
+	d[4] = w & MASK51;
+	cc = w >> 51;
+
+	/*
+	 * Since the limbs were 64-bit values, the top carry is at
+	 * most 8192 (in practice, that cannot be reached). We simply
+	 * performed a partial reduction.
+	 */
+	d[0] += 19 * cc;
+}
+
+/*
+ * Multiplication by A24 = 121665.
+ * Input must have limbs of 60 bits at most.
+ */
+static inline void
+f255_mul_a24(uint64_t *d, const uint64_t *a)
+{
+	uint64_t t[5], cc, w;
+
+	/*
+	 * 121665 = 15 * 8111. We first multiply by 15, with carry
+	 * propagation and partial reduction.
+	 */
+	w = a[0] * 15;
+	t[0] = w & MASK51;
+	cc = w >> 51;
+	w = a[1] * 15 + cc;
+	t[1] = w & MASK51;
+	cc = w >> 51;
+	w = a[2] * 15 + cc;
+	t[2] = w & MASK51;
+	cc = w >> 51;
+	w = a[3] * 15 + cc;
+	t[3] = w & MASK51;
+	cc = w >> 51;
+	w = a[4] * 15 + cc;
+	t[4] = w & MASK51;
+	t[0] += 19 * (w >> 51);
+
+	/*
+	 * Then multiplication by 8111. At that point, we known that
+	 * t[0] is less than 2^51 + 19*8192, and other limbs are less
+	 * than 2^51; thus, there will be no overflow.
+	 */
+	w = t[0] * 8111;
+	d[0] = w & MASK51;
+	cc = w >> 51;
+	w = t[1] * 8111 + cc;
+	d[1] = w & MASK51;
+	cc = w >> 51;
+	w = t[2] * 8111 + cc;
+	d[2] = w & MASK51;
+	cc = w >> 51;
+	w = t[3] * 8111 + cc;
+	d[3] = w & MASK51;
+	cc = w >> 51;
+	w = t[4] * 8111 + cc;
+	d[4] = w & MASK51;
+	d[0] += 19 * (w >> 51);
+}
+
+/*
+ * Finalize reduction.
+ * On input, limbs must fit on 51 bits, except possibly the low limb,
+ * which may be slightly above 2^51.
+ */
+static inline void
+f255_final_reduce(uint64_t *a)
+{
+	uint64_t t[5], cc, w;
+
+	/*
+	 * We add 19. If the result (in t[]) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	w = a[0] + 19;
+	t[0] = w & MASK51;
+	cc = w >> 51;
+	w = a[1] + cc;
+	t[1] = w & MASK51;
+	cc = w >> 51;
+	w = a[2] + cc;
+	t[2] = w & MASK51;
+	cc = w >> 51;
+	w = a[3] + cc;
+	t[3] = w & MASK51;
+	cc = w >> 51;
+	w = a[4] + cc;
+	t[4] = w & MASK51;
+	cc = w >> 51;
+
+	/*
+	 * The bit 255 of t is in cc. If that bit is 0, when a[] must
+	 * be unchanged; otherwise, it must be replaced with t[].
+	 */
+	cc = -cc;
+	a[0] ^= cc & (a[0] ^ t[0]);
+	a[1] ^= cc & (a[1] ^ t[1]);
+	a[2] ^= cc & (a[2] ^ t[2]);
+	a[3] ^= cc & (a[3] ^ t[3]);
+	a[4] ^= cc & (a[4] ^ t[4]);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	unsigned char k[32];
+	uint64_t x1[5], x2[5], z2[5], x3[5], z3[5];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+
+	/*
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared; the "& MASK51" in the initialization for
+	 * x1[4] clears that bit.
+	 */
+	x1[0] = br_dec64le(&G[0]) & MASK51;
+	x1[1] = (br_dec64le(&G[6]) >> 3) & MASK51;
+	x1[2] = (br_dec64le(&G[12]) >> 6) & MASK51;
+	x1[3] = (br_dec64le(&G[19]) >> 1) & MASK51;
+	x1[4] = (br_dec64le(&G[24]) >> 12) & MASK51;
+
+	/*
+	 * We can use memset() to clear values, because exact-width types
+	 * like uint64_t are guaranteed to have no padding bits or
+	 * trap representations.
+	 */
+	memset(x2, 0, sizeof x2);
+	x2[0] = 1;
+	memset(z2, 0, sizeof z2);
+	memcpy(x3, x1, sizeof x1);
+	memcpy(z3, x2, sizeof x2);
+
+	/*
+	 * The multiplier is provided in big-endian notation, and
+	 * possibly shorter than 32 bytes.
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	swap = 0;
+
+	for (i = 254; i >= 0; i --) {
+		uint64_t a[5], aa[5], b[5], bb[5], e[5];
+		uint64_t c[5], d[5], da[5], cb[5];
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		f255_cswap(x2, x3, swap);
+		f255_cswap(z2, z3, swap);
+		swap = kt;
+
+		/*
+		 * At that point, limbs of x_2 and z_2 are assumed to fit
+		 * on at most 52 bits each.
+		 *
+		 * Each f255_add() adds one bit to the maximum range of
+		 * the values, but f255_sub() and f255_mul() bring back
+		 * the limbs into 52 bits. All f255_add() outputs are
+		 * used only as inputs for f255_mul(), which ensures
+		 * that limbs remain in the proper range.
+		 */
+
+		/* A = x_2 + z_2   -- limbs fit on 53 bits each */
+		f255_add(a, x2, z2);
+
+		/* AA = A^2 */
+		f255_mul(aa, a, a);
+
+		/* B = x_2 - z_2 */
+		f255_sub(b, x2, z2);
+
+		/* BB = B^2 */
+		f255_mul(bb, b, b);
+
+		/* E = AA - BB */
+		f255_sub(e, aa, bb);
+
+		/* C = x_3 + z_3   -- limbs fit on 53 bits each */
+		f255_add(c, x3, z3);
+
+		/* D = x_3 - z_3 */
+		f255_sub(d, x3, z3);
+
+		/* DA = D * A */
+		f255_mul(da, d, a);
+
+		/* CB = C * B */
+		f255_mul(cb, c, b);
+
+		/* x_3 = (DA + CB)^2 */
+		f255_add(x3, da, cb);
+		f255_mul(x3, x3, x3);
+
+		/* z_3 = x_1 * (DA - CB)^2 */
+		f255_sub(z3, da, cb);
+		f255_mul(z3, z3, z3);
+		f255_mul(z3, x1, z3);
+
+		/* x_2 = AA * BB */
+		f255_mul(x2, aa, bb);
+
+		/* z_2 = E * (AA + a24 * E) */
+		f255_mul_a24(z2, e);
+		f255_add(z2, aa, z2);
+		f255_mul(z2, e, z2);
+	}
+
+	f255_cswap(x2, x3, swap);
+	f255_cswap(z2, z3, swap);
+
+	/*
+	 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
+	 * most non-squarings. We use x1 and x3, now useless, as temporaries.
+	 */
+	memcpy(x1, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		f255_mul(x1, x1, x1);
+		f255_mul(x1, x1, z2);
+	}
+	memcpy(x3, x1, sizeof x1);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			f255_mul(x3, x3, x3);
+		}
+		f255_mul(x3, x3, x1);
+	}
+	for (i = 14; i >= 0; i --) {
+		f255_mul(x3, x3, x3);
+		if ((0xFFEB >> i) & 1) {
+			f255_mul(x3, z2, x3);
+		}
+	}
+
+	/*
+	 * Compute x2/z2. We have 1/z2 in x3.
+	 */
+	f255_mul(x2, x2, x3);
+	f255_final_reduce(x2);
+
+	/*
+	 * Encode the final x2 value in little-endian. We first assemble
+	 * the limbs into 64-bit values.
+	 */
+	x2[0] |= x2[1] << 51;
+	x2[1] = (x2[1] >> 13) | (x2[2] << 38);
+	x2[2] = (x2[2] >> 26) | (x2[3] << 25);
+	x2[3] = (x2[3] >> 39) | (x2[4] << 12);
+	br_enc64le(G, x2[0]);
+	br_enc64le(G + 8, x2[1]);
+	br_enc64le(G + 16, x2[2]);
+	br_enc64le(G + 24, x2[3]);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m62 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m62_get(void)
+{
+	return &br_ec_c25519_m62;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m62_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c
new file mode 100644
index 00000000..7e7f12f7
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return GEN;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return ORDER;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 0;
+}
+
+/*
+ * A field element is encoded as four 64-bit integers, in basis 2^63.
+ * Operations return partially reduced values, which may range up to
+ * 2^255+37.
+ */
+
+#define MASK63   (((uint64_t)1 << 63) - (uint64_t)1)
+
+/*
+ * Swap two field elements, conditionally on a flag.
+ */
+static inline void
+f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl)
+{
+	uint64_t m, w;
+
+	m = -(uint64_t)ctl;
+	w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w;
+	w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w;
+	w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w;
+	w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w;
+}
+
+/*
+ * Addition in the field.
+ */
+static inline void
+f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3, cc;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] + (unsigned __int128)b[0];
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+	cc = (uint64_t)(z >> 63);
+
+	/*
+	 * Since operands are at most 2^255+37, the sum is at most
+	 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
+	 *
+	 * We use: 2^255 = 19 mod p.
+	 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
+	 * the result is at most 2^255+37.
+	 */
+	z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc);
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, cc;
+	unsigned char k;
+
+	k = _addcarry_u64(0, a[0], b[0], &t0);
+	k = _addcarry_u64(k, a[1], b[1], &t1);
+	k = _addcarry_u64(k, a[2], b[2], &t2);
+	k = _addcarry_u64(k, a[3], b[3], &t3);
+	cc = (k << 1) + (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Since operands are at most 2^255+37, the sum is at most
+	 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2.
+	 *
+	 * We use: 2^255 = 19 mod p.
+	 * Since we add 0, 19 or 38 to a value that fits on 255 bits,
+	 * the result is at most 2^255+37.
+	 */
+	k = _addcarry_u64(0, t0, 19 * cc, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Subtraction.
+ * On input, limbs must fit on 60 bits each. On output, result is
+ * partially reduced, with max value 2^255+19456; moreover, all
+ * limbs will fit on 51 bits, except the low limb, which may have
+ * value up to 2^51+19455.
+ */
+static inline void
+f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	/*
+	 * We compute t = 2^256 - 38 + a - b, which is necessarily
+	 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
+	 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
+	 * on the two upper bits of t (bits 255 and 256).
+	 */
+
+	uint64_t t0, t1, t2, t3, t4, cc;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38;
+	t0 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[1] - (unsigned __int128)b[1]
+		- (unsigned __int128)cc;
+	t1 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[2] - (unsigned __int128)b[2]
+		- (unsigned __int128)cc;
+	t2 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+	z = (unsigned __int128)a[3] - (unsigned __int128)b[3]
+		- (unsigned __int128)cc;
+	t3 = (uint64_t)z;
+	t4 = 1 + (uint64_t)(z >> 64);
+
+	/*
+	 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
+	 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
+	 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
+	 * This guarantees that the result is at most 2^255+37.
+	 */
+	cc = (38 & -t4) + (19 & -(t3 >> 63));
+	t3 &= MASK63;
+	z = (unsigned __int128)t0 + (unsigned __int128)cc;
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	/*
+	 * We compute t = 2^256 - 38 + a - b, which is necessarily
+	 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37
+	 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending
+	 * on the two upper bits of t (bits 255 and 256).
+	 */
+
+	uint64_t t0, t1, t2, t3, t4;
+	unsigned char k;
+
+	k = _subborrow_u64(0, a[0], b[0], &t0);
+	k = _subborrow_u64(k, a[1], b[1], &t1);
+	k = _subborrow_u64(k, a[2], b[2], &t2);
+	k = _subborrow_u64(k, a[3], b[3], &t3);
+	(void)_subborrow_u64(k, 1, 0, &t4);
+
+	k = _subborrow_u64(0, t0, 38, &t0);
+	k = _subborrow_u64(k, t1, 0, &t1);
+	k = _subborrow_u64(k, t2, 0, &t2);
+	k = _subborrow_u64(k, t3, 0, &t3);
+	(void)_subborrow_u64(k, t4, 0, &t4);
+
+	/*
+	 * We have a 257-bit result. The two top bits can be 00, 01 or 10,
+	 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1).
+	 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38.
+	 * This guarantees that the result is at most 2^255+37.
+	 */
+	t4 = (38 & -t4) + (19 & -(t3 >> 63));
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, t4, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Multiplication.
+ */
+static inline void
+f255_mul(uint64_t *d, uint64_t *a, uint64_t *b)
+{
+#if BR_INT128
+
+	unsigned __int128 z;
+	uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
+
+	/*
+	 * Compute the product a*b over plain integers.
+	 */
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[0];
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64);
+	t3 = (uint64_t)z;
+	t4 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t1;
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t2 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t3 + (z >> 64);
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	t5 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t2;
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t3 + (z >> 64);
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t5 + (z >> 64);
+	t5 = (uint64_t)z;
+	t6 = (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[0]
+		+ (unsigned __int128)t3;
+	t3 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[1]
+		+ (unsigned __int128)t4 + (z >> 64);
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[2]
+		+ (unsigned __int128)t5 + (z >> 64);
+	t5 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * (unsigned __int128)b[3]
+		+ (unsigned __int128)t6 + (z >> 64);
+	t6 = (uint64_t)z;
+	t7 = (uint64_t)(z >> 64);
+
+	/*
+	 * Modulo p, we have:
+	 *
+	 *   2^255 = 19
+	 *   2^510 = 19*19 = 361
+	 *
+	 * We split the intermediate t into three parts, in basis
+	 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
+	 * The upper one can only be a single bit (th), since the
+	 * multiplication operands are at most 2^255+37 each.
+	 */
+	th = t7 >> 62;
+	t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
+	t6 = (t6 << 1) | (t5 >> 63);
+	t5 = (t5 << 1) | (t4 >> 63);
+	t4 = (t4 << 1) | (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Multiply the middle part (t4..t7) by 19. We truncate it to
+	 * 255 bits; the extra bits will go along with th.
+	 */
+	z = (unsigned __int128)t4 * 19;
+	t4 = (uint64_t)z;
+	z = (unsigned __int128)t5 * 19 + (z >> 64);
+	t5 = (uint64_t)z;
+	z = (unsigned __int128)t6 * 19 + (z >> 64);
+	t6 = (uint64_t)z;
+	z = (unsigned __int128)t7 * 19 + (z >> 64);
+	t7 = (uint64_t)z & MASK63;
+
+	th = (361 & -th) + (19 * (uint64_t)(z >> 63));
+
+	/*
+	 * Add elements together.
+	 * At this point:
+	 *   t0..t3 fits on 255 bits.
+	 *   t4..t7 fits on 255 bits.
+	 *   th <= 361 + 342 = 703.
+	 */
+	z = (unsigned __int128)t0 + (unsigned __int128)t4
+		+ (unsigned __int128)th;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+	th = (uint64_t)(z >> 63);
+
+	/*
+	 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
+	 * can only have value 0, 1 or 2. We just add th*19, which
+	 * guarantees a result of at most 2^255+37.
+	 */
+	z = (unsigned __int128)t0 + (19 * th);
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = t3 + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th;
+	uint64_t h0, h1, h2, h3;
+	unsigned char k;
+
+	/*
+	 * Compute the product a*b over plain integers.
+	 */
+	t0 = _umul128(a[0], b[0], &h0);
+	t1 = _umul128(a[0], b[1], &h1);
+	k = _addcarry_u64(0, t1, h0, &t1);
+	t2 = _umul128(a[0], b[2], &h2);
+	k = _addcarry_u64(k, t2, h1, &t2);
+	t3 = _umul128(a[0], b[3], &h3);
+	k = _addcarry_u64(k, t3, h2, &t3);
+	(void)_addcarry_u64(k, h3, 0, &t4);
+
+	k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1);
+	k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2);
+	k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4);
+	t5 = k;
+	k = _addcarry_u64(0, t2, h0, &t2);
+	k = _addcarry_u64(k, t3, h1, &t3);
+	k = _addcarry_u64(k, t4, h2, &t4);
+	(void)_addcarry_u64(k, t5, h3, &t5);
+
+	k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2);
+	k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4);
+	k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5);
+	t6 = k;
+	k = _addcarry_u64(0, t3, h0, &t3);
+	k = _addcarry_u64(k, t4, h1, &t4);
+	k = _addcarry_u64(k, t5, h2, &t5);
+	(void)_addcarry_u64(k, t6, h3, &t6);
+
+	k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3);
+	k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4);
+	k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5);
+	k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6);
+	t7 = k;
+	k = _addcarry_u64(0, t4, h0, &t4);
+	k = _addcarry_u64(k, t5, h1, &t5);
+	k = _addcarry_u64(k, t6, h2, &t6);
+	(void)_addcarry_u64(k, t7, h3, &t7);
+
+	/*
+	 * Modulo p, we have:
+	 *
+	 *   2^255 = 19
+	 *   2^510 = 19*19 = 361
+	 *
+	 * We split the intermediate t into three parts, in basis
+	 * 2^255. The low one will be in t0..t3; the middle one in t4..t7.
+	 * The upper one can only be a single bit (th), since the
+	 * multiplication operands are at most 2^255+37 each.
+	 */
+	th = t7 >> 62;
+	t7 = ((t7 << 1) | (t6 >> 63)) & MASK63;
+	t6 = (t6 << 1) | (t5 >> 63);
+	t5 = (t5 << 1) | (t4 >> 63);
+	t4 = (t4 << 1) | (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Multiply the middle part (t4..t7) by 19. We truncate it to
+	 * 255 bits; the extra bits will go along with th.
+	 */
+	t4 = _umul128(t4, 19, &h0);
+	t5 = _umul128(t5, 19, &h1);
+	t6 = _umul128(t6, 19, &h2);
+	t7 = _umul128(t7, 19, &h3);
+	k = _addcarry_u64(0, t5, h0, &t5);
+	k = _addcarry_u64(k, t6, h1, &t6);
+	k = _addcarry_u64(k, t7, h2, &t7);
+	(void)_addcarry_u64(k, h3, 0, &h3);
+	th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63)));
+	t7 &= MASK63;
+
+	/*
+	 * Add elements together.
+	 * At this point:
+	 *   t0..t3 fits on 255 bits.
+	 *   t4..t7 fits on 255 bits.
+	 *   th <= 361 + 342 = 703.
+	 */
+	k = _addcarry_u64(0, t0, t4, &t0);
+	k = _addcarry_u64(k, t1, t5, &t1);
+	k = _addcarry_u64(k, t2, t6, &t2);
+	k = _addcarry_u64(k, t3, t7, &t3);
+	t4 = k;
+	k = _addcarry_u64(0, t0, th, &t0);
+	k = _addcarry_u64(k, t1, 0, &t1);
+	k = _addcarry_u64(k, t2, 0, &t2);
+	k = _addcarry_u64(k, t3, 0, &t3);
+	(void)_addcarry_u64(k, t4, 0, &t4);
+
+	th = (t4 << 1) + (t3 >> 63);
+	t3 &= MASK63;
+
+	/*
+	 * Since the sum is at most 2^256 + 703, the two upper bits, in th,
+	 * can only have value 0, 1 or 2. We just add th*19, which
+	 * guarantees a result of at most 2^255+37.
+	 */
+	k = _addcarry_u64(0, t0, 19 * th, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Multiplication by A24 = 121665.
+ */
+static inline void
+f255_mul_a24(uint64_t *d, const uint64_t *a)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3;
+	unsigned __int128 z;
+
+	z = (unsigned __int128)a[0] * 121665;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] * 121665 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] * 121665 + (z >> 64);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[3] * 121665 + (z >> 64);
+	t3 = (uint64_t)z & MASK63;
+
+	z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63));
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	t2 = (uint64_t)z;
+	t3 = t3 + (uint64_t)(z >> 64);
+
+	z = (unsigned __int128)t0 + (19 & -(t3 >> 63));
+	d[0] = (uint64_t)z;
+	z = (unsigned __int128)t1 + (z >> 64);
+	d[1] = (uint64_t)z;
+	z = (unsigned __int128)t2 + (z >> 64);
+	d[2] = (uint64_t)z;
+	d[3] = (t3 & MASK63) + (uint64_t)(z >> 64);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3;
+	unsigned char k;
+
+	t0 = _umul128(a[0], 121665, &h0);
+	t1 = _umul128(a[1], 121665, &h1);
+	k = _addcarry_u64(0, t1, h0, &t1);
+	t2 = _umul128(a[2], 121665, &h2);
+	k = _addcarry_u64(k, t2, h1, &t2);
+	t3 = _umul128(a[3], 121665, &h3);
+	k = _addcarry_u64(k, t3, h2, &t3);
+	(void)_addcarry_u64(k, h3, 0, &t4);
+
+	t4 = (t4 << 1) + (t3 >> 63);
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, 19 * t4, &t0);
+	k = _addcarry_u64(k, t1, 0, &t1);
+	k = _addcarry_u64(k, t2, 0, &t2);
+	(void)_addcarry_u64(k, t3, 0, &t3);
+
+	t4 = 19 & -(t3 >> 63);
+	t3 &= MASK63;
+	k = _addcarry_u64(0, t0, t4, &d[0]);
+	k = _addcarry_u64(k, t1, 0, &d[1]);
+	k = _addcarry_u64(k, t2, 0, &d[2]);
+	(void)_addcarry_u64(k, t3, 0, &d[3]);
+
+#endif
+}
+
+/*
+ * Finalize reduction.
+ */
+static inline void
+f255_final_reduce(uint64_t *a)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3, m;
+	unsigned __int128 z;
+
+	/*
+	 * We add 19. If the result (in t) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	z = (unsigned __int128)a[0] + 19;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] + (z >> 64);
+	t2 = (uint64_t)z;
+	t3 = a[3] + (uint64_t)(z >> 64);
+
+	m = -(t3 >> 63);
+	t3 &= MASK63;
+	a[0] ^= m & (a[0] ^ t0);
+	a[1] ^= m & (a[1] ^ t1);
+	a[2] ^= m & (a[2] ^ t2);
+	a[3] ^= m & (a[3] ^ t3);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, m;
+	unsigned char k;
+
+	/*
+	 * We add 19. If the result (in t) is below 2^255, then a[]
+	 * is already less than 2^255-19, thus already reduced.
+	 * Otherwise, we subtract 2^255 from t[], in which case we
+	 * have t = a - (2^255-19), and that's our result.
+	 */
+	k = _addcarry_u64(0, a[0], 19, &t0);
+	k = _addcarry_u64(k, a[1], 0, &t1);
+	k = _addcarry_u64(k, a[2], 0, &t2);
+	(void)_addcarry_u64(k, a[3], 0, &t3);
+
+	m = -(t3 >> 63);
+	t3 &= MASK63;
+	a[0] ^= m & (a[0] ^ t0);
+	a[1] ^= m & (a[1] ^ t1);
+	a[2] ^= m & (a[2] ^ t2);
+	a[3] ^= m & (a[3] ^ t3);
+
+#endif
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *kb, size_t kblen, int curve)
+{
+	unsigned char k[32];
+	uint64_t x1[4], x2[4], z2[4], x3[4], z3[4];
+	uint32_t swap;
+	int i;
+
+	(void)curve;
+
+	/*
+	 * Points are encoded over exactly 32 bytes. Multipliers must fit
+	 * in 32 bytes as well.
+	 */
+	if (Glen != 32 || kblen > 32) {
+		return 0;
+	}
+
+	/*
+	 * RFC 7748 mandates that the high bit of the last point byte must
+	 * be ignored/cleared.
+	 */
+	x1[0] = br_dec64le(&G[ 0]);
+	x1[1] = br_dec64le(&G[ 8]);
+	x1[2] = br_dec64le(&G[16]);
+	x1[3] = br_dec64le(&G[24]) & MASK63;
+
+	/*
+	 * We can use memset() to clear values, because exact-width types
+	 * like uint64_t are guaranteed to have no padding bits or
+	 * trap representations.
+	 */
+	memset(x2, 0, sizeof x2);
+	x2[0] = 1;
+	memset(z2, 0, sizeof z2);
+	memcpy(x3, x1, sizeof x1);
+	memcpy(z3, x2, sizeof x2);
+
+	/*
+	 * The multiplier is provided in big-endian notation, and
+	 * possibly shorter than 32 bytes.
+	 */
+	memset(k, 0, (sizeof k) - kblen);
+	memcpy(k + (sizeof k) - kblen, kb, kblen);
+	k[31] &= 0xF8;
+	k[0] &= 0x7F;
+	k[0] |= 0x40;
+
+	swap = 0;
+
+	for (i = 254; i >= 0; i --) {
+		uint64_t a[4], aa[4], b[4], bb[4], e[4];
+		uint64_t c[4], d[4], da[4], cb[4];
+		uint32_t kt;
+
+		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
+		swap ^= kt;
+		f255_cswap(x2, x3, swap);
+		f255_cswap(z2, z3, swap);
+		swap = kt;
+
+		/* A = x_2 + z_2 */
+		f255_add(a, x2, z2);
+
+		/* AA = A^2 */
+		f255_mul(aa, a, a);
+
+		/* B = x_2 - z_2 */
+		f255_sub(b, x2, z2);
+
+		/* BB = B^2 */
+		f255_mul(bb, b, b);
+
+		/* E = AA - BB */
+		f255_sub(e, aa, bb);
+
+		/* C = x_3 + z_3 */
+		f255_add(c, x3, z3);
+
+		/* D = x_3 - z_3 */
+		f255_sub(d, x3, z3);
+
+		/* DA = D * A */
+		f255_mul(da, d, a);
+
+		/* CB = C * B */
+		f255_mul(cb, c, b);
+
+		/* x_3 = (DA + CB)^2 */
+		f255_add(x3, da, cb);
+		f255_mul(x3, x3, x3);
+
+		/* z_3 = x_1 * (DA - CB)^2 */
+		f255_sub(z3, da, cb);
+		f255_mul(z3, z3, z3);
+		f255_mul(z3, x1, z3);
+
+		/* x_2 = AA * BB */
+		f255_mul(x2, aa, bb);
+
+		/* z_2 = E * (AA + a24 * E) */
+		f255_mul_a24(z2, e);
+		f255_add(z2, aa, z2);
+		f255_mul(z2, e, z2);
+	}
+
+	f255_cswap(x2, x3, swap);
+	f255_cswap(z2, z3, swap);
+
+	/*
+	 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize
+	 * most non-squarings. We use x1 and x3, now useless, as temporaries.
+	 */
+	memcpy(x1, z2, sizeof z2);
+	for (i = 0; i < 15; i ++) {
+		f255_mul(x1, x1, x1);
+		f255_mul(x1, x1, z2);
+	}
+	memcpy(x3, x1, sizeof x1);
+	for (i = 0; i < 14; i ++) {
+		int j;
+
+		for (j = 0; j < 16; j ++) {
+			f255_mul(x3, x3, x3);
+		}
+		f255_mul(x3, x3, x1);
+	}
+	for (i = 14; i >= 0; i --) {
+		f255_mul(x3, x3, x3);
+		if ((0xFFEB >> i) & 1) {
+			f255_mul(x3, z2, x3);
+		}
+	}
+
+	/*
+	 * Compute x2/z2. We have 1/z2 in x3.
+	 */
+	f255_mul(x2, x2, x3);
+	f255_final_reduce(x2);
+
+	/*
+	 * Encode the final x2 value in little-endian.
+	 */
+	br_enc64le(G,      x2[0]);
+	br_enc64le(G +  8, x2[1]);
+	br_enc64le(G + 16, x2[2]);
+	br_enc64le(G + 24, x2[3]);
+	return 1;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We don't implement this method, since it is used for ECDSA
+	 * only, and there is no ECDSA over Curve25519 (which instead
+	 * uses EdDSA).
+	 */
+	(void)A;
+	(void)B;
+	(void)len;
+	(void)x;
+	(void)xlen;
+	(void)y;
+	(void)ylen;
+	(void)curve;
+	return 0;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_c25519_m64 = {
+	(uint32_t)0x20000000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m64_get(void)
+{
+	return &br_ec_c25519_m64;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_c25519_m64_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/ec/ec_curve25519.c b/test/monniaux/BearSSL/src/ec/ec_curve25519.c
new file mode 100644
index 00000000..a47d215e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_curve25519.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char GEN[] = {
+	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static const unsigned char ORDER[] = {
+	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
+};
+
+/* see inner.h */
+const br_ec_curve_def br_curve25519 = {
+	BR_EC_curve25519,
+	ORDER, sizeof ORDER,
+	GEN, sizeof GEN
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_default.c b/test/monniaux/BearSSL/src/ec/ec_default.c
new file mode 100644
index 00000000..7bb6e0c7
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_default.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_get_default(void)
+{
+#if BR_LOMUL
+	return &br_ec_all_m15;
+#else
+	return &br_ec_all_m31;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ec_keygen.c b/test/monniaux/BearSSL/src/ec/ec_keygen.c
new file mode 100644
index 00000000..02a30962
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_keygen.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+size_t
+br_ec_keygen(const br_prng_class **rng_ctx,
+	const br_ec_impl *impl, br_ec_private_key *sk,
+	void *kbuf, int curve)
+{
+	const unsigned char *order;
+	unsigned char *buf;
+	size_t len;
+	unsigned mask;
+
+	if (curve < 0 || curve >= 32
+		|| ((impl->supported_curves >> curve) & 1) == 0)
+	{
+		return 0;
+	}
+	order = impl->order(curve, &len);
+	while (len > 0 && *order == 0) {
+		order ++;
+		len --;
+	}
+	if (kbuf == NULL || len == 0) {
+		return len;
+	}
+	mask = order[0];
+	mask |= (mask >> 1);
+	mask |= (mask >> 2);
+	mask |= (mask >> 4);
+
+	/*
+	 * We generate sequences of random bits of the right size, until
+	 * the value is strictly lower than the curve order (we also
+	 * check for all-zero values, which are invalid).
+	 */
+	buf = kbuf;
+	for (;;) {
+		size_t u;
+		unsigned cc, zz;
+
+		(*rng_ctx)->generate(rng_ctx, buf, len);
+		buf[0] &= mask;
+		cc = 0;
+		u = len;
+		zz = 0;
+		while (u -- > 0) {
+			cc = ((unsigned)(buf[u] - order[u] - cc) >> 8) & 1;
+			zz |= buf[u];
+		}
+		if (cc != 0 && zz != 0) {
+			break;
+		}
+	}
+
+	if (sk != NULL) {
+		sk->curve = curve;
+		sk->x = buf;
+		sk->xlen = len;
+	}
+	return len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m15.c b/test/monniaux/BearSSL/src/ec/ec_p256_m15.c
new file mode 100644
index 00000000..8d68d1d2
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_p256_m15.c
@@ -0,0 +1,2130 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
+ * that right-shifting a signed negative integer copies the sign bit
+ * (arithmetic right-shift). This is "implementation-defined behaviour",
+ * i.e. it is not undefined, but it may differ between compilers. Each
+ * compiler is supposed to document its behaviour in that respect. GCC
+ * explicitly defines that an arithmetic right shift is used. We expect
+ * all other compilers to do the same, because underlying CPU offer an
+ * arithmetic right shift opcode that could not be used otherwise.
+ */
+#if BR_NO_ARITH_SHIFT
+#define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
+                    | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
+#else
+#define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
+#endif
+
+/*
+ * Convert an integer from unsigned big-endian encoding to a sequence of
+ * 13-bit words in little-endian order. The final "partial" word is
+ * returned.
+ */
+static uint32_t
+be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	uint32_t acc;
+	int acc_len;
+
+	acc = 0;
+	acc_len = 0;
+	while (len -- > 0) {
+		acc |= (uint32_t)src[len] << acc_len;
+		acc_len += 8;
+		if (acc_len >= 13) {
+			*dst ++ = acc & 0x1FFF;
+			acc >>= 13;
+			acc_len -= 13;
+		}
+	}
+	return acc;
+}
+
+/*
+ * Convert an integer (13-bit words, little-endian) to unsigned
+ * big-endian encoding. The total encoding length is provided; all
+ * the destination bytes will be filled.
+ */
+static void
+le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
+{
+	uint32_t acc;
+	int acc_len;
+
+	acc = 0;
+	acc_len = 0;
+	while (len -- > 0) {
+		if (acc_len < 8) {
+			acc |= (*src ++) << acc_len;
+			acc_len += 13;
+		}
+		dst[len] = (unsigned char)acc;
+		acc >>= 8;
+		acc_len -= 8;
+	}
+}
+
+/*
+ * Normalise an array of words to a strict 13 bits per word. Returned
+ * value is the resulting carry. The source (w) and destination (d)
+ * arrays may be identical, but shall not overlap partially.
+ */
+static inline uint32_t
+norm13(uint32_t *d, const uint32_t *w, size_t len)
+{
+	size_t u;
+	uint32_t cc;
+
+	cc = 0;
+	for (u = 0; u < len; u ++) {
+		int32_t z;
+
+		z = w[u] + cc;
+		d[u] = z & 0x1FFF;
+		cc = ARSH(z, 13);
+	}
+	return cc;
+}
+
+/*
+ * mul20() multiplies two 260-bit integers together. Each word must fit
+ * on 13 bits; source operands use 20 words, destination operand
+ * receives 40 words. All overlaps allowed.
+ *
+ * square20() computes the square of a 260-bit integer. Each word must
+ * fit on 13 bits; source operand uses 20 words, destination operand
+ * receives 40 words. All overlaps allowed.
+ */
+
+#if BR_SLOW_MUL15
+
+static void
+mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	/*
+	 * Two-level Karatsuba: turns a 20x20 multiplication into
+	 * nine 5x5 multiplications. We use 13-bit words but do not
+	 * propagate carries immediately, so words may expand:
+	 *
+	 *  - First Karatsuba decomposition turns the 20x20 mul on
+	 *    13-bit words into three 10x10 muls, two on 13-bit words
+	 *    and one on 14-bit words.
+	 *
+	 *  - Second Karatsuba decomposition further splits these into:
+	 *
+	 *     * four 5x5 muls on 13-bit words
+	 *     * four 5x5 muls on 14-bit words
+	 *     * one 5x5 mul on 15-bit words
+	 *
+	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
+	 * or 15-bit words, respectively.
+	 */
+	uint32_t u[45], v[45], w[90];
+	uint32_t cc;
+	int i;
+
+#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
+		(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
+			+ (s2w)[5 * (s2_off) + 0]; \
+		(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
+			+ (s2w)[5 * (s2_off) + 1]; \
+		(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
+			+ (s2w)[5 * (s2_off) + 2]; \
+		(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
+			+ (s2w)[5 * (s2_off) + 3]; \
+		(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
+			+ (s2w)[5 * (s2_off) + 4]; \
+	} while (0)
+
+#define ZADDT(dw, d_off, sw, s_off)   do { \
+		(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
+		(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
+		(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
+		(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
+		(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
+	} while (0)
+
+#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
+		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
+			+ (s2w)[5 * (s2_off) + 0]; \
+		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
+			+ (s2w)[5 * (s2_off) + 1]; \
+		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
+			+ (s2w)[5 * (s2_off) + 2]; \
+		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
+			+ (s2w)[5 * (s2_off) + 3]; \
+		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
+			+ (s2w)[5 * (s2_off) + 4]; \
+	} while (0)
+
+#define CPR1(w, cprcc)   do { \
+		uint32_t cprz = (w) + cprcc; \
+		(w) = cprz & 0x1FFF; \
+		cprcc = cprz >> 13; \
+	} while (0)
+
+#define CPR(dw, d_off)   do { \
+		uint32_t cprcc; \
+		cprcc = 0; \
+		CPR1((dw)[(d_off) + 0], cprcc); \
+		CPR1((dw)[(d_off) + 1], cprcc); \
+		CPR1((dw)[(d_off) + 2], cprcc); \
+		CPR1((dw)[(d_off) + 3], cprcc); \
+		CPR1((dw)[(d_off) + 4], cprcc); \
+		CPR1((dw)[(d_off) + 5], cprcc); \
+		CPR1((dw)[(d_off) + 6], cprcc); \
+		CPR1((dw)[(d_off) + 7], cprcc); \
+		CPR1((dw)[(d_off) + 8], cprcc); \
+		(dw)[(d_off) + 9] = cprcc; \
+	} while (0)
+
+	memcpy(u, a, 20 * sizeof *a);
+	ZADD(u, 4, a, 0, a, 1);
+	ZADD(u, 5, a, 2, a, 3);
+	ZADD(u, 6, a, 0, a, 2);
+	ZADD(u, 7, a, 1, a, 3);
+	ZADD(u, 8, u, 6, u, 7);
+
+	memcpy(v, b, 20 * sizeof *b);
+	ZADD(v, 4, b, 0, b, 1);
+	ZADD(v, 5, b, 2, b, 3);
+	ZADD(v, 6, b, 0, b, 2);
+	ZADD(v, 7, b, 1, b, 3);
+	ZADD(v, 8, v, 6, v, 7);
+
+	/*
+	 * Do the eight first 8x8 muls. Source words are at most 16382
+	 * each, so we can add product results together "as is" in 32-bit
+	 * words.
+	 */
+	for (i = 0; i < 40; i += 5) {
+		w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
+		w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
+			+ MUL15(u[i + 1], v[i + 0]);
+		w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
+			+ MUL15(u[i + 1], v[i + 1])
+			+ MUL15(u[i + 2], v[i + 0]);
+		w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
+			+ MUL15(u[i + 1], v[i + 2])
+			+ MUL15(u[i + 2], v[i + 1])
+			+ MUL15(u[i + 3], v[i + 0]);
+		w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
+			+ MUL15(u[i + 1], v[i + 3])
+			+ MUL15(u[i + 2], v[i + 2])
+			+ MUL15(u[i + 3], v[i + 1])
+			+ MUL15(u[i + 4], v[i + 0]);
+		w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
+			+ MUL15(u[i + 2], v[i + 3])
+			+ MUL15(u[i + 3], v[i + 2])
+			+ MUL15(u[i + 4], v[i + 1]);
+		w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
+			+ MUL15(u[i + 3], v[i + 3])
+			+ MUL15(u[i + 4], v[i + 2]);
+		w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
+			+ MUL15(u[i + 4], v[i + 3]);
+		w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
+		w[(i << 1) + 9] = 0;
+	}
+
+	/*
+	 * For the 9th multiplication, source words are up to 32764,
+	 * so we must do some carry propagation. If we add up to
+	 * 4 products and the carry is no more than 524224, then the
+	 * result fits in 32 bits, and the next carry will be no more
+	 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
+	 *
+	 * We thus just skip one of the products in the middle word,
+	 * then do a carry propagation (this reduces words to 13 bits
+	 * each, except possibly the last, which may use up to 17 bits
+	 * or so), then add the missing product.
+	 */
+	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
+	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
+		+ MUL15(u[40 + 1], v[40 + 0]);
+	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
+		+ MUL15(u[40 + 1], v[40 + 1])
+		+ MUL15(u[40 + 2], v[40 + 0]);
+	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
+		+ MUL15(u[40 + 1], v[40 + 2])
+		+ MUL15(u[40 + 2], v[40 + 1])
+		+ MUL15(u[40 + 3], v[40 + 0]);
+	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
+		+ MUL15(u[40 + 1], v[40 + 3])
+		+ MUL15(u[40 + 2], v[40 + 2])
+		+ MUL15(u[40 + 3], v[40 + 1]);
+		/* + MUL15(u[40 + 4], v[40 + 0]) */
+	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
+		+ MUL15(u[40 + 2], v[40 + 3])
+		+ MUL15(u[40 + 3], v[40 + 2])
+		+ MUL15(u[40 + 4], v[40 + 1]);
+	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
+		+ MUL15(u[40 + 3], v[40 + 3])
+		+ MUL15(u[40 + 4], v[40 + 2]);
+	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
+		+ MUL15(u[40 + 4], v[40 + 3]);
+	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
+
+	CPR(w, 80);
+
+	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
+
+	/*
+	 * The products on 14-bit words in slots 6 and 7 yield values
+	 * up to 5*(16382^2) each, and we need to subtract two such
+	 * values from the higher word. We need the subtraction to fit
+	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
+	 * However, 10*(16382^2) does not fit. So we must perform a
+	 * bit of reduction here.
+	 */
+	CPR(w, 60);
+	CPR(w, 70);
+
+	/*
+	 * Recompose results.
+	 */
+
+	/* 0..1*0..1 into 0..3 */
+	ZSUB2F(w, 8, w, 0, w, 2);
+	ZSUB2F(w, 9, w, 1, w, 3);
+	ZADDT(w, 1, w, 8);
+	ZADDT(w, 2, w, 9);
+
+	/* 2..3*2..3 into 4..7 */
+	ZSUB2F(w, 10, w, 4, w, 6);
+	ZSUB2F(w, 11, w, 5, w, 7);
+	ZADDT(w, 5, w, 10);
+	ZADDT(w, 6, w, 11);
+
+	/* (0..1+2..3)*(0..1+2..3) into 12..15 */
+	ZSUB2F(w, 16, w, 12, w, 14);
+	ZSUB2F(w, 17, w, 13, w, 15);
+	ZADDT(w, 13, w, 16);
+	ZADDT(w, 14, w, 17);
+
+	/* first-level recomposition */
+	ZSUB2F(w, 12, w, 0, w, 4);
+	ZSUB2F(w, 13, w, 1, w, 5);
+	ZSUB2F(w, 14, w, 2, w, 6);
+	ZSUB2F(w, 15, w, 3, w, 7);
+	ZADDT(w, 2, w, 12);
+	ZADDT(w, 3, w, 13);
+	ZADDT(w, 4, w, 14);
+	ZADDT(w, 5, w, 15);
+
+	/*
+	 * Perform carry propagation to bring all words down to 13 bits.
+	 */
+	cc = norm13(d, w, 40);
+	d[39] += (cc << 13);
+
+#undef ZADD
+#undef ZADDT
+#undef ZSUB2F
+#undef CPR1
+#undef CPR
+}
+
+static inline void
+square20(uint32_t *d, const uint32_t *a)
+{
+	mul20(d, a, a);
+}
+
+#else
+
+static void
+mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t t[39];
+
+	t[ 0] = MUL15(a[ 0], b[ 0]);
+	t[ 1] = MUL15(a[ 0], b[ 1])
+		+ MUL15(a[ 1], b[ 0]);
+	t[ 2] = MUL15(a[ 0], b[ 2])
+		+ MUL15(a[ 1], b[ 1])
+		+ MUL15(a[ 2], b[ 0]);
+	t[ 3] = MUL15(a[ 0], b[ 3])
+		+ MUL15(a[ 1], b[ 2])
+		+ MUL15(a[ 2], b[ 1])
+		+ MUL15(a[ 3], b[ 0]);
+	t[ 4] = MUL15(a[ 0], b[ 4])
+		+ MUL15(a[ 1], b[ 3])
+		+ MUL15(a[ 2], b[ 2])
+		+ MUL15(a[ 3], b[ 1])
+		+ MUL15(a[ 4], b[ 0]);
+	t[ 5] = MUL15(a[ 0], b[ 5])
+		+ MUL15(a[ 1], b[ 4])
+		+ MUL15(a[ 2], b[ 3])
+		+ MUL15(a[ 3], b[ 2])
+		+ MUL15(a[ 4], b[ 1])
+		+ MUL15(a[ 5], b[ 0]);
+	t[ 6] = MUL15(a[ 0], b[ 6])
+		+ MUL15(a[ 1], b[ 5])
+		+ MUL15(a[ 2], b[ 4])
+		+ MUL15(a[ 3], b[ 3])
+		+ MUL15(a[ 4], b[ 2])
+		+ MUL15(a[ 5], b[ 1])
+		+ MUL15(a[ 6], b[ 0]);
+	t[ 7] = MUL15(a[ 0], b[ 7])
+		+ MUL15(a[ 1], b[ 6])
+		+ MUL15(a[ 2], b[ 5])
+		+ MUL15(a[ 3], b[ 4])
+		+ MUL15(a[ 4], b[ 3])
+		+ MUL15(a[ 5], b[ 2])
+		+ MUL15(a[ 6], b[ 1])
+		+ MUL15(a[ 7], b[ 0]);
+	t[ 8] = MUL15(a[ 0], b[ 8])
+		+ MUL15(a[ 1], b[ 7])
+		+ MUL15(a[ 2], b[ 6])
+		+ MUL15(a[ 3], b[ 5])
+		+ MUL15(a[ 4], b[ 4])
+		+ MUL15(a[ 5], b[ 3])
+		+ MUL15(a[ 6], b[ 2])
+		+ MUL15(a[ 7], b[ 1])
+		+ MUL15(a[ 8], b[ 0]);
+	t[ 9] = MUL15(a[ 0], b[ 9])
+		+ MUL15(a[ 1], b[ 8])
+		+ MUL15(a[ 2], b[ 7])
+		+ MUL15(a[ 3], b[ 6])
+		+ MUL15(a[ 4], b[ 5])
+		+ MUL15(a[ 5], b[ 4])
+		+ MUL15(a[ 6], b[ 3])
+		+ MUL15(a[ 7], b[ 2])
+		+ MUL15(a[ 8], b[ 1])
+		+ MUL15(a[ 9], b[ 0]);
+	t[10] = MUL15(a[ 0], b[10])
+		+ MUL15(a[ 1], b[ 9])
+		+ MUL15(a[ 2], b[ 8])
+		+ MUL15(a[ 3], b[ 7])
+		+ MUL15(a[ 4], b[ 6])
+		+ MUL15(a[ 5], b[ 5])
+		+ MUL15(a[ 6], b[ 4])
+		+ MUL15(a[ 7], b[ 3])
+		+ MUL15(a[ 8], b[ 2])
+		+ MUL15(a[ 9], b[ 1])
+		+ MUL15(a[10], b[ 0]);
+	t[11] = MUL15(a[ 0], b[11])
+		+ MUL15(a[ 1], b[10])
+		+ MUL15(a[ 2], b[ 9])
+		+ MUL15(a[ 3], b[ 8])
+		+ MUL15(a[ 4], b[ 7])
+		+ MUL15(a[ 5], b[ 6])
+		+ MUL15(a[ 6], b[ 5])
+		+ MUL15(a[ 7], b[ 4])
+		+ MUL15(a[ 8], b[ 3])
+		+ MUL15(a[ 9], b[ 2])
+		+ MUL15(a[10], b[ 1])
+		+ MUL15(a[11], b[ 0]);
+	t[12] = MUL15(a[ 0], b[12])
+		+ MUL15(a[ 1], b[11])
+		+ MUL15(a[ 2], b[10])
+		+ MUL15(a[ 3], b[ 9])
+		+ MUL15(a[ 4], b[ 8])
+		+ MUL15(a[ 5], b[ 7])
+		+ MUL15(a[ 6], b[ 6])
+		+ MUL15(a[ 7], b[ 5])
+		+ MUL15(a[ 8], b[ 4])
+		+ MUL15(a[ 9], b[ 3])
+		+ MUL15(a[10], b[ 2])
+		+ MUL15(a[11], b[ 1])
+		+ MUL15(a[12], b[ 0]);
+	t[13] = MUL15(a[ 0], b[13])
+		+ MUL15(a[ 1], b[12])
+		+ MUL15(a[ 2], b[11])
+		+ MUL15(a[ 3], b[10])
+		+ MUL15(a[ 4], b[ 9])
+		+ MUL15(a[ 5], b[ 8])
+		+ MUL15(a[ 6], b[ 7])
+		+ MUL15(a[ 7], b[ 6])
+		+ MUL15(a[ 8], b[ 5])
+		+ MUL15(a[ 9], b[ 4])
+		+ MUL15(a[10], b[ 3])
+		+ MUL15(a[11], b[ 2])
+		+ MUL15(a[12], b[ 1])
+		+ MUL15(a[13], b[ 0]);
+	t[14] = MUL15(a[ 0], b[14])
+		+ MUL15(a[ 1], b[13])
+		+ MUL15(a[ 2], b[12])
+		+ MUL15(a[ 3], b[11])
+		+ MUL15(a[ 4], b[10])
+		+ MUL15(a[ 5], b[ 9])
+		+ MUL15(a[ 6], b[ 8])
+		+ MUL15(a[ 7], b[ 7])
+		+ MUL15(a[ 8], b[ 6])
+		+ MUL15(a[ 9], b[ 5])
+		+ MUL15(a[10], b[ 4])
+		+ MUL15(a[11], b[ 3])
+		+ MUL15(a[12], b[ 2])
+		+ MUL15(a[13], b[ 1])
+		+ MUL15(a[14], b[ 0]);
+	t[15] = MUL15(a[ 0], b[15])
+		+ MUL15(a[ 1], b[14])
+		+ MUL15(a[ 2], b[13])
+		+ MUL15(a[ 3], b[12])
+		+ MUL15(a[ 4], b[11])
+		+ MUL15(a[ 5], b[10])
+		+ MUL15(a[ 6], b[ 9])
+		+ MUL15(a[ 7], b[ 8])
+		+ MUL15(a[ 8], b[ 7])
+		+ MUL15(a[ 9], b[ 6])
+		+ MUL15(a[10], b[ 5])
+		+ MUL15(a[11], b[ 4])
+		+ MUL15(a[12], b[ 3])
+		+ MUL15(a[13], b[ 2])
+		+ MUL15(a[14], b[ 1])
+		+ MUL15(a[15], b[ 0]);
+	t[16] = MUL15(a[ 0], b[16])
+		+ MUL15(a[ 1], b[15])
+		+ MUL15(a[ 2], b[14])
+		+ MUL15(a[ 3], b[13])
+		+ MUL15(a[ 4], b[12])
+		+ MUL15(a[ 5], b[11])
+		+ MUL15(a[ 6], b[10])
+		+ MUL15(a[ 7], b[ 9])
+		+ MUL15(a[ 8], b[ 8])
+		+ MUL15(a[ 9], b[ 7])
+		+ MUL15(a[10], b[ 6])
+		+ MUL15(a[11], b[ 5])
+		+ MUL15(a[12], b[ 4])
+		+ MUL15(a[13], b[ 3])
+		+ MUL15(a[14], b[ 2])
+		+ MUL15(a[15], b[ 1])
+		+ MUL15(a[16], b[ 0]);
+	t[17] = MUL15(a[ 0], b[17])
+		+ MUL15(a[ 1], b[16])
+		+ MUL15(a[ 2], b[15])
+		+ MUL15(a[ 3], b[14])
+		+ MUL15(a[ 4], b[13])
+		+ MUL15(a[ 5], b[12])
+		+ MUL15(a[ 6], b[11])
+		+ MUL15(a[ 7], b[10])
+		+ MUL15(a[ 8], b[ 9])
+		+ MUL15(a[ 9], b[ 8])
+		+ MUL15(a[10], b[ 7])
+		+ MUL15(a[11], b[ 6])
+		+ MUL15(a[12], b[ 5])
+		+ MUL15(a[13], b[ 4])
+		+ MUL15(a[14], b[ 3])
+		+ MUL15(a[15], b[ 2])
+		+ MUL15(a[16], b[ 1])
+		+ MUL15(a[17], b[ 0]);
+	t[18] = MUL15(a[ 0], b[18])
+		+ MUL15(a[ 1], b[17])
+		+ MUL15(a[ 2], b[16])
+		+ MUL15(a[ 3], b[15])
+		+ MUL15(a[ 4], b[14])
+		+ MUL15(a[ 5], b[13])
+		+ MUL15(a[ 6], b[12])
+		+ MUL15(a[ 7], b[11])
+		+ MUL15(a[ 8], b[10])
+		+ MUL15(a[ 9], b[ 9])
+		+ MUL15(a[10], b[ 8])
+		+ MUL15(a[11], b[ 7])
+		+ MUL15(a[12], b[ 6])
+		+ MUL15(a[13], b[ 5])
+		+ MUL15(a[14], b[ 4])
+		+ MUL15(a[15], b[ 3])
+		+ MUL15(a[16], b[ 2])
+		+ MUL15(a[17], b[ 1])
+		+ MUL15(a[18], b[ 0]);
+	t[19] = MUL15(a[ 0], b[19])
+		+ MUL15(a[ 1], b[18])
+		+ MUL15(a[ 2], b[17])
+		+ MUL15(a[ 3], b[16])
+		+ MUL15(a[ 4], b[15])
+		+ MUL15(a[ 5], b[14])
+		+ MUL15(a[ 6], b[13])
+		+ MUL15(a[ 7], b[12])
+		+ MUL15(a[ 8], b[11])
+		+ MUL15(a[ 9], b[10])
+		+ MUL15(a[10], b[ 9])
+		+ MUL15(a[11], b[ 8])
+		+ MUL15(a[12], b[ 7])
+		+ MUL15(a[13], b[ 6])
+		+ MUL15(a[14], b[ 5])
+		+ MUL15(a[15], b[ 4])
+		+ MUL15(a[16], b[ 3])
+		+ MUL15(a[17], b[ 2])
+		+ MUL15(a[18], b[ 1])
+		+ MUL15(a[19], b[ 0]);
+	t[20] = MUL15(a[ 1], b[19])
+		+ MUL15(a[ 2], b[18])
+		+ MUL15(a[ 3], b[17])
+		+ MUL15(a[ 4], b[16])
+		+ MUL15(a[ 5], b[15])
+		+ MUL15(a[ 6], b[14])
+		+ MUL15(a[ 7], b[13])
+		+ MUL15(a[ 8], b[12])
+		+ MUL15(a[ 9], b[11])
+		+ MUL15(a[10], b[10])
+		+ MUL15(a[11], b[ 9])
+		+ MUL15(a[12], b[ 8])
+		+ MUL15(a[13], b[ 7])
+		+ MUL15(a[14], b[ 6])
+		+ MUL15(a[15], b[ 5])
+		+ MUL15(a[16], b[ 4])
+		+ MUL15(a[17], b[ 3])
+		+ MUL15(a[18], b[ 2])
+		+ MUL15(a[19], b[ 1]);
+	t[21] = MUL15(a[ 2], b[19])
+		+ MUL15(a[ 3], b[18])
+		+ MUL15(a[ 4], b[17])
+		+ MUL15(a[ 5], b[16])
+		+ MUL15(a[ 6], b[15])
+		+ MUL15(a[ 7], b[14])
+		+ MUL15(a[ 8], b[13])
+		+ MUL15(a[ 9], b[12])
+		+ MUL15(a[10], b[11])
+		+ MUL15(a[11], b[10])
+		+ MUL15(a[12], b[ 9])
+		+ MUL15(a[13], b[ 8])
+		+ MUL15(a[14], b[ 7])
+		+ MUL15(a[15], b[ 6])
+		+ MUL15(a[16], b[ 5])
+		+ MUL15(a[17], b[ 4])
+		+ MUL15(a[18], b[ 3])
+		+ MUL15(a[19], b[ 2]);
+	t[22] = MUL15(a[ 3], b[19])
+		+ MUL15(a[ 4], b[18])
+		+ MUL15(a[ 5], b[17])
+		+ MUL15(a[ 6], b[16])
+		+ MUL15(a[ 7], b[15])
+		+ MUL15(a[ 8], b[14])
+		+ MUL15(a[ 9], b[13])
+		+ MUL15(a[10], b[12])
+		+ MUL15(a[11], b[11])
+		+ MUL15(a[12], b[10])
+		+ MUL15(a[13], b[ 9])
+		+ MUL15(a[14], b[ 8])
+		+ MUL15(a[15], b[ 7])
+		+ MUL15(a[16], b[ 6])
+		+ MUL15(a[17], b[ 5])
+		+ MUL15(a[18], b[ 4])
+		+ MUL15(a[19], b[ 3]);
+	t[23] = MUL15(a[ 4], b[19])
+		+ MUL15(a[ 5], b[18])
+		+ MUL15(a[ 6], b[17])
+		+ MUL15(a[ 7], b[16])
+		+ MUL15(a[ 8], b[15])
+		+ MUL15(a[ 9], b[14])
+		+ MUL15(a[10], b[13])
+		+ MUL15(a[11], b[12])
+		+ MUL15(a[12], b[11])
+		+ MUL15(a[13], b[10])
+		+ MUL15(a[14], b[ 9])
+		+ MUL15(a[15], b[ 8])
+		+ MUL15(a[16], b[ 7])
+		+ MUL15(a[17], b[ 6])
+		+ MUL15(a[18], b[ 5])
+		+ MUL15(a[19], b[ 4]);
+	t[24] = MUL15(a[ 5], b[19])
+		+ MUL15(a[ 6], b[18])
+		+ MUL15(a[ 7], b[17])
+		+ MUL15(a[ 8], b[16])
+		+ MUL15(a[ 9], b[15])
+		+ MUL15(a[10], b[14])
+		+ MUL15(a[11], b[13])
+		+ MUL15(a[12], b[12])
+		+ MUL15(a[13], b[11])
+		+ MUL15(a[14], b[10])
+		+ MUL15(a[15], b[ 9])
+		+ MUL15(a[16], b[ 8])
+		+ MUL15(a[17], b[ 7])
+		+ MUL15(a[18], b[ 6])
+		+ MUL15(a[19], b[ 5]);
+	t[25] = MUL15(a[ 6], b[19])
+		+ MUL15(a[ 7], b[18])
+		+ MUL15(a[ 8], b[17])
+		+ MUL15(a[ 9], b[16])
+		+ MUL15(a[10], b[15])
+		+ MUL15(a[11], b[14])
+		+ MUL15(a[12], b[13])
+		+ MUL15(a[13], b[12])
+		+ MUL15(a[14], b[11])
+		+ MUL15(a[15], b[10])
+		+ MUL15(a[16], b[ 9])
+		+ MUL15(a[17], b[ 8])
+		+ MUL15(a[18], b[ 7])
+		+ MUL15(a[19], b[ 6]);
+	t[26] = MUL15(a[ 7], b[19])
+		+ MUL15(a[ 8], b[18])
+		+ MUL15(a[ 9], b[17])
+		+ MUL15(a[10], b[16])
+		+ MUL15(a[11], b[15])
+		+ MUL15(a[12], b[14])
+		+ MUL15(a[13], b[13])
+		+ MUL15(a[14], b[12])
+		+ MUL15(a[15], b[11])
+		+ MUL15(a[16], b[10])
+		+ MUL15(a[17], b[ 9])
+		+ MUL15(a[18], b[ 8])
+		+ MUL15(a[19], b[ 7]);
+	t[27] = MUL15(a[ 8], b[19])
+		+ MUL15(a[ 9], b[18])
+		+ MUL15(a[10], b[17])
+		+ MUL15(a[11], b[16])
+		+ MUL15(a[12], b[15])
+		+ MUL15(a[13], b[14])
+		+ MUL15(a[14], b[13])
+		+ MUL15(a[15], b[12])
+		+ MUL15(a[16], b[11])
+		+ MUL15(a[17], b[10])
+		+ MUL15(a[18], b[ 9])
+		+ MUL15(a[19], b[ 8]);
+	t[28] = MUL15(a[ 9], b[19])
+		+ MUL15(a[10], b[18])
+		+ MUL15(a[11], b[17])
+		+ MUL15(a[12], b[16])
+		+ MUL15(a[13], b[15])
+		+ MUL15(a[14], b[14])
+		+ MUL15(a[15], b[13])
+		+ MUL15(a[16], b[12])
+		+ MUL15(a[17], b[11])
+		+ MUL15(a[18], b[10])
+		+ MUL15(a[19], b[ 9]);
+	t[29] = MUL15(a[10], b[19])
+		+ MUL15(a[11], b[18])
+		+ MUL15(a[12], b[17])
+		+ MUL15(a[13], b[16])
+		+ MUL15(a[14], b[15])
+		+ MUL15(a[15], b[14])
+		+ MUL15(a[16], b[13])
+		+ MUL15(a[17], b[12])
+		+ MUL15(a[18], b[11])
+		+ MUL15(a[19], b[10]);
+	t[30] = MUL15(a[11], b[19])
+		+ MUL15(a[12], b[18])
+		+ MUL15(a[13], b[17])
+		+ MUL15(a[14], b[16])
+		+ MUL15(a[15], b[15])
+		+ MUL15(a[16], b[14])
+		+ MUL15(a[17], b[13])
+		+ MUL15(a[18], b[12])
+		+ MUL15(a[19], b[11]);
+	t[31] = MUL15(a[12], b[19])
+		+ MUL15(a[13], b[18])
+		+ MUL15(a[14], b[17])
+		+ MUL15(a[15], b[16])
+		+ MUL15(a[16], b[15])
+		+ MUL15(a[17], b[14])
+		+ MUL15(a[18], b[13])
+		+ MUL15(a[19], b[12]);
+	t[32] = MUL15(a[13], b[19])
+		+ MUL15(a[14], b[18])
+		+ MUL15(a[15], b[17])
+		+ MUL15(a[16], b[16])
+		+ MUL15(a[17], b[15])
+		+ MUL15(a[18], b[14])
+		+ MUL15(a[19], b[13]);
+	t[33] = MUL15(a[14], b[19])
+		+ MUL15(a[15], b[18])
+		+ MUL15(a[16], b[17])
+		+ MUL15(a[17], b[16])
+		+ MUL15(a[18], b[15])
+		+ MUL15(a[19], b[14]);
+	t[34] = MUL15(a[15], b[19])
+		+ MUL15(a[16], b[18])
+		+ MUL15(a[17], b[17])
+		+ MUL15(a[18], b[16])
+		+ MUL15(a[19], b[15]);
+	t[35] = MUL15(a[16], b[19])
+		+ MUL15(a[17], b[18])
+		+ MUL15(a[18], b[17])
+		+ MUL15(a[19], b[16]);
+	t[36] = MUL15(a[17], b[19])
+		+ MUL15(a[18], b[18])
+		+ MUL15(a[19], b[17]);
+	t[37] = MUL15(a[18], b[19])
+		+ MUL15(a[19], b[18]);
+	t[38] = MUL15(a[19], b[19]);
+	d[39] = norm13(d, t, 39);
+}
+
+static void
+square20(uint32_t *d, const uint32_t *a)
+{
+	uint32_t t[39];
+
+	t[ 0] = MUL15(a[ 0], a[ 0]);
+	t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
+	t[ 2] = MUL15(a[ 1], a[ 1])
+		+ ((MUL15(a[ 0], a[ 2])) << 1);
+	t[ 3] = ((MUL15(a[ 0], a[ 3])
+		+ MUL15(a[ 1], a[ 2])) << 1);
+	t[ 4] = MUL15(a[ 2], a[ 2])
+		+ ((MUL15(a[ 0], a[ 4])
+		+ MUL15(a[ 1], a[ 3])) << 1);
+	t[ 5] = ((MUL15(a[ 0], a[ 5])
+		+ MUL15(a[ 1], a[ 4])
+		+ MUL15(a[ 2], a[ 3])) << 1);
+	t[ 6] = MUL15(a[ 3], a[ 3])
+		+ ((MUL15(a[ 0], a[ 6])
+		+ MUL15(a[ 1], a[ 5])
+		+ MUL15(a[ 2], a[ 4])) << 1);
+	t[ 7] = ((MUL15(a[ 0], a[ 7])
+		+ MUL15(a[ 1], a[ 6])
+		+ MUL15(a[ 2], a[ 5])
+		+ MUL15(a[ 3], a[ 4])) << 1);
+	t[ 8] = MUL15(a[ 4], a[ 4])
+		+ ((MUL15(a[ 0], a[ 8])
+		+ MUL15(a[ 1], a[ 7])
+		+ MUL15(a[ 2], a[ 6])
+		+ MUL15(a[ 3], a[ 5])) << 1);
+	t[ 9] = ((MUL15(a[ 0], a[ 9])
+		+ MUL15(a[ 1], a[ 8])
+		+ MUL15(a[ 2], a[ 7])
+		+ MUL15(a[ 3], a[ 6])
+		+ MUL15(a[ 4], a[ 5])) << 1);
+	t[10] = MUL15(a[ 5], a[ 5])
+		+ ((MUL15(a[ 0], a[10])
+		+ MUL15(a[ 1], a[ 9])
+		+ MUL15(a[ 2], a[ 8])
+		+ MUL15(a[ 3], a[ 7])
+		+ MUL15(a[ 4], a[ 6])) << 1);
+	t[11] = ((MUL15(a[ 0], a[11])
+		+ MUL15(a[ 1], a[10])
+		+ MUL15(a[ 2], a[ 9])
+		+ MUL15(a[ 3], a[ 8])
+		+ MUL15(a[ 4], a[ 7])
+		+ MUL15(a[ 5], a[ 6])) << 1);
+	t[12] = MUL15(a[ 6], a[ 6])
+		+ ((MUL15(a[ 0], a[12])
+		+ MUL15(a[ 1], a[11])
+		+ MUL15(a[ 2], a[10])
+		+ MUL15(a[ 3], a[ 9])
+		+ MUL15(a[ 4], a[ 8])
+		+ MUL15(a[ 5], a[ 7])) << 1);
+	t[13] = ((MUL15(a[ 0], a[13])
+		+ MUL15(a[ 1], a[12])
+		+ MUL15(a[ 2], a[11])
+		+ MUL15(a[ 3], a[10])
+		+ MUL15(a[ 4], a[ 9])
+		+ MUL15(a[ 5], a[ 8])
+		+ MUL15(a[ 6], a[ 7])) << 1);
+	t[14] = MUL15(a[ 7], a[ 7])
+		+ ((MUL15(a[ 0], a[14])
+		+ MUL15(a[ 1], a[13])
+		+ MUL15(a[ 2], a[12])
+		+ MUL15(a[ 3], a[11])
+		+ MUL15(a[ 4], a[10])
+		+ MUL15(a[ 5], a[ 9])
+		+ MUL15(a[ 6], a[ 8])) << 1);
+	t[15] = ((MUL15(a[ 0], a[15])
+		+ MUL15(a[ 1], a[14])
+		+ MUL15(a[ 2], a[13])
+		+ MUL15(a[ 3], a[12])
+		+ MUL15(a[ 4], a[11])
+		+ MUL15(a[ 5], a[10])
+		+ MUL15(a[ 6], a[ 9])
+		+ MUL15(a[ 7], a[ 8])) << 1);
+	t[16] = MUL15(a[ 8], a[ 8])
+		+ ((MUL15(a[ 0], a[16])
+		+ MUL15(a[ 1], a[15])
+		+ MUL15(a[ 2], a[14])
+		+ MUL15(a[ 3], a[13])
+		+ MUL15(a[ 4], a[12])
+		+ MUL15(a[ 5], a[11])
+		+ MUL15(a[ 6], a[10])
+		+ MUL15(a[ 7], a[ 9])) << 1);
+	t[17] = ((MUL15(a[ 0], a[17])
+		+ MUL15(a[ 1], a[16])
+		+ MUL15(a[ 2], a[15])
+		+ MUL15(a[ 3], a[14])
+		+ MUL15(a[ 4], a[13])
+		+ MUL15(a[ 5], a[12])
+		+ MUL15(a[ 6], a[11])
+		+ MUL15(a[ 7], a[10])
+		+ MUL15(a[ 8], a[ 9])) << 1);
+	t[18] = MUL15(a[ 9], a[ 9])
+		+ ((MUL15(a[ 0], a[18])
+		+ MUL15(a[ 1], a[17])
+		+ MUL15(a[ 2], a[16])
+		+ MUL15(a[ 3], a[15])
+		+ MUL15(a[ 4], a[14])
+		+ MUL15(a[ 5], a[13])
+		+ MUL15(a[ 6], a[12])
+		+ MUL15(a[ 7], a[11])
+		+ MUL15(a[ 8], a[10])) << 1);
+	t[19] = ((MUL15(a[ 0], a[19])
+		+ MUL15(a[ 1], a[18])
+		+ MUL15(a[ 2], a[17])
+		+ MUL15(a[ 3], a[16])
+		+ MUL15(a[ 4], a[15])
+		+ MUL15(a[ 5], a[14])
+		+ MUL15(a[ 6], a[13])
+		+ MUL15(a[ 7], a[12])
+		+ MUL15(a[ 8], a[11])
+		+ MUL15(a[ 9], a[10])) << 1);
+	t[20] = MUL15(a[10], a[10])
+		+ ((MUL15(a[ 1], a[19])
+		+ MUL15(a[ 2], a[18])
+		+ MUL15(a[ 3], a[17])
+		+ MUL15(a[ 4], a[16])
+		+ MUL15(a[ 5], a[15])
+		+ MUL15(a[ 6], a[14])
+		+ MUL15(a[ 7], a[13])
+		+ MUL15(a[ 8], a[12])
+		+ MUL15(a[ 9], a[11])) << 1);
+	t[21] = ((MUL15(a[ 2], a[19])
+		+ MUL15(a[ 3], a[18])
+		+ MUL15(a[ 4], a[17])
+		+ MUL15(a[ 5], a[16])
+		+ MUL15(a[ 6], a[15])
+		+ MUL15(a[ 7], a[14])
+		+ MUL15(a[ 8], a[13])
+		+ MUL15(a[ 9], a[12])
+		+ MUL15(a[10], a[11])) << 1);
+	t[22] = MUL15(a[11], a[11])
+		+ ((MUL15(a[ 3], a[19])
+		+ MUL15(a[ 4], a[18])
+		+ MUL15(a[ 5], a[17])
+		+ MUL15(a[ 6], a[16])
+		+ MUL15(a[ 7], a[15])
+		+ MUL15(a[ 8], a[14])
+		+ MUL15(a[ 9], a[13])
+		+ MUL15(a[10], a[12])) << 1);
+	t[23] = ((MUL15(a[ 4], a[19])
+		+ MUL15(a[ 5], a[18])
+		+ MUL15(a[ 6], a[17])
+		+ MUL15(a[ 7], a[16])
+		+ MUL15(a[ 8], a[15])
+		+ MUL15(a[ 9], a[14])
+		+ MUL15(a[10], a[13])
+		+ MUL15(a[11], a[12])) << 1);
+	t[24] = MUL15(a[12], a[12])
+		+ ((MUL15(a[ 5], a[19])
+		+ MUL15(a[ 6], a[18])
+		+ MUL15(a[ 7], a[17])
+		+ MUL15(a[ 8], a[16])
+		+ MUL15(a[ 9], a[15])
+		+ MUL15(a[10], a[14])
+		+ MUL15(a[11], a[13])) << 1);
+	t[25] = ((MUL15(a[ 6], a[19])
+		+ MUL15(a[ 7], a[18])
+		+ MUL15(a[ 8], a[17])
+		+ MUL15(a[ 9], a[16])
+		+ MUL15(a[10], a[15])
+		+ MUL15(a[11], a[14])
+		+ MUL15(a[12], a[13])) << 1);
+	t[26] = MUL15(a[13], a[13])
+		+ ((MUL15(a[ 7], a[19])
+		+ MUL15(a[ 8], a[18])
+		+ MUL15(a[ 9], a[17])
+		+ MUL15(a[10], a[16])
+		+ MUL15(a[11], a[15])
+		+ MUL15(a[12], a[14])) << 1);
+	t[27] = ((MUL15(a[ 8], a[19])
+		+ MUL15(a[ 9], a[18])
+		+ MUL15(a[10], a[17])
+		+ MUL15(a[11], a[16])
+		+ MUL15(a[12], a[15])
+		+ MUL15(a[13], a[14])) << 1);
+	t[28] = MUL15(a[14], a[14])
+		+ ((MUL15(a[ 9], a[19])
+		+ MUL15(a[10], a[18])
+		+ MUL15(a[11], a[17])
+		+ MUL15(a[12], a[16])
+		+ MUL15(a[13], a[15])) << 1);
+	t[29] = ((MUL15(a[10], a[19])
+		+ MUL15(a[11], a[18])
+		+ MUL15(a[12], a[17])
+		+ MUL15(a[13], a[16])
+		+ MUL15(a[14], a[15])) << 1);
+	t[30] = MUL15(a[15], a[15])
+		+ ((MUL15(a[11], a[19])
+		+ MUL15(a[12], a[18])
+		+ MUL15(a[13], a[17])
+		+ MUL15(a[14], a[16])) << 1);
+	t[31] = ((MUL15(a[12], a[19])
+		+ MUL15(a[13], a[18])
+		+ MUL15(a[14], a[17])
+		+ MUL15(a[15], a[16])) << 1);
+	t[32] = MUL15(a[16], a[16])
+		+ ((MUL15(a[13], a[19])
+		+ MUL15(a[14], a[18])
+		+ MUL15(a[15], a[17])) << 1);
+	t[33] = ((MUL15(a[14], a[19])
+		+ MUL15(a[15], a[18])
+		+ MUL15(a[16], a[17])) << 1);
+	t[34] = MUL15(a[17], a[17])
+		+ ((MUL15(a[15], a[19])
+		+ MUL15(a[16], a[18])) << 1);
+	t[35] = ((MUL15(a[16], a[19])
+		+ MUL15(a[17], a[18])) << 1);
+	t[36] = MUL15(a[18], a[18])
+		+ ((MUL15(a[17], a[19])) << 1);
+	t[37] = ((MUL15(a[18], a[19])) << 1);
+	t[38] = MUL15(a[19], a[19]);
+	d[39] = norm13(d, t, 39);
+}
+
+#endif
+
+/*
+ * Modulus for field F256 (field for point coordinates in curve P-256).
+ */
+static const uint32_t F256[] = {
+	0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
+	0x0000, 0x1FF8, 0x1FFF, 0x01FF
+};
+
+/*
+ * The 'b' curve equation coefficient for P-256.
+ */
+static const uint32_t P256_B[] = {
+	0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
+	0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
+	0x0A3A, 0x0EC5, 0x118D, 0x00B5
+};
+
+/*
+ * Perform a "short reduction" in field F256 (field for curve P-256).
+ * The source value should be less than 262 bits; on output, it will
+ * be at most 257 bits, and less than twice the modulus.
+ */
+static void
+reduce_f256(uint32_t *d)
+{
+	uint32_t x;
+
+	x = d[19] >> 9;
+	d[19] &= 0x01FF;
+	d[17] += x << 3;
+	d[14] -= x << 10;
+	d[7] -= x << 5;
+	d[0] += x;
+	norm13(d, d, 20);
+}
+
+/*
+ * Perform a "final reduction" in field F256 (field for curve P-256).
+ * The source value must be less than twice the modulus. If the value
+ * is not lower than the modulus, then the modulus is subtracted and
+ * this function returns 1; otherwise, it leaves it untouched and it
+ * returns 0.
+ */
+static uint32_t
+reduce_final_f256(uint32_t *d)
+{
+	uint32_t t[20];
+	uint32_t cc;
+	int i;
+
+	memcpy(t, d, sizeof t);
+	cc = 0;
+	for (i = 0; i < 20; i ++) {
+		uint32_t w;
+
+		w = t[i] - F256[i] - cc;
+		cc = w >> 31;
+		t[i] = w & 0x1FFF;
+	}
+	cc ^= 1;
+	CCOPY(cc, d, t, sizeof t);
+	return cc;
+}
+
+/*
+ * Perform a multiplication of two integers modulo
+ * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
+ * of 20 words, each containing 13 bits of data, in little-endian order.
+ * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
+ * on output, value fits on 257 bits and is lower than twice the modulus.
+ */
+static void
+mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t t[40], cc;
+	int i;
+
+	/*
+	 * Compute raw multiplication. All result words fit in 13 bits
+	 * each.
+	 */
+	mul20(t, a, b);
+
+	/*
+	 * Modular reduction: each high word in added/subtracted where
+	 * necessary.
+	 *
+	 * The modulus is:
+	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+	 * Therefore:
+	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+	 *
+	 * For a word x at bit offset n (n >= 256), we have:
+	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
+	 *            - x*2^(n - 160) + x*2^(n-256) mod p
+	 *
+	 * Thus, we can nullify the high word if we reinject it at some
+	 * proper emplacements.
+	 */
+	for (i = 39; i >= 20; i --) {
+		uint32_t x;
+
+		x = t[i];
+		t[i - 2] += ARSH(x, 6);
+		t[i - 3] += (x << 7) & 0x1FFF;
+		t[i - 4] -= ARSH(x, 12);
+		t[i - 5] -= (x << 1) & 0x1FFF;
+		t[i - 12] -= ARSH(x, 4);
+		t[i - 13] -= (x << 9) & 0x1FFF;
+		t[i - 19] += ARSH(x, 9);
+		t[i - 20] += (x << 4) & 0x1FFF;
+	}
+
+	/*
+	 * Propagate carries. This is a signed propagation, and the
+	 * result may be negative. The loop above may enlarge values,
+	 * but not two much: worst case is the chain involving t[i - 3],
+	 * in which a value may be added to itself up to 7 times. Since
+	 * starting values are 13-bit each, all words fit on 20 bits
+	 * (21 to account for the sign bit).
+	 */
+	cc = norm13(t, t, 20);
+
+	/*
+	 * Perform modular reduction again for the bits beyond 256 (the carry
+	 * and the bits 256..259). Since the largest shift below is by 10
+	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
+	 * thereby allowing injecting full word values.
+	 */
+	cc = (cc << 4) | (t[19] >> 9);
+	t[19] &= 0x01FF;
+	t[17] += cc << 3;
+	t[14] -= cc << 10;
+	t[7] -= cc << 5;
+	t[0] += cc;
+
+	/*
+	 * If the carry is negative, then after carry propagation, we may
+	 * end up with a value which is negative, and we don't want that.
+	 * Thus, in that case, we add the modulus. Note that the subtraction
+	 * result, when the carry is negative, is always smaller than the
+	 * modulus, so the extra addition will not make the value exceed
+	 * twice the modulus.
+	 */
+	cc >>= 31;
+	t[0] -= cc;
+	t[7] += cc << 5;
+	t[14] += cc << 10;
+	t[17] -= cc << 3;
+	t[19] += cc << 9;
+
+	norm13(d, t, 20);
+}
+
+/*
+ * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
+ * P-256). Operand is an array of 20 words, each containing 13 bits of
+ * data, in little-endian order. On input, upper word may be up to 13
+ * bits (hence value up to 2^260-1); on output, value fits on 257 bits
+ * and is lower than twice the modulus.
+ */
+static void
+square_f256(uint32_t *d, const uint32_t *a)
+{
+	uint32_t t[40], cc;
+	int i;
+
+	/*
+	 * Compute raw square. All result words fit in 13 bits each.
+	 */
+	square20(t, a);
+
+	/*
+	 * Modular reduction: each high word in added/subtracted where
+	 * necessary.
+	 *
+	 * The modulus is:
+	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+	 * Therefore:
+	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+	 *
+	 * For a word x at bit offset n (n >= 256), we have:
+	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
+	 *            - x*2^(n - 160) + x*2^(n-256) mod p
+	 *
+	 * Thus, we can nullify the high word if we reinject it at some
+	 * proper emplacements.
+	 */
+	for (i = 39; i >= 20; i --) {
+		uint32_t x;
+
+		x = t[i];
+		t[i - 2] += ARSH(x, 6);
+		t[i - 3] += (x << 7) & 0x1FFF;
+		t[i - 4] -= ARSH(x, 12);
+		t[i - 5] -= (x << 1) & 0x1FFF;
+		t[i - 12] -= ARSH(x, 4);
+		t[i - 13] -= (x << 9) & 0x1FFF;
+		t[i - 19] += ARSH(x, 9);
+		t[i - 20] += (x << 4) & 0x1FFF;
+	}
+
+	/*
+	 * Propagate carries. This is a signed propagation, and the
+	 * result may be negative. The loop above may enlarge values,
+	 * but not two much: worst case is the chain involving t[i - 3],
+	 * in which a value may be added to itself up to 7 times. Since
+	 * starting values are 13-bit each, all words fit on 20 bits
+	 * (21 to account for the sign bit).
+	 */
+	cc = norm13(t, t, 20);
+
+	/*
+	 * Perform modular reduction again for the bits beyond 256 (the carry
+	 * and the bits 256..259). Since the largest shift below is by 10
+	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
+	 * thereby allowing injecting full word values.
+	 */
+	cc = (cc << 4) | (t[19] >> 9);
+	t[19] &= 0x01FF;
+	t[17] += cc << 3;
+	t[14] -= cc << 10;
+	t[7] -= cc << 5;
+	t[0] += cc;
+
+	/*
+	 * If the carry is negative, then after carry propagation, we may
+	 * end up with a value which is negative, and we don't want that.
+	 * Thus, in that case, we add the modulus. Note that the subtraction
+	 * result, when the carry is negative, is always smaller than the
+	 * modulus, so the extra addition will not make the value exceed
+	 * twice the modulus.
+	 */
+	cc >>= 31;
+	t[0] -= cc;
+	t[7] += cc << 5;
+	t[14] += cc << 10;
+	t[17] -= cc << 3;
+	t[19] += cc << 9;
+
+	norm13(d, t, 20);
+}
+
+/*
+ * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
+ * are such that:
+ *   X = x / z^2
+ *   Y = y / z^3
+ * For the point at infinity, z = 0.
+ * Each point thus admits many possible representations.
+ *
+ * Coordinates are represented in arrays of 32-bit integers, each holding
+ * 13 bits of data. Values may also be slightly greater than the modulus,
+ * but they will always be lower than twice the modulus.
+ */
+typedef struct {
+	uint32_t x[20];
+	uint32_t y[20];
+	uint32_t z[20];
+} p256_jacobian;
+
+/*
+ * Convert a point to affine coordinates:
+ *  - If the point is the point at infinity, then all three coordinates
+ *    are set to 0.
+ *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
+ *    coordinates are the 'X' and 'Y' affine coordinates.
+ * The coordinates are guaranteed to be lower than the modulus.
+ */
+static void
+p256_to_affine(p256_jacobian *P)
+{
+	uint32_t t1[20], t2[20];
+	int i;
+
+	/*
+	 * Invert z with a modular exponentiation: the modulus is
+	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
+	 * p-2. Exponent bit pattern (from high to low) is:
+	 *  - 32 bits of value 1
+	 *  - 31 bits of value 0
+	 *  - 1 bit of value 1
+	 *  - 96 bits of value 0
+	 *  - 94 bits of value 1
+	 *  - 1 bit of value 0
+	 *  - 1 bit of value 1
+	 * Thus, we precompute z^(2^31-1) to speed things up.
+	 *
+	 * If z = 0 (point at infinity) then the modular exponentiation
+	 * will yield 0, which leads to the expected result (all three
+	 * coordinates set to 0).
+	 */
+
+	/*
+	 * A simple square-and-multiply for z^(2^31-1). We could save about
+	 * two dozen multiplications here with an addition chain, but
+	 * this would require a bit more code, and extra stack buffers.
+	 */
+	memcpy(t1, P->z, sizeof P->z);
+	for (i = 0; i < 30; i ++) {
+		square_f256(t1, t1);
+		mul_f256(t1, t1, P->z);
+	}
+
+	/*
+	 * Square-and-multiply. Apart from the squarings, we have a few
+	 * multiplications to set bits to 1; we multiply by the original z
+	 * for setting 1 bit, and by t1 for setting 31 bits.
+	 */
+	memcpy(t2, P->z, sizeof P->z);
+	for (i = 1; i < 256; i ++) {
+		square_f256(t2, t2);
+		switch (i) {
+		case 31:
+		case 190:
+		case 221:
+		case 252:
+			mul_f256(t2, t2, t1);
+			break;
+		case 63:
+		case 253:
+		case 255:
+			mul_f256(t2, t2, P->z);
+			break;
+		}
+	}
+
+	/*
+	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
+	 */
+	mul_f256(t1, t2, t2);
+	mul_f256(P->x, t1, P->x);
+	mul_f256(t1, t1, t2);
+	mul_f256(P->y, t1, P->y);
+	reduce_final_f256(P->x);
+	reduce_final_f256(P->y);
+
+	/*
+	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
+	 * this will set z to 1.
+	 */
+	mul_f256(P->z, P->z, t2);
+	reduce_final_f256(P->z);
+}
+
+/*
+ * Double a point in P-256. This function works for all valid points,
+ * including the point at infinity.
+ */
+static void
+p256_double(p256_jacobian *Q)
+{
+	/*
+	 * Doubling formulas are:
+	 *
+	 *   s = 4*x*y^2
+	 *   m = 3*(x + z^2)*(x - z^2)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y^4
+	 *   z' = 2*y*z
+	 *
+	 * These formulas work for all points, including points of order 2
+	 * and points at infinity:
+	 *   - If y = 0 then z' = 0. But there is no such point in P-256
+	 *     anyway.
+	 *   - If z = 0 then z' = 0.
+	 */
+	uint32_t t1[20], t2[20], t3[20], t4[20];
+	int i;
+
+	/*
+	 * Compute z^2 in t1.
+	 */
+	square_f256(t1, Q->z);
+
+	/*
+	 * Compute x-z^2 in t2 and x+z^2 in t1.
+	 */
+	for (i = 0; i < 20; i ++) {
+		t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
+		t1[i] += Q->x[i];
+	}
+	norm13(t1, t1, 20);
+	norm13(t2, t2, 20);
+
+	/*
+	 * Compute 3*(x+z^2)*(x-z^2) in t1.
+	 */
+	mul_f256(t3, t1, t2);
+	for (i = 0; i < 20; i ++) {
+		t1[i] = MUL15(3, t3[i]);
+	}
+	norm13(t1, t1, 20);
+
+	/*
+	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	 */
+	square_f256(t3, Q->y);
+	for (i = 0; i < 20; i ++) {
+		t3[i] <<= 1;
+	}
+	norm13(t3, t3, 20);
+	mul_f256(t2, Q->x, t3);
+	for (i = 0; i < 20; i ++) {
+		t2[i] <<= 1;
+	}
+	norm13(t2, t2, 20);
+	reduce_f256(t2);
+
+	/*
+	 * Compute x' = m^2 - 2*s.
+	 */
+	square_f256(Q->x, t1);
+	for (i = 0; i < 20; i ++) {
+		Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
+	}
+	norm13(Q->x, Q->x, 20);
+	reduce_f256(Q->x);
+
+	/*
+	 * Compute z' = 2*y*z.
+	 */
+	mul_f256(t4, Q->y, Q->z);
+	for (i = 0; i < 20; i ++) {
+		Q->z[i] = t4[i] << 1;
+	}
+	norm13(Q->z, Q->z, 20);
+	reduce_f256(Q->z);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	 * 2*y^2 in t3.
+	 */
+	for (i = 0; i < 20; i ++) {
+		t2[i] += (F256[i] << 1) - Q->x[i];
+	}
+	norm13(t2, t2, 20);
+	mul_f256(Q->y, t1, t2);
+	square_f256(t4, t3);
+	for (i = 0; i < 20; i ++) {
+		Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
+	}
+	norm13(Q->y, Q->y, 20);
+	reduce_f256(Q->y);
+}
+
+/*
+ * Add point P2 to point P1.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0 but P2 != 0
+ *   - If P1 != 0 but P2 == 0
+ *   - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y coordinate
+ *   - P1 == 0 and P2 == 0
+ *   - The Y coordinate of one of the points is 0 and the other point is
+ *     the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1 * z2^2
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1 * z2^3
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1 * z2
+	 */
+	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
+	uint32_t ret;
+	int i;
+
+	/*
+	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	 */
+	square_f256(t3, P2->z);
+	mul_f256(t1, P1->x, t3);
+	mul_f256(t4, P2->z, t3);
+	mul_f256(t3, P1->y, t4);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	square_f256(t4, P1->z);
+	mul_f256(t2, P2->x, t4);
+	mul_f256(t5, P1->z, t4);
+	mul_f256(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	for (i = 0; i < 20; i ++) {
+		t2[i] += (F256[i] << 1) - t1[i];
+		t4[i] += (F256[i] << 1) - t3[i];
+	}
+	norm13(t2, t2, 20);
+	norm13(t4, t4, 20);
+	reduce_f256(t4);
+	reduce_final_f256(t4);
+	ret = 0;
+	for (i = 0; i < 20; i ++) {
+		ret |= t4[i];
+	}
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	square_f256(t7, t2);
+	mul_f256(t6, t1, t7);
+	mul_f256(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	square_f256(P1->x, t4);
+	for (i = 0; i < 20; i ++) {
+		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
+	}
+	norm13(P1->x, P1->x, 20);
+	reduce_f256(P1->x);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	for (i = 0; i < 20; i ++) {
+		t6[i] += (F256[i] << 1) - P1->x[i];
+	}
+	norm13(t6, t6, 20);
+	mul_f256(P1->y, t4, t6);
+	mul_f256(t1, t5, t3);
+	for (i = 0; i < 20; i ++) {
+		P1->y[i] += (F256[i] << 1) - t1[i];
+	}
+	norm13(P1->y, P1->y, 20);
+	reduce_f256(P1->y);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	mul_f256(t1, P1->z, P2->z);
+	mul_f256(P1->z, t1, t2);
+
+	return ret;
+}
+
+/*
+ * Add point P2 to point P1. This is a specialised function for the
+ * case when P2 is a non-zero point in affine coordinate.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0
+ *   - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y coordinate
+ *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 */
+	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
+	uint32_t ret;
+	int i;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	square_f256(t4, P1->z);
+	mul_f256(t2, P2->x, t4);
+	mul_f256(t5, P1->z, t4);
+	mul_f256(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	for (i = 0; i < 20; i ++) {
+		t2[i] += (F256[i] << 1) - t1[i];
+		t4[i] += (F256[i] << 1) - t3[i];
+	}
+	norm13(t2, t2, 20);
+	norm13(t4, t4, 20);
+	reduce_f256(t4);
+	reduce_final_f256(t4);
+	ret = 0;
+	for (i = 0; i < 20; i ++) {
+		ret |= t4[i];
+	}
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	square_f256(t7, t2);
+	mul_f256(t6, t1, t7);
+	mul_f256(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	square_f256(P1->x, t4);
+	for (i = 0; i < 20; i ++) {
+		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
+	}
+	norm13(P1->x, P1->x, 20);
+	reduce_f256(P1->x);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	for (i = 0; i < 20; i ++) {
+		t6[i] += (F256[i] << 1) - P1->x[i];
+	}
+	norm13(t6, t6, 20);
+	mul_f256(P1->y, t4, t6);
+	mul_f256(t1, t5, t3);
+	for (i = 0; i < 20; i ++) {
+		P1->y[i] += (F256[i] << 1) - t1[i];
+	}
+	norm13(P1->y, P1->y, 20);
+	reduce_f256(P1->y);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	mul_f256(P1->z, P1->z, t2);
+
+	return ret;
+}
+
+/*
+ * Decode a P-256 point. This function does not support the point at
+ * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
+ */
+static uint32_t
+p256_decode(p256_jacobian *P, const void *src, size_t len)
+{
+	const unsigned char *buf;
+	uint32_t tx[20], ty[20], t1[20], t2[20];
+	uint32_t bad;
+	int i;
+
+	if (len != 65) {
+		return 0;
+	}
+	buf = src;
+
+	/*
+	 * First byte must be 0x04 (uncompressed format). We could support
+	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
+	 * least significant bit of the Y coordinate), but it is explicitly
+	 * forbidden by RFC 5480 (section 2.2).
+	 */
+	bad = NEQ(buf[0], 0x04);
+
+	/*
+	 * Decode the coordinates, and check that they are both lower
+	 * than the modulus.
+	 */
+	tx[19] = be8_to_le13(tx, buf + 1, 32);
+	ty[19] = be8_to_le13(ty, buf + 33, 32);
+	bad |= reduce_final_f256(tx);
+	bad |= reduce_final_f256(ty);
+
+	/*
+	 * Check curve equation.
+	 */
+	square_f256(t1, tx);
+	mul_f256(t1, tx, t1);
+	square_f256(t2, ty);
+	for (i = 0; i < 20; i ++) {
+		t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
+	}
+	norm13(t1, t1, 20);
+	reduce_f256(t1);
+	reduce_final_f256(t1);
+	for (i = 0; i < 20; i ++) {
+		bad |= t1[i];
+	}
+
+	/*
+	 * Copy coordinates to the point structure.
+	 */
+	memcpy(P->x, tx, sizeof tx);
+	memcpy(P->y, ty, sizeof ty);
+	memset(P->z, 0, sizeof P->z);
+	P->z[0] = 1;
+	return EQ(bad, 0);
+}
+
+/*
+ * Encode a point into a buffer. This function assumes that the point is
+ * valid, in affine coordinates, and not the point at infinity.
+ */
+static void
+p256_encode(void *dst, const p256_jacobian *P)
+{
+	unsigned char *buf;
+
+	buf = dst;
+	buf[0] = 0x04;
+	le13_to_be8(buf + 1, 32, P->x);
+	le13_to_be8(buf + 33, 32, P->y);
+}
+
+/*
+ * Multiply a curve point by an integer. The integer is assumed to be
+ * lower than the curve order, and the base point must not be the point
+ * at infinity.
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
+{
+	/*
+	 * qz is a flag that is initially 1, and remains equal to 1
+	 * as long as the point is the point at infinity.
+	 *
+	 * We use a 2-bit window to handle multiplier bits by pairs.
+	 * The precomputed window really is the points P2 and P3.
+	 */
+	uint32_t qz;
+	p256_jacobian P2, P3, Q, T, U;
+
+	/*
+	 * Compute window values.
+	 */
+	P2 = *P;
+	p256_double(&P2);
+	P3 = *P;
+	p256_add(&P3, &P2);
+
+	/*
+	 * We start with Q = 0. We process multiplier bits 2 by 2.
+	 */
+	memset(&Q, 0, sizeof Q);
+	qz = 1;
+	while (xlen -- > 0) {
+		int k;
+
+		for (k = 6; k >= 0; k -= 2) {
+			uint32_t bits;
+			uint32_t bnz;
+
+			p256_double(&Q);
+			p256_double(&Q);
+			T = *P;
+			U = Q;
+			bits = (*x >> k) & (uint32_t)3;
+			bnz = NEQ(bits, 0);
+			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
+			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
+			p256_add(&U, &T);
+			CCOPY(bnz & qz, &Q, &T, sizeof Q);
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+		}
+		x ++;
+	}
+	*P = Q;
+}
+
+/*
+ * Precomputed window: k*G points, where G is the curve generator, and k
+ * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
+ * the point are encoded as 20 words of 13 bits each (little-endian
+ * order); 13-bit words are then grouped 2-by-2 into 32-bit words
+ * (little-endian order within each word).
+ */
+static const uint32_t Gwin[15][20] = {
+
+	{ 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
+	  0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
+	  0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
+	  0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
+	  0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
+
+	{ 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
+	  0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
+	  0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
+	  0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
+	  0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
+
+	{ 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
+	  0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
+	  0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
+	  0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
+	  0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
+
+	{ 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
+	  0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
+	  0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
+	  0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
+	  0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
+
+	{ 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
+	  0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
+	  0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
+	  0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
+	  0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
+
+	{ 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
+	  0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
+	  0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
+	  0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
+	  0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
+
+	{ 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
+	  0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
+	  0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
+	  0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
+	  0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
+
+	{ 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
+	  0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
+	  0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
+	  0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
+	  0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
+
+	{ 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
+	  0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
+	  0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
+	  0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
+	  0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
+
+	{ 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
+	  0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
+	  0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
+	  0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
+	  0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
+
+	{ 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
+	  0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
+	  0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
+	  0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
+	  0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
+
+	{ 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
+	  0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
+	  0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
+	  0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
+	  0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
+
+	{ 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
+	  0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
+	  0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
+	  0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
+	  0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
+
+	{ 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
+	  0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
+	  0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
+	  0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
+	  0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
+
+	{ 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
+	  0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
+	  0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
+	  0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
+	  0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
+};
+
+/*
+ * Lookup one of the Gwin[] values, by index. This is constant-time.
+ */
+static void
+lookup_Gwin(p256_jacobian *T, uint32_t idx)
+{
+	uint32_t xy[20];
+	uint32_t k;
+	size_t u;
+
+	memset(xy, 0, sizeof xy);
+	for (k = 0; k < 15; k ++) {
+		uint32_t m;
+
+		m = -EQ(idx, k + 1);
+		for (u = 0; u < 20; u ++) {
+			xy[u] |= m & Gwin[k][u];
+		}
+	}
+	for (u = 0; u < 10; u ++) {
+		T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
+		T->x[(u << 1) + 1] = xy[u] >> 16;
+		T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
+		T->y[(u << 1) + 1] = xy[u + 10] >> 16;
+	}
+	memset(T->z, 0, sizeof T->z);
+	T->z[0] = 1;
+}
+
+/*
+ * Multiply the generator by an integer. The integer is assumed non-zero
+ * and lower than the curve order.
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
+{
+	/*
+	 * qz is a flag that is initially 1, and remains equal to 1
+	 * as long as the point is the point at infinity.
+	 *
+	 * We use a 4-bit window to handle multiplier bits by groups
+	 * of 4. The precomputed window is constant static data, with
+	 * points in affine coordinates; we use a constant-time lookup.
+	 */
+	p256_jacobian Q;
+	uint32_t qz;
+
+	memset(&Q, 0, sizeof Q);
+	qz = 1;
+	while (xlen -- > 0) {
+		int k;
+		unsigned bx;
+
+		bx = *x ++;
+		for (k = 0; k < 2; k ++) {
+			uint32_t bits;
+			uint32_t bnz;
+			p256_jacobian T, U;
+
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			bits = (bx >> 4) & 0x0F;
+			bnz = NEQ(bits, 0);
+			lookup_Gwin(&T, bits);
+			U = Q;
+			p256_add_mixed(&U, &T);
+			CCOPY(bnz & qz, &Q, &T, sizeof Q);
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+			bx <<= 4;
+		}
+	}
+	*P = Q;
+}
+
+static const unsigned char P256_G[] = {
+	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+	0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+	0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_G;
+	return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_N;
+	return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 1;
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	uint32_t r;
+	p256_jacobian P;
+
+	(void)curve;
+	r = p256_decode(&P, G, Glen);
+	p256_mul(&P, x, xlen);
+	if (Glen >= 65) {
+		p256_to_affine(&P);
+		p256_encode(G, &P);
+	}
+	return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	p256_jacobian P;
+
+	(void)curve;
+	p256_mulgen(&P, x, xlen);
+	p256_to_affine(&P);
+	p256_encode(R, &P);
+	return 65;
+
+	/*
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+	*/
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	p256_jacobian P, Q;
+	uint32_t r, t, z;
+	int i;
+
+	(void)curve;
+	r = p256_decode(&P, A, len);
+	p256_mul(&P, x, xlen);
+	if (B == NULL) {
+		p256_mulgen(&Q, y, ylen);
+	} else {
+		r &= p256_decode(&Q, B, len);
+		p256_mul(&Q, y, ylen);
+	}
+
+	/*
+	 * The final addition may fail in case both points are equal.
+	 */
+	t = p256_add(&P, &Q);
+	reduce_final_f256(P.z);
+	z = 0;
+	for (i = 0; i < 20; i ++) {
+		z |= P.z[i];
+	}
+	z = EQ(z, 0);
+	p256_double(&Q);
+
+	/*
+	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	 * have the following:
+	 *
+	 *   z = 0, t = 0   return P (normal addition)
+	 *   z = 0, t = 1   return P (normal addition)
+	 *   z = 1, t = 0   return Q (a 'double' case)
+	 *   z = 1, t = 1   report an error (P+Q = 0)
+	 */
+	CCOPY(z & ~t, &P, &Q, sizeof Q);
+	p256_to_affine(&P);
+	p256_encode(A, &P);
+	r &= ~(z & t);
+	return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m15 = {
+	(uint32_t)0x00800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m31.c b/test/monniaux/BearSSL/src/ec/ec_p256_m31.c
new file mode 100644
index 00000000..d57ef7b0
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_p256_m31.c
@@ -0,0 +1,1475 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
+ * that right-shifting a signed negative integer copies the sign bit
+ * (arithmetic right-shift). This is "implementation-defined behaviour",
+ * i.e. it is not undefined, but it may differ between compilers. Each
+ * compiler is supposed to document its behaviour in that respect. GCC
+ * explicitly defines that an arithmetic right shift is used. We expect
+ * all other compilers to do the same, because underlying CPU offer an
+ * arithmetic right shift opcode that could not be used otherwise.
+ */
+#if BR_NO_ARITH_SHIFT
+#define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
+                      | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
+#define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
+                      | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
+#else
+#define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
+#define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
+#endif
+
+/*
+ * Convert an integer from unsigned big-endian encoding to a sequence of
+ * 30-bit words in little-endian order. The final "partial" word is
+ * returned.
+ */
+static uint32_t
+be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	uint32_t acc;
+	int acc_len;
+
+	acc = 0;
+	acc_len = 0;
+	while (len -- > 0) {
+		uint32_t b;
+
+		b = src[len];
+		if (acc_len < 22) {
+			acc |= b << acc_len;
+			acc_len += 8;
+		} else {
+			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
+			acc = b >> (30 - acc_len);
+			acc_len -= 22;
+		}
+	}
+	return acc;
+}
+
+/*
+ * Convert an integer (30-bit words, little-endian) to unsigned
+ * big-endian encoding. The total encoding length is provided; all
+ * the destination bytes will be filled.
+ */
+static void
+le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
+{
+	uint32_t acc;
+	int acc_len;
+
+	acc = 0;
+	acc_len = 0;
+	while (len -- > 0) {
+		if (acc_len < 8) {
+			uint32_t w;
+
+			w = *src ++;
+			dst[len] = (unsigned char)(acc | (w << acc_len));
+			acc = w >> (8 - acc_len);
+			acc_len += 22;
+		} else {
+			dst[len] = (unsigned char)acc;
+			acc >>= 8;
+			acc_len -= 8;
+		}
+	}
+}
+
+/*
+ * Multiply two integers. Source integers are represented as arrays of
+ * nine 30-bit words, for values up to 2^270-1. Result is encoded over
+ * 18 words of 30 bits each.
+ */
+static void
+mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	/*
+	 * Maximum intermediate result is no more than
+	 * 10376293531797946367, which fits in 64 bits. Reason:
+	 *
+	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
+	 *   10376293531797946367 < 9663676407 * 2^30
+	 *
+	 * Thus, adding together 9 products of 30-bit integers, with
+	 * a carry of at most 9663676406, yields an integer that fits
+	 * on 64 bits and generates a carry of at most 9663676406.
+	 */
+	uint64_t t[17];
+	uint64_t cc;
+	int i;
+
+	t[ 0] = MUL31(a[0], b[0]);
+	t[ 1] = MUL31(a[0], b[1])
+		+ MUL31(a[1], b[0]);
+	t[ 2] = MUL31(a[0], b[2])
+		+ MUL31(a[1], b[1])
+		+ MUL31(a[2], b[0]);
+	t[ 3] = MUL31(a[0], b[3])
+		+ MUL31(a[1], b[2])
+		+ MUL31(a[2], b[1])
+		+ MUL31(a[3], b[0]);
+	t[ 4] = MUL31(a[0], b[4])
+		+ MUL31(a[1], b[3])
+		+ MUL31(a[2], b[2])
+		+ MUL31(a[3], b[1])
+		+ MUL31(a[4], b[0]);
+	t[ 5] = MUL31(a[0], b[5])
+		+ MUL31(a[1], b[4])
+		+ MUL31(a[2], b[3])
+		+ MUL31(a[3], b[2])
+		+ MUL31(a[4], b[1])
+		+ MUL31(a[5], b[0]);
+	t[ 6] = MUL31(a[0], b[6])
+		+ MUL31(a[1], b[5])
+		+ MUL31(a[2], b[4])
+		+ MUL31(a[3], b[3])
+		+ MUL31(a[4], b[2])
+		+ MUL31(a[5], b[1])
+		+ MUL31(a[6], b[0]);
+	t[ 7] = MUL31(a[0], b[7])
+		+ MUL31(a[1], b[6])
+		+ MUL31(a[2], b[5])
+		+ MUL31(a[3], b[4])
+		+ MUL31(a[4], b[3])
+		+ MUL31(a[5], b[2])
+		+ MUL31(a[6], b[1])
+		+ MUL31(a[7], b[0]);
+	t[ 8] = MUL31(a[0], b[8])
+		+ MUL31(a[1], b[7])
+		+ MUL31(a[2], b[6])
+		+ MUL31(a[3], b[5])
+		+ MUL31(a[4], b[4])
+		+ MUL31(a[5], b[3])
+		+ MUL31(a[6], b[2])
+		+ MUL31(a[7], b[1])
+		+ MUL31(a[8], b[0]);
+	t[ 9] = MUL31(a[1], b[8])
+		+ MUL31(a[2], b[7])
+		+ MUL31(a[3], b[6])
+		+ MUL31(a[4], b[5])
+		+ MUL31(a[5], b[4])
+		+ MUL31(a[6], b[3])
+		+ MUL31(a[7], b[2])
+		+ MUL31(a[8], b[1]);
+	t[10] = MUL31(a[2], b[8])
+		+ MUL31(a[3], b[7])
+		+ MUL31(a[4], b[6])
+		+ MUL31(a[5], b[5])
+		+ MUL31(a[6], b[4])
+		+ MUL31(a[7], b[3])
+		+ MUL31(a[8], b[2]);
+	t[11] = MUL31(a[3], b[8])
+		+ MUL31(a[4], b[7])
+		+ MUL31(a[5], b[6])
+		+ MUL31(a[6], b[5])
+		+ MUL31(a[7], b[4])
+		+ MUL31(a[8], b[3]);
+	t[12] = MUL31(a[4], b[8])
+		+ MUL31(a[5], b[7])
+		+ MUL31(a[6], b[6])
+		+ MUL31(a[7], b[5])
+		+ MUL31(a[8], b[4]);
+	t[13] = MUL31(a[5], b[8])
+		+ MUL31(a[6], b[7])
+		+ MUL31(a[7], b[6])
+		+ MUL31(a[8], b[5]);
+	t[14] = MUL31(a[6], b[8])
+		+ MUL31(a[7], b[7])
+		+ MUL31(a[8], b[6]);
+	t[15] = MUL31(a[7], b[8])
+		+ MUL31(a[8], b[7]);
+	t[16] = MUL31(a[8], b[8]);
+
+	/*
+	 * Propagate carries.
+	 */
+	cc = 0;
+	for (i = 0; i < 17; i ++) {
+		uint64_t w;
+
+		w = t[i] + cc;
+		d[i] = (uint32_t)w & 0x3FFFFFFF;
+		cc = w >> 30;
+	}
+	d[17] = (uint32_t)cc;
+}
+
+/*
+ * Square a 270-bit integer, represented as an array of nine 30-bit words.
+ * Result uses 18 words of 30 bits each.
+ */
+static void
+square9(uint32_t *d, const uint32_t *a)
+{
+	uint64_t t[17];
+	uint64_t cc;
+	int i;
+
+	t[ 0] = MUL31(a[0], a[0]);
+	t[ 1] = ((MUL31(a[0], a[1])) << 1);
+	t[ 2] = MUL31(a[1], a[1])
+		+ ((MUL31(a[0], a[2])) << 1);
+	t[ 3] = ((MUL31(a[0], a[3])
+		+ MUL31(a[1], a[2])) << 1);
+	t[ 4] = MUL31(a[2], a[2])
+		+ ((MUL31(a[0], a[4])
+		+ MUL31(a[1], a[3])) << 1);
+	t[ 5] = ((MUL31(a[0], a[5])
+		+ MUL31(a[1], a[4])
+		+ MUL31(a[2], a[3])) << 1);
+	t[ 6] = MUL31(a[3], a[3])
+		+ ((MUL31(a[0], a[6])
+		+ MUL31(a[1], a[5])
+		+ MUL31(a[2], a[4])) << 1);
+	t[ 7] = ((MUL31(a[0], a[7])
+		+ MUL31(a[1], a[6])
+		+ MUL31(a[2], a[5])
+		+ MUL31(a[3], a[4])) << 1);
+	t[ 8] = MUL31(a[4], a[4])
+		+ ((MUL31(a[0], a[8])
+		+ MUL31(a[1], a[7])
+		+ MUL31(a[2], a[6])
+		+ MUL31(a[3], a[5])) << 1);
+	t[ 9] = ((MUL31(a[1], a[8])
+		+ MUL31(a[2], a[7])
+		+ MUL31(a[3], a[6])
+		+ MUL31(a[4], a[5])) << 1);
+	t[10] = MUL31(a[5], a[5])
+		+ ((MUL31(a[2], a[8])
+		+ MUL31(a[3], a[7])
+		+ MUL31(a[4], a[6])) << 1);
+	t[11] = ((MUL31(a[3], a[8])
+		+ MUL31(a[4], a[7])
+		+ MUL31(a[5], a[6])) << 1);
+	t[12] = MUL31(a[6], a[6])
+		+ ((MUL31(a[4], a[8])
+		+ MUL31(a[5], a[7])) << 1);
+	t[13] = ((MUL31(a[5], a[8])
+		+ MUL31(a[6], a[7])) << 1);
+	t[14] = MUL31(a[7], a[7])
+		+ ((MUL31(a[6], a[8])) << 1);
+	t[15] = ((MUL31(a[7], a[8])) << 1);
+	t[16] = MUL31(a[8], a[8]);
+
+	/*
+	 * Propagate carries.
+	 */
+	cc = 0;
+	for (i = 0; i < 17; i ++) {
+		uint64_t w;
+
+		w = t[i] + cc;
+		d[i] = (uint32_t)w & 0x3FFFFFFF;
+		cc = w >> 30;
+	}
+	d[17] = (uint32_t)cc;
+}
+
+/*
+ * Base field modulus for P-256.
+ */
+static const uint32_t F256[] = {
+
+	0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
+	0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
+};
+
+/*
+ * The 'b' curve equation coefficient for P-256.
+ */
+static const uint32_t P256_B[] = {
+
+	0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
+	0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
+};
+
+/*
+ * Addition in the field. Source operands shall fit on 257 bits; output
+ * will be lower than twice the modulus.
+ */
+static void
+add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t w, cc;
+	int i;
+
+	cc = 0;
+	for (i = 0; i < 9; i ++) {
+		w = a[i] + b[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = w >> 30;
+	}
+	w >>= 16;
+	d[8] &= 0xFFFF;
+	d[3] -= w << 6;
+	d[6] -= w << 12;
+	d[7] += w << 14;
+	cc = w;
+	for (i = 0; i < 9; i ++) {
+		w = d[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = ARSH(w, 30);
+	}
+}
+
+/*
+ * Subtraction in the field. Source operands shall be smaller than twice
+ * the modulus; the result will fulfil the same property.
+ */
+static void
+sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t w, cc;
+	int i;
+
+	/*
+	 * We really compute a - b + 2*p to make sure that the result is
+	 * positive.
+	 */
+	w = a[0] - b[0] - 0x00002;
+	d[0] = w & 0x3FFFFFFF;
+	w = a[1] - b[1] + ARSH(w, 30);
+	d[1] = w & 0x3FFFFFFF;
+	w = a[2] - b[2] + ARSH(w, 30);
+	d[2] = w & 0x3FFFFFFF;
+	w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
+	d[3] = w & 0x3FFFFFFF;
+	w = a[4] - b[4] + ARSH(w, 30);
+	d[4] = w & 0x3FFFFFFF;
+	w = a[5] - b[5] + ARSH(w, 30);
+	d[5] = w & 0x3FFFFFFF;
+	w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
+	d[6] = w & 0x3FFFFFFF;
+	w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
+	d[7] = w & 0x3FFFFFFF;
+	w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
+	d[8] = w & 0xFFFF;
+	w >>= 16;
+	d[8] &= 0xFFFF;
+	d[3] -= w << 6;
+	d[6] -= w << 12;
+	d[7] += w << 14;
+	cc = w;
+	for (i = 0; i < 9; i ++) {
+		w = d[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = ARSH(w, 30);
+	}
+}
+
+/*
+ * Compute a multiplication in F256. Source operands shall be less than
+ * twice the modulus.
+ */
+static void
+mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
+{
+	uint32_t t[18];
+	uint64_t s[18];
+	uint64_t cc, x;
+	uint32_t z, c;
+	int i;
+
+	mul9(t, a, b);
+
+	/*
+	 * Modular reduction: each high word in added/subtracted where
+	 * necessary.
+	 *
+	 * The modulus is:
+	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+	 * Therefore:
+	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+	 *
+	 * For a word x at bit offset n (n >= 256), we have:
+	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
+	 *            - x*2^(n - 160) + x*2^(n-256) mod p
+	 *
+	 * Thus, we can nullify the high word if we reinject it at some
+	 * proper emplacements.
+	 *
+	 * We use 64-bit intermediate words to allow for carries to
+	 * accumulate easily, before performing the final propagation.
+	 */
+	for (i = 0; i < 18; i ++) {
+		s[i] = t[i];
+	}
+
+	for (i = 17; i >= 9; i --) {
+		uint64_t y;
+
+		y = s[i];
+		s[i - 1] += ARSHW(y, 2);
+		s[i - 2] += (y << 28) & 0x3FFFFFFF;
+		s[i - 2] -= ARSHW(y, 4);
+		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
+		s[i - 5] -= ARSHW(y, 10);
+		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
+		s[i - 8] += ARSHW(y, 16);
+		s[i - 9] += (y << 14) & 0x3FFFFFFF;
+	}
+
+	/*
+	 * Carry propagation must be signed. Moreover, we may have overdone
+	 * it a bit, and obtain a negative result.
+	 *
+	 * The loop above ran 9 times; each time, each word was augmented
+	 * by at most one extra word (in absolute value). Thus, the top
+	 * word must in fine fit in 39 bits, so the carry below will fit
+	 * on 9 bits.
+	 */
+	cc = 0;
+	for (i = 0; i < 9; i ++) {
+		x = s[i] + cc;
+		d[i] = (uint32_t)x & 0x3FFFFFFF;
+		cc = ARSHW(x, 30);
+	}
+
+	/*
+	 * All nine words fit on 30 bits, but there may be an extra
+	 * carry for a few bits (at most 9), and that carry may be
+	 * negative. Moreover, we want the result to fit on 257 bits.
+	 * The two lines below ensure that the word in d[] has length
+	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
+	 * significant length of cc is less than 24 bits, so we will be
+	 * able to switch to 32-bit operations.
+	 */
+	cc = ARSHW(x, 16);
+	d[8] &= 0xFFFF;
+
+	/*
+	 * One extra round of reduction, for cc*2^256, which means
+	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
+	 * value. If cc is negative, then it may happen (rarely, but
+	 * not neglectibly so) that the result would be negative. In
+	 * order to avoid that, if cc is negative, then we add the
+	 * modulus once. Note that if cc is negative, then propagating
+	 * that carry must yield a value lower than the modulus, so
+	 * adding the modulus once will keep the final result under
+	 * twice the modulus.
+	 */
+	z = (uint32_t)cc;
+	d[3] -= z << 6;
+	d[6] -= (z << 12) & 0x3FFFFFFF;
+	d[7] -= ARSH(z, 18);
+	d[7] += (z << 14) & 0x3FFFFFFF;
+	d[8] += ARSH(z, 16);
+	c = z >> 31;
+	d[0] -= c;
+	d[3] += c << 6;
+	d[6] += c << 12;
+	d[7] -= c << 14;
+	d[8] += c << 16;
+	for (i = 0; i < 9; i ++) {
+		uint32_t w;
+
+		w = d[i] + z;
+		d[i] = w & 0x3FFFFFFF;
+		z = ARSH(w, 30);
+	}
+}
+
+/*
+ * Compute a square in F256. Source operand shall be less than
+ * twice the modulus.
+ */
+static void
+square_f256(uint32_t *d, const uint32_t *a)
+{
+	uint32_t t[18];
+	uint64_t s[18];
+	uint64_t cc, x;
+	uint32_t z, c;
+	int i;
+
+	square9(t, a);
+
+	/*
+	 * Modular reduction: each high word in added/subtracted where
+	 * necessary.
+	 *
+	 * The modulus is:
+	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+	 * Therefore:
+	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+	 *
+	 * For a word x at bit offset n (n >= 256), we have:
+	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
+	 *            - x*2^(n - 160) + x*2^(n-256) mod p
+	 *
+	 * Thus, we can nullify the high word if we reinject it at some
+	 * proper emplacements.
+	 *
+	 * We use 64-bit intermediate words to allow for carries to
+	 * accumulate easily, before performing the final propagation.
+	 */
+	for (i = 0; i < 18; i ++) {
+		s[i] = t[i];
+	}
+
+	for (i = 17; i >= 9; i --) {
+		uint64_t y;
+
+		y = s[i];
+		s[i - 1] += ARSHW(y, 2);
+		s[i - 2] += (y << 28) & 0x3FFFFFFF;
+		s[i - 2] -= ARSHW(y, 4);
+		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
+		s[i - 5] -= ARSHW(y, 10);
+		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
+		s[i - 8] += ARSHW(y, 16);
+		s[i - 9] += (y << 14) & 0x3FFFFFFF;
+	}
+
+	/*
+	 * Carry propagation must be signed. Moreover, we may have overdone
+	 * it a bit, and obtain a negative result.
+	 *
+	 * The loop above ran 9 times; each time, each word was augmented
+	 * by at most one extra word (in absolute value). Thus, the top
+	 * word must in fine fit in 39 bits, so the carry below will fit
+	 * on 9 bits.
+	 */
+	cc = 0;
+	for (i = 0; i < 9; i ++) {
+		x = s[i] + cc;
+		d[i] = (uint32_t)x & 0x3FFFFFFF;
+		cc = ARSHW(x, 30);
+	}
+
+	/*
+	 * All nine words fit on 30 bits, but there may be an extra
+	 * carry for a few bits (at most 9), and that carry may be
+	 * negative. Moreover, we want the result to fit on 257 bits.
+	 * The two lines below ensure that the word in d[] has length
+	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
+	 * significant length of cc is less than 24 bits, so we will be
+	 * able to switch to 32-bit operations.
+	 */
+	cc = ARSHW(x, 16);
+	d[8] &= 0xFFFF;
+
+	/*
+	 * One extra round of reduction, for cc*2^256, which means
+	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
+	 * value. If cc is negative, then it may happen (rarely, but
+	 * not neglectibly so) that the result would be negative. In
+	 * order to avoid that, if cc is negative, then we add the
+	 * modulus once. Note that if cc is negative, then propagating
+	 * that carry must yield a value lower than the modulus, so
+	 * adding the modulus once will keep the final result under
+	 * twice the modulus.
+	 */
+	z = (uint32_t)cc;
+	d[3] -= z << 6;
+	d[6] -= (z << 12) & 0x3FFFFFFF;
+	d[7] -= ARSH(z, 18);
+	d[7] += (z << 14) & 0x3FFFFFFF;
+	d[8] += ARSH(z, 16);
+	c = z >> 31;
+	d[0] -= c;
+	d[3] += c << 6;
+	d[6] += c << 12;
+	d[7] -= c << 14;
+	d[8] += c << 16;
+	for (i = 0; i < 9; i ++) {
+		uint32_t w;
+
+		w = d[i] + z;
+		d[i] = w & 0x3FFFFFFF;
+		z = ARSH(w, 30);
+	}
+}
+
+/*
+ * Perform a "final reduction" in field F256 (field for curve P-256).
+ * The source value must be less than twice the modulus. If the value
+ * is not lower than the modulus, then the modulus is subtracted and
+ * this function returns 1; otherwise, it leaves it untouched and it
+ * returns 0.
+ */
+static uint32_t
+reduce_final_f256(uint32_t *d)
+{
+	uint32_t t[9];
+	uint32_t cc;
+	int i;
+
+	cc = 0;
+	for (i = 0; i < 9; i ++) {
+		uint32_t w;
+
+		w = d[i] - F256[i] - cc;
+		cc = w >> 31;
+		t[i] = w & 0x3FFFFFFF;
+	}
+	cc ^= 1;
+	CCOPY(cc, d, t, sizeof t);
+	return cc;
+}
+
+/*
+ * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
+ * are such that:
+ *   X = x / z^2
+ *   Y = y / z^3
+ * For the point at infinity, z = 0.
+ * Each point thus admits many possible representations.
+ *
+ * Coordinates are represented in arrays of 32-bit integers, each holding
+ * 30 bits of data. Values may also be slightly greater than the modulus,
+ * but they will always be lower than twice the modulus.
+ */
+typedef struct {
+	uint32_t x[9];
+	uint32_t y[9];
+	uint32_t z[9];
+} p256_jacobian;
+
+/*
+ * Convert a point to affine coordinates:
+ *  - If the point is the point at infinity, then all three coordinates
+ *    are set to 0.
+ *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
+ *    coordinates are the 'X' and 'Y' affine coordinates.
+ * The coordinates are guaranteed to be lower than the modulus.
+ */
+static void
+p256_to_affine(p256_jacobian *P)
+{
+	uint32_t t1[9], t2[9];
+	int i;
+
+	/*
+	 * Invert z with a modular exponentiation: the modulus is
+	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
+	 * p-2. Exponent bit pattern (from high to low) is:
+	 *  - 32 bits of value 1
+	 *  - 31 bits of value 0
+	 *  - 1 bit of value 1
+	 *  - 96 bits of value 0
+	 *  - 94 bits of value 1
+	 *  - 1 bit of value 0
+	 *  - 1 bit of value 1
+	 * Thus, we precompute z^(2^31-1) to speed things up.
+	 *
+	 * If z = 0 (point at infinity) then the modular exponentiation
+	 * will yield 0, which leads to the expected result (all three
+	 * coordinates set to 0).
+	 */
+
+	/*
+	 * A simple square-and-multiply for z^(2^31-1). We could save about
+	 * two dozen multiplications here with an addition chain, but
+	 * this would require a bit more code, and extra stack buffers.
+	 */
+	memcpy(t1, P->z, sizeof P->z);
+	for (i = 0; i < 30; i ++) {
+		square_f256(t1, t1);
+		mul_f256(t1, t1, P->z);
+	}
+
+	/*
+	 * Square-and-multiply. Apart from the squarings, we have a few
+	 * multiplications to set bits to 1; we multiply by the original z
+	 * for setting 1 bit, and by t1 for setting 31 bits.
+	 */
+	memcpy(t2, P->z, sizeof P->z);
+	for (i = 1; i < 256; i ++) {
+		square_f256(t2, t2);
+		switch (i) {
+		case 31:
+		case 190:
+		case 221:
+		case 252:
+			mul_f256(t2, t2, t1);
+			break;
+		case 63:
+		case 253:
+		case 255:
+			mul_f256(t2, t2, P->z);
+			break;
+		}
+	}
+
+	/*
+	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
+	 */
+	mul_f256(t1, t2, t2);
+	mul_f256(P->x, t1, P->x);
+	mul_f256(t1, t1, t2);
+	mul_f256(P->y, t1, P->y);
+	reduce_final_f256(P->x);
+	reduce_final_f256(P->y);
+
+	/*
+	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
+	 * this will set z to 1.
+	 */
+	mul_f256(P->z, P->z, t2);
+	reduce_final_f256(P->z);
+}
+
+/*
+ * Double a point in P-256. This function works for all valid points,
+ * including the point at infinity.
+ */
+static void
+p256_double(p256_jacobian *Q)
+{
+	/*
+	 * Doubling formulas are:
+	 *
+	 *   s = 4*x*y^2
+	 *   m = 3*(x + z^2)*(x - z^2)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y^4
+	 *   z' = 2*y*z
+	 *
+	 * These formulas work for all points, including points of order 2
+	 * and points at infinity:
+	 *   - If y = 0 then z' = 0. But there is no such point in P-256
+	 *     anyway.
+	 *   - If z = 0 then z' = 0.
+	 */
+	uint32_t t1[9], t2[9], t3[9], t4[9];
+
+	/*
+	 * Compute z^2 in t1.
+	 */
+	square_f256(t1, Q->z);
+
+	/*
+	 * Compute x-z^2 in t2 and x+z^2 in t1.
+	 */
+	add_f256(t2, Q->x, t1);
+	sub_f256(t1, Q->x, t1);
+
+	/*
+	 * Compute 3*(x+z^2)*(x-z^2) in t1.
+	 */
+	mul_f256(t3, t1, t2);
+	add_f256(t1, t3, t3);
+	add_f256(t1, t3, t1);
+
+	/*
+	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	 */
+	square_f256(t3, Q->y);
+	add_f256(t3, t3, t3);
+	mul_f256(t2, Q->x, t3);
+	add_f256(t2, t2, t2);
+
+	/*
+	 * Compute x' = m^2 - 2*s.
+	 */
+	square_f256(Q->x, t1);
+	sub_f256(Q->x, Q->x, t2);
+	sub_f256(Q->x, Q->x, t2);
+
+	/*
+	 * Compute z' = 2*y*z.
+	 */
+	mul_f256(t4, Q->y, Q->z);
+	add_f256(Q->z, t4, t4);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	 * 2*y^2 in t3.
+	 */
+	sub_f256(t2, t2, Q->x);
+	mul_f256(Q->y, t1, t2);
+	square_f256(t4, t3);
+	add_f256(t4, t4, t4);
+	sub_f256(Q->y, Q->y, t4);
+}
+
+/*
+ * Add point P2 to point P1.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0 but P2 != 0
+ *   - If P1 != 0 but P2 == 0
+ *   - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y coordinate
+ *   - P1 == 0 and P2 == 0
+ *   - The Y coordinate of one of the points is 0 and the other point is
+ *     the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1 * z2^2
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1 * z2^3
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1 * z2
+	 */
+	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
+	uint32_t ret;
+	int i;
+
+	/*
+	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	 */
+	square_f256(t3, P2->z);
+	mul_f256(t1, P1->x, t3);
+	mul_f256(t4, P2->z, t3);
+	mul_f256(t3, P1->y, t4);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	square_f256(t4, P1->z);
+	mul_f256(t2, P2->x, t4);
+	mul_f256(t5, P1->z, t4);
+	mul_f256(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	sub_f256(t2, t2, t1);
+	sub_f256(t4, t4, t3);
+	reduce_final_f256(t4);
+	ret = 0;
+	for (i = 0; i < 9; i ++) {
+		ret |= t4[i];
+	}
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	square_f256(t7, t2);
+	mul_f256(t6, t1, t7);
+	mul_f256(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	square_f256(P1->x, t4);
+	sub_f256(P1->x, P1->x, t5);
+	sub_f256(P1->x, P1->x, t6);
+	sub_f256(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	sub_f256(t6, t6, P1->x);
+	mul_f256(P1->y, t4, t6);
+	mul_f256(t1, t5, t3);
+	sub_f256(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	mul_f256(t1, P1->z, P2->z);
+	mul_f256(P1->z, t1, t2);
+
+	return ret;
+}
+
+/*
+ * Add point P2 to point P1. This is a specialised function for the
+ * case when P2 is a non-zero point in affine coordinate.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0
+ *   - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y coordinate
+ *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 */
+	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
+	uint32_t ret;
+	int i;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	square_f256(t4, P1->z);
+	mul_f256(t2, P2->x, t4);
+	mul_f256(t5, P1->z, t4);
+	mul_f256(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	sub_f256(t2, t2, t1);
+	sub_f256(t4, t4, t3);
+	reduce_final_f256(t4);
+	ret = 0;
+	for (i = 0; i < 9; i ++) {
+		ret |= t4[i];
+	}
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	square_f256(t7, t2);
+	mul_f256(t6, t1, t7);
+	mul_f256(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	square_f256(P1->x, t4);
+	sub_f256(P1->x, P1->x, t5);
+	sub_f256(P1->x, P1->x, t6);
+	sub_f256(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	sub_f256(t6, t6, P1->x);
+	mul_f256(P1->y, t4, t6);
+	mul_f256(t1, t5, t3);
+	sub_f256(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	mul_f256(P1->z, P1->z, t2);
+
+	return ret;
+}
+
+/*
+ * Decode a P-256 point. This function does not support the point at
+ * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
+ */
+static uint32_t
+p256_decode(p256_jacobian *P, const void *src, size_t len)
+{
+	const unsigned char *buf;
+	uint32_t tx[9], ty[9], t1[9], t2[9];
+	uint32_t bad;
+	int i;
+
+	if (len != 65) {
+		return 0;
+	}
+	buf = src;
+
+	/*
+	 * First byte must be 0x04 (uncompressed format). We could support
+	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
+	 * least significant bit of the Y coordinate), but it is explicitly
+	 * forbidden by RFC 5480 (section 2.2).
+	 */
+	bad = NEQ(buf[0], 0x04);
+
+	/*
+	 * Decode the coordinates, and check that they are both lower
+	 * than the modulus.
+	 */
+	tx[8] = be8_to_le30(tx, buf + 1, 32);
+	ty[8] = be8_to_le30(ty, buf + 33, 32);
+	bad |= reduce_final_f256(tx);
+	bad |= reduce_final_f256(ty);
+
+	/*
+	 * Check curve equation.
+	 */
+	square_f256(t1, tx);
+	mul_f256(t1, tx, t1);
+	square_f256(t2, ty);
+	sub_f256(t1, t1, tx);
+	sub_f256(t1, t1, tx);
+	sub_f256(t1, t1, tx);
+	add_f256(t1, t1, P256_B);
+	sub_f256(t1, t1, t2);
+	reduce_final_f256(t1);
+	for (i = 0; i < 9; i ++) {
+		bad |= t1[i];
+	}
+
+	/*
+	 * Copy coordinates to the point structure.
+	 */
+	memcpy(P->x, tx, sizeof tx);
+	memcpy(P->y, ty, sizeof ty);
+	memset(P->z, 0, sizeof P->z);
+	P->z[0] = 1;
+	return EQ(bad, 0);
+}
+
+/*
+ * Encode a point into a buffer. This function assumes that the point is
+ * valid, in affine coordinates, and not the point at infinity.
+ */
+static void
+p256_encode(void *dst, const p256_jacobian *P)
+{
+	unsigned char *buf;
+
+	buf = dst;
+	buf[0] = 0x04;
+	le30_to_be8(buf + 1, 32, P->x);
+	le30_to_be8(buf + 33, 32, P->y);
+}
+
+/*
+ * Multiply a curve point by an integer. The integer is assumed to be
+ * lower than the curve order, and the base point must not be the point
+ * at infinity.
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
+{
+	/*
+	 * qz is a flag that is initially 1, and remains equal to 1
+	 * as long as the point is the point at infinity.
+	 *
+	 * We use a 2-bit window to handle multiplier bits by pairs.
+	 * The precomputed window really is the points P2 and P3.
+	 */
+	uint32_t qz;
+	p256_jacobian P2, P3, Q, T, U;
+
+	/*
+	 * Compute window values.
+	 */
+	P2 = *P;
+	p256_double(&P2);
+	P3 = *P;
+	p256_add(&P3, &P2);
+
+	/*
+	 * We start with Q = 0. We process multiplier bits 2 by 2.
+	 */
+	memset(&Q, 0, sizeof Q);
+	qz = 1;
+	while (xlen -- > 0) {
+		int k;
+
+		for (k = 6; k >= 0; k -= 2) {
+			uint32_t bits;
+			uint32_t bnz;
+
+			p256_double(&Q);
+			p256_double(&Q);
+			T = *P;
+			U = Q;
+			bits = (*x >> k) & (uint32_t)3;
+			bnz = NEQ(bits, 0);
+			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
+			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
+			p256_add(&U, &T);
+			CCOPY(bnz & qz, &Q, &T, sizeof Q);
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+		}
+		x ++;
+	}
+	*P = Q;
+}
+
+/*
+ * Precomputed window: k*G points, where G is the curve generator, and k
+ * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
+ * the point are encoded as 9 words of 30 bits each (little-endian
+ * order).
+ */
+static const uint32_t Gwin[15][18] = {
+
+	{ 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
+	  0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
+	  0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
+	  0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
+	  0x10B8BF86, 0x00004FE3 },
+
+	{ 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
+	  0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
+	  0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
+	  0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
+	  0x154436E3, 0x00000777 },
+
+	{ 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
+	  0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
+	  0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
+	  0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
+	  0x19031266, 0x00008734 },
+
+	{ 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
+	  0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
+	  0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
+	  0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
+	  0x15D69318, 0x0000E0F1 },
+
+	{ 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
+	  0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
+	  0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
+	  0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
+	  0x1F6A2412, 0x0000E0C1 },
+
+	{ 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
+	  0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
+	  0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
+	  0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
+	  0x041D0C8D, 0x0000E85C },
+
+	{ 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
+	  0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
+	  0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
+	  0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
+	  0x076F780C, 0x000073EB },
+
+	{ 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
+	  0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
+	  0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
+	  0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
+	  0x332F647A, 0x0000AD5A },
+
+	{ 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
+	  0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
+	  0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
+	  0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
+	  0x11325CB2, 0x00002A27 },
+
+	{ 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
+	  0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
+	  0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
+	  0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
+	  0x18A88A6A, 0x00008786 },
+
+	{ 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
+	  0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
+	  0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
+	  0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
+	  0x0826B331, 0x00009099 },
+
+	{ 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
+	  0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
+	  0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
+	  0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
+	  0x2D1AA70E, 0x00000770 },
+
+	{ 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
+	  0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
+	  0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
+	  0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
+	  0x163353AF, 0x000063BB },
+
+	{ 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
+	  0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
+	  0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
+	  0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
+	  0x3C6ECA7D, 0x0000F599 },
+
+	{ 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
+	  0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
+	  0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
+	  0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
+	  0x0FB8D64B, 0x0000B5B9 }
+};
+
+/*
+ * Lookup one of the Gwin[] values, by index. This is constant-time.
+ */
+static void
+lookup_Gwin(p256_jacobian *T, uint32_t idx)
+{
+	uint32_t xy[18];
+	uint32_t k;
+	size_t u;
+
+	memset(xy, 0, sizeof xy);
+	for (k = 0; k < 15; k ++) {
+		uint32_t m;
+
+		m = -EQ(idx, k + 1);
+		for (u = 0; u < 18; u ++) {
+			xy[u] |= m & Gwin[k][u];
+		}
+	}
+	memcpy(T->x, &xy[0], sizeof T->x);
+	memcpy(T->y, &xy[9], sizeof T->y);
+	memset(T->z, 0, sizeof T->z);
+	T->z[0] = 1;
+}
+
+/*
+ * Multiply the generator by an integer. The integer is assumed non-zero
+ * and lower than the curve order.
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
+{
+	/*
+	 * qz is a flag that is initially 1, and remains equal to 1
+	 * as long as the point is the point at infinity.
+	 *
+	 * We use a 4-bit window to handle multiplier bits by groups
+	 * of 4. The precomputed window is constant static data, with
+	 * points in affine coordinates; we use a constant-time lookup.
+	 */
+	p256_jacobian Q;
+	uint32_t qz;
+
+	memset(&Q, 0, sizeof Q);
+	qz = 1;
+	while (xlen -- > 0) {
+		int k;
+		unsigned bx;
+
+		bx = *x ++;
+		for (k = 0; k < 2; k ++) {
+			uint32_t bits;
+			uint32_t bnz;
+			p256_jacobian T, U;
+
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			bits = (bx >> 4) & 0x0F;
+			bnz = NEQ(bits, 0);
+			lookup_Gwin(&T, bits);
+			U = Q;
+			p256_add_mixed(&U, &T);
+			CCOPY(bnz & qz, &Q, &T, sizeof Q);
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+			bx <<= 4;
+		}
+	}
+	*P = Q;
+}
+
+static const unsigned char P256_G[] = {
+	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+	0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+	0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_G;
+	return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_N;
+	return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 1;
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	uint32_t r;
+	p256_jacobian P;
+
+	(void)curve;
+	r = p256_decode(&P, G, Glen);
+	p256_mul(&P, x, xlen);
+	if (Glen >= 65) {
+		p256_to_affine(&P);
+		p256_encode(G, &P);
+	}
+	return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	p256_jacobian P;
+
+	(void)curve;
+	p256_mulgen(&P, x, xlen);
+	p256_to_affine(&P);
+	p256_encode(R, &P);
+	return 65;
+
+	/*
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+	*/
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	p256_jacobian P, Q;
+	uint32_t r, t, z;
+	int i;
+
+	(void)curve;
+	r = p256_decode(&P, A, len);
+	p256_mul(&P, x, xlen);
+	if (B == NULL) {
+		p256_mulgen(&Q, y, ylen);
+	} else {
+		r &= p256_decode(&Q, B, len);
+		p256_mul(&Q, y, ylen);
+	}
+
+	/*
+	 * The final addition may fail in case both points are equal.
+	 */
+	t = p256_add(&P, &Q);
+	reduce_final_f256(P.z);
+	z = 0;
+	for (i = 0; i < 9; i ++) {
+		z |= P.z[i];
+	}
+	z = EQ(z, 0);
+	p256_double(&Q);
+
+	/*
+	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	 * have the following:
+	 *
+	 *   z = 0, t = 0   return P (normal addition)
+	 *   z = 0, t = 1   return P (normal addition)
+	 *   z = 1, t = 0   return Q (a 'double' case)
+	 *   z = 1, t = 1   report an error (P+Q = 0)
+	 */
+	CCOPY(z & ~t, &P, &Q, sizeof Q);
+	p256_to_affine(&P);
+	p256_encode(A, &P);
+	r &= ~(z & t);
+	return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m31 = {
+	(uint32_t)0x00800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m62.c b/test/monniaux/BearSSL/src/ec/ec_p256_m62.c
new file mode 100644
index 00000000..3bcb95b5
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_p256_m62.c
@@ -0,0 +1,1765 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char P256_G[] = {
+	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+	0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+	0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_G;
+	return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_N;
+	return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 1;
+}
+
+/*
+ * A field element is encoded as five 64-bit integers, in basis 2^52.
+ * Limbs may occasionally exceed 2^52.
+ *
+ * A _partially reduced_ value is such that the following hold:
+ *   - top limb is less than 2^48 + 2^30
+ *   - the other limbs fit on 53 bits each
+ * In particular, such a value is less than twice the modulus p.
+ */
+
+#define BIT(n)   ((uint64_t)1 << (n))
+#define MASK48   (BIT(48) - BIT(0))
+#define MASK52   (BIT(52) - BIT(0))
+
+/* R = 2^260 mod p */
+static const uint64_t F256_R[] = {
+	0x0000000000010, 0xF000000000000, 0xFFFFFFFFFFFFF,
+	0xFFEFFFFFFFFFF, 0x00000000FFFFF
+};
+
+/* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p
+   (Montgomery representation of B). */
+static const uint64_t P256_B_MONTY[] = {
+	0xDF6229C4BDDFD, 0xCA8843090D89C, 0x212ED6ACF005C,
+	0x83415A220ABF7, 0x0C30061DD4874
+};
+
+/*
+ * Addition in the field. Carry propagation is not performed.
+ * On input, limbs may be up to 63 bits each; on output, they will
+ * be up to one bit more than on input.
+ */
+static inline void
+f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	d[0] = a[0] + b[0];
+	d[1] = a[1] + b[1];
+	d[2] = a[2] + b[2];
+	d[3] = a[3] + b[3];
+	d[4] = a[4] + b[4];
+}
+
+/*
+ * Partially reduce the provided value.
+ * Input: limbs can go up to 61 bits each.
+ * Output: partially reduced.
+ */
+static inline void
+f256_partial_reduce(uint64_t *a)
+{
+	uint64_t w, cc, s;
+
+	/*
+	 * Propagate carries.
+	 */
+	w = a[0];
+	a[0] = w & MASK52;
+	cc = w >> 52;
+	w = a[1] + cc;
+	a[1] = w & MASK52;
+	cc = w >> 52;
+	w = a[2] + cc;
+	a[2] = w & MASK52;
+	cc = w >> 52;
+	w = a[3] + cc;
+	a[3] = w & MASK52;
+	cc = w >> 52;
+	a[4] += cc;
+
+	s = a[4] >> 48;             /* s < 2^14 */
+	a[0] += s;                  /* a[0] < 2^52 + 2^14 */
+	w = a[1] - (s << 44);
+	a[1] = w & MASK52;          /* a[1] < 2^52 */
+	cc = -(w >> 52) & 0xFFF;    /* cc < 16 */
+	w = a[2] - cc;
+	a[2] = w & MASK52;          /* a[2] < 2^52 */
+	cc = w >> 63;               /* cc = 0 or 1 */
+	w = a[3] - cc - (s << 36);
+	a[3] = w & MASK52;          /* a[3] < 2^52 */
+	cc = w >> 63;               /* cc = 0 or 1 */
+	w = a[4] & MASK48;
+	a[4] = w + (s << 16) - cc;  /* a[4] < 2^48 + 2^30 */
+}
+
+/*
+ * Subtraction in the field.
+ * Input: limbs must fit on 60 bits each; in particular, the complete
+ * integer will be less than 2^268 + 2^217.
+ * Output: partially reduced.
+ */
+static inline void
+f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+	uint64_t t[5], w, s, cc;
+
+	/*
+	 * We compute d = 2^13*p + a - b; this ensures a positive
+	 * intermediate value.
+	 *
+	 * Each individual addition/subtraction may yield a positive or
+	 * negative result; thus, we need to handle a signed carry, thus
+	 * with sign extension. We prefer not to use signed types (int64_t)
+	 * because conversion from unsigned to signed is cumbersome (a
+	 * direct cast with the top bit set is undefined behavior; instead,
+	 * we have to use pointer aliasing, using the guaranteed properties
+	 * of exact-width types, but this requires the compiler to optimize
+	 * away the writes and reads from RAM), and right-shifting a
+	 * signed negative value is implementation-defined. Therefore,
+	 * we use a custom sign extension.
+	 */
+
+	w = a[0] - b[0] - BIT(13);
+	t[0] = w & MASK52;
+	cc = w >> 52;
+	cc |= -(cc & BIT(11));
+	w = a[1] - b[1] + cc;
+	t[1] = w & MASK52;
+	cc = w >> 52;
+	cc |= -(cc & BIT(11));
+	w = a[2] - b[2] + cc;
+	t[2] = (w & MASK52) + BIT(5);
+	cc = w >> 52;
+	cc |= -(cc & BIT(11));
+	w = a[3] - b[3] + cc;
+	t[3] = (w & MASK52) + BIT(49);
+	cc = w >> 52;
+	cc |= -(cc & BIT(11));
+	t[4] = (BIT(61) - BIT(29)) + a[4] - b[4] + cc;
+
+	/*
+	 * Perform partial reduction. Rule is:
+	 *  2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+	 *
+	 * At that point:
+	 *    0 <= t[0] <= 2^52 - 1
+	 *    0 <= t[1] <= 2^52 - 1
+	 *    2^5 <= t[2] <= 2^52 + 2^5 - 1
+	 *    2^49 <= t[3] <= 2^52 + 2^49 - 1
+	 *    2^59 < t[4] <= 2^61 + 2^60 - 2^29
+	 *
+	 * Thus, the value 's' (t[4] / 2^48) will be necessarily
+	 * greater than 2048, and less than 12288.
+	 */
+	s = t[4] >> 48;
+
+	d[0] = t[0] + s;             /* d[0] <= 2^52 + 12287 */
+	w = t[1] - (s << 44);
+	d[1] = w & MASK52;           /* d[1] <= 2^52 - 1 */
+	cc = -(w >> 52) & 0xFFF;     /* cc <= 48 */
+	w = t[2] - cc;
+	cc = w >> 63;                /* cc = 0 or 1 */
+	d[2] = w + (cc << 52);       /* d[2] <= 2^52 + 31 */
+	w = t[3] - cc - (s << 36);
+	cc = w >> 63;                /* cc = 0 or 1 */
+	d[3] = w + (cc << 52);       /* t[3] <= 2^52 + 2^49 - 1 */
+	d[4] = (t[4] & MASK48) + (s << 16) - cc;  /* d[4] < 2^48 + 2^30 */
+
+	/*
+	 * If s = 0, then none of the limbs is modified, and there cannot
+	 * be an overflow; if s != 0, then (s << 16) > cc, and there is
+	 * no overflow either.
+	 */
+}
+
+/*
+ * Montgomery multiplication in the field.
+ * Input: limbs must fit on 56 bits each.
+ * Output: partially reduced.
+ */
+static void
+f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	int i;
+	uint64_t t[5];
+
+	t[0] = 0;
+	t[1] = 0;
+	t[2] = 0;
+	t[3] = 0;
+	t[4] = 0;
+	for (i = 0; i < 5; i ++) {
+		uint64_t x, f, cc, w, s;
+		unsigned __int128 z;
+
+		/*
+		 * Since limbs of a[] and b[] fit on 56 bits each,
+		 * each individual product fits on 112 bits. Also,
+		 * the factor f fits on 52 bits, so f<<48 fits on
+		 * 112 bits too. This guarantees that carries (cc)
+		 * will fit on 62 bits, thus no overflow.
+		 *
+		 * The operations below compute:
+		 *   t <- (t + x*b + f*p) / 2^64
+		 */
+		x = a[i];
+		z = (unsigned __int128)b[0] * (unsigned __int128)x
+			+ (unsigned __int128)t[0];
+		f = (uint64_t)z & MASK52;
+		cc = (uint64_t)(z >> 52);
+		z = (unsigned __int128)b[1] * (unsigned __int128)x
+			+ (unsigned __int128)t[1] + cc
+			+ ((unsigned __int128)f << 44);
+		t[0] = (uint64_t)z & MASK52;
+		cc = (uint64_t)(z >> 52);
+		z = (unsigned __int128)b[2] * (unsigned __int128)x
+			+ (unsigned __int128)t[2] + cc;
+		t[1] = (uint64_t)z & MASK52;
+		cc = (uint64_t)(z >> 52);
+		z = (unsigned __int128)b[3] * (unsigned __int128)x
+			+ (unsigned __int128)t[3] + cc
+			+ ((unsigned __int128)f << 36);
+		t[2] = (uint64_t)z & MASK52;
+		cc = (uint64_t)(z >> 52);
+		z = (unsigned __int128)b[4] * (unsigned __int128)x
+			+ (unsigned __int128)t[4] + cc
+			+ ((unsigned __int128)f << 48)
+			- ((unsigned __int128)f << 16);
+		t[3] = (uint64_t)z & MASK52;
+		t[4] = (uint64_t)(z >> 52);
+
+		/*
+		 * t[4] may be up to 62 bits here; we need to do a
+		 * partial reduction. Note that limbs t[0] to t[3]
+		 * fit on 52 bits each.
+		 */
+		s = t[4] >> 48;             /* s < 2^14 */
+		t[0] += s;                  /* t[0] < 2^52 + 2^14 */
+		w = t[1] - (s << 44);
+		t[1] = w & MASK52;          /* t[1] < 2^52 */
+		cc = -(w >> 52) & 0xFFF;    /* cc < 16 */
+		w = t[2] - cc;
+		t[2] = w & MASK52;          /* t[2] < 2^52 */
+		cc = w >> 63;               /* cc = 0 or 1 */
+		w = t[3] - cc - (s << 36);
+		t[3] = w & MASK52;          /* t[3] < 2^52 */
+		cc = w >> 63;               /* cc = 0 or 1 */
+		w = t[4] & MASK48;
+		t[4] = w + (s << 16) - cc;  /* t[4] < 2^48 + 2^30 */
+
+		/*
+		 * The final t[4] cannot overflow because cc is 0 or 1,
+		 * and cc can be 1 only if s != 0.
+		 */
+	}
+
+	d[0] = t[0];
+	d[1] = t[1];
+	d[2] = t[2];
+	d[3] = t[3];
+	d[4] = t[4];
+
+#elif BR_UMUL128
+
+	int i;
+	uint64_t t[5];
+
+	t[0] = 0;
+	t[1] = 0;
+	t[2] = 0;
+	t[3] = 0;
+	t[4] = 0;
+	for (i = 0; i < 5; i ++) {
+		uint64_t x, f, cc, w, s, zh, zl;
+		unsigned char k;
+
+		/*
+		 * Since limbs of a[] and b[] fit on 56 bits each,
+		 * each individual product fits on 112 bits. Also,
+		 * the factor f fits on 52 bits, so f<<48 fits on
+		 * 112 bits too. This guarantees that carries (cc)
+		 * will fit on 62 bits, thus no overflow.
+		 *
+		 * The operations below compute:
+		 *   t <- (t + x*b + f*p) / 2^64
+		 */
+		x = a[i];
+		zl = _umul128(b[0], x, &zh);
+		k = _addcarry_u64(0, t[0], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		f = zl & MASK52;
+		cc = (zl >> 52) | (zh << 12);
+
+		zl = _umul128(b[1], x, &zh);
+		k = _addcarry_u64(0, t[1], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, cc, zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, f << 44, zl, &zl);
+		(void)_addcarry_u64(k, f >> 20, zh, &zh);
+		t[0] = zl & MASK52;
+		cc = (zl >> 52) | (zh << 12);
+
+		zl = _umul128(b[2], x, &zh);
+		k = _addcarry_u64(0, t[2], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, cc, zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		t[1] = zl & MASK52;
+		cc = (zl >> 52) | (zh << 12);
+
+		zl = _umul128(b[3], x, &zh);
+		k = _addcarry_u64(0, t[3], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, cc, zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, f << 36, zl, &zl);
+		(void)_addcarry_u64(k, f >> 28, zh, &zh);
+		t[2] = zl & MASK52;
+		cc = (zl >> 52) | (zh << 12);
+
+		zl = _umul128(b[4], x, &zh);
+		k = _addcarry_u64(0, t[4], zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, cc, zl, &zl);
+		(void)_addcarry_u64(k, 0, zh, &zh);
+		k = _addcarry_u64(0, f << 48, zl, &zl);
+		(void)_addcarry_u64(k, f >> 16, zh, &zh);
+		k = _subborrow_u64(0, zl, f << 16, &zl);
+		(void)_subborrow_u64(k, zh, f >> 48, &zh);
+		t[3] = zl & MASK52;
+		t[4] = (zl >> 52) | (zh << 12);
+
+		/*
+		 * t[4] may be up to 62 bits here; we need to do a
+		 * partial reduction. Note that limbs t[0] to t[3]
+		 * fit on 52 bits each.
+		 */
+		s = t[4] >> 48;             /* s < 2^14 */
+		t[0] += s;                  /* t[0] < 2^52 + 2^14 */
+		w = t[1] - (s << 44);
+		t[1] = w & MASK52;          /* t[1] < 2^52 */
+		cc = -(w >> 52) & 0xFFF;    /* cc < 16 */
+		w = t[2] - cc;
+		t[2] = w & MASK52;          /* t[2] < 2^52 */
+		cc = w >> 63;               /* cc = 0 or 1 */
+		w = t[3] - cc - (s << 36);
+		t[3] = w & MASK52;          /* t[3] < 2^52 */
+		cc = w >> 63;               /* cc = 0 or 1 */
+		w = t[4] & MASK48;
+		t[4] = w + (s << 16) - cc;  /* t[4] < 2^48 + 2^30 */
+
+		/*
+		 * The final t[4] cannot overflow because cc is 0 or 1,
+		 * and cc can be 1 only if s != 0.
+		 */
+	}
+
+	d[0] = t[0];
+	d[1] = t[1];
+	d[2] = t[2];
+	d[3] = t[3];
+	d[4] = t[4];
+
+#endif
+}
+
+/*
+ * Montgomery squaring in the field; currently a basic wrapper around
+ * multiplication (inline, should be optimized away).
+ * TODO: see if some extra speed can be gained here.
+ */
+static inline void
+f256_montysquare(uint64_t *d, const uint64_t *a)
+{
+	f256_montymul(d, a, a);
+}
+
+/*
+ * Convert to Montgomery representation.
+ */
+static void
+f256_tomonty(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * R2 = 2^520 mod p.
+	 * If R = 2^260 mod p, then R2 = R^2 mod p; and the Montgomery
+	 * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the
+	 * conversion to Montgomery representation.
+	 */
+	static const uint64_t R2[] = {
+		0x0000000000300, 0xFFFFFFFF00000, 0xFFFFEFFFFFFFB,
+		0xFDFFFFFFFFFFF, 0x0000004FFFFFF
+	};
+
+	f256_montymul(d, a, R2);
+}
+
+/*
+ * Convert from Montgomery representation.
+ */
+static void
+f256_frommonty(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * Montgomery multiplication by 1 is division by 2^260 modulo p.
+	 */
+	static const uint64_t one[] = { 1, 0, 0, 0, 0 };
+
+	f256_montymul(d, a, one);
+}
+
+/*
+ * Inversion in the field. If the source value is 0 modulo p, then this
+ * returns 0 or p. This function uses Montgomery representation.
+ */
+static void
+f256_invert(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * We compute a^(p-2) mod p. The exponent pattern (from high to
+	 * low) is:
+	 *  - 32 bits of value 1
+	 *  - 31 bits of value 0
+	 *  - 1 bit of value 1
+	 *  - 96 bits of value 0
+	 *  - 94 bits of value 1
+	 *  - 1 bit of value 0
+	 *  - 1 bit of value 1
+	 * To speed up the square-and-multiply algorithm, we precompute
+	 * a^(2^31-1).
+	 */
+
+	uint64_t r[5], t[5];
+	int i;
+
+	memcpy(t, a, sizeof t);
+	for (i = 0; i < 30; i ++) {
+		f256_montysquare(t, t);
+		f256_montymul(t, t, a);
+	}
+
+	memcpy(r, t, sizeof t);
+	for (i = 224; i >= 0; i --) {
+		f256_montysquare(r, r);
+		switch (i) {
+		case 0:
+		case 2:
+		case 192:
+		case 224:
+			f256_montymul(r, r, a);
+			break;
+		case 3:
+		case 34:
+		case 65:
+			f256_montymul(r, r, t);
+			break;
+		}
+	}
+	memcpy(d, r, sizeof r);
+}
+
+/*
+ * Finalize reduction.
+ * Input value should be partially reduced.
+ * On output, limbs a[0] to a[3] fit on 52 bits each, limb a[4] fits
+ * on 48 bits, and the integer is less than p.
+ */
+static inline void
+f256_final_reduce(uint64_t *a)
+{
+	uint64_t r[5], t[5], w, cc;
+	int i;
+
+	/*
+	 * Propagate carries to ensure that limbs 0 to 3 fit on 52 bits.
+	 */
+	cc = 0;
+	for (i = 0; i < 5; i ++) {
+		w = a[i] + cc;
+		r[i] = w & MASK52;
+		cc = w >> 52;
+	}
+
+	/*
+	 * We compute t = r + (2^256 - p) = r + 2^224 - 2^192 - 2^96 + 1.
+	 * If t < 2^256, then r < p, and we return r. Otherwise, we
+	 * want to return r - p = t - 2^256.
+	 */
+
+	/*
+	 * Add 2^224 + 1, and propagate carries to ensure that limbs
+	 * t[0] to t[3] fit in 52 bits each.
+	 */
+	w = r[0] + 1;
+	t[0] = w & MASK52;
+	cc = w >> 52;
+	w = r[1] + cc;
+	t[1] = w & MASK52;
+	cc = w >> 52;
+	w = r[2] + cc;
+	t[2] = w & MASK52;
+	cc = w >> 52;
+	w = r[3] + cc;
+	t[3] = w & MASK52;
+	cc = w >> 52;
+	t[4] = r[4] + cc + BIT(16);
+
+	/*
+	 * Subtract 2^192 + 2^96. Since we just added 2^224 + 1, the
+	 * result cannot be negative.
+	 */
+	w = t[1] - BIT(44);
+	t[1] = w & MASK52;
+	cc = w >> 63;
+	w = t[2] - cc;
+	t[2] = w & MASK52;
+	cc = w >> 63;
+	w = t[3] - BIT(36);
+	t[3] = w & MASK52;
+	cc = w >> 63;
+	t[4] -= cc;
+
+	/*
+	 * If the top limb t[4] fits on 48 bits, then r[] is already
+	 * in the proper range. Otherwise, t[] is the value to return
+	 * (truncated to 256 bits).
+	 */
+	cc = -(t[4] >> 48);
+	t[4] &= MASK48;
+	for (i = 0; i < 5; i ++) {
+		a[i] = r[i] ^ (cc & (r[i] ^ t[i]));
+	}
+}
+
+/*
+ * Points in affine and Jacobian coordinates.
+ *
+ *  - In affine coordinates, the point-at-infinity cannot be encoded.
+ *  - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
+ *    if Z = 0 then this is the point-at-infinity.
+ */
+typedef struct {
+	uint64_t x[5];
+	uint64_t y[5];
+} p256_affine;
+
+typedef struct {
+	uint64_t x[5];
+	uint64_t y[5];
+	uint64_t z[5];
+} p256_jacobian;
+
+/*
+ * Decode a field element (unsigned big endian notation).
+ */
+static void
+f256_decode(uint64_t *a, const unsigned char *buf)
+{
+	uint64_t w0, w1, w2, w3;
+
+	w3 = br_dec64be(buf +  0);
+	w2 = br_dec64be(buf +  8);
+	w1 = br_dec64be(buf + 16);
+	w0 = br_dec64be(buf + 24);
+	a[0] = w0 & MASK52;
+	a[1] = ((w0 >> 52) | (w1 << 12)) & MASK52;
+	a[2] = ((w1 >> 40) | (w2 << 24)) & MASK52;
+	a[3] = ((w2 >> 28) | (w3 << 36)) & MASK52;
+	a[4] = w3 >> 16;
+}
+
+/*
+ * Encode a field element (unsigned big endian notation). The field
+ * element MUST be fully reduced.
+ */
+static void
+f256_encode(unsigned char *buf, const uint64_t *a)
+{
+	uint64_t w0, w1, w2, w3;
+
+	w0 = a[0] | (a[1] << 52);
+	w1 = (a[1] >> 12) | (a[2] << 40);
+	w2 = (a[2] >> 24) | (a[3] << 28);
+	w3 = (a[3] >> 36) | (a[4] << 16);
+	br_enc64be(buf +  0, w3);
+	br_enc64be(buf +  8, w2);
+	br_enc64be(buf + 16, w1);
+	br_enc64be(buf + 24, w0);
+}
+
+/*
+ * Decode a point. The returned point is in Jacobian coordinates, but
+ * with z = 1. If the encoding is invalid, or encodes a point which is
+ * not on the curve, or encodes the point at infinity, then this function
+ * returns 0. Otherwise, 1 is returned.
+ *
+ * The buffer is assumed to have length exactly 65 bytes.
+ */
+static uint32_t
+point_decode(p256_jacobian *P, const unsigned char *buf)
+{
+	uint64_t x[5], y[5], t[5], x3[5], tt;
+	uint32_t r;
+
+	/*
+	 * Header byte shall be 0x04.
+	 */
+	r = EQ(buf[0], 0x04);
+
+	/*
+	 * Decode X and Y coordinates, and convert them into
+	 * Montgomery representation.
+	 */
+	f256_decode(x, buf +  1);
+	f256_decode(y, buf + 33);
+	f256_tomonty(x, x);
+	f256_tomonty(y, y);
+
+	/*
+	 * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
+	 * Note that the Montgomery representation of 0 is 0. We must
+	 * take care to apply the final reduction to make sure we have
+	 * 0 and not p.
+	 */
+	f256_montysquare(t, y);
+	f256_montysquare(x3, x);
+	f256_montymul(x3, x3, x);
+	f256_sub(t, t, x3);
+	f256_add(t, t, x);
+	f256_add(t, t, x);
+	f256_add(t, t, x);
+	f256_sub(t, t, P256_B_MONTY);
+	f256_final_reduce(t);
+	tt = t[0] | t[1] | t[2] | t[3] | t[4];
+	r &= EQ((uint32_t)(tt | (tt >> 32)), 0);
+
+	/*
+	 * Return the point in Jacobian coordinates (and Montgomery
+	 * representation).
+	 */
+	memcpy(P->x, x, sizeof x);
+	memcpy(P->y, y, sizeof y);
+	memcpy(P->z, F256_R, sizeof F256_R);
+	return r;
+}
+
+/*
+ * Final conversion for a point:
+ *  - The point is converted back to affine coordinates.
+ *  - Final reduction is performed.
+ *  - The point is encoded into the provided buffer.
+ *
+ * If the point is the point-at-infinity, all operations are performed,
+ * but the buffer contents are indeterminate, and 0 is returned. Otherwise,
+ * the encoded point is written in the buffer, and 1 is returned.
+ */
+static uint32_t
+point_encode(unsigned char *buf, const p256_jacobian *P)
+{
+	uint64_t t1[5], t2[5], z;
+
+	/* Set t1 = 1/z^2 and t2 = 1/z^3. */
+	f256_invert(t2, P->z);
+	f256_montysquare(t1, t2);
+	f256_montymul(t2, t2, t1);
+
+	/* Compute affine coordinates x (in t1) and y (in t2). */
+	f256_montymul(t1, P->x, t1);
+	f256_montymul(t2, P->y, t2);
+
+	/* Convert back from Montgomery representation, and finalize
+	   reductions. */
+	f256_frommonty(t1, t1);
+	f256_frommonty(t2, t2);
+	f256_final_reduce(t1);
+	f256_final_reduce(t2);
+
+	/* Encode. */
+	buf[0] = 0x04;
+	f256_encode(buf +  1, t1);
+	f256_encode(buf + 33, t2);
+
+	/* Return success if and only if P->z != 0. */
+	z = P->z[0] | P->z[1] | P->z[2] | P->z[3] | P->z[4];
+	return NEQ((uint32_t)(z | z >> 32), 0);
+}
+
+/*
+ * Point doubling in Jacobian coordinates: point P is doubled.
+ * Note: if the source point is the point-at-infinity, then the result is
+ * still the point-at-infinity, which is correct. Moreover, if the three
+ * coordinates were zero, then they still are zero in the returned value.
+ */
+static void
+p256_double(p256_jacobian *P)
+{
+	/*
+	 * Doubling formulas are:
+	 *
+	 *   s = 4*x*y^2
+	 *   m = 3*(x + z^2)*(x - z^2)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y^4
+	 *   z' = 2*y*z
+	 *
+	 * These formulas work for all points, including points of order 2
+	 * and points at infinity:
+	 *   - If y = 0 then z' = 0. But there is no such point in P-256
+	 *     anyway.
+	 *   - If z = 0 then z' = 0.
+	 */
+	uint64_t t1[5], t2[5], t3[5], t4[5];
+
+	/*
+	 * Compute z^2 in t1.
+	 */
+	f256_montysquare(t1, P->z);
+
+	/*
+	 * Compute x-z^2 in t2 and x+z^2 in t1.
+	 */
+	f256_add(t2, P->x, t1);
+	f256_sub(t1, P->x, t1);
+
+	/*
+	 * Compute 3*(x+z^2)*(x-z^2) in t1.
+	 */
+	f256_montymul(t3, t1, t2);
+	f256_add(t1, t3, t3);
+	f256_add(t1, t3, t1);
+
+	/*
+	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	 */
+	f256_montysquare(t3, P->y);
+	f256_add(t3, t3, t3);
+	f256_montymul(t2, P->x, t3);
+	f256_add(t2, t2, t2);
+
+	/*
+	 * Compute x' = m^2 - 2*s.
+	 */
+	f256_montysquare(P->x, t1);
+	f256_sub(P->x, P->x, t2);
+	f256_sub(P->x, P->x, t2);
+
+	/*
+	 * Compute z' = 2*y*z.
+	 */
+	f256_montymul(t4, P->y, P->z);
+	f256_add(P->z, t4, t4);
+	f256_partial_reduce(P->z);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	 * 2*y^2 in t3.
+	 */
+	f256_sub(t2, t2, P->x);
+	f256_montymul(P->y, t1, t2);
+	f256_montysquare(t4, t3);
+	f256_add(t4, t4, t4);
+	f256_sub(P->y, P->y, t4);
+}
+
+/*
+ * Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0 but P2 != 0
+ *   - If P1 != 0 but P2 == 0
+ *   - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y coordinate.
+ *   - P1 == 0 and P2 == 0.
+ *   - The Y coordinate of one of the points is 0 and the other point is
+ *     the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ *
+ * Note that you can get a returned value of 0 with a correct result,
+ * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1 * z2^2
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1 * z2^3
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1 * z2
+	 */
+	uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt;
+	uint32_t ret;
+
+	/*
+	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	 */
+	f256_montysquare(t3, P2->z);
+	f256_montymul(t1, P1->x, t3);
+	f256_montymul(t4, P2->z, t3);
+	f256_montymul(t3, P1->y, t4);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+	f256_final_reduce(t4);
+	tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+	ret = (uint32_t)(tt | (tt >> 32));
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	f256_montymul(t1, P1->z, P2->z);
+	f256_montymul(P1->z, t1, t2);
+
+	return ret;
+}
+
+/*
+ * Point addition (mixed coordinates): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0
+ *   - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y (affine) coordinate.
+ *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ *
+ * Again, a value of 0 may be returned in some cases where the addition
+ * result is correct.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 */
+	uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt;
+	uint32_t ret;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+	f256_final_reduce(t4);
+	tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+	ret = (uint32_t)(tt | (tt >> 32));
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	f256_montymul(P1->z, P1->z, t2);
+
+	return ret;
+}
+
+#if 0
+/* unused */
+/*
+ * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function returns the correct result in all cases.
+ */
+static uint32_t
+p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+	/*
+	 * Addtions formulas, in the general case, are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 *
+	 * These formulas mishandle the two following cases:
+	 *
+	 *  - If P1 is the point-at-infinity (z1 = 0), then z3 is
+	 *    incorrectly set to 0.
+	 *
+	 *  - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
+	 *    are all set to 0.
+	 *
+	 * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
+	 * we correctly get z3 = 0 (the point-at-infinity).
+	 *
+	 * To fix the case P1 = 0, we perform at the end a copy of P2
+	 * over P1, conditional to z1 = 0.
+	 *
+	 * For P1 = P2: in that case, both h and r are set to 0, and
+	 * we get x3, y3 and z3 equal to 0. We can test for that
+	 * occurrence to make a mask which will be all-one if P1 = P2,
+	 * or all-zero otherwise; then we can compute the double of P2
+	 * and add it, combined with the mask, to (x3,y3,z3).
+	 *
+	 * Using the doubling formulas in p256_double() on (x2,y2),
+	 * simplifying since P2 is affine (i.e. z2 = 1, implicitly),
+	 * we get:
+	 *   s = 4*x2*y2^2
+	 *   m = 3*(x2 + 1)*(x2 - 1)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y2^4
+	 *   z' = 2*y2
+	 * which requires only 6 multiplications. Added to the 11
+	 * multiplications of the normal mixed addition in Jacobian
+	 * coordinates, we get a cost of 17 multiplications in total.
+	 */
+	uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt, zz;
+	int i;
+
+	/*
+	 * Set zz to -1 if P1 is the point at infinity, 0 otherwise.
+	 */
+	zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3] | P1->z[4];
+	zz = ((zz | -zz) >> 63) - (uint64_t)1;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+
+	/*
+	 * If both h = 0 and r = 0, then P1 = P2, and we want to set
+	 * the mask tt to -1; otherwise, the mask will be 0.
+	 */
+	f256_final_reduce(t2);
+	f256_final_reduce(t4);
+	tt = t2[0] | t2[1] | t2[2] | t2[3] | t2[4]
+		| t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
+	tt = ((tt | -tt) >> 63) - (uint64_t)1;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1.
+	 */
+	f256_montymul(P1->z, P1->z, t2);
+
+	/*
+	 * The "double" result, in case P1 = P2.
+	 */
+
+	/*
+	 * Compute z' = 2*y2 (in t1).
+	 */
+	f256_add(t1, P2->y, P2->y);
+	f256_partial_reduce(t1);
+
+	/*
+	 * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3).
+	 */
+	f256_montysquare(t2, P2->y);
+	f256_add(t2, t2, t2);
+	f256_add(t3, t2, t2);
+	f256_montymul(t3, P2->x, t3);
+
+	/*
+	 * Compute m = 3*(x2^2 - 1) (in t4).
+	 */
+	f256_montysquare(t4, P2->x);
+	f256_sub(t4, t4, F256_R);
+	f256_add(t5, t4, t4);
+	f256_add(t4, t4, t5);
+
+	/*
+	 * Compute x' = m^2 - 2*s (in t5).
+	 */
+	f256_montysquare(t5, t4);
+	f256_sub(t5, t3);
+	f256_sub(t5, t3);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y2^4 (in t6).
+	 */
+	f256_sub(t6, t3, t5);
+	f256_montymul(t6, t6, t4);
+	f256_montysquare(t7, t2);
+	f256_sub(t6, t6, t7);
+	f256_sub(t6, t6, t7);
+
+	/*
+	 * We now have the alternate (doubling) coordinates in (t5,t6,t1).
+	 * We combine them with (x3,y3,z3).
+	 */
+	for (i = 0; i < 5; i ++) {
+		P1->x[i] |= tt & t5[i];
+		P1->y[i] |= tt & t6[i];
+		P1->z[i] |= tt & t1[i];
+	}
+
+	/*
+	 * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
+	 * then we want to replace the result with a copy of P2. The
+	 * test on z1 was done at the start, in the zz mask.
+	 */
+	for (i = 0; i < 5; i ++) {
+		P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
+		P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
+		P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
+	}
+}
+#endif
+
+/*
+ * Inner function for computing a point multiplication. A window is
+ * provided, with points 1*P to 15*P in affine coordinates.
+ *
+ * Assumptions:
+ *  - All provided points are valid points on the curve.
+ *  - Multiplier is non-zero, and smaller than the curve order.
+ *  - Everything is in Montgomery representation.
+ */
+static void
+point_mul_inner(p256_jacobian *R, const p256_affine *W,
+	const unsigned char *k, size_t klen)
+{
+	p256_jacobian Q;
+	uint32_t qz;
+
+	memset(&Q, 0, sizeof Q);
+	qz = 1;
+	while (klen -- > 0) {
+		int i;
+		unsigned bk;
+
+		bk = *k ++;
+		for (i = 0; i < 2; i ++) {
+			uint32_t bits;
+			uint32_t bnz;
+			p256_affine T;
+			p256_jacobian U;
+			uint32_t n;
+			int j;
+			uint64_t m;
+
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			bits = (bk >> 4) & 0x0F;
+			bnz = NEQ(bits, 0);
+
+			/*
+			 * Lookup point in window. If the bits are 0,
+			 * we get something invalid, which is not a
+			 * problem because we will use it only if the
+			 * bits are non-zero.
+			 */
+			memset(&T, 0, sizeof T);
+			for (n = 0; n < 15; n ++) {
+				m = -(uint64_t)EQ(bits, n + 1);
+				T.x[0] |= m & W[n].x[0];
+				T.x[1] |= m & W[n].x[1];
+				T.x[2] |= m & W[n].x[2];
+				T.x[3] |= m & W[n].x[3];
+				T.x[4] |= m & W[n].x[4];
+				T.y[0] |= m & W[n].y[0];
+				T.y[1] |= m & W[n].y[1];
+				T.y[2] |= m & W[n].y[2];
+				T.y[3] |= m & W[n].y[3];
+				T.y[4] |= m & W[n].y[4];
+			}
+
+			U = Q;
+			p256_add_mixed(&U, &T);
+
+			/*
+			 * If qz is still 1, then Q was all-zeros, and this
+			 * is conserved through p256_double().
+			 */
+			m = -(uint64_t)(bnz & qz);
+			for (j = 0; j < 5; j ++) {
+				Q.x[j] ^= m & (Q.x[j] ^ T.x[j]);
+				Q.y[j] ^= m & (Q.y[j] ^ T.y[j]);
+				Q.z[j] ^= m & (Q.z[j] ^ F256_R[j]);
+			}
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+			bk <<= 4;
+		}
+	}
+	*R = Q;
+}
+
+/*
+ * Convert a window from Jacobian to affine coordinates. A single
+ * field inversion is used. This function works for windows up to
+ * 32 elements.
+ *
+ * The destination array (aff[]) and the source array (jac[]) may
+ * overlap, provided that the start of aff[] is not after the start of
+ * jac[]. Even if the arrays do _not_ overlap, the source array is
+ * modified.
+ */
+static void
+window_to_affine(p256_affine *aff, p256_jacobian *jac, int num)
+{
+	/*
+	 * Convert the window points to affine coordinates. We use the
+	 * following trick to mutualize the inversion computation: if
+	 * we have z1, z2, z3, and z4, and want to invert all of them,
+	 * we compute u = 1/(z1*z2*z3*z4), and then we have:
+	 *   1/z1 = u*z2*z3*z4
+	 *   1/z2 = u*z1*z3*z4
+	 *   1/z3 = u*z1*z2*z4
+	 *   1/z4 = u*z1*z2*z3
+	 *
+	 * The partial products are computed recursively:
+	 *
+	 *  - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
+	 *  - on input (z_1,z_2,... z_n):
+	 *       recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
+	 *       recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
+	 *       multiply elements of r1 by m2 -> s1
+	 *       multiply elements of r2 by m1 -> s2
+	 *       return r1||r2 and m1*m2
+	 *
+	 * In the example below, we suppose that we have 14 elements.
+	 * Let z1, z2,... zE be the 14 values to invert (index noted in
+	 * hexadecimal, starting at 1).
+	 *
+	 *  - Depth 1:
+	 *      swap(z1, z2); z12 = z1*z2
+	 *      swap(z3, z4); z34 = z3*z4
+	 *      swap(z5, z6); z56 = z5*z6
+	 *      swap(z7, z8); z78 = z7*z8
+	 *      swap(z9, zA); z9A = z9*zA
+	 *      swap(zB, zC); zBC = zB*zC
+	 *      swap(zD, zE); zDE = zD*zE
+	 *
+	 *  - Depth 2:
+	 *      z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12
+	 *      z1234 = z12*z34
+	 *      z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56
+	 *      z5678 = z56*z78
+	 *      z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A
+	 *      z9ABC = z9A*zBC
+	 *
+	 *  - Depth 3:
+	 *      z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678
+	 *      z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234
+	 *      z12345678 = z1234*z5678
+	 *      z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE
+	 *      zD <- zD*z9ABC, zE*z9ABC
+	 *      z9ABCDE = z9ABC*zDE
+	 *
+	 *  - Depth 4:
+	 *      multiply z1..z8 by z9ABCDE
+	 *      multiply z9..zE by z12345678
+	 *      final z = z12345678*z9ABCDE
+	 */
+
+	uint64_t z[16][5];
+	int i, k, s;
+#define zt   (z[15])
+#define zu   (z[14])
+#define zv   (z[13])
+
+	/*
+	 * First recursion step (pairwise swapping and multiplication).
+	 * If there is an odd number of elements, then we "invent" an
+	 * extra one with coordinate Z = 1 (in Montgomery representation).
+	 */
+	for (i = 0; (i + 1) < num; i += 2) {
+		memcpy(zt, jac[i].z, sizeof zt);
+		memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
+		memcpy(jac[i + 1].z, zt, sizeof zt);
+		f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
+	}
+	if ((num & 1) != 0) {
+		memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
+		memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
+	}
+
+	/*
+	 * Perform further recursion steps. At the entry of each step,
+	 * the process has been done for groups of 's' points. The
+	 * integer k is the log2 of s.
+	 */
+	for (k = 1, s = 2; s < num; k ++, s <<= 1) {
+		int n;
+
+		for (i = 0; i < num; i ++) {
+			f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
+		}
+		n = (num + s - 1) >> k;
+		for (i = 0; i < (n >> 1); i ++) {
+			f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
+		}
+		if ((n & 1) != 0) {
+			memmove(z[n >> 1], z[n], sizeof zt);
+		}
+	}
+
+	/*
+	 * Invert the final result, and convert all points.
+	 */
+	f256_invert(zt, z[0]);
+	for (i = 0; i < num; i ++) {
+		f256_montymul(zv, jac[i].z, zt);
+		f256_montysquare(zu, zv);
+		f256_montymul(zv, zv, zu);
+		f256_montymul(aff[i].x, jac[i].x, zu);
+		f256_montymul(aff[i].y, jac[i].y, zv);
+	}
+}
+
+/*
+ * Multiply the provided point by an integer.
+ * Assumptions:
+ *  - Source point is a valid curve point.
+ *  - Source point is not the point-at-infinity.
+ *  - Integer is not 0, and is lower than the curve order.
+ * If these conditions are not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+	union {
+		p256_affine aff[15];
+		p256_jacobian jac[15];
+	} window;
+	int i;
+
+	/*
+	 * Compute window, in Jacobian coordinates.
+	 */
+	window.jac[0] = *P;
+	for (i = 2; i < 16; i ++) {
+		window.jac[i - 1] = window.jac[(i >> 1) - 1];
+		if ((i & 1) == 0) {
+			p256_double(&window.jac[i - 1]);
+		} else {
+			p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
+		}
+	}
+
+	/*
+	 * Convert the window points to affine coordinates. Point
+	 * window[0] is the source point, already in affine coordinates.
+	 */
+	window_to_affine(window.aff, window.jac, 15);
+
+	/*
+	 * Perform point multiplication.
+	 */
+	point_mul_inner(P, window.aff, k, klen);
+}
+
+/*
+ * Precomputed window for the conventional generator: P256_Gwin[n]
+ * contains (n+1)*G (affine coordinates, in Montgomery representation).
+ */
+static const p256_affine P256_Gwin[] = {
+	{
+		{ 0x30D418A9143C1, 0xC4FEDB60179E7, 0x62251075BA95F,
+		  0x5C669FB732B77, 0x08905F76B5375 },
+		{ 0x5357CE95560A8, 0x43A19E45CDDF2, 0x21F3258B4AB8E,
+		  0xD8552E88688DD, 0x0571FF18A5885 }
+	},
+	{
+		{ 0x46D410DDD64DF, 0x0B433827D8500, 0x1490D9AA6AE3C,
+		  0xA3A832205038D, 0x06BB32E52DCF3 },
+		{ 0x48D361BEE1A57, 0xB7B236FF82F36, 0x042DBE152CD7C,
+		  0xA3AA9A8FB0E92, 0x08C577517A5B8 }
+	},
+	{
+		{ 0x3F904EEBC1272, 0x9E87D81FBFFAC, 0xCBBC98B027F84,
+		  0x47E46AD77DD87, 0x06936A3FD6FF7 },
+		{ 0x5C1FC983A7EBD, 0xC3861FE1AB04C, 0x2EE98E583E47A,
+		  0xC06A88208311A, 0x05F06A2AB587C }
+	},
+	{
+		{ 0xB50D46918DCC5, 0xD7623C17374B0, 0x100AF24650A6E,
+		  0x76ABCDAACACE8, 0x077362F591B01 },
+		{ 0xF24CE4CBABA68, 0x17AD6F4472D96, 0xDDD22E1762847,
+		  0x862EB6C36DEE5, 0x04B14C39CC5AB }
+	},
+	{
+		{ 0x8AAEC45C61F5C, 0x9D4B9537DBE1B, 0x76C20C90EC649,
+		  0x3C7D41CB5AAD0, 0x0907960649052 },
+		{ 0x9B4AE7BA4F107, 0xF75EB882BEB30, 0x7A1F6873C568E,
+		  0x915C540A9877E, 0x03A076BB9DD1E }
+	},
+	{
+		{ 0x47373E77664A1, 0xF246CEE3E4039, 0x17A3AD55AE744,
+		  0x673C50A961A5B, 0x03074B5964213 },
+		{ 0x6220D377E44BA, 0x30DFF14B593D3, 0x639F11299C2B5,
+		  0x75F5424D44CEF, 0x04C9916DEA07F }
+	},
+	{
+		{ 0x354EA0173B4F1, 0x3C23C00F70746, 0x23BB082BD2021,
+		  0xE03E43EAAB50C, 0x03BA5119D3123 },
+		{ 0xD0303F5B9D4DE, 0x17DA67BDD2847, 0xC941956742F2F,
+		  0x8670F933BDC77, 0x0AEDD9164E240 }
+	},
+	{
+		{ 0x4CD19499A78FB, 0x4BF9B345527F1, 0x2CFC6B462AB5C,
+		  0x30CDF90F02AF0, 0x0763891F62652 },
+		{ 0xA3A9532D49775, 0xD7F9EBA15F59D, 0x60BBF021E3327,
+		  0xF75C23C7B84BE, 0x06EC12F2C706D }
+	},
+	{
+		{ 0x6E8F264E20E8E, 0xC79A7A84175C9, 0xC8EB00ABE6BFE,
+		  0x16A4CC09C0444, 0x005B3081D0C4E },
+		{ 0x777AA45F33140, 0xDCE5D45E31EB7, 0xB12F1A56AF7BE,
+		  0xF9B2B6E019A88, 0x086659CDFD835 }
+	},
+	{
+		{ 0xDBD19DC21EC8C, 0x94FCF81392C18, 0x250B4998F9868,
+		  0x28EB37D2CD648, 0x0C61C947E4B34 },
+		{ 0x407880DD9E767, 0x0C83FBE080C2B, 0x9BE5D2C43A899,
+		  0xAB4EF7D2D6577, 0x08719A555B3B4 }
+	},
+	{
+		{ 0x260A6245E4043, 0x53E7FDFE0EA7D, 0xAC1AB59DE4079,
+		  0x072EFF3A4158D, 0x0E7090F1949C9 },
+		{ 0x85612B944E886, 0xE857F61C81A76, 0xAD643D250F939,
+		  0x88DAC0DAA891E, 0x089300244125B }
+	},
+	{
+		{ 0x1AA7D26977684, 0x58A345A3304B7, 0x37385EABDEDEF,
+		  0x155E409D29DEE, 0x0EE1DF780B83E },
+		{ 0x12D91CBB5B437, 0x65A8956370CAC, 0xDE6D66170ED2F,
+		  0xAC9B8228CFA8A, 0x0FF57C95C3238 }
+	},
+	{
+		{ 0x25634B2ED7097, 0x9156FD30DCCC4, 0x9E98110E35676,
+		  0x7594CBCD43F55, 0x038477ACC395B },
+		{ 0x2B90C00EE17FF, 0xF842ED2E33575, 0x1F5BC16874838,
+		  0x7968CD06422BD, 0x0BC0876AB9E7B }
+	},
+	{
+		{ 0xA35BB0CF664AF, 0x68F9707E3A242, 0x832660126E48F,
+		  0x72D2717BF54C6, 0x0AAE7333ED12C },
+		{ 0x2DB7995D586B1, 0xE732237C227B5, 0x65E7DBBE29569,
+		  0xBBBD8E4193E2A, 0x052706DC3EAA1 }
+	},
+	{
+		{ 0xD8B7BC60055BE, 0xD76E27E4B72BC, 0x81937003CC23E,
+		  0xA090E337424E4, 0x02AA0E43EAD3D },
+		{ 0x524F6383C45D2, 0x422A41B2540B8, 0x8A4797D766355,
+		  0xDF444EFA6DE77, 0x0042170A9079A }
+	},
+};
+
+/*
+ * Multiply the conventional generator of the curve by the provided
+ * integer. Return is written in *P.
+ *
+ * Assumptions:
+ *  - Integer is not 0, and is lower than the curve order.
+ * If this conditions is not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+	point_mul_inner(P, P256_Gwin, k, klen);
+}
+
+/*
+ * Return 1 if all of the following hold:
+ *  - klen <= 32
+ *  - k != 0
+ *  - k is lower than the curve order
+ * Otherwise, return 0.
+ *
+ * Constant-time behaviour: only klen may be observable.
+ */
+static uint32_t
+check_scalar(const unsigned char *k, size_t klen)
+{
+	uint32_t z;
+	int32_t c;
+	size_t u;
+
+	if (klen > 32) {
+		return 0;
+	}
+	z = 0;
+	for (u = 0; u < klen; u ++) {
+		z |= k[u];
+	}
+	if (klen == 32) {
+		c = 0;
+		for (u = 0; u < klen; u ++) {
+			c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
+		}
+	} else {
+		c = -1;
+	}
+	return NEQ(z, 0) & LT0(c);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *k, size_t klen, int curve)
+{
+	uint32_t r;
+	p256_jacobian P;
+
+	(void)curve;
+	if (Glen != 65) {
+		return 0;
+	}
+	r = check_scalar(k, klen);
+	r &= point_decode(&P, G);
+	p256_mul(&P, k, klen);
+	r &= point_encode(G, &P);
+	return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *k, size_t klen, int curve)
+{
+	p256_jacobian P;
+
+	(void)curve;
+	p256_mulgen(&P, k, klen);
+	point_encode(R, &P);
+	return 65;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We might want to use Shamir's trick here: make a composite
+	 * window of u*P+v*Q points, to merge the two doubling-ladders
+	 * into one. This, however, has some complications:
+	 *
+	 *  - During the computation, we may hit the point-at-infinity.
+	 *    Thus, we would need p256_add_complete_mixed() (complete
+	 *    formulas for point addition), with a higher cost (17 muls
+	 *    instead of 11).
+	 *
+	 *  - A 4-bit window would be too large, since it would involve
+	 *    16*16-1 = 255 points. For the same window size as in the
+	 *    p256_mul() case, we would need to reduce the window size
+	 *    to 2 bits, and thus perform twice as many non-doubling
+	 *    point additions.
+	 *
+	 *  - The window may itself contain the point-at-infinity, and
+	 *    thus cannot be in all generality be made of affine points.
+	 *    Instead, we would need to make it a window of points in
+	 *    Jacobian coordinates. Even p256_add_complete_mixed() would
+	 *    be inappropriate.
+	 *
+	 * For these reasons, the code below performs two separate
+	 * point multiplications, then computes the final point addition
+	 * (which is both a "normal" addition, and a doubling, to handle
+	 * all cases).
+	 */
+
+	p256_jacobian P, Q;
+	uint32_t r, t, s;
+	uint64_t z;
+
+	(void)curve;
+	if (len != 65) {
+		return 0;
+	}
+	r = point_decode(&P, A);
+	p256_mul(&P, x, xlen);
+	if (B == NULL) {
+		p256_mulgen(&Q, y, ylen);
+	} else {
+		r &= point_decode(&Q, B);
+		p256_mul(&Q, y, ylen);
+	}
+
+	/*
+	 * The final addition may fail in case both points are equal.
+	 */
+	t = p256_add(&P, &Q);
+	f256_final_reduce(P.z);
+	z = P.z[0] | P.z[1] | P.z[2] | P.z[3] | P.z[4];
+	s = EQ((uint32_t)(z | (z >> 32)), 0);
+	p256_double(&Q);
+
+	/*
+	 * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	 * have the following:
+	 *
+	 *   s = 0, t = 0   return P (normal addition)
+	 *   s = 0, t = 1   return P (normal addition)
+	 *   s = 1, t = 0   return Q (a 'double' case)
+	 *   s = 1, t = 1   report an error (P+Q = 0)
+	 */
+	CCOPY(s & ~t, &P, &Q, sizeof Q);
+	point_encode(A, &P);
+	r &= ~(s & t);
+	return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m62 = {
+	(uint32_t)0x00800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m62_get(void)
+{
+	return &br_ec_p256_m62;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m62_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m64.c b/test/monniaux/BearSSL/src/ec/ec_p256_m64.c
new file mode 100644
index 00000000..5a7ea177
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_p256_m64.c
@@ -0,0 +1,1730 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_UMUL128
+#include <intrin.h>
+#endif
+
+static const unsigned char P256_G[] = {
+	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
+	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
+	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
+	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
+	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
+	0x68, 0x37, 0xBF, 0x51, 0xF5
+};
+
+static const unsigned char P256_N[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
+	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
+	0x25, 0x51
+};
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_G;
+	return P256_G;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	(void)curve;
+	*len = sizeof P256_N;
+	return P256_N;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	(void)curve;
+	*len = 32;
+	return 1;
+}
+
+/*
+ * A field element is encoded as four 64-bit integers, in basis 2^64.
+ * Values may reach up to 2^256-1. Montgomery multiplication is used.
+ */
+
+/* R = 2^256 mod p */
+static const uint64_t F256_R[] = {
+	0x0000000000000001, 0xFFFFFFFF00000000,
+	0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFE
+};
+
+/* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p
+   (Montgomery representation of B). */
+static const uint64_t P256_B_MONTY[] = {
+	0xD89CDF6229C4BDDF, 0xACF005CD78843090,
+	0xE5A220ABF7212ED6, 0xDC30061D04874834
+};
+
+/*
+ * Addition in the field.
+ */
+static inline void
+f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+	unsigned __int128 w;
+	uint64_t t;
+
+	w = (unsigned __int128)a[0] + b[0];
+	d[0] = (uint64_t)w;
+	w = (unsigned __int128)a[1] + b[1] + (w >> 64);
+	d[1] = (uint64_t)w;
+	w = (unsigned __int128)a[2] + b[2] + (w >> 64);
+	d[2] = (uint64_t)w;
+	w = (unsigned __int128)a[3] + b[3] + (w >> 64);
+	d[3] = (uint64_t)w;
+	t = (uint64_t)(w >> 64);
+
+	/*
+	 * 2^256 = 2^224 - 2^192 - 2^96 + 1 in the field.
+	 */
+	w = (unsigned __int128)d[0] + t;
+	d[0] = (uint64_t)w;
+	w = (unsigned __int128)d[1] + (w >> 64) - (t << 32);
+	d[1] = (uint64_t)w;
+	/* Here, carry "w >> 64" can only be 0 or -1 */
+	w = (unsigned __int128)d[2] - ((w >> 64) & 1);
+	d[2] = (uint64_t)w;
+	/* Again, carry is 0 or -1 */
+	d[3] += (uint64_t)(w >> 64) + (t << 32) - t;
+
+#elif BR_UMUL128
+
+	unsigned char cc;
+	uint64_t t;
+
+	cc = _addcarry_u64(0, a[0], b[0], &d[0]);
+	cc = _addcarry_u64(cc, a[1], b[1], &d[1]);
+	cc = _addcarry_u64(cc, a[2], b[2], &d[2]);
+	cc = _addcarry_u64(cc, a[3], b[3], &d[3]);
+
+	/*
+	 * If there is a carry, then we want to subtract p, which we
+	 * do by adding 2^256 - p.
+	 */
+	t = cc;
+	cc = _addcarry_u64(cc, d[0], 0, &d[0]);
+	cc = _addcarry_u64(cc, d[1], -(t << 32), &d[1]);
+	cc = _addcarry_u64(cc, d[2], -t, &d[2]);
+	(void)_addcarry_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
+
+#endif
+}
+
+/*
+ * Subtraction in the field.
+ */
+static inline void
+f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	unsigned __int128 w;
+	uint64_t t;
+
+	w = (unsigned __int128)a[0] - b[0];
+	d[0] = (uint64_t)w;
+	w = (unsigned __int128)a[1] - b[1] - ((w >> 64) & 1);
+	d[1] = (uint64_t)w;
+	w = (unsigned __int128)a[2] - b[2] - ((w >> 64) & 1);
+	d[2] = (uint64_t)w;
+	w = (unsigned __int128)a[3] - b[3] - ((w >> 64) & 1);
+	d[3] = (uint64_t)w;
+	t = (uint64_t)(w >> 64) & 1;
+
+	/*
+	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
+	 */
+	w = (unsigned __int128)d[0] - t;
+	d[0] = (uint64_t)w;
+	w = (unsigned __int128)d[1] + (t << 32) - ((w >> 64) & 1);
+	d[1] = (uint64_t)w;
+	/* Here, carry "w >> 64" can only be 0 or +1 */
+	w = (unsigned __int128)d[2] + (w >> 64);
+	d[2] = (uint64_t)w;
+	/* Again, carry is 0 or +1 */
+	d[3] += (uint64_t)(w >> 64) - (t << 32) + t;
+
+#elif BR_UMUL128
+
+	unsigned char cc;
+	uint64_t t;
+
+	cc = _subborrow_u64(0, a[0], b[0], &d[0]);
+	cc = _subborrow_u64(cc, a[1], b[1], &d[1]);
+	cc = _subborrow_u64(cc, a[2], b[2], &d[2]);
+	cc = _subborrow_u64(cc, a[3], b[3], &d[3]);
+
+	/*
+	 * If there is a carry, then we need to add p.
+	 */
+	t = cc;
+	cc = _addcarry_u64(0, d[0], -t, &d[0]);
+	cc = _addcarry_u64(cc, d[1], (-t) >> 32, &d[1]);
+	cc = _addcarry_u64(cc, d[2], 0, &d[2]);
+	(void)_addcarry_u64(cc, d[3], t - (t << 32), &d[3]);
+
+#endif
+}
+
+/*
+ * Montgomery multiplication in the field.
+ */
+static void
+f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b)
+{
+#if BR_INT128
+
+	uint64_t x, f, t0, t1, t2, t3, t4;
+	unsigned __int128 z, ff;
+	int i;
+
+	/*
+	 * When computing d <- d + a[u]*b, we also add f*p such
+	 * that d + a[u]*b + f*p is a multiple of 2^64. Since
+	 * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
+	 */
+
+	/*
+	 * Step 1: t <- (a[0]*b + f*p) / 2^64
+	 * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
+	 * ensures that (a[0]*b + f*p) is a multiple of 2^64.
+	 *
+	 * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f.
+	 */
+	x = a[0];
+	z = (unsigned __int128)b[0] * x;
+	f = (uint64_t)z;
+	z = (unsigned __int128)b[1] * x + (z >> 64) + (uint64_t)(f << 32);
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)b[2] * x + (z >> 64) + (uint64_t)(f >> 32);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)b[3] * x + (z >> 64) + f;
+	t2 = (uint64_t)z;
+	t3 = (uint64_t)(z >> 64);
+	ff = ((unsigned __int128)f << 64) - ((unsigned __int128)f << 32);
+	z = (unsigned __int128)t2 + (uint64_t)ff;
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
+	t3 = (uint64_t)z;
+	t4 = (uint64_t)(z >> 64);
+
+	/*
+	 * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64
+	 */
+	for (i = 1; i < 4; i ++) {
+		x = a[i];
+
+		/* t <- (t + x*b - f) / 2^64 */
+		z = (unsigned __int128)b[0] * x + t0;
+		f = (uint64_t)z;
+		z = (unsigned __int128)b[1] * x + t1 + (z >> 64);
+		t0 = (uint64_t)z;
+		z = (unsigned __int128)b[2] * x + t2 + (z >> 64);
+		t1 = (uint64_t)z;
+		z = (unsigned __int128)b[3] * x + t3 + (z >> 64);
+		t2 = (uint64_t)z;
+		z = t4 + (z >> 64);
+		t3 = (uint64_t)z;
+		t4 = (uint64_t)(z >> 64);
+
+		/* t <- t + f*2^32, carry in the upper half of z */
+		z = (unsigned __int128)t0 + (uint64_t)(f << 32);
+		t0 = (uint64_t)z;
+		z = (z >> 64) + (unsigned __int128)t1 + (uint64_t)(f >> 32);
+		t1 = (uint64_t)z;
+
+		/* t <- t + f*2^192 - f*2^160 + f*2^128 */
+		ff = ((unsigned __int128)f << 64) 
+			- ((unsigned __int128)f << 32) + f;
+		z = (z >> 64) + (unsigned __int128)t2 + (uint64_t)ff;
+		t2 = (uint64_t)z;
+		z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
+		t3 = (uint64_t)z;
+		t4 += (uint64_t)(z >> 64);
+	}
+
+	/*
+	 * At that point, we have computed t = (a*b + F*p) / 2^256, where
+	 * F is a 256-bit integer whose limbs are the "f" coefficients
+	 * in the steps above. We have:
+	 *   a <= 2^256-1
+	 *   b <= 2^256-1
+	 *   F <= 2^256-1
+	 * Hence:
+	 *   a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1)
+	 *   a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p
+	 * Therefore:
+	 *   t < 2^256 + p - 2
+	 * Since p < 2^256, it follows that:
+	 *   t4 can be only 0 or 1
+	 *   t - p < 2^256
+	 * We can therefore subtract p from t, conditionally on t4, to
+	 * get a nonnegative result that fits on 256 bits.
+	 */
+	z = (unsigned __int128)t0 + t4;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)t1 - (t4 << 32) + (z >> 64);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)t2 - (z >> 127);
+	t2 = (uint64_t)z;
+	t3 = t3 - (uint64_t)(z >> 127) - t4 + (t4 << 32);
+
+	d[0] = t0;
+	d[1] = t1;
+	d[2] = t2;
+	d[3] = t3;
+
+#elif BR_UMUL128
+
+	uint64_t x, f, t0, t1, t2, t3, t4;
+	uint64_t zl, zh, ffl, ffh;
+	unsigned char k, m;
+	int i;
+
+	/*
+	 * When computing d <- d + a[u]*b, we also add f*p such
+	 * that d + a[u]*b + f*p is a multiple of 2^64. Since
+	 * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
+	 */
+
+	/*
+	 * Step 1: t <- (a[0]*b + f*p) / 2^64
+	 * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
+	 * ensures that (a[0]*b + f*p) is a multiple of 2^64.
+	 *
+	 * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f.
+	 */
+	x = a[0];
+
+	zl = _umul128(b[0], x, &zh);
+	f = zl;
+	t0 = zh;
+
+	zl = _umul128(b[1], x, &zh);
+	k = _addcarry_u64(0, zl, t0, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	k = _addcarry_u64(0, zl, f << 32, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	t0 = zl;
+	t1 = zh;
+
+	zl = _umul128(b[2], x, &zh);
+	k = _addcarry_u64(0, zl, t1, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	k = _addcarry_u64(0, zl, f >> 32, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	t1 = zl;
+	t2 = zh;
+
+	zl = _umul128(b[3], x, &zh);
+	k = _addcarry_u64(0, zl, t2, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	k = _addcarry_u64(0, zl, f, &zl);
+	(void)_addcarry_u64(k, zh, 0, &zh);
+	t2 = zl;
+	t3 = zh;
+
+	t4 = _addcarry_u64(0, t3, f, &t3);
+	k = _subborrow_u64(0, t2, f << 32, &t2);
+	k = _subborrow_u64(k, t3, f >> 32, &t3);
+	(void)_subborrow_u64(k, t4, 0, &t4);
+
+	/*
+	 * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64
+	 */
+	for (i = 1; i < 4; i ++) {
+		x = a[i];
+		/* f = t0 + x * b[0]; -- computed below */
+
+		/* t <- (t + x*b - f) / 2^64 */
+		zl = _umul128(b[0], x, &zh);
+		k = _addcarry_u64(0, zl, t0, &f);
+		(void)_addcarry_u64(k, zh, 0, &t0);
+
+		zl = _umul128(b[1], x, &zh);
+		k = _addcarry_u64(0, zl, t0, &zl);
+		(void)_addcarry_u64(k, zh, 0, &zh);
+		k = _addcarry_u64(0, zl, t1, &t0);
+		(void)_addcarry_u64(k, zh, 0, &t1);
+
+		zl = _umul128(b[2], x, &zh);
+		k = _addcarry_u64(0, zl, t1, &zl);
+		(void)_addcarry_u64(k, zh, 0, &zh);
+		k = _addcarry_u64(0, zl, t2, &t1);
+		(void)_addcarry_u64(k, zh, 0, &t2);
+
+		zl = _umul128(b[3], x, &zh);
+		k = _addcarry_u64(0, zl, t2, &zl);
+		(void)_addcarry_u64(k, zh, 0, &zh);
+		k = _addcarry_u64(0, zl, t3, &t2);
+		(void)_addcarry_u64(k, zh, 0, &t3);
+
+		t4 = _addcarry_u64(0, t3, t4, &t3);
+
+		/* t <- t + f*2^32, carry in k */
+		k = _addcarry_u64(0, t0, f << 32, &t0);
+		k = _addcarry_u64(k, t1, f >> 32, &t1);
+
+		/* t <- t + f*2^192 - f*2^160 + f*2^128 */
+		m = _subborrow_u64(0, f, f << 32, &ffl);
+		(void)_subborrow_u64(m, f, f >> 32, &ffh);
+		k = _addcarry_u64(k, t2, ffl, &t2);
+		k = _addcarry_u64(k, t3, ffh, &t3);
+		(void)_addcarry_u64(k, t4, 0, &t4);
+	}
+
+	/*
+	 * At that point, we have computed t = (a*b + F*p) / 2^256, where
+	 * F is a 256-bit integer whose limbs are the "f" coefficients
+	 * in the steps above. We have:
+	 *   a <= 2^256-1
+	 *   b <= 2^256-1
+	 *   F <= 2^256-1
+	 * Hence:
+	 *   a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1)
+	 *   a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p
+	 * Therefore:
+	 *   t < 2^256 + p - 2
+	 * Since p < 2^256, it follows that:
+	 *   t4 can be only 0 or 1
+	 *   t - p < 2^256
+	 * We can therefore subtract p from t, conditionally on t4, to
+	 * get a nonnegative result that fits on 256 bits.
+	 */
+	k = _addcarry_u64(0, t0, t4, &t0);
+	k = _addcarry_u64(k, t1, -(t4 << 32), &t1);
+	k = _addcarry_u64(k, t2, -t4, &t2);
+	(void)_addcarry_u64(k, t3, (t4 << 32) - (t4 << 1), &t3);
+
+	d[0] = t0;
+	d[1] = t1;
+	d[2] = t2;
+	d[3] = t3;
+
+#endif
+}
+
+/*
+ * Montgomery squaring in the field; currently a basic wrapper around
+ * multiplication (inline, should be optimized away).
+ * TODO: see if some extra speed can be gained here.
+ */
+static inline void
+f256_montysquare(uint64_t *d, const uint64_t *a)
+{
+	f256_montymul(d, a, a);
+}
+
+/*
+ * Convert to Montgomery representation.
+ */
+static void
+f256_tomonty(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * R2 = 2^512 mod p.
+	 * If R = 2^256 mod p, then R2 = R^2 mod p; and the Montgomery
+	 * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the
+	 * conversion to Montgomery representation.
+	 */
+	static const uint64_t R2[] = {
+		0x0000000000000003,
+		0xFFFFFFFBFFFFFFFF,
+		0xFFFFFFFFFFFFFFFE,
+		0x00000004FFFFFFFD
+	};
+
+	f256_montymul(d, a, R2);
+}
+
+/*
+ * Convert from Montgomery representation.
+ */
+static void
+f256_frommonty(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * Montgomery multiplication by 1 is division by 2^256 modulo p.
+	 */
+	static const uint64_t one[] = { 1, 0, 0, 0 };
+
+	f256_montymul(d, a, one);
+}
+
+/*
+ * Inversion in the field. If the source value is 0 modulo p, then this
+ * returns 0 or p. This function uses Montgomery representation.
+ */
+static void
+f256_invert(uint64_t *d, const uint64_t *a)
+{
+	/*
+	 * We compute a^(p-2) mod p. The exponent pattern (from high to
+	 * low) is:
+	 *  - 32 bits of value 1
+	 *  - 31 bits of value 0
+	 *  - 1 bit of value 1
+	 *  - 96 bits of value 0
+	 *  - 94 bits of value 1
+	 *  - 1 bit of value 0
+	 *  - 1 bit of value 1
+	 * To speed up the square-and-multiply algorithm, we precompute
+	 * a^(2^31-1).
+	 */
+
+	uint64_t r[4], t[4];
+	int i;
+
+	memcpy(t, a, sizeof t);
+	for (i = 0; i < 30; i ++) {
+		f256_montysquare(t, t);
+		f256_montymul(t, t, a);
+	}
+
+	memcpy(r, t, sizeof t);
+	for (i = 224; i >= 0; i --) {
+		f256_montysquare(r, r);
+		switch (i) {
+		case 0:
+		case 2:
+		case 192:
+		case 224:
+			f256_montymul(r, r, a);
+			break;
+		case 3:
+		case 34:
+		case 65:
+			f256_montymul(r, r, t);
+			break;
+		}
+	}
+	memcpy(d, r, sizeof r);
+}
+
+/*
+ * Finalize reduction.
+ * Input value fits on 256 bits. This function subtracts p if and only
+ * if the input is greater than or equal to p.
+ */
+static inline void
+f256_final_reduce(uint64_t *a)
+{
+#if BR_INT128
+
+	uint64_t t0, t1, t2, t3, cc;
+	unsigned __int128 z;
+
+	/*
+	 * We add 2^224 - 2^192 - 2^96 + 1 to a. If there is no carry,
+	 * then a < p; otherwise, the addition result we computed is
+	 * the value we must return.
+	 */
+	z = (unsigned __int128)a[0] + 1;
+	t0 = (uint64_t)z;
+	z = (unsigned __int128)a[1] + (z >> 64) - ((uint64_t)1 << 32);
+	t1 = (uint64_t)z;
+	z = (unsigned __int128)a[2] - (z >> 127);
+	t2 = (uint64_t)z;
+	z = (unsigned __int128)a[3] - (z >> 127) + 0xFFFFFFFF;
+	t3 = (uint64_t)z;
+	cc = -(uint64_t)(z >> 64);
+
+	a[0] ^= cc & (a[0] ^ t0);
+	a[1] ^= cc & (a[1] ^ t1);
+	a[2] ^= cc & (a[2] ^ t2);
+	a[3] ^= cc & (a[3] ^ t3);
+
+#elif BR_UMUL128
+
+	uint64_t t0, t1, t2, t3, m;
+	unsigned char k;
+
+	k = _addcarry_u64(0, a[0], (uint64_t)1, &t0);
+	k = _addcarry_u64(k, a[1], -((uint64_t)1 << 32), &t1);
+	k = _addcarry_u64(k, a[2], -(uint64_t)1, &t2);
+	k = _addcarry_u64(k, a[3], ((uint64_t)1 << 32) - 2, &t3);
+	m = -(uint64_t)k;
+
+	a[0] ^= m & (a[0] ^ t0);
+	a[1] ^= m & (a[1] ^ t1);
+	a[2] ^= m & (a[2] ^ t2);
+	a[3] ^= m & (a[3] ^ t3);
+
+#endif
+}
+
+/*
+ * Points in affine and Jacobian coordinates.
+ *
+ *  - In affine coordinates, the point-at-infinity cannot be encoded.
+ *  - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
+ *    if Z = 0 then this is the point-at-infinity.
+ */
+typedef struct {
+	uint64_t x[4];
+	uint64_t y[4];
+} p256_affine;
+
+typedef struct {
+	uint64_t x[4];
+	uint64_t y[4];
+	uint64_t z[4];
+} p256_jacobian;
+
+/*
+ * Decode a point. The returned point is in Jacobian coordinates, but
+ * with z = 1. If the encoding is invalid, or encodes a point which is
+ * not on the curve, or encodes the point at infinity, then this function
+ * returns 0. Otherwise, 1 is returned.
+ *
+ * The buffer is assumed to have length exactly 65 bytes.
+ */
+static uint32_t
+point_decode(p256_jacobian *P, const unsigned char *buf)
+{
+	uint64_t x[4], y[4], t[4], x3[4], tt;
+	uint32_t r;
+
+	/*
+	 * Header byte shall be 0x04.
+	 */
+	r = EQ(buf[0], 0x04);
+
+	/*
+	 * Decode X and Y coordinates, and convert them into
+	 * Montgomery representation.
+	 */
+	x[3] = br_dec64be(buf +  1);
+	x[2] = br_dec64be(buf +  9);
+	x[1] = br_dec64be(buf + 17);
+	x[0] = br_dec64be(buf + 25);
+	y[3] = br_dec64be(buf + 33);
+	y[2] = br_dec64be(buf + 41);
+	y[1] = br_dec64be(buf + 49);
+	y[0] = br_dec64be(buf + 57);
+	f256_tomonty(x, x);
+	f256_tomonty(y, y);
+
+	/*
+	 * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
+	 * Note that the Montgomery representation of 0 is 0. We must
+	 * take care to apply the final reduction to make sure we have
+	 * 0 and not p.
+	 */
+	f256_montysquare(t, y);
+	f256_montysquare(x3, x);
+	f256_montymul(x3, x3, x);
+	f256_sub(t, t, x3);
+	f256_add(t, t, x);
+	f256_add(t, t, x);
+	f256_add(t, t, x);
+	f256_sub(t, t, P256_B_MONTY);
+	f256_final_reduce(t);
+	tt = t[0] | t[1] | t[2] | t[3];
+	r &= EQ((uint32_t)(tt | (tt >> 32)), 0);
+
+	/*
+	 * Return the point in Jacobian coordinates (and Montgomery
+	 * representation).
+	 */
+	memcpy(P->x, x, sizeof x);
+	memcpy(P->y, y, sizeof y);
+	memcpy(P->z, F256_R, sizeof F256_R);
+	return r;
+}
+
+/*
+ * Final conversion for a point:
+ *  - The point is converted back to affine coordinates.
+ *  - Final reduction is performed.
+ *  - The point is encoded into the provided buffer.
+ *
+ * If the point is the point-at-infinity, all operations are performed,
+ * but the buffer contents are indeterminate, and 0 is returned. Otherwise,
+ * the encoded point is written in the buffer, and 1 is returned.
+ */
+static uint32_t
+point_encode(unsigned char *buf, const p256_jacobian *P)
+{
+	uint64_t t1[4], t2[4], z;
+
+	/* Set t1 = 1/z^2 and t2 = 1/z^3. */
+	f256_invert(t2, P->z);
+	f256_montysquare(t1, t2);
+	f256_montymul(t2, t2, t1);
+
+	/* Compute affine coordinates x (in t1) and y (in t2). */
+	f256_montymul(t1, P->x, t1);
+	f256_montymul(t2, P->y, t2);
+
+	/* Convert back from Montgomery representation, and finalize
+	   reductions. */
+	f256_frommonty(t1, t1);
+	f256_frommonty(t2, t2);
+	f256_final_reduce(t1);
+	f256_final_reduce(t2);
+
+	/* Encode. */
+	buf[0] = 0x04;
+	br_enc64be(buf +  1, t1[3]);
+	br_enc64be(buf +  9, t1[2]);
+	br_enc64be(buf + 17, t1[1]);
+	br_enc64be(buf + 25, t1[0]);
+	br_enc64be(buf + 33, t2[3]);
+	br_enc64be(buf + 41, t2[2]);
+	br_enc64be(buf + 49, t2[1]);
+	br_enc64be(buf + 57, t2[0]);
+
+	/* Return success if and only if P->z != 0. */
+	z = P->z[0] | P->z[1] | P->z[2] | P->z[3];
+	return NEQ((uint32_t)(z | z >> 32), 0);
+}
+
+/*
+ * Point doubling in Jacobian coordinates: point P is doubled.
+ * Note: if the source point is the point-at-infinity, then the result is
+ * still the point-at-infinity, which is correct. Moreover, if the three
+ * coordinates were zero, then they still are zero in the returned value.
+ *
+ * (Note: this is true even without the final reduction: if the three
+ * coordinates are encoded as four words of value zero each, then the
+ * result will also have all-zero coordinate encodings, not the alternate
+ * encoding as the integer p.)
+ */
+static void
+p256_double(p256_jacobian *P)
+{
+	/*
+	 * Doubling formulas are:
+	 *
+	 *   s = 4*x*y^2
+	 *   m = 3*(x + z^2)*(x - z^2)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y^4
+	 *   z' = 2*y*z
+	 *
+	 * These formulas work for all points, including points of order 2
+	 * and points at infinity:
+	 *   - If y = 0 then z' = 0. But there is no such point in P-256
+	 *     anyway.
+	 *   - If z = 0 then z' = 0.
+	 */
+	uint64_t t1[4], t2[4], t3[4], t4[4];
+
+	/*
+	 * Compute z^2 in t1.
+	 */
+	f256_montysquare(t1, P->z);
+
+	/*
+	 * Compute x-z^2 in t2 and x+z^2 in t1.
+	 */
+	f256_add(t2, P->x, t1);
+	f256_sub(t1, P->x, t1);
+
+	/*
+	 * Compute 3*(x+z^2)*(x-z^2) in t1.
+	 */
+	f256_montymul(t3, t1, t2);
+	f256_add(t1, t3, t3);
+	f256_add(t1, t3, t1);
+
+	/*
+	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	 */
+	f256_montysquare(t3, P->y);
+	f256_add(t3, t3, t3);
+	f256_montymul(t2, P->x, t3);
+	f256_add(t2, t2, t2);
+
+	/*
+	 * Compute x' = m^2 - 2*s.
+	 */
+	f256_montysquare(P->x, t1);
+	f256_sub(P->x, P->x, t2);
+	f256_sub(P->x, P->x, t2);
+
+	/*
+	 * Compute z' = 2*y*z.
+	 */
+	f256_montymul(t4, P->y, P->z);
+	f256_add(P->z, t4, t4);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	 * 2*y^2 in t3.
+	 */
+	f256_sub(t2, t2, P->x);
+	f256_montymul(P->y, t1, t2);
+	f256_montysquare(t4, t3);
+	f256_add(t4, t4, t4);
+	f256_sub(P->y, P->y, t4);
+}
+
+/*
+ * Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0 but P2 != 0
+ *   - If P1 != 0 but P2 == 0
+ *   - If P1 == P2
+ *
+ * In all three cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y coordinate.
+ *   - P1 == 0 and P2 == 0.
+ *   - The Y coordinate of one of the points is 0 and the other point is
+ *     the point at infinity.
+ *
+ * The third case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ *
+ * Note that you can get a returned value of 0 with a correct result,
+ * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
+ */
+static uint32_t
+p256_add(p256_jacobian *P1, const p256_jacobian *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1 * z2^2
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1 * z2^3
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1 * z2
+	 */
+	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
+	uint32_t ret;
+
+	/*
+	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	 */
+	f256_montysquare(t3, P2->z);
+	f256_montymul(t1, P1->x, t3);
+	f256_montymul(t4, P2->z, t3);
+	f256_montymul(t3, P1->y, t4);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+	f256_final_reduce(t4);
+	tt = t4[0] | t4[1] | t4[2] | t4[3];
+	ret = (uint32_t)(tt | (tt >> 32));
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	f256_montymul(t1, P1->z, P2->z);
+	f256_montymul(P1->z, t1, t2);
+
+	return ret;
+}
+
+/*
+ * Point addition (mixed coordinates): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function computes the wrong result in the following cases:
+ *
+ *   - If P1 == 0
+ *   - If P1 == P2
+ *
+ * In both cases, P1 is set to the point at infinity.
+ *
+ * Returned value is 0 if one of the following occurs:
+ *
+ *   - P1 and P2 have the same Y (affine) coordinate.
+ *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+ *
+ * The second case cannot actually happen with valid points, since a point
+ * with Y == 0 is a point of order 2, and there is no point of order 2 on
+ * curve P-256.
+ *
+ * Therefore, assuming that P1 != 0 on input, then the caller
+ * can apply the following:
+ *
+ *   - If the result is not the point at infinity, then it is correct.
+ *   - Otherwise, if the returned value is 1, then this is a case of
+ *     P1+P2 == 0, so the result is indeed the point at infinity.
+ *   - Otherwise, P1 == P2, so a "double" operation should have been
+ *     performed.
+ *
+ * Again, a value of 0 may be returned in some cases where the addition
+ * result is correct.
+ */
+static uint32_t
+p256_add_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+	/*
+	 * Addtions formulas are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 */
+	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
+	uint32_t ret;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * We need to test whether r is zero, so we will do some extra
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+	f256_final_reduce(t4);
+	tt = t4[0] | t4[1] | t4[2] | t4[3];
+	ret = (uint32_t)(tt | (tt >> 32));
+	ret = (ret | -ret) >> 31;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	f256_montymul(P1->z, P1->z, t2);
+
+	return ret;
+}
+
+#if 0
+/* unused */
+/*
+ * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
+ * This is a specialised function for the case when P2 is a non-zero point
+ * in affine coordinates.
+ *
+ * This function returns the correct result in all cases.
+ */
+static uint32_t
+p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2)
+{
+	/*
+	 * Addtions formulas, in the general case, are:
+	 *
+	 *   u1 = x1
+	 *   u2 = x2 * z1^2
+	 *   s1 = y1
+	 *   s2 = y2 * z1^3
+	 *   h = u2 - u1
+	 *   r = s2 - s1
+	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
+	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	 *   z3 = h * z1
+	 *
+	 * These formulas mishandle the two following cases:
+	 *
+	 *  - If P1 is the point-at-infinity (z1 = 0), then z3 is
+	 *    incorrectly set to 0.
+	 *
+	 *  - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
+	 *    are all set to 0.
+	 *
+	 * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
+	 * we correctly get z3 = 0 (the point-at-infinity).
+	 *
+	 * To fix the case P1 = 0, we perform at the end a copy of P2
+	 * over P1, conditional to z1 = 0.
+	 *
+	 * For P1 = P2: in that case, both h and r are set to 0, and
+	 * we get x3, y3 and z3 equal to 0. We can test for that
+	 * occurrence to make a mask which will be all-one if P1 = P2,
+	 * or all-zero otherwise; then we can compute the double of P2
+	 * and add it, combined with the mask, to (x3,y3,z3).
+	 *
+	 * Using the doubling formulas in p256_double() on (x2,y2),
+	 * simplifying since P2 is affine (i.e. z2 = 1, implicitly),
+	 * we get:
+	 *   s = 4*x2*y2^2
+	 *   m = 3*(x2 + 1)*(x2 - 1)
+	 *   x' = m^2 - 2*s
+	 *   y' = m*(s - x') - 8*y2^4
+	 *   z' = 2*y2
+	 * which requires only 6 multiplications. Added to the 11
+	 * multiplications of the normal mixed addition in Jacobian
+	 * coordinates, we get a cost of 17 multiplications in total.
+	 */
+	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt, zz;
+	int i;
+
+	/*
+	 * Set zz to -1 if P1 is the point at infinity, 0 otherwise.
+	 */
+	zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3];
+	zz = ((zz | -zz) >> 63) - (uint64_t)1;
+
+	/*
+	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	 */
+	memcpy(t1, P1->x, sizeof t1);
+	memcpy(t3, P1->y, sizeof t3);
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	f256_montysquare(t4, P1->z);
+	f256_montymul(t2, P2->x, t4);
+	f256_montymul(t5, P1->z, t4);
+	f256_montymul(t4, P2->y, t5);
+
+	/*
+	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 * reduce.
+	 */
+	f256_sub(t2, t2, t1);
+	f256_sub(t4, t4, t3);
+
+	/*
+	 * If both h = 0 and r = 0, then P1 = P2, and we want to set
+	 * the mask tt to -1; otherwise, the mask will be 0.
+	 */
+	f256_final_reduce(t2);
+	f256_final_reduce(t4);
+	tt = t2[0] | t2[1] | t2[2] | t2[3] | t4[0] | t4[1] | t4[2] | t4[3];
+	tt = ((tt | -tt) >> 63) - (uint64_t)1;
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5);
+	 */
+	f256_montysquare(t7, t2);
+	f256_montymul(t6, t1, t7);
+	f256_montymul(t5, t7, t2);
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 */
+	f256_montysquare(P1->x, t4);
+	f256_sub(P1->x, P1->x, t5);
+	f256_sub(P1->x, P1->x, t6);
+	f256_sub(P1->x, P1->x, t6);
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	f256_sub(t6, t6, P1->x);
+	f256_montymul(P1->y, t4, t6);
+	f256_montymul(t1, t5, t3);
+	f256_sub(P1->y, P1->y, t1);
+
+	/*
+	 * Compute z3 = h*z1.
+	 */
+	f256_montymul(P1->z, P1->z, t2);
+
+	/*
+	 * The "double" result, in case P1 = P2.
+	 */
+
+	/*
+	 * Compute z' = 2*y2 (in t1).
+	 */
+	f256_add(t1, P2->y, P2->y);
+
+	/*
+	 * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3).
+	 */
+	f256_montysquare(t2, P2->y);
+	f256_add(t2, t2, t2);
+	f256_add(t3, t2, t2);
+	f256_montymul(t3, P2->x, t3);
+
+	/*
+	 * Compute m = 3*(x2^2 - 1) (in t4).
+	 */
+	f256_montysquare(t4, P2->x);
+	f256_sub(t4, t4, F256_R);
+	f256_add(t5, t4, t4);
+	f256_add(t4, t4, t5);
+
+	/*
+	 * Compute x' = m^2 - 2*s (in t5).
+	 */
+	f256_montysquare(t5, t4);
+	f256_sub(t5, t3);
+	f256_sub(t5, t3);
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y2^4 (in t6).
+	 */
+	f256_sub(t6, t3, t5);
+	f256_montymul(t6, t6, t4);
+	f256_montysquare(t7, t2);
+	f256_sub(t6, t6, t7);
+	f256_sub(t6, t6, t7);
+
+	/*
+	 * We now have the alternate (doubling) coordinates in (t5,t6,t1).
+	 * We combine them with (x3,y3,z3).
+	 */
+	for (i = 0; i < 4; i ++) {
+		P1->x[i] |= tt & t5[i];
+		P1->y[i] |= tt & t6[i];
+		P1->z[i] |= tt & t1[i];
+	}
+
+	/*
+	 * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
+	 * then we want to replace the result with a copy of P2. The
+	 * test on z1 was done at the start, in the zz mask.
+	 */
+	for (i = 0; i < 4; i ++) {
+		P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
+		P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
+		P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
+	}
+}
+#endif
+
+/*
+ * Inner function for computing a point multiplication. A window is
+ * provided, with points 1*P to 15*P in affine coordinates.
+ *
+ * Assumptions:
+ *  - All provided points are valid points on the curve.
+ *  - Multiplier is non-zero, and smaller than the curve order.
+ *  - Everything is in Montgomery representation.
+ */
+static void
+point_mul_inner(p256_jacobian *R, const p256_affine *W,
+	const unsigned char *k, size_t klen)
+{
+	p256_jacobian Q;
+	uint32_t qz;
+
+	memset(&Q, 0, sizeof Q);
+	qz = 1;
+	while (klen -- > 0) {
+		int i;
+		unsigned bk;
+
+		bk = *k ++;
+		for (i = 0; i < 2; i ++) {
+			uint32_t bits;
+			uint32_t bnz;
+			p256_affine T;
+			p256_jacobian U;
+			uint32_t n;
+			int j;
+			uint64_t m;
+
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			p256_double(&Q);
+			bits = (bk >> 4) & 0x0F;
+			bnz = NEQ(bits, 0);
+
+			/*
+			 * Lookup point in window. If the bits are 0,
+			 * we get something invalid, which is not a
+			 * problem because we will use it only if the
+			 * bits are non-zero.
+			 */
+			memset(&T, 0, sizeof T);
+			for (n = 0; n < 15; n ++) {
+				m = -(uint64_t)EQ(bits, n + 1);
+				T.x[0] |= m & W[n].x[0];
+				T.x[1] |= m & W[n].x[1];
+				T.x[2] |= m & W[n].x[2];
+				T.x[3] |= m & W[n].x[3];
+				T.y[0] |= m & W[n].y[0];
+				T.y[1] |= m & W[n].y[1];
+				T.y[2] |= m & W[n].y[2];
+				T.y[3] |= m & W[n].y[3];
+			}
+
+			U = Q;
+			p256_add_mixed(&U, &T);
+
+			/*
+			 * If qz is still 1, then Q was all-zeros, and this
+			 * is conserved through p256_double().
+			 */
+			m = -(uint64_t)(bnz & qz);
+			for (j = 0; j < 4; j ++) {
+				Q.x[j] |= m & T.x[j];
+				Q.y[j] |= m & T.y[j];
+				Q.z[j] |= m & F256_R[j];
+			}
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+			bk <<= 4;
+		}
+	}
+	*R = Q;
+}
+
+/*
+ * Convert a window from Jacobian to affine coordinates. A single
+ * field inversion is used. This function works for windows up to
+ * 32 elements.
+ *
+ * The destination array (aff[]) and the source array (jac[]) may
+ * overlap, provided that the start of aff[] is not after the start of
+ * jac[]. Even if the arrays do _not_ overlap, the source array is
+ * modified.
+ */
+static void
+window_to_affine(p256_affine *aff, p256_jacobian *jac, int num)
+{
+	/*
+	 * Convert the window points to affine coordinates. We use the
+	 * following trick to mutualize the inversion computation: if
+	 * we have z1, z2, z3, and z4, and want to inverse all of them,
+	 * we compute u = 1/(z1*z2*z3*z4), and then we have:
+	 *   1/z1 = u*z2*z3*z4
+	 *   1/z2 = u*z1*z3*z4
+	 *   1/z3 = u*z1*z2*z4
+	 *   1/z4 = u*z1*z2*z3
+	 *
+	 * The partial products are computed recursively:
+	 *
+	 *  - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
+	 *  - on input (z_1,z_2,... z_n):
+	 *       recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
+	 *       recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
+	 *       multiply elements of r1 by m2 -> s1
+	 *       multiply elements of r2 by m1 -> s2
+	 *       return r1||r2 and m1*m2
+	 *
+	 * In the example below, we suppose that we have 14 elements.
+	 * Let z1, z2,... zE be the 14 values to invert (index noted in
+	 * hexadecimal, starting at 1).
+	 *
+	 *  - Depth 1:
+	 *      swap(z1, z2); z12 = z1*z2
+	 *      swap(z3, z4); z34 = z3*z4
+	 *      swap(z5, z6); z56 = z5*z6
+	 *      swap(z7, z8); z78 = z7*z8
+	 *      swap(z9, zA); z9A = z9*zA
+	 *      swap(zB, zC); zBC = zB*zC
+	 *      swap(zD, zE); zDE = zD*zE
+	 *
+	 *  - Depth 2:
+	 *      z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12
+	 *      z1234 = z12*z34
+	 *      z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56
+	 *      z5678 = z56*z78
+	 *      z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A
+	 *      z9ABC = z9A*zBC
+	 *
+	 *  - Depth 3:
+	 *      z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678
+	 *      z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234
+	 *      z12345678 = z1234*z5678
+	 *      z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE
+	 *      zD <- zD*z9ABC, zE*z9ABC
+	 *      z9ABCDE = z9ABC*zDE
+	 *
+	 *  - Depth 4:
+	 *      multiply z1..z8 by z9ABCDE
+	 *      multiply z9..zE by z12345678
+	 *      final z = z12345678*z9ABCDE
+	 */
+
+	uint64_t z[16][4];
+	int i, k, s;
+#define zt   (z[15])
+#define zu   (z[14])
+#define zv   (z[13])
+
+	/*
+	 * First recursion step (pairwise swapping and multiplication).
+	 * If there is an odd number of elements, then we "invent" an
+	 * extra one with coordinate Z = 1 (in Montgomery representation).
+	 */
+	for (i = 0; (i + 1) < num; i += 2) {
+		memcpy(zt, jac[i].z, sizeof zt);
+		memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
+		memcpy(jac[i + 1].z, zt, sizeof zt);
+		f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
+	}
+	if ((num & 1) != 0) {
+		memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
+		memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
+	}
+
+	/*
+	 * Perform further recursion steps. At the entry of each step,
+	 * the process has been done for groups of 's' points. The
+	 * integer k is the log2 of s.
+	 */
+	for (k = 1, s = 2; s < num; k ++, s <<= 1) {
+		int n;
+
+		for (i = 0; i < num; i ++) {
+			f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
+		}
+		n = (num + s - 1) >> k;
+		for (i = 0; i < (n >> 1); i ++) {
+			f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
+		}
+		if ((n & 1) != 0) {
+			memmove(z[n >> 1], z[n], sizeof zt);
+		}
+	}
+
+	/*
+	 * Invert the final result, and convert all points.
+	 */
+	f256_invert(zt, z[0]);
+	for (i = 0; i < num; i ++) {
+		f256_montymul(zv, jac[i].z, zt);
+		f256_montysquare(zu, zv);
+		f256_montymul(zv, zv, zu);
+		f256_montymul(aff[i].x, jac[i].x, zu);
+		f256_montymul(aff[i].y, jac[i].y, zv);
+	}
+}
+
+/*
+ * Multiply the provided point by an integer.
+ * Assumptions:
+ *  - Source point is a valid curve point.
+ *  - Source point is not the point-at-infinity.
+ *  - Integer is not 0, and is lower than the curve order.
+ * If these conditions are not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+	union {
+		p256_affine aff[15];
+		p256_jacobian jac[15];
+	} window;
+	int i;
+
+	/*
+	 * Compute window, in Jacobian coordinates.
+	 */
+	window.jac[0] = *P;
+	for (i = 2; i < 16; i ++) {
+		window.jac[i - 1] = window.jac[(i >> 1) - 1];
+		if ((i & 1) == 0) {
+			p256_double(&window.jac[i - 1]);
+		} else {
+			p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
+		}
+	}
+
+	/*
+	 * Convert the window points to affine coordinates. Point
+	 * window[0] is the source point, already in affine coordinates.
+	 */
+	window_to_affine(window.aff, window.jac, 15);
+
+	/*
+	 * Perform point multiplication.
+	 */
+	point_mul_inner(P, window.aff, k, klen);
+}
+
+/*
+ * Precomputed window for the conventional generator: P256_Gwin[n]
+ * contains (n+1)*G (affine coordinates, in Montgomery representation).
+ */
+static const p256_affine P256_Gwin[] = {
+	{
+		{ 0x79E730D418A9143C, 0x75BA95FC5FEDB601,
+		  0x79FB732B77622510, 0x18905F76A53755C6 },
+		{ 0xDDF25357CE95560A, 0x8B4AB8E4BA19E45C,
+		  0xD2E88688DD21F325, 0x8571FF1825885D85 }
+	},
+	{
+		{ 0x850046D410DDD64D, 0xAA6AE3C1A433827D,
+		  0x732205038D1490D9, 0xF6BB32E43DCF3A3B },
+		{ 0x2F3648D361BEE1A5, 0x152CD7CBEB236FF8,
+		  0x19A8FB0E92042DBE, 0x78C577510A5B8A3B }
+	},
+	{
+		{ 0xFFAC3F904EEBC127, 0xB027F84A087D81FB,
+		  0x66AD77DD87CBBC98, 0x26936A3FB6FF747E },
+		{ 0xB04C5C1FC983A7EB, 0x583E47AD0861FE1A,
+		  0x788208311A2EE98E, 0xD5F06A29E587CC07 }
+	},
+	{
+		{ 0x74B0B50D46918DCC, 0x4650A6EDC623C173,
+		  0x0CDAACACE8100AF2, 0x577362F541B0176B },
+		{ 0x2D96F24CE4CBABA6, 0x17628471FAD6F447,
+		  0x6B6C36DEE5DDD22E, 0x84B14C394C5AB863 }
+	},
+	{
+		{ 0xBE1B8AAEC45C61F5, 0x90EC649A94B9537D,
+		  0x941CB5AAD076C20C, 0xC9079605890523C8 },
+		{ 0xEB309B4AE7BA4F10, 0x73C568EFE5EB882B,
+		  0x3540A9877E7A1F68, 0x73A076BB2DD1E916 }
+	},
+	{
+		{ 0x403947373E77664A, 0x55AE744F346CEE3E,
+		  0xD50A961A5B17A3AD, 0x13074B5954213673 },
+		{ 0x93D36220D377E44B, 0x299C2B53ADFF14B5,
+		  0xF424D44CEF639F11, 0xA4C9916D4A07F75F }
+	},
+	{
+		{ 0x0746354EA0173B4F, 0x2BD20213D23C00F7,
+		  0xF43EAAB50C23BB08, 0x13BA5119C3123E03 },
+		{ 0x2847D0303F5B9D4D, 0x6742F2F25DA67BDD,
+		  0xEF933BDC77C94195, 0xEAEDD9156E240867 }
+	},
+	{
+		{ 0x27F14CD19499A78F, 0x462AB5C56F9B3455,
+		  0x8F90F02AF02CFC6B, 0xB763891EB265230D },
+		{ 0xF59DA3A9532D4977, 0x21E3327DCF9EBA15,
+		  0x123C7B84BE60BBF0, 0x56EC12F27706DF76 }
+	},
+	{
+		{ 0x75C96E8F264E20E8, 0xABE6BFED59A7A841,
+		  0x2CC09C0444C8EB00, 0xE05B3080F0C4E16B },
+		{ 0x1EB7777AA45F3314, 0x56AF7BEDCE5D45E3,
+		  0x2B6E019A88B12F1A, 0x086659CDFD835F9B }
+	},
+	{
+		{ 0x2C18DBD19DC21EC8, 0x98F9868A0FCF8139,
+		  0x737D2CD648250B49, 0xCC61C94724B3428F },
+		{ 0x0C2B407880DD9E76, 0xC43A8991383FBE08,
+		  0x5F7D2D65779BE5D2, 0x78719A54EB3B4AB5 }
+	},
+	{
+		{ 0xEA7D260A6245E404, 0x9DE407956E7FDFE0,
+		  0x1FF3A4158DAC1AB5, 0x3E7090F1649C9073 },
+		{ 0x1A7685612B944E88, 0x250F939EE57F61C8,
+		  0x0C0DAA891EAD643D, 0x68930023E125B88E }
+	},
+	{
+		{ 0x04B71AA7D2697768, 0xABDEDEF5CA345A33,
+		  0x2409D29DEE37385E, 0x4EE1DF77CB83E156 },
+		{ 0x0CAC12D91CBB5B43, 0x170ED2F6CA895637,
+		  0x28228CFA8ADE6D66, 0x7FF57C9553238ACA }
+	},
+	{
+		{ 0xCCC425634B2ED709, 0x0E356769856FD30D,
+		  0xBCBCD43F559E9811, 0x738477AC5395B759 },
+		{ 0x35752B90C00EE17F, 0x68748390742ED2E3,
+		  0x7CD06422BD1F5BC1, 0xFBC08769C9E7B797 }
+	},
+	{
+		{ 0xA242A35BB0CF664A, 0x126E48F77F9707E3,
+		  0x1717BF54C6832660, 0xFAAE7332FD12C72E },
+		{ 0x27B52DB7995D586B, 0xBE29569E832237C2,
+		  0xE8E4193E2A65E7DB, 0x152706DC2EAA1BBB }
+	},
+	{
+		{ 0x72BCD8B7BC60055B, 0x03CC23EE56E27E4B,
+		  0xEE337424E4819370, 0xE2AA0E430AD3DA09 },
+		{ 0x40B8524F6383C45D, 0xD766355442A41B25,
+		  0x64EFA6DE778A4797, 0x2042170A7079ADF4 }
+	}
+};
+
+/*
+ * Multiply the conventional generator of the curve by the provided
+ * integer. Return is written in *P.
+ *
+ * Assumptions:
+ *  - Integer is not 0, and is lower than the curve order.
+ * If this conditions is not met, then the result is indeterminate
+ * (but the process is still constant-time).
+ */
+static void
+p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen)
+{
+	point_mul_inner(P, P256_Gwin, k, klen);
+}
+
+/*
+ * Return 1 if all of the following hold:
+ *  - klen <= 32
+ *  - k != 0
+ *  - k is lower than the curve order
+ * Otherwise, return 0.
+ *
+ * Constant-time behaviour: only klen may be observable.
+ */
+static uint32_t
+check_scalar(const unsigned char *k, size_t klen)
+{
+	uint32_t z;
+	int32_t c;
+	size_t u;
+
+	if (klen > 32) {
+		return 0;
+	}
+	z = 0;
+	for (u = 0; u < klen; u ++) {
+		z |= k[u];
+	}
+	if (klen == 32) {
+		c = 0;
+		for (u = 0; u < klen; u ++) {
+			c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
+		}
+	} else {
+		c = -1;
+	}
+	return NEQ(z, 0) & LT0(c);
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *k, size_t klen, int curve)
+{
+	uint32_t r;
+	p256_jacobian P;
+
+	(void)curve;
+	if (Glen != 65) {
+		return 0;
+	}
+	r = check_scalar(k, klen);
+	r &= point_decode(&P, G);
+	p256_mul(&P, k, klen);
+	r &= point_encode(G, &P);
+	return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *k, size_t klen, int curve)
+{
+	p256_jacobian P;
+
+	(void)curve;
+	p256_mulgen(&P, k, klen);
+	point_encode(R, &P);
+	return 65;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	/*
+	 * We might want to use Shamir's trick here: make a composite
+	 * window of u*P+v*Q points, to merge the two doubling-ladders
+	 * into one. This, however, has some complications:
+	 *
+	 *  - During the computation, we may hit the point-at-infinity.
+	 *    Thus, we would need p256_add_complete_mixed() (complete
+	 *    formulas for point addition), with a higher cost (17 muls
+	 *    instead of 11).
+	 *
+	 *  - A 4-bit window would be too large, since it would involve
+	 *    16*16-1 = 255 points. For the same window size as in the
+	 *    p256_mul() case, we would need to reduce the window size
+	 *    to 2 bits, and thus perform twice as many non-doubling
+	 *    point additions.
+	 *
+	 *  - The window may itself contain the point-at-infinity, and
+	 *    thus cannot be in all generality be made of affine points.
+	 *    Instead, we would need to make it a window of points in
+	 *    Jacobian coordinates. Even p256_add_complete_mixed() would
+	 *    be inappropriate.
+	 *
+	 * For these reasons, the code below performs two separate
+	 * point multiplications, then computes the final point addition
+	 * (which is both a "normal" addition, and a doubling, to handle
+	 * all cases).
+	 */
+
+	p256_jacobian P, Q;
+	uint32_t r, t, s;
+	uint64_t z;
+
+	(void)curve;
+	if (len != 65) {
+		return 0;
+	}
+	r = point_decode(&P, A);
+	p256_mul(&P, x, xlen);
+	if (B == NULL) {
+		p256_mulgen(&Q, y, ylen);
+	} else {
+		r &= point_decode(&Q, B);
+		p256_mul(&Q, y, ylen);
+	}
+
+	/*
+	 * The final addition may fail in case both points are equal.
+	 */
+	t = p256_add(&P, &Q);
+	f256_final_reduce(P.z);
+	z = P.z[0] | P.z[1] | P.z[2] | P.z[3];
+	s = EQ((uint32_t)(z | (z >> 32)), 0);
+	p256_double(&Q);
+
+	/*
+	 * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	 * have the following:
+	 *
+	 *   s = 0, t = 0   return P (normal addition)
+	 *   s = 0, t = 1   return P (normal addition)
+	 *   s = 1, t = 0   return Q (a 'double' case)
+	 *   s = 1, t = 1   report an error (P+Q = 0)
+	 */
+	CCOPY(s & ~t, &P, &Q, sizeof Q);
+	point_encode(A, &P);
+	r &= ~(s & t);
+	return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_p256_m64 = {
+	(uint32_t)0x00800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m64_get(void)
+{
+	return &br_ec_p256_m64;
+}
+
+#else
+
+/* see bearssl_ec.h */
+const br_ec_impl *
+br_ec_p256_m64_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/ec/ec_prime_i15.c b/test/monniaux/BearSSL/src/ec/ec_prime_i15.c
new file mode 100644
index 00000000..0f210f24
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_prime_i15.c
@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Parameters for supported curves:
+ *   - field modulus p
+ *   - R^2 mod p (R = 2^(15k) for the smallest k such that R >= p)
+ *   - b*R mod p (b is the second curve equation parameter)
+ */
+
+static const uint16_t P256_P[] = {
+	0x0111,
+	0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x003F, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x1000, 0x0000, 0x4000, 0x7FFF,
+	0x7FFF, 0x0001
+};
+
+static const uint16_t P256_R2[] = {
+	0x0111,
+	0x0000, 0x6000, 0x0000, 0x0000, 0x0000, 0x0000, 0x7FFC, 0x7FFF,
+	0x7FBF, 0x7FFF, 0x7FBF, 0x7FFF, 0x7FFF, 0x7FFF, 0x77FF, 0x7FFF,
+	0x4FFF, 0x0000
+};
+
+static const uint16_t P256_B[] = {
+	0x0111,
+	0x770C, 0x5EEF, 0x29C4, 0x3EC4, 0x6273, 0x0486, 0x4543, 0x3993,
+	0x3C01, 0x6B56, 0x212E, 0x57EE, 0x4882, 0x204B, 0x7483, 0x3C16,
+	0x0187, 0x0000
+};
+
+static const uint16_t P384_P[] = {
+	0x0199,
+	0x7FFF, 0x7FFF, 0x0003, 0x0000, 0x0000, 0x0000, 0x7FC0, 0x7FFF,
+	0x7EFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+	0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+	0x7FFF, 0x01FF
+};
+
+static const uint16_t P384_R2[] = {
+	0x0199,
+	0x1000, 0x0000, 0x0000, 0x7FFF, 0x7FFF, 0x0001, 0x0000, 0x0010,
+	0x0000, 0x0000, 0x0000, 0x7F00, 0x7FFF, 0x01FF, 0x0000, 0x1000,
+	0x0000, 0x2000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000
+};
+
+static const uint16_t P384_B[] = {
+	0x0199,
+	0x7333, 0x2096, 0x70D1, 0x2310, 0x3020, 0x6197, 0x1464, 0x35BB,
+	0x70CA, 0x0117, 0x1920, 0x4136, 0x5FC8, 0x5713, 0x4938, 0x7DD2,
+	0x4DD2, 0x4A71, 0x0220, 0x683E, 0x2C87, 0x4DB1, 0x7BFF, 0x6C09,
+	0x0452, 0x0084
+};
+
+static const uint16_t P521_P[] = {
+	0x022B,
+	0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+	0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+	0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+	0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF,
+	0x7FFF, 0x7FFF, 0x07FF
+};
+
+static const uint16_t P521_R2[] = {
+	0x022B,
+	0x0100, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000
+};
+
+static const uint16_t P521_B[] = {
+	0x022B,
+	0x7002, 0x6A07, 0x751A, 0x228F, 0x71EF, 0x5869, 0x20F4, 0x1EFC,
+	0x7357, 0x37E0, 0x4EEC, 0x605E, 0x1652, 0x26F6, 0x31FA, 0x4A8F,
+	0x6193, 0x3C2A, 0x3C42, 0x48C7, 0x3489, 0x6771, 0x4C57, 0x5CCD,
+	0x2725, 0x545B, 0x503B, 0x5B42, 0x21A0, 0x2534, 0x687E, 0x70E4,
+	0x1618, 0x27D7, 0x0465
+};
+
+typedef struct {
+	const uint16_t *p;
+	const uint16_t *b;
+	const uint16_t *R2;
+	uint16_t p0i;
+	size_t point_len;
+} curve_params;
+
+static inline const curve_params *
+id_to_curve(int curve)
+{
+	static const curve_params pp[] = {
+		{ P256_P, P256_B, P256_R2, 0x0001,  65 },
+		{ P384_P, P384_B, P384_R2, 0x0001,  97 },
+		{ P521_P, P521_B, P521_R2, 0x0001, 133 }
+	};
+
+	return &pp[curve - BR_EC_secp256r1];
+}
+
+#define I15_LEN   ((BR_MAX_EC_SIZE + 29) / 15)
+
+/*
+ * Type for a point in Jacobian coordinates:
+ * -- three values, x, y and z, in Montgomery representation
+ * -- affine coordinates are X = x / z^2 and Y = y / z^3
+ * -- for the point at infinity, z = 0
+ */
+typedef struct {
+	uint16_t c[3][I15_LEN];
+} jacobian;
+
+/*
+ * We use a custom interpreter that uses a dozen registers, and
+ * only six operations:
+ *    MSET(d, a)       copy a into d
+ *    MADD(d, a)       d = d+a (modular)
+ *    MSUB(d, a)       d = d-a (modular)
+ *    MMUL(d, a, b)    d = a*b (Montgomery multiplication)
+ *    MINV(d, a, b)    invert d modulo p; a and b are used as scratch registers
+ *    MTZ(d)           clear return value if d = 0
+ * Destination of MMUL (d) must be distinct from operands (a and b).
+ * There is no such constraint for MSUB and MADD.
+ *
+ * Registers include the operand coordinates, and temporaries.
+ */
+#define MSET(d, a)      (0x0000 + ((d) << 8) + ((a) << 4))
+#define MADD(d, a)      (0x1000 + ((d) << 8) + ((a) << 4))
+#define MSUB(d, a)      (0x2000 + ((d) << 8) + ((a) << 4))
+#define MMUL(d, a, b)   (0x3000 + ((d) << 8) + ((a) << 4) + (b))
+#define MINV(d, a, b)   (0x4000 + ((d) << 8) + ((a) << 4) + (b))
+#define MTZ(d)          (0x5000 + ((d) << 8))
+#define ENDCODE         0
+
+/*
+ * Registers for the input operands.
+ */
+#define P1x    0
+#define P1y    1
+#define P1z    2
+#define P2x    3
+#define P2y    4
+#define P2z    5
+
+/*
+ * Alternate names for the first input operand.
+ */
+#define Px     0
+#define Py     1
+#define Pz     2
+
+/*
+ * Temporaries.
+ */
+#define t1     6
+#define t2     7
+#define t3     8
+#define t4     9
+#define t5    10
+#define t6    11
+#define t7    12
+
+/*
+ * Extra scratch registers available when there is no second operand (e.g.
+ * for "double" and "affine").
+ */
+#define t8     3
+#define t9     4
+#define t10    5
+
+/*
+ * Doubling formulas are:
+ *
+ *   s = 4*x*y^2
+ *   m = 3*(x + z^2)*(x - z^2)
+ *   x' = m^2 - 2*s
+ *   y' = m*(s - x') - 8*y^4
+ *   z' = 2*y*z
+ *
+ * If y = 0 (P has order 2) then this yields infinity (z' = 0), as it
+ * should. This case should not happen anyway, because our curves have
+ * prime order, and thus do not contain any point of order 2.
+ *
+ * If P is infinity (z = 0), then again the formulas yield infinity,
+ * which is correct. Thus, this code works for all points.
+ *
+ * Cost: 8 multiplications
+ */
+static const uint16_t code_double[] = {
+	/*
+	 * Compute z^2 (in t1).
+	 */
+	MMUL(t1, Pz, Pz),
+
+	/*
+	 * Compute x-z^2 (in t2) and then x+z^2 (in t1).
+	 */
+	MSET(t2, Px),
+	MSUB(t2, t1),
+	MADD(t1, Px),
+
+	/*
+	 * Compute m = 3*(x+z^2)*(x-z^2) (in t1).
+	 */
+	MMUL(t3, t1, t2),
+	MSET(t1, t3),
+	MADD(t1, t3),
+	MADD(t1, t3),
+
+	/*
+	 * Compute s = 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	 */
+	MMUL(t3, Py, Py),
+	MADD(t3, t3),
+	MMUL(t2, Px, t3),
+	MADD(t2, t2),
+
+	/*
+	 * Compute x' = m^2 - 2*s.
+	 */
+	MMUL(Px, t1, t1),
+	MSUB(Px, t2),
+	MSUB(Px, t2),
+
+	/*
+	 * Compute z' = 2*y*z.
+	 */
+	MMUL(t4, Py, Pz),
+	MSET(Pz, t4),
+	MADD(Pz, t4),
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	 * 2*y^2 in t3.
+	 */
+	MSUB(t2, Px),
+	MMUL(Py, t1, t2),
+	MMUL(t4, t3, t3),
+	MSUB(Py, t4),
+	MSUB(Py, t4),
+
+	ENDCODE
+};
+
+/*
+ * Addtions formulas are:
+ *
+ *   u1 = x1 * z2^2
+ *   u2 = x2 * z1^2
+ *   s1 = y1 * z2^3
+ *   s2 = y2 * z1^3
+ *   h = u2 - u1
+ *   r = s2 - s1
+ *   x3 = r^2 - h^3 - 2 * u1 * h^2
+ *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ *   z3 = h * z1 * z2
+ *
+ * If both P1 and P2 are infinity, then z1 == 0 and z2 == 0, implying that
+ * z3 == 0, so the result is correct.
+ * If either of P1 or P2 is infinity, but not both, then z3 == 0, which is
+ * not correct.
+ * h == 0 only if u1 == u2; this happens in two cases:
+ * -- if s1 == s2 then P1 and/or P2 is infinity, or P1 == P2
+ * -- if s1 != s2 then P1 + P2 == infinity (but neither P1 or P2 is infinity)
+ *
+ * Thus, the following situations are not handled correctly:
+ * -- P1 = 0 and P2 != 0
+ * -- P1 != 0 and P2 = 0
+ * -- P1 = P2
+ * All other cases are properly computed. However, even in "incorrect"
+ * situations, the three coordinates still are properly formed field
+ * elements.
+ *
+ * The returned flag is cleared if r == 0. This happens in the following
+ * cases:
+ * -- Both points are on the same horizontal line (same Y coordinate).
+ * -- Both points are infinity.
+ * -- One point is infinity and the other is on line Y = 0.
+ * The third case cannot happen with our curves (there is no valid point
+ * on line Y = 0 since that would be a point of order 2). If the two
+ * source points are non-infinity, then remains only the case where the
+ * two points are on the same horizontal line.
+ *
+ * This allows us to detect the "P1 == P2" case, assuming that P1 != 0 and
+ * P2 != 0:
+ * -- If the returned value is not the point at infinity, then it was properly
+ * computed.
+ * -- Otherwise, if the returned flag is 1, then P1+P2 = 0, and the result
+ * is indeed the point at infinity.
+ * -- Otherwise (result is infinity, flag is 0), then P1 = P2 and we should
+ * use the 'double' code.
+ *
+ * Cost: 16 multiplications
+ */
+static const uint16_t code_add[] = {
+	/*
+	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	 */
+	MMUL(t3, P2z, P2z),
+	MMUL(t1, P1x, t3),
+	MMUL(t4, P2z, t3),
+	MMUL(t3, P1y, t4),
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	MMUL(t4, P1z, P1z),
+	MMUL(t2, P2x, t4),
+	MMUL(t5, P1z, t4),
+	MMUL(t4, P2y, t5),
+
+	/*
+	 * Compute h = u2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 */
+	MSUB(t2, t1),
+	MSUB(t4, t3),
+
+	/*
+	 * Report cases where r = 0 through the returned flag.
+	 */
+	MTZ(t4),
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5).
+	 */
+	MMUL(t7, t2, t2),
+	MMUL(t6, t1, t7),
+	MMUL(t5, t7, t2),
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 * t1 and t7 can be used as scratch registers.
+	 */
+	MMUL(P1x, t4, t4),
+	MSUB(P1x, t5),
+	MSUB(P1x, t6),
+	MSUB(P1x, t6),
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	MSUB(t6, P1x),
+	MMUL(P1y, t4, t6),
+	MMUL(t1, t5, t3),
+	MSUB(P1y, t1),
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	MMUL(t1, P1z, P2z),
+	MMUL(P1z, t1, t2),
+
+	ENDCODE
+};
+
+/*
+ * Check that the point is on the curve. This code snippet assumes the
+ * following conventions:
+ * -- Coordinates x and y have been freshly decoded in P1 (but not
+ * converted to Montgomery coordinates yet).
+ * -- P2x, P2y and P2z are set to, respectively, R^2, b*R and 1.
+ */
+static const uint16_t code_check[] = {
+
+	/* Convert x and y to Montgomery representation. */
+	MMUL(t1, P1x, P2x),
+	MMUL(t2, P1y, P2x),
+	MSET(P1x, t1),
+	MSET(P1y, t2),
+
+	/* Compute x^3 in t1. */
+	MMUL(t2, P1x, P1x),
+	MMUL(t1, P1x, t2),
+
+	/* Subtract 3*x from t1. */
+	MSUB(t1, P1x),
+	MSUB(t1, P1x),
+	MSUB(t1, P1x),
+
+	/* Add b. */
+	MADD(t1, P2y),
+
+	/* Compute y^2 in t2. */
+	MMUL(t2, P1y, P1y),
+
+	/* Compare y^2 with x^3 - 3*x + b; they must match. */
+	MSUB(t1, t2),
+	MTZ(t1),
+
+	/* Set z to 1 (in Montgomery representation). */
+	MMUL(P1z, P2x, P2z),
+
+	ENDCODE
+};
+
+/*
+ * Conversion back to affine coordinates. This code snippet assumes that
+ * the z coordinate of P2 is set to 1 (not in Montgomery representation).
+ */
+static const uint16_t code_affine[] = {
+
+	/* Save z*R in t1. */
+	MSET(t1, P1z),
+
+	/* Compute z^3 in t2. */
+	MMUL(t2, P1z, P1z),
+	MMUL(t3, P1z, t2),
+	MMUL(t2, t3, P2z),
+
+	/* Invert to (1/z^3) in t2. */
+	MINV(t2, t3, t4),
+
+	/* Compute y. */
+	MSET(t3, P1y),
+	MMUL(P1y, t2, t3),
+
+	/* Compute (1/z^2) in t3. */
+	MMUL(t3, t2, t1),
+
+	/* Compute x. */
+	MSET(t2, P1x),
+	MMUL(P1x, t2, t3),
+
+	ENDCODE
+};
+
+static uint32_t
+run_code(jacobian *P1, const jacobian *P2,
+	const curve_params *cc, const uint16_t *code)
+{
+	uint32_t r;
+	uint16_t t[13][I15_LEN];
+	size_t u;
+
+	r = 1;
+
+	/*
+	 * Copy the two operands in the dedicated registers.
+	 */
+	memcpy(t[P1x], P1->c, 3 * I15_LEN * sizeof(uint16_t));
+	memcpy(t[P2x], P2->c, 3 * I15_LEN * sizeof(uint16_t));
+
+	/*
+	 * Run formulas.
+	 */
+	for (u = 0;; u ++) {
+		unsigned op, d, a, b;
+
+		op = code[u];
+		if (op == 0) {
+			break;
+		}
+		d = (op >> 8) & 0x0F;
+		a = (op >> 4) & 0x0F;
+		b = op & 0x0F;
+		op >>= 12;
+		switch (op) {
+			uint32_t ctl;
+			size_t plen;
+			unsigned char tp[(BR_MAX_EC_SIZE + 7) >> 3];
+
+		case 0:
+			memcpy(t[d], t[a], I15_LEN * sizeof(uint16_t));
+			break;
+		case 1:
+			ctl = br_i15_add(t[d], t[a], 1);
+			ctl |= NOT(br_i15_sub(t[d], cc->p, 0));
+			br_i15_sub(t[d], cc->p, ctl);
+			break;
+		case 2:
+			br_i15_add(t[d], cc->p, br_i15_sub(t[d], t[a], 1));
+			break;
+		case 3:
+			br_i15_montymul(t[d], t[a], t[b], cc->p, cc->p0i);
+			break;
+		case 4:
+			plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3;
+			br_i15_encode(tp, plen, cc->p);
+			tp[plen - 1] -= 2;
+			br_i15_modpow(t[d], tp, plen,
+				cc->p, cc->p0i, t[a], t[b]);
+			break;
+		default:
+			r &= ~br_i15_iszero(t[d]);
+			break;
+		}
+	}
+
+	/*
+	 * Copy back result.
+	 */
+	memcpy(P1->c, t[P1x], 3 * I15_LEN * sizeof(uint16_t));
+	return r;
+}
+
+static void
+set_one(uint16_t *x, const uint16_t *p)
+{
+	size_t plen;
+
+	plen = (p[0] + 31) >> 4;
+	memset(x, 0, plen * sizeof *x);
+	x[0] = p[0];
+	x[1] = 0x0001;
+}
+
+static void
+point_zero(jacobian *P, const curve_params *cc)
+{
+	memset(P, 0, sizeof *P);
+	P->c[0][0] = P->c[1][0] = P->c[2][0] = cc->p[0];
+}
+
+static inline void
+point_double(jacobian *P, const curve_params *cc)
+{
+	run_code(P, P, cc, code_double);
+}
+
+static inline uint32_t
+point_add(jacobian *P1, const jacobian *P2, const curve_params *cc)
+{
+	return run_code(P1, P2, cc, code_add);
+}
+
+static void
+point_mul(jacobian *P, const unsigned char *x, size_t xlen,
+	const curve_params *cc)
+{
+	/*
+	 * We do a simple double-and-add ladder with a 2-bit window
+	 * to make only one add every two doublings. We thus first
+	 * precompute 2P and 3P in some local buffers.
+	 *
+	 * We always perform two doublings and one addition; the
+	 * addition is with P, 2P and 3P and is done in a temporary
+	 * array.
+	 *
+	 * The addition code cannot handle cases where one of the
+	 * operands is infinity, which is the case at the start of the
+	 * ladder. We therefore need to maintain a flag that controls
+	 * this situation.
+	 */
+	uint32_t qz;
+	jacobian P2, P3, Q, T, U;
+
+	memcpy(&P2, P, sizeof P2);
+	point_double(&P2, cc);
+	memcpy(&P3, P, sizeof P3);
+	point_add(&P3, &P2, cc);
+
+	point_zero(&Q, cc);
+	qz = 1;
+	while (xlen -- > 0) {
+		int k;
+
+		for (k = 6; k >= 0; k -= 2) {
+			uint32_t bits;
+			uint32_t bnz;
+
+			point_double(&Q, cc);
+			point_double(&Q, cc);
+			memcpy(&T, P, sizeof T);
+			memcpy(&U, &Q, sizeof U);
+			bits = (*x >> k) & (uint32_t)3;
+			bnz = NEQ(bits, 0);
+			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
+			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
+			point_add(&U, &T, cc);
+			CCOPY(bnz & qz, &Q, &T, sizeof Q);
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+		}
+		x ++;
+	}
+	memcpy(P, &Q, sizeof Q);
+}
+
+/*
+ * Decode point into Jacobian coordinates. This function does not support
+ * the point at infinity. If the point is invalid then this returns 0, but
+ * the coordinates are still set to properly formed field elements.
+ */
+static uint32_t
+point_decode(jacobian *P, const void *src, size_t len, const curve_params *cc)
+{
+	/*
+	 * Points must use uncompressed format:
+	 * -- first byte is 0x04;
+	 * -- coordinates X and Y use unsigned big-endian, with the same
+	 *    length as the field modulus.
+	 *
+	 * We don't support hybrid format (uncompressed, but first byte
+	 * has value 0x06 or 0x07, depending on the least significant bit
+	 * of Y) because it is rather useless, and explicitly forbidden
+	 * by PKIX (RFC 5480, section 2.2).
+	 *
+	 * We don't support compressed format either, because it is not
+	 * much used in practice (there are or were patent-related
+	 * concerns about point compression, which explains the lack of
+	 * generalised support). Also, point compression support would
+	 * need a bit more code.
+	 */
+	const unsigned char *buf;
+	size_t plen, zlen;
+	uint32_t r;
+	jacobian Q;
+
+	buf = src;
+	point_zero(P, cc);
+	plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3;
+	if (len != 1 + (plen << 1)) {
+		return 0;
+	}
+	r = br_i15_decode_mod(P->c[0], buf + 1, plen, cc->p);
+	r &= br_i15_decode_mod(P->c[1], buf + 1 + plen, plen, cc->p);
+
+	/*
+	 * Check first byte.
+	 */
+	r &= EQ(buf[0], 0x04);
+	/* obsolete
+	r &= EQ(buf[0], 0x04) | (EQ(buf[0] & 0xFE, 0x06)
+		& ~(uint32_t)(buf[0] ^ buf[plen << 1]));
+	*/
+
+	/*
+	 * Convert coordinates and check that the point is valid.
+	 */
+	zlen = ((cc->p[0] + 31) >> 4) * sizeof(uint16_t);
+	memcpy(Q.c[0], cc->R2, zlen);
+	memcpy(Q.c[1], cc->b, zlen);
+	set_one(Q.c[2], cc->p);
+	r &= ~run_code(P, &Q, cc, code_check);
+	return r;
+}
+
+/*
+ * Encode a point. This method assumes that the point is correct and is
+ * not the point at infinity. Encoded size is always 1+2*plen, where
+ * plen is the field modulus length, in bytes.
+ */
+static void
+point_encode(void *dst, const jacobian *P, const curve_params *cc)
+{
+	unsigned char *buf;
+	size_t plen;
+	jacobian Q, T;
+
+	buf = dst;
+	plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3;
+	buf[0] = 0x04;
+	memcpy(&Q, P, sizeof *P);
+	set_one(T.c[2], cc->p);
+	run_code(&Q, &T, cc, code_affine);
+	br_i15_encode(buf + 1, plen, Q.c[0]);
+	br_i15_encode(buf + 1 + plen, plen, Q.c[1]);
+}
+
+static const br_ec_curve_def *
+id_to_curve_def(int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return &br_secp256r1;
+	case BR_EC_secp384r1:
+		return &br_secp384r1;
+	case BR_EC_secp521r1:
+		return &br_secp521r1;
+	}
+	return NULL;
+}
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	const br_ec_curve_def *cd;
+
+	cd = id_to_curve_def(curve);
+	*len = cd->generator_len;
+	return cd->generator;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	const br_ec_curve_def *cd;
+
+	cd = id_to_curve_def(curve);
+	*len = cd->order_len;
+	return cd->order;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	api_generator(curve, len);
+	*len >>= 1;
+	return 1;
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	uint32_t r;
+	const curve_params *cc;
+	jacobian P;
+
+	cc = id_to_curve(curve);
+	r = point_decode(&P, G, Glen, cc);
+	point_mul(&P, x, xlen, cc);
+	if (Glen == cc->point_len) {
+		point_encode(G, &P, cc);
+	}
+	return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	uint32_t r, t, z;
+	const curve_params *cc;
+	jacobian P, Q;
+
+	/*
+	 * TODO: see about merging the two ladders. Right now, we do
+	 * two independent point multiplications, which is a bit
+	 * wasteful of CPU resources (but yields short code).
+	 */
+
+	cc = id_to_curve(curve);
+	r = point_decode(&P, A, len, cc);
+	if (B == NULL) {
+		size_t Glen;
+
+		B = api_generator(curve, &Glen);
+	}
+	r &= point_decode(&Q, B, len, cc);
+	point_mul(&P, x, xlen, cc);
+	point_mul(&Q, y, ylen, cc);
+
+	/*
+	 * We want to compute P+Q. Since the base points A and B are distinct
+	 * from infinity, and the multipliers are non-zero and lower than the
+	 * curve order, then we know that P and Q are non-infinity. This
+	 * leaves two special situations to test for:
+	 * -- If P = Q then we must use point_double().
+	 * -- If P+Q = 0 then we must report an error.
+	 */
+	t = point_add(&P, &Q, cc);
+	point_double(&Q, cc);
+	z = br_i15_iszero(P.c[2]);
+
+	/*
+	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	 * have the following:
+	 *
+	 *   z = 0, t = 0   return P (normal addition)
+	 *   z = 0, t = 1   return P (normal addition)
+	 *   z = 1, t = 0   return Q (a 'double' case)
+	 *   z = 1, t = 1   report an error (P+Q = 0)
+	 */
+	CCOPY(z & ~t, &P, &Q, sizeof Q);
+	point_encode(A, &P, cc);
+	r &= ~(z & t);
+
+	return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_prime_i15 = {
+	(uint32_t)0x03800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_prime_i31.c b/test/monniaux/BearSSL/src/ec/ec_prime_i31.c
new file mode 100644
index 00000000..0586a3b5
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_prime_i31.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Parameters for supported curves (field modulus, and 'b' equation
+ * parameter; both values use the 'i31' format, and 'b' is in Montgomery
+ * representation).
+ */
+
+static const uint32_t P256_P[] = {
+	0x00000108,
+	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x00000007,
+	0x00000000, 0x00000000, 0x00000040, 0x7FFFFF80,
+	0x000000FF
+};
+
+static const uint32_t P256_R2[] = {
+	0x00000108,
+	0x00014000, 0x00018000, 0x00000000, 0x7FF40000,
+	0x7FEFFFFF, 0x7FF7FFFF, 0x7FAFFFFF, 0x005FFFFF,
+	0x00000000
+};
+
+static const uint32_t P256_B[] = {
+	0x00000108,
+	0x6FEE1803, 0x6229C4BD, 0x21B139BE, 0x327150AA,
+	0x3567802E, 0x3F7212ED, 0x012E4355, 0x782DD38D,
+	0x0000000E
+};
+
+static const uint32_t P384_P[] = {
+	0x0000018C,
+	0x7FFFFFFF, 0x00000001, 0x00000000, 0x7FFFFFF8,
+	0x7FFFFFEF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+	0x00000FFF
+};
+
+static const uint32_t P384_R2[] = {
+	0x0000018C,
+	0x00000000, 0x00000080, 0x7FFFFE00, 0x000001FF,
+	0x00000800, 0x00000000, 0x7FFFE000, 0x00001FFF,
+	0x00008000, 0x00008000, 0x00000000, 0x00000000,
+	0x00000000
+};
+
+static const uint32_t P384_B[] = {
+	0x0000018C,
+	0x6E666840, 0x070D0392, 0x5D810231, 0x7651D50C,
+	0x17E218D6, 0x1B192002, 0x44EFE441, 0x3A524E2B,
+	0x2719BA5F, 0x41F02209, 0x36C5643E, 0x5813EFFE,
+	0x000008A5
+};
+
+static const uint32_t P521_P[] = {
+	0x00000219,
+	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+	0x01FFFFFF
+};
+
+static const uint32_t P521_R2[] = {
+	0x00000219,
+	0x00001000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000, 0x00000000, 0x00000000, 0x00000000,
+	0x00000000
+};
+
+static const uint32_t P521_B[] = {
+	0x00000219,
+	0x540FC00A, 0x228FEA35, 0x2C34F1EF, 0x67BF107A,
+	0x46FC1CD5, 0x1605E9DD, 0x6937B165, 0x272A3D8F,
+	0x42785586, 0x44C8C778, 0x15F3B8B4, 0x64B73366,
+	0x03BA8B69, 0x0D05B42A, 0x21F929A2, 0x2C31C393,
+	0x00654FAE
+};
+
+typedef struct {
+	const uint32_t *p;
+	const uint32_t *b;
+	const uint32_t *R2;
+	uint32_t p0i;
+} curve_params;
+
+static inline const curve_params *
+id_to_curve(int curve)
+{
+	static const curve_params pp[] = {
+		{ P256_P, P256_B, P256_R2, 0x00000001 },
+		{ P384_P, P384_B, P384_R2, 0x00000001 },
+		{ P521_P, P521_B, P521_R2, 0x00000001 }
+	};
+
+	return &pp[curve - BR_EC_secp256r1];
+}
+
+#define I31_LEN   ((BR_MAX_EC_SIZE + 61) / 31)
+
+/*
+ * Type for a point in Jacobian coordinates:
+ * -- three values, x, y and z, in Montgomery representation
+ * -- affine coordinates are X = x / z^2 and Y = y / z^3
+ * -- for the point at infinity, z = 0
+ */
+typedef struct {
+	uint32_t c[3][I31_LEN];
+} jacobian;
+
+/*
+ * We use a custom interpreter that uses a dozen registers, and
+ * only six operations:
+ *    MSET(d, a)       copy a into d
+ *    MADD(d, a)       d = d+a (modular)
+ *    MSUB(d, a)       d = d-a (modular)
+ *    MMUL(d, a, b)    d = a*b (Montgomery multiplication)
+ *    MINV(d, a, b)    invert d modulo p; a and b are used as scratch registers
+ *    MTZ(d)           clear return value if d = 0
+ * Destination of MMUL (d) must be distinct from operands (a and b).
+ * There is no such constraint for MSUB and MADD.
+ *
+ * Registers include the operand coordinates, and temporaries.
+ */
+#define MSET(d, a)      (0x0000 + ((d) << 8) + ((a) << 4))
+#define MADD(d, a)      (0x1000 + ((d) << 8) + ((a) << 4))
+#define MSUB(d, a)      (0x2000 + ((d) << 8) + ((a) << 4))
+#define MMUL(d, a, b)   (0x3000 + ((d) << 8) + ((a) << 4) + (b))
+#define MINV(d, a, b)   (0x4000 + ((d) << 8) + ((a) << 4) + (b))
+#define MTZ(d)          (0x5000 + ((d) << 8))
+#define ENDCODE         0
+
+/*
+ * Registers for the input operands.
+ */
+#define P1x    0
+#define P1y    1
+#define P1z    2
+#define P2x    3
+#define P2y    4
+#define P2z    5
+
+/*
+ * Alternate names for the first input operand.
+ */
+#define Px     0
+#define Py     1
+#define Pz     2
+
+/*
+ * Temporaries.
+ */
+#define t1     6
+#define t2     7
+#define t3     8
+#define t4     9
+#define t5    10
+#define t6    11
+#define t7    12
+
+/*
+ * Extra scratch registers available when there is no second operand (e.g.
+ * for "double" and "affine").
+ */
+#define t8     3
+#define t9     4
+#define t10    5
+
+/*
+ * Doubling formulas are:
+ *
+ *   s = 4*x*y^2
+ *   m = 3*(x + z^2)*(x - z^2)
+ *   x' = m^2 - 2*s
+ *   y' = m*(s - x') - 8*y^4
+ *   z' = 2*y*z
+ *
+ * If y = 0 (P has order 2) then this yields infinity (z' = 0), as it
+ * should. This case should not happen anyway, because our curves have
+ * prime order, and thus do not contain any point of order 2.
+ *
+ * If P is infinity (z = 0), then again the formulas yield infinity,
+ * which is correct. Thus, this code works for all points.
+ *
+ * Cost: 8 multiplications
+ */
+static const uint16_t code_double[] = {
+	/*
+	 * Compute z^2 (in t1).
+	 */
+	MMUL(t1, Pz, Pz),
+
+	/*
+	 * Compute x-z^2 (in t2) and then x+z^2 (in t1).
+	 */
+	MSET(t2, Px),
+	MSUB(t2, t1),
+	MADD(t1, Px),
+
+	/*
+	 * Compute m = 3*(x+z^2)*(x-z^2) (in t1).
+	 */
+	MMUL(t3, t1, t2),
+	MSET(t1, t3),
+	MADD(t1, t3),
+	MADD(t1, t3),
+
+	/*
+	 * Compute s = 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	 */
+	MMUL(t3, Py, Py),
+	MADD(t3, t3),
+	MMUL(t2, Px, t3),
+	MADD(t2, t2),
+
+	/*
+	 * Compute x' = m^2 - 2*s.
+	 */
+	MMUL(Px, t1, t1),
+	MSUB(Px, t2),
+	MSUB(Px, t2),
+
+	/*
+	 * Compute z' = 2*y*z.
+	 */
+	MMUL(t4, Py, Pz),
+	MSET(Pz, t4),
+	MADD(Pz, t4),
+
+	/*
+	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	 * 2*y^2 in t3.
+	 */
+	MSUB(t2, Px),
+	MMUL(Py, t1, t2),
+	MMUL(t4, t3, t3),
+	MSUB(Py, t4),
+	MSUB(Py, t4),
+
+	ENDCODE
+};
+
+/*
+ * Addtions formulas are:
+ *
+ *   u1 = x1 * z2^2
+ *   u2 = x2 * z1^2
+ *   s1 = y1 * z2^3
+ *   s2 = y2 * z1^3
+ *   h = u2 - u1
+ *   r = s2 - s1
+ *   x3 = r^2 - h^3 - 2 * u1 * h^2
+ *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+ *   z3 = h * z1 * z2
+ *
+ * If both P1 and P2 are infinity, then z1 == 0 and z2 == 0, implying that
+ * z3 == 0, so the result is correct.
+ * If either of P1 or P2 is infinity, but not both, then z3 == 0, which is
+ * not correct.
+ * h == 0 only if u1 == u2; this happens in two cases:
+ * -- if s1 == s2 then P1 and/or P2 is infinity, or P1 == P2
+ * -- if s1 != s2 then P1 + P2 == infinity (but neither P1 or P2 is infinity)
+ *
+ * Thus, the following situations are not handled correctly:
+ * -- P1 = 0 and P2 != 0
+ * -- P1 != 0 and P2 = 0
+ * -- P1 = P2
+ * All other cases are properly computed. However, even in "incorrect"
+ * situations, the three coordinates still are properly formed field
+ * elements.
+ *
+ * The returned flag is cleared if r == 0. This happens in the following
+ * cases:
+ * -- Both points are on the same horizontal line (same Y coordinate).
+ * -- Both points are infinity.
+ * -- One point is infinity and the other is on line Y = 0.
+ * The third case cannot happen with our curves (there is no valid point
+ * on line Y = 0 since that would be a point of order 2). If the two
+ * source points are non-infinity, then remains only the case where the
+ * two points are on the same horizontal line.
+ *
+ * This allows us to detect the "P1 == P2" case, assuming that P1 != 0 and
+ * P2 != 0:
+ * -- If the returned value is not the point at infinity, then it was properly
+ * computed.
+ * -- Otherwise, if the returned flag is 1, then P1+P2 = 0, and the result
+ * is indeed the point at infinity.
+ * -- Otherwise (result is infinity, flag is 0), then P1 = P2 and we should
+ * use the 'double' code.
+ *
+ * Cost: 16 multiplications
+ */
+static const uint16_t code_add[] = {
+	/*
+	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	 */
+	MMUL(t3, P2z, P2z),
+	MMUL(t1, P1x, t3),
+	MMUL(t4, P2z, t3),
+	MMUL(t3, P1y, t4),
+
+	/*
+	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	 */
+	MMUL(t4, P1z, P1z),
+	MMUL(t2, P2x, t4),
+	MMUL(t5, P1z, t4),
+	MMUL(t4, P2y, t5),
+
+	/*
+	 * Compute h = u2 - u1 (in t2) and r = s2 - s1 (in t4).
+	 */
+	MSUB(t2, t1),
+	MSUB(t4, t3),
+
+	/*
+	 * Report cases where r = 0 through the returned flag.
+	 */
+	MTZ(t4),
+
+	/*
+	 * Compute u1*h^2 (in t6) and h^3 (in t5).
+	 */
+	MMUL(t7, t2, t2),
+	MMUL(t6, t1, t7),
+	MMUL(t5, t7, t2),
+
+	/*
+	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	 * t1 and t7 can be used as scratch registers.
+	 */
+	MMUL(P1x, t4, t4),
+	MSUB(P1x, t5),
+	MSUB(P1x, t6),
+	MSUB(P1x, t6),
+
+	/*
+	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	 */
+	MSUB(t6, P1x),
+	MMUL(P1y, t4, t6),
+	MMUL(t1, t5, t3),
+	MSUB(P1y, t1),
+
+	/*
+	 * Compute z3 = h*z1*z2.
+	 */
+	MMUL(t1, P1z, P2z),
+	MMUL(P1z, t1, t2),
+
+	ENDCODE
+};
+
+/*
+ * Check that the point is on the curve. This code snippet assumes the
+ * following conventions:
+ * -- Coordinates x and y have been freshly decoded in P1 (but not
+ * converted to Montgomery coordinates yet).
+ * -- P2x, P2y and P2z are set to, respectively, R^2, b*R and 1.
+ */
+static const uint16_t code_check[] = {
+
+	/* Convert x and y to Montgomery representation. */
+	MMUL(t1, P1x, P2x),
+	MMUL(t2, P1y, P2x),
+	MSET(P1x, t1),
+	MSET(P1y, t2),
+
+	/* Compute x^3 in t1. */
+	MMUL(t2, P1x, P1x),
+	MMUL(t1, P1x, t2),
+
+	/* Subtract 3*x from t1. */
+	MSUB(t1, P1x),
+	MSUB(t1, P1x),
+	MSUB(t1, P1x),
+
+	/* Add b. */
+	MADD(t1, P2y),
+
+	/* Compute y^2 in t2. */
+	MMUL(t2, P1y, P1y),
+
+	/* Compare y^2 with x^3 - 3*x + b; they must match. */
+	MSUB(t1, t2),
+	MTZ(t1),
+
+	/* Set z to 1 (in Montgomery representation). */
+	MMUL(P1z, P2x, P2z),
+
+	ENDCODE
+};
+
+/*
+ * Conversion back to affine coordinates. This code snippet assumes that
+ * the z coordinate of P2 is set to 1 (not in Montgomery representation).
+ */
+static const uint16_t code_affine[] = {
+
+	/* Save z*R in t1. */
+	MSET(t1, P1z),
+
+	/* Compute z^3 in t2. */
+	MMUL(t2, P1z, P1z),
+	MMUL(t3, P1z, t2),
+	MMUL(t2, t3, P2z),
+
+	/* Invert to (1/z^3) in t2. */
+	MINV(t2, t3, t4),
+
+	/* Compute y. */
+	MSET(t3, P1y),
+	MMUL(P1y, t2, t3),
+
+	/* Compute (1/z^2) in t3. */
+	MMUL(t3, t2, t1),
+
+	/* Compute x. */
+	MSET(t2, P1x),
+	MMUL(P1x, t2, t3),
+
+	ENDCODE
+};
+
+static uint32_t
+run_code(jacobian *P1, const jacobian *P2,
+	const curve_params *cc, const uint16_t *code)
+{
+	uint32_t r;
+	uint32_t t[13][I31_LEN];
+	size_t u;
+
+	r = 1;
+
+	/*
+	 * Copy the two operands in the dedicated registers.
+	 */
+	memcpy(t[P1x], P1->c, 3 * I31_LEN * sizeof(uint32_t));
+	memcpy(t[P2x], P2->c, 3 * I31_LEN * sizeof(uint32_t));
+
+	/*
+	 * Run formulas.
+	 */
+	for (u = 0;; u ++) {
+		unsigned op, d, a, b;
+
+		op = code[u];
+		if (op == 0) {
+			break;
+		}
+		d = (op >> 8) & 0x0F;
+		a = (op >> 4) & 0x0F;
+		b = op & 0x0F;
+		op >>= 12;
+		switch (op) {
+			uint32_t ctl;
+			size_t plen;
+			unsigned char tp[(BR_MAX_EC_SIZE + 7) >> 3];
+
+		case 0:
+			memcpy(t[d], t[a], I31_LEN * sizeof(uint32_t));
+			break;
+		case 1:
+			ctl = br_i31_add(t[d], t[a], 1);
+			ctl |= NOT(br_i31_sub(t[d], cc->p, 0));
+			br_i31_sub(t[d], cc->p, ctl);
+			break;
+		case 2:
+			br_i31_add(t[d], cc->p, br_i31_sub(t[d], t[a], 1));
+			break;
+		case 3:
+			br_i31_montymul(t[d], t[a], t[b], cc->p, cc->p0i);
+			break;
+		case 4:
+			plen = (cc->p[0] - (cc->p[0] >> 5) + 7) >> 3;
+			br_i31_encode(tp, plen, cc->p);
+			tp[plen - 1] -= 2;
+			br_i31_modpow(t[d], tp, plen,
+				cc->p, cc->p0i, t[a], t[b]);
+			break;
+		default:
+			r &= ~br_i31_iszero(t[d]);
+			break;
+		}
+	}
+
+	/*
+	 * Copy back result.
+	 */
+	memcpy(P1->c, t[P1x], 3 * I31_LEN * sizeof(uint32_t));
+	return r;
+}
+
+static void
+set_one(uint32_t *x, const uint32_t *p)
+{
+	size_t plen;
+
+	plen = (p[0] + 63) >> 5;
+	memset(x, 0, plen * sizeof *x);
+	x[0] = p[0];
+	x[1] = 0x00000001;
+}
+
+static void
+point_zero(jacobian *P, const curve_params *cc)
+{
+	memset(P, 0, sizeof *P);
+	P->c[0][0] = P->c[1][0] = P->c[2][0] = cc->p[0];
+}
+
+static inline void
+point_double(jacobian *P, const curve_params *cc)
+{
+	run_code(P, P, cc, code_double);
+}
+
+static inline uint32_t
+point_add(jacobian *P1, const jacobian *P2, const curve_params *cc)
+{
+	return run_code(P1, P2, cc, code_add);
+}
+
+static void
+point_mul(jacobian *P, const unsigned char *x, size_t xlen,
+	const curve_params *cc)
+{
+	/*
+	 * We do a simple double-and-add ladder with a 2-bit window
+	 * to make only one add every two doublings. We thus first
+	 * precompute 2P and 3P in some local buffers.
+	 *
+	 * We always perform two doublings and one addition; the
+	 * addition is with P, 2P and 3P and is done in a temporary
+	 * array.
+	 *
+	 * The addition code cannot handle cases where one of the
+	 * operands is infinity, which is the case at the start of the
+	 * ladder. We therefore need to maintain a flag that controls
+	 * this situation.
+	 */
+	uint32_t qz;
+	jacobian P2, P3, Q, T, U;
+
+	memcpy(&P2, P, sizeof P2);
+	point_double(&P2, cc);
+	memcpy(&P3, P, sizeof P3);
+	point_add(&P3, &P2, cc);
+
+	point_zero(&Q, cc);
+	qz = 1;
+	while (xlen -- > 0) {
+		int k;
+
+		for (k = 6; k >= 0; k -= 2) {
+			uint32_t bits;
+			uint32_t bnz;
+
+			point_double(&Q, cc);
+			point_double(&Q, cc);
+			memcpy(&T, P, sizeof T);
+			memcpy(&U, &Q, sizeof U);
+			bits = (*x >> k) & (uint32_t)3;
+			bnz = NEQ(bits, 0);
+			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
+			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
+			point_add(&U, &T, cc);
+			CCOPY(bnz & qz, &Q, &T, sizeof Q);
+			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
+			qz &= ~bnz;
+		}
+		x ++;
+	}
+	memcpy(P, &Q, sizeof Q);
+}
+
+/*
+ * Decode point into Jacobian coordinates. This function does not support
+ * the point at infinity. If the point is invalid then this returns 0, but
+ * the coordinates are still set to properly formed field elements.
+ */
+static uint32_t
+point_decode(jacobian *P, const void *src, size_t len, const curve_params *cc)
+{
+	/*
+	 * Points must use uncompressed format:
+	 * -- first byte is 0x04;
+	 * -- coordinates X and Y use unsigned big-endian, with the same
+	 *    length as the field modulus.
+	 *
+	 * We don't support hybrid format (uncompressed, but first byte
+	 * has value 0x06 or 0x07, depending on the least significant bit
+	 * of Y) because it is rather useless, and explicitly forbidden
+	 * by PKIX (RFC 5480, section 2.2).
+	 *
+	 * We don't support compressed format either, because it is not
+	 * much used in practice (there are or were patent-related
+	 * concerns about point compression, which explains the lack of
+	 * generalised support). Also, point compression support would
+	 * need a bit more code.
+	 */
+	const unsigned char *buf;
+	size_t plen, zlen;
+	uint32_t r;
+	jacobian Q;
+
+	buf = src;
+	point_zero(P, cc);
+	plen = (cc->p[0] - (cc->p[0] >> 5) + 7) >> 3;
+	if (len != 1 + (plen << 1)) {
+		return 0;
+	}
+	r = br_i31_decode_mod(P->c[0], buf + 1, plen, cc->p);
+	r &= br_i31_decode_mod(P->c[1], buf + 1 + plen, plen, cc->p);
+
+	/*
+	 * Check first byte.
+	 */
+	r &= EQ(buf[0], 0x04);
+	/* obsolete
+	r &= EQ(buf[0], 0x04) | (EQ(buf[0] & 0xFE, 0x06)
+		& ~(uint32_t)(buf[0] ^ buf[plen << 1]));
+	*/
+
+	/*
+	 * Convert coordinates and check that the point is valid.
+	 */
+	zlen = ((cc->p[0] + 63) >> 5) * sizeof(uint32_t);
+	memcpy(Q.c[0], cc->R2, zlen);
+	memcpy(Q.c[1], cc->b, zlen);
+	set_one(Q.c[2], cc->p);
+	r &= ~run_code(P, &Q, cc, code_check);
+	return r;
+}
+
+/*
+ * Encode a point. This method assumes that the point is correct and is
+ * not the point at infinity. Encoded size is always 1+2*plen, where
+ * plen is the field modulus length, in bytes.
+ */
+static void
+point_encode(void *dst, const jacobian *P, const curve_params *cc)
+{
+	unsigned char *buf;
+	uint32_t xbl;
+	size_t plen;
+	jacobian Q, T;
+
+	buf = dst;
+	xbl = cc->p[0];
+	xbl -= (xbl >> 5);
+	plen = (xbl + 7) >> 3;
+	buf[0] = 0x04;
+	memcpy(&Q, P, sizeof *P);
+	set_one(T.c[2], cc->p);
+	run_code(&Q, &T, cc, code_affine);
+	br_i31_encode(buf + 1, plen, Q.c[0]);
+	br_i31_encode(buf + 1 + plen, plen, Q.c[1]);
+}
+
+static const br_ec_curve_def *
+id_to_curve_def(int curve)
+{
+	switch (curve) {
+	case BR_EC_secp256r1:
+		return &br_secp256r1;
+	case BR_EC_secp384r1:
+		return &br_secp384r1;
+	case BR_EC_secp521r1:
+		return &br_secp521r1;
+	}
+	return NULL;
+}
+
+static const unsigned char *
+api_generator(int curve, size_t *len)
+{
+	const br_ec_curve_def *cd;
+
+	cd = id_to_curve_def(curve);
+	*len = cd->generator_len;
+	return cd->generator;
+}
+
+static const unsigned char *
+api_order(int curve, size_t *len)
+{
+	const br_ec_curve_def *cd;
+
+	cd = id_to_curve_def(curve);
+	*len = cd->order_len;
+	return cd->order;
+}
+
+static size_t
+api_xoff(int curve, size_t *len)
+{
+	api_generator(curve, len);
+	*len >>= 1;
+	return 1;
+}
+
+static uint32_t
+api_mul(unsigned char *G, size_t Glen,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	uint32_t r;
+	const curve_params *cc;
+	jacobian P;
+
+	cc = id_to_curve(curve);
+	r = point_decode(&P, G, Glen, cc);
+	point_mul(&P, x, xlen, cc);
+	point_encode(G, &P, cc);
+	return r;
+}
+
+static size_t
+api_mulgen(unsigned char *R,
+	const unsigned char *x, size_t xlen, int curve)
+{
+	const unsigned char *G;
+	size_t Glen;
+
+	G = api_generator(curve, &Glen);
+	memcpy(R, G, Glen);
+	api_mul(R, Glen, x, xlen, curve);
+	return Glen;
+}
+
+static uint32_t
+api_muladd(unsigned char *A, const unsigned char *B, size_t len,
+	const unsigned char *x, size_t xlen,
+	const unsigned char *y, size_t ylen, int curve)
+{
+	uint32_t r, t, z;
+	const curve_params *cc;
+	jacobian P, Q;
+
+	/*
+	 * TODO: see about merging the two ladders. Right now, we do
+	 * two independent point multiplications, which is a bit
+	 * wasteful of CPU resources (but yields short code).
+	 */
+
+	cc = id_to_curve(curve);
+	r = point_decode(&P, A, len, cc);
+	if (B == NULL) {
+		size_t Glen;
+
+		B = api_generator(curve, &Glen);
+	}
+	r &= point_decode(&Q, B, len, cc);
+	point_mul(&P, x, xlen, cc);
+	point_mul(&Q, y, ylen, cc);
+
+	/*
+	 * We want to compute P+Q. Since the base points A and B are distinct
+	 * from infinity, and the multipliers are non-zero and lower than the
+	 * curve order, then we know that P and Q are non-infinity. This
+	 * leaves two special situations to test for:
+	 * -- If P = Q then we must use point_double().
+	 * -- If P+Q = 0 then we must report an error.
+	 */
+	t = point_add(&P, &Q, cc);
+	point_double(&Q, cc);
+	z = br_i31_iszero(P.c[2]);
+
+	/*
+	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	 * have the following:
+	 *
+	 *   z = 0, t = 0   return P (normal addition)
+	 *   z = 0, t = 1   return P (normal addition)
+	 *   z = 1, t = 0   return Q (a 'double' case)
+	 *   z = 1, t = 1   report an error (P+Q = 0)
+	 */
+	CCOPY(z & ~t, &P, &Q, sizeof Q);
+	point_encode(A, &P, cc);
+	r &= ~(z & t);
+
+	return r;
+}
+
+/* see bearssl_ec.h */
+const br_ec_impl br_ec_prime_i31 = {
+	(uint32_t)0x03800000,
+	&api_generator,
+	&api_order,
+	&api_xoff,
+	&api_mul,
+	&api_mulgen,
+	&api_muladd
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_pubkey.c b/test/monniaux/BearSSL/src/ec/ec_pubkey.c
new file mode 100644
index 00000000..383ff286
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_pubkey.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char POINT_LEN[] = {
+	  0,   /* 0: not a valid curve ID */
+	 43,   /* sect163k1 */
+	 43,   /* sect163r1 */
+	 43,   /* sect163r2 */
+	 51,   /* sect193r1 */
+	 51,   /* sect193r2 */
+	 61,   /* sect233k1 */
+	 61,   /* sect233r1 */
+	 61,   /* sect239k1 */
+	 73,   /* sect283k1 */
+	 73,   /* sect283r1 */
+	105,   /* sect409k1 */
+	105,   /* sect409r1 */
+	145,   /* sect571k1 */
+	145,   /* sect571r1 */
+	 41,   /* secp160k1 */
+	 41,   /* secp160r1 */
+	 41,   /* secp160r2 */
+	 49,   /* secp192k1 */
+	 49,   /* secp192r1 */
+	 57,   /* secp224k1 */
+	 57,   /* secp224r1 */
+	 65,   /* secp256k1 */
+	 65,   /* secp256r1 */
+	 97,   /* secp384r1 */
+	133,   /* secp521r1 */
+	 65,   /* brainpoolP256r1 */
+	 97,   /* brainpoolP384r1 */
+	129,   /* brainpoolP512r1 */
+	 32,   /* curve25519 */
+	 56,   /* curve448 */
+};
+
+/* see bearssl_ec.h */
+size_t
+br_ec_compute_pub(const br_ec_impl *impl, br_ec_public_key *pk,
+	void *kbuf, const br_ec_private_key *sk)
+{
+	int curve;
+	size_t len;
+
+	curve = sk->curve;
+	if (curve < 0 || curve >= 32 || curve >= (int)(sizeof POINT_LEN)
+		|| ((impl->supported_curves >> curve) & 1) == 0)
+	{
+		return 0;
+	}
+	if (kbuf == NULL) {
+		return POINT_LEN[curve];
+	}
+	len = impl->mulgen(kbuf, sk->x, sk->xlen, curve);
+	if (pk != NULL) {
+		pk->curve = curve;
+		pk->q = kbuf;
+		pk->qlen = len;
+	}
+	return len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ec_secp256r1.c b/test/monniaux/BearSSL/src/ec/ec_secp256r1.c
new file mode 100644
index 00000000..a9d6c456
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_secp256r1.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char P256_N[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xBC, 0xE6, 0xFA, 0xAD, 0xA7, 0x17, 0x9E, 0x84,
+	0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, 0x25, 0x51
+};
+
+static const unsigned char P256_G[] = {
+	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42,
+	0x47, 0xF8, 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40,
+	0xF2, 0x77, 0x03, 0x7D, 0x81, 0x2D, 0xEB, 0x33,
+	0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, 0x98, 0xC2,
+	0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
+	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E,
+	0x16, 0x2B, 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E,
+	0xCE, 0xCB, 0xB6, 0x40, 0x68, 0x37, 0xBF, 0x51,
+	0xF5
+};
+
+/* see inner.h */
+const br_ec_curve_def br_secp256r1 = {
+	BR_EC_secp256r1,
+	P256_N, sizeof P256_N,
+	P256_G, sizeof P256_G
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_secp384r1.c b/test/monniaux/BearSSL/src/ec/ec_secp384r1.c
new file mode 100644
index 00000000..693d93e4
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_secp384r1.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char P384_N[] = {
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 
+	0xC7, 0x63, 0x4D, 0x81, 0xF4, 0x37, 0x2D, 0xDF, 
+	0x58, 0x1A, 0x0D, 0xB2, 0x48, 0xB0, 0xA7, 0x7A, 
+	0xEC, 0xEC, 0x19, 0x6A, 0xCC, 0xC5, 0x29, 0x73
+};
+
+static const unsigned char P384_G[] = {
+	0x04, 0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05,
+	0x37, 0x8E, 0xB1, 0xC7, 0x1E, 0xF3, 0x20, 0xAD,
+	0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B,
+	0x98, 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A,
+	0x38, 0x55, 0x02, 0xF2, 0x5D, 0xBF, 0x55, 0x29,
+	0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A,
+	0xB7, 0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C,
+	0x6F, 0x5D, 0x9E, 0x98, 0xBF, 0x92, 0x92, 0xDC,
+	0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14,
+	0x7C, 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8,
+	0xC0, 0x0A, 0x60, 0xB1, 0xCE, 0x1D, 0x7E, 0x81,
+	0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E,
+	0x5F
+};
+
+/* see inner.h */
+const br_ec_curve_def br_secp384r1 = {
+	BR_EC_secp384r1,
+	P384_N, sizeof P384_N,
+	P384_G, sizeof P384_G
+};
diff --git a/test/monniaux/BearSSL/src/ec/ec_secp521r1.c b/test/monniaux/BearSSL/src/ec/ec_secp521r1.c
new file mode 100644
index 00000000..161acd0e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ec_secp521r1.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const unsigned char P521_N[] = {
+	0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	0xFF, 0xFA, 0x51, 0x86, 0x87, 0x83, 0xBF, 0x2F,
+	0x96, 0x6B, 0x7F, 0xCC, 0x01, 0x48, 0xF7, 0x09,
+	0xA5, 0xD0, 0x3B, 0xB5, 0xC9, 0xB8, 0x89, 0x9C,
+	0x47, 0xAE, 0xBB, 0x6F, 0xB7, 0x1E, 0x91, 0x38,
+	0x64, 0x09
+};
+
+static const unsigned char P521_G[] = {
+	0x04, 0x00, 0xC6, 0x85, 0x8E, 0x06, 0xB7, 0x04,
+	0x04, 0xE9, 0xCD, 0x9E, 0x3E, 0xCB, 0x66, 0x23,
+	0x95, 0xB4, 0x42, 0x9C, 0x64, 0x81, 0x39, 0x05,
+	0x3F, 0xB5, 0x21, 0xF8, 0x28, 0xAF, 0x60, 0x6B,
+	0x4D, 0x3D, 0xBA, 0xA1, 0x4B, 0x5E, 0x77, 0xEF,
+	0xE7, 0x59, 0x28, 0xFE, 0x1D, 0xC1, 0x27, 0xA2,
+	0xFF, 0xA8, 0xDE, 0x33, 0x48, 0xB3, 0xC1, 0x85,
+	0x6A, 0x42, 0x9B, 0xF9, 0x7E, 0x7E, 0x31, 0xC2,
+	0xE5, 0xBD, 0x66, 0x01, 0x18, 0x39, 0x29, 0x6A,
+	0x78, 0x9A, 0x3B, 0xC0, 0x04, 0x5C, 0x8A, 0x5F,
+	0xB4, 0x2C, 0x7D, 0x1B, 0xD9, 0x98, 0xF5, 0x44,
+	0x49, 0x57, 0x9B, 0x44, 0x68, 0x17, 0xAF, 0xBD,
+	0x17, 0x27, 0x3E, 0x66, 0x2C, 0x97, 0xEE, 0x72,
+	0x99, 0x5E, 0xF4, 0x26, 0x40, 0xC5, 0x50, 0xB9,
+	0x01, 0x3F, 0xAD, 0x07, 0x61, 0x35, 0x3C, 0x70,
+	0x86, 0xA2, 0x72, 0xC2, 0x40, 0x88, 0xBE, 0x94,
+	0x76, 0x9F, 0xD1, 0x66, 0x50
+};
+
+/* see inner.h */
+const br_ec_curve_def br_secp521r1 = {
+	BR_EC_secp521r1,
+	P521_N, sizeof P521_N,
+	P521_G, sizeof P521_G
+};
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_atr.c b/test/monniaux/BearSSL/src/ec/ecdsa_atr.c
new file mode 100644
index 00000000..3a11226e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_atr.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_asn1_to_raw(void *sig, size_t sig_len)
+{
+	/*
+	 * Note: this code is a bit lenient in that it accepts a few
+	 * deviations to DER with regards to minimality of encoding of
+	 * lengths and integer values. These deviations are still
+	 * unambiguous.
+	 *
+	 * Signature format is a SEQUENCE of two INTEGER values. We
+	 * support only integers of less than 127 bytes each (signed
+	 * encoding) so the resulting raw signature will have length
+	 * at most 254 bytes.
+	 */
+
+	unsigned char *buf, *r, *s;
+	size_t zlen, rlen, slen, off;
+	unsigned char tmp[254];
+
+	buf = sig;
+	if (sig_len < 8) {
+		return 0;
+	}
+
+	/*
+	 * First byte is SEQUENCE tag.
+	 */
+	if (buf[0] != 0x30) {
+		return 0;
+	}
+
+	/*
+	 * The SEQUENCE length will be encoded over one or two bytes. We
+	 * limit the total SEQUENCE contents to 255 bytes, because it
+	 * makes things simpler; this is enough for subgroup orders up
+	 * to 999 bits.
+	 */
+	zlen = buf[1];
+	if (zlen > 0x80) {
+		if (zlen != 0x81) {
+			return 0;
+		}
+		zlen = buf[2];
+		if (zlen != sig_len - 3) {
+			return 0;
+		}
+		off = 3;
+	} else {
+		if (zlen != sig_len - 2) {
+			return 0;
+		}
+		off = 2;
+	}
+
+	/*
+	 * First INTEGER (r).
+	 */
+	if (buf[off ++] != 0x02) {
+		return 0;
+	}
+	rlen = buf[off ++];
+	if (rlen >= 0x80) {
+		return 0;
+	}
+	r = buf + off;
+	off += rlen;
+
+	/*
+	 * Second INTEGER (s).
+	 */
+	if (off + 2 > sig_len) {
+		return 0;
+	}
+	if (buf[off ++] != 0x02) {
+		return 0;
+	}
+	slen = buf[off ++];
+	if (slen >= 0x80 || slen != sig_len - off) {
+		return 0;
+	}
+	s = buf + off;
+
+	/*
+	 * Removing leading zeros from r and s.
+	 */
+	while (rlen > 0 && *r == 0) {
+		rlen --;
+		r ++;
+	}
+	while (slen > 0 && *s == 0) {
+		slen --;
+		s ++;
+	}
+
+	/*
+	 * Compute common length for the two integers, then copy integers
+	 * into the temporary buffer, and finally copy it back over the
+	 * signature buffer.
+	 */
+	zlen = rlen > slen ? rlen : slen;
+	sig_len = zlen << 1;
+	memset(tmp, 0, sig_len);
+	memcpy(tmp + zlen - rlen, r, rlen);
+	memcpy(tmp + sig_len - slen, s, slen);
+	memcpy(sig, tmp, sig_len);
+	return sig_len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c
new file mode 100644
index 00000000..afbf8acb
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+br_ecdsa_sign
+br_ecdsa_sign_asn1_get_default(void)
+{
+#if BR_LOMUL
+	return &br_ecdsa_i15_sign_asn1;
+#else
+	return &br_ecdsa_i31_sign_asn1;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c
new file mode 100644
index 00000000..287c9704
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+br_ecdsa_sign
+br_ecdsa_sign_raw_get_default(void)
+{
+#if BR_LOMUL
+	return &br_ecdsa_i15_sign_raw;
+#else
+	return &br_ecdsa_i31_sign_raw;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c
new file mode 100644
index 00000000..fe0996e8
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+br_ecdsa_vrfy
+br_ecdsa_vrfy_asn1_get_default(void)
+{
+#if BR_LOMUL
+	return &br_ecdsa_i15_vrfy_asn1;
+#else
+	return &br_ecdsa_i31_vrfy_asn1;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c
new file mode 100644
index 00000000..e564a105
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_ec.h */
+br_ecdsa_vrfy
+br_ecdsa_vrfy_raw_get_default(void)
+{
+#if BR_LOMUL
+	return &br_ecdsa_i15_vrfy_raw;
+#else
+	return &br_ecdsa_i31_vrfy_raw;
+#endif
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c
new file mode 100644
index 00000000..402d14a6
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_ecdsa_i15_bits2int(uint16_t *x,
+	const void *src, size_t len, uint32_t ebitlen)
+{
+	uint32_t bitlen, hbitlen;
+	int sc;
+
+	bitlen = ebitlen - (ebitlen >> 4);
+	hbitlen = (uint32_t)len << 3;
+	if (hbitlen > bitlen) {
+		len = (bitlen + 7) >> 3;
+		sc = (int)((hbitlen - bitlen) & 7);
+	} else {
+		sc = 0;
+	}
+	br_i15_zero(x, ebitlen);
+	br_i15_decode(x, src, len);
+	br_i15_rshift(x, sc);
+	x[0] = ebitlen;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c
new file mode 100644
index 00000000..ab4a283c
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define ORDER_LEN   ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_i15_sign_asn1(const br_ec_impl *impl,
+	const br_hash_class *hf, const void *hash_value,
+	const br_ec_private_key *sk, void *sig)
+{
+	unsigned char rsig[(ORDER_LEN << 1) + 12];
+	size_t sig_len;
+
+	sig_len = br_ecdsa_i15_sign_raw(impl, hf, hash_value, sk, rsig);
+	if (sig_len == 0) {
+		return 0;
+	}
+	sig_len = br_ecdsa_raw_to_asn1(rsig, sig_len);
+	memcpy(sig, rsig, sig_len);
+	return sig_len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c
new file mode 100644
index 00000000..39b2e1d7
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define I15_LEN     ((BR_MAX_EC_SIZE + 29) / 15)
+#define POINT_LEN   (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1))
+#define ORDER_LEN   ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_i15_sign_raw(const br_ec_impl *impl,
+	const br_hash_class *hf, const void *hash_value,
+	const br_ec_private_key *sk, void *sig)
+{
+	/*
+	 * IMPORTANT: this code is fit only for curves with a prime
+	 * order. This is needed so that modular reduction of the X
+	 * coordinate of a point can be done with a simple subtraction.
+	 * We also rely on the last byte of the curve order to be distinct
+	 * from 0 and 1.
+	 */
+	const br_ec_curve_def *cd;
+	uint16_t n[I15_LEN], r[I15_LEN], s[I15_LEN], x[I15_LEN];
+	uint16_t m[I15_LEN], k[I15_LEN], t1[I15_LEN], t2[I15_LEN];
+	unsigned char tt[ORDER_LEN << 1];
+	unsigned char eU[POINT_LEN];
+	size_t hash_len, nlen, ulen;
+	uint16_t n0i;
+	uint32_t ctl;
+	br_hmac_drbg_context drbg;
+
+	/*
+	 * If the curve is not supported, then exit with an error.
+	 */
+	if (((impl->supported_curves >> sk->curve) & 1) == 0) {
+		return 0;
+	}
+
+	/*
+	 * Get the curve parameters (generator and order).
+	 */
+	switch (sk->curve) {
+	case BR_EC_secp256r1:
+		cd = &br_secp256r1;
+		break;
+	case BR_EC_secp384r1:
+		cd = &br_secp384r1;
+		break;
+	case BR_EC_secp521r1:
+		cd = &br_secp521r1;
+		break;
+	default:
+		return 0;
+	}
+
+	/*
+	 * Get modulus.
+	 */
+	nlen = cd->order_len;
+	br_i15_decode(n, cd->order, nlen);
+	n0i = br_i15_ninv15(n[1]);
+
+	/*
+	 * Get private key as an i15 integer. This also checks that the
+	 * private key is well-defined (not zero, and less than the
+	 * curve order).
+	 */
+	if (!br_i15_decode_mod(x, sk->x, sk->xlen, n)) {
+		return 0;
+	}
+	if (br_i15_iszero(x)) {
+		return 0;
+	}
+
+	/*
+	 * Get hash length.
+	 */
+	hash_len = (hf->desc >> BR_HASHDESC_OUT_OFF) & BR_HASHDESC_OUT_MASK;
+
+	/*
+	 * Truncate and reduce the hash value modulo the curve order.
+	 */
+	br_ecdsa_i15_bits2int(m, hash_value, hash_len, n[0]);
+	br_i15_sub(m, n, br_i15_sub(m, n, 0) ^ 1);
+
+	/*
+	 * RFC 6979 generation of the "k" value.
+	 *
+	 * The process uses HMAC_DRBG (with the hash function used to
+	 * process the message that is to be signed). The seed is the
+	 * concatenation of the encodings of the private key and
+	 * the hash value (after truncation and modular reduction).
+	 */
+	br_i15_encode(tt, nlen, x);
+	br_i15_encode(tt + nlen, nlen, m);
+	br_hmac_drbg_init(&drbg, hf, tt, nlen << 1);
+	for (;;) {
+		br_hmac_drbg_generate(&drbg, tt, nlen);
+		br_ecdsa_i15_bits2int(k, tt, nlen, n[0]);
+		if (br_i15_iszero(k)) {
+			continue;
+		}
+		if (br_i15_sub(k, n, 0)) {
+			break;
+		}
+	}
+
+	/*
+	 * Compute k*G and extract the X coordinate, then reduce it
+	 * modulo the curve order. Since we support only curves with
+	 * prime order, that reduction is only a matter of computing
+	 * a subtraction.
+	 */
+	br_i15_encode(tt, nlen, k);
+	ulen = impl->mulgen(eU, tt, nlen, sk->curve);
+	br_i15_zero(r, n[0]);
+	br_i15_decode(r, &eU[1], ulen >> 1);
+	r[0] = n[0];
+	br_i15_sub(r, n, br_i15_sub(r, n, 0) ^ 1);
+
+	/*
+	 * Compute 1/k in double-Montgomery representation. We do so by
+	 * first converting _from_ Montgomery representation (twice),
+	 * then using a modular exponentiation.
+	 */
+	br_i15_from_monty(k, n, n0i);
+	br_i15_from_monty(k, n, n0i);
+	memcpy(tt, cd->order, nlen);
+	tt[nlen - 1] -= 2;
+	br_i15_modpow(k, tt, nlen, n, n0i, t1, t2);
+
+	/*
+	 * Compute s = (m+xr)/k (mod n).
+	 * The k[] array contains R^2/k (double-Montgomery representation);
+	 * we thus can use direct Montgomery multiplications and conversions
+	 * from Montgomery, avoiding any call to br_i15_to_monty() (which
+	 * is slower).
+	 */
+	br_i15_from_monty(m, n, n0i);
+	br_i15_montymul(t1, x, r, n, n0i);
+	ctl = br_i15_add(t1, m, 1);
+	ctl |= br_i15_sub(t1, n, 0) ^ 1;
+	br_i15_sub(t1, n, ctl);
+	br_i15_montymul(s, t1, k, n, n0i);
+
+	/*
+	 * Encode r and s in the signature.
+	 */
+	br_i15_encode(sig, nlen, r);
+	br_i15_encode((unsigned char *)sig + nlen, nlen, s);
+	return nlen << 1;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c
new file mode 100644
index 00000000..f4bef997
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define FIELD_LEN   ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+uint32_t
+br_ecdsa_i15_vrfy_asn1(const br_ec_impl *impl,
+	const void *hash, size_t hash_len,
+	const br_ec_public_key *pk,
+	const void *sig, size_t sig_len)
+{
+	/*
+	 * We use a double-sized buffer because a malformed ASN.1 signature
+	 * may trigger a size expansion when converting to "raw" format.
+	 */
+	unsigned char rsig[(FIELD_LEN << 2) + 24];
+
+	if (sig_len > ((sizeof rsig) >> 1)) {
+		return 0;
+	}
+	memcpy(rsig, sig, sig_len);
+	sig_len = br_ecdsa_asn1_to_raw(rsig, sig_len);
+	return br_ecdsa_i15_vrfy_raw(impl, hash, hash_len, pk, rsig, sig_len);
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c
new file mode 100644
index 00000000..14dd5e46
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define I15_LEN     ((BR_MAX_EC_SIZE + 29) / 15)
+#define POINT_LEN   (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1))
+
+/* see bearssl_ec.h */
+uint32_t
+br_ecdsa_i15_vrfy_raw(const br_ec_impl *impl,
+	const void *hash, size_t hash_len,
+	const br_ec_public_key *pk,
+	const void *sig, size_t sig_len)
+{
+	/*
+	 * IMPORTANT: this code is fit only for curves with a prime
+	 * order. This is needed so that modular reduction of the X
+	 * coordinate of a point can be done with a simple subtraction.
+	 */
+	const br_ec_curve_def *cd;
+	uint16_t n[I15_LEN], r[I15_LEN], s[I15_LEN], t1[I15_LEN], t2[I15_LEN];
+	unsigned char tx[(BR_MAX_EC_SIZE + 7) >> 3];
+	unsigned char ty[(BR_MAX_EC_SIZE + 7) >> 3];
+	unsigned char eU[POINT_LEN];
+	size_t nlen, rlen, ulen;
+	uint16_t n0i;
+	uint32_t res;
+
+	/*
+	 * If the curve is not supported, then report an error.
+	 */
+	if (((impl->supported_curves >> pk->curve) & 1) == 0) {
+		return 0;
+	}
+
+	/*
+	 * Get the curve parameters (generator and order).
+	 */
+	switch (pk->curve) {
+	case BR_EC_secp256r1:
+		cd = &br_secp256r1;
+		break;
+	case BR_EC_secp384r1:
+		cd = &br_secp384r1;
+		break;
+	case BR_EC_secp521r1:
+		cd = &br_secp521r1;
+		break;
+	default:
+		return 0;
+	}
+
+	/*
+	 * Signature length must be even.
+	 */
+	if (sig_len & 1) {
+		return 0;
+	}
+	rlen = sig_len >> 1;
+
+	/*
+	 * Public key point must have the proper size for this curve.
+	 */
+	if (pk->qlen != cd->generator_len) {
+		return 0;
+	}
+
+	/*
+	 * Get modulus; then decode the r and s values. They must be
+	 * lower than the modulus, and s must not be null.
+	 */
+	nlen = cd->order_len;
+	br_i15_decode(n, cd->order, nlen);
+	n0i = br_i15_ninv15(n[1]);
+	if (!br_i15_decode_mod(r, sig, rlen, n)) {
+		return 0;
+	}
+	if (!br_i15_decode_mod(s, (const unsigned char *)sig + rlen, rlen, n)) {
+		return 0;
+	}
+	if (br_i15_iszero(s)) {
+		return 0;
+	}
+
+	/*
+	 * Invert s. We do that with a modular exponentiation; we use
+	 * the fact that for all the curves we support, the least
+	 * significant byte is not 0 or 1, so we can subtract 2 without
+	 * any carry to process.
+	 * We also want 1/s in Montgomery representation, which can be
+	 * done by converting _from_ Montgomery representation before
+	 * the inversion (because (1/s)*R = 1/(s/R)).
+	 */
+	br_i15_from_monty(s, n, n0i);
+	memcpy(tx, cd->order, nlen);
+	tx[nlen - 1] -= 2;
+	br_i15_modpow(s, tx, nlen, n, n0i, t1, t2);
+
+	/*
+	 * Truncate the hash to the modulus length (in bits) and reduce
+	 * it modulo the curve order. The modular reduction can be done
+	 * with a subtraction since the truncation already reduced the
+	 * value to the modulus bit length.
+	 */
+	br_ecdsa_i15_bits2int(t1, hash, hash_len, n[0]);
+	br_i15_sub(t1, n, br_i15_sub(t1, n, 0) ^ 1);
+
+	/*
+	 * Multiply the (truncated, reduced) hash value with 1/s, result in
+	 * t2, encoded in ty.
+	 */
+	br_i15_montymul(t2, t1, s, n, n0i);
+	br_i15_encode(ty, nlen, t2);
+
+	/*
+	 * Multiply r with 1/s, result in t1, encoded in tx.
+	 */
+	br_i15_montymul(t1, r, s, n, n0i);
+	br_i15_encode(tx, nlen, t1);
+
+	/*
+	 * Compute the point x*Q + y*G.
+	 */
+	ulen = cd->generator_len;
+	memcpy(eU, pk->q, ulen);
+	res = impl->muladd(eU, NULL, ulen,
+		tx, nlen, ty, nlen, cd->curve);
+
+	/*
+	 * Get the X coordinate, reduce modulo the curve order, and
+	 * compare with the 'r' value.
+	 *
+	 * The modular reduction can be done with subtractions because
+	 * we work with curves of prime order, so the curve order is
+	 * close to the field order (Hasse's theorem).
+	 */
+	br_i15_zero(t1, n[0]);
+	br_i15_decode(t1, &eU[1], ulen >> 1);
+	t1[0] = n[0];
+	br_i15_sub(t1, n, br_i15_sub(t1, n, 0) ^ 1);
+	res &= ~br_i15_sub(t1, r, 1);
+	res &= br_i15_iszero(t1);
+	return res;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c
new file mode 100644
index 00000000..9a8d6730
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_ecdsa_i31_bits2int(uint32_t *x,
+	const void *src, size_t len, uint32_t ebitlen)
+{
+	uint32_t bitlen, hbitlen;
+	int sc;
+
+	bitlen = ebitlen - (ebitlen >> 5);
+	hbitlen = (uint32_t)len << 3;
+	if (hbitlen > bitlen) {
+		len = (bitlen + 7) >> 3;
+		sc = (int)((hbitlen - bitlen) & 7);
+	} else {
+		sc = 0;
+	}
+	br_i31_zero(x, ebitlen);
+	br_i31_decode(x, src, len);
+	br_i31_rshift(x, sc);
+	x[0] = ebitlen;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c
new file mode 100644
index 00000000..cf0d351d
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define ORDER_LEN   ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_i31_sign_asn1(const br_ec_impl *impl,
+	const br_hash_class *hf, const void *hash_value,
+	const br_ec_private_key *sk, void *sig)
+{
+	unsigned char rsig[(ORDER_LEN << 1) + 12];
+	size_t sig_len;
+
+	sig_len = br_ecdsa_i31_sign_raw(impl, hf, hash_value, sk, rsig);
+	if (sig_len == 0) {
+		return 0;
+	}
+	sig_len = br_ecdsa_raw_to_asn1(rsig, sig_len);
+	memcpy(sig, rsig, sig_len);
+	return sig_len;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c
new file mode 100644
index 00000000..1df98fed
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define I31_LEN     ((BR_MAX_EC_SIZE + 61) / 31)
+#define POINT_LEN   (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1))
+#define ORDER_LEN   ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_i31_sign_raw(const br_ec_impl *impl,
+	const br_hash_class *hf, const void *hash_value,
+	const br_ec_private_key *sk, void *sig)
+{
+	/*
+	 * IMPORTANT: this code is fit only for curves with a prime
+	 * order. This is needed so that modular reduction of the X
+	 * coordinate of a point can be done with a simple subtraction.
+	 * We also rely on the last byte of the curve order to be distinct
+	 * from 0 and 1.
+	 */
+	const br_ec_curve_def *cd;
+	uint32_t n[I31_LEN], r[I31_LEN], s[I31_LEN], x[I31_LEN];
+	uint32_t m[I31_LEN], k[I31_LEN], t1[I31_LEN], t2[I31_LEN];
+	unsigned char tt[ORDER_LEN << 1];
+	unsigned char eU[POINT_LEN];
+	size_t hash_len, nlen, ulen;
+	uint32_t n0i, ctl;
+	br_hmac_drbg_context drbg;
+
+	/*
+	 * If the curve is not supported, then exit with an error.
+	 */
+	if (((impl->supported_curves >> sk->curve) & 1) == 0) {
+		return 0;
+	}
+
+	/*
+	 * Get the curve parameters (generator and order).
+	 */
+	switch (sk->curve) {
+	case BR_EC_secp256r1:
+		cd = &br_secp256r1;
+		break;
+	case BR_EC_secp384r1:
+		cd = &br_secp384r1;
+		break;
+	case BR_EC_secp521r1:
+		cd = &br_secp521r1;
+		break;
+	default:
+		return 0;
+	}
+
+	/*
+	 * Get modulus.
+	 */
+	nlen = cd->order_len;
+	br_i31_decode(n, cd->order, nlen);
+	n0i = br_i31_ninv31(n[1]);
+
+	/*
+	 * Get private key as an i31 integer. This also checks that the
+	 * private key is well-defined (not zero, and less than the
+	 * curve order).
+	 */
+	if (!br_i31_decode_mod(x, sk->x, sk->xlen, n)) {
+		return 0;
+	}
+	if (br_i31_iszero(x)) {
+		return 0;
+	}
+
+	/*
+	 * Get hash length.
+	 */
+	hash_len = (hf->desc >> BR_HASHDESC_OUT_OFF) & BR_HASHDESC_OUT_MASK;
+
+	/*
+	 * Truncate and reduce the hash value modulo the curve order.
+	 */
+	br_ecdsa_i31_bits2int(m, hash_value, hash_len, n[0]);
+	br_i31_sub(m, n, br_i31_sub(m, n, 0) ^ 1);
+
+	/*
+	 * RFC 6979 generation of the "k" value.
+	 *
+	 * The process uses HMAC_DRBG (with the hash function used to
+	 * process the message that is to be signed). The seed is the
+	 * concatenation of the encodings of the private key and
+	 * the hash value (after truncation and modular reduction).
+	 */
+	br_i31_encode(tt, nlen, x);
+	br_i31_encode(tt + nlen, nlen, m);
+	br_hmac_drbg_init(&drbg, hf, tt, nlen << 1);
+	for (;;) {
+		br_hmac_drbg_generate(&drbg, tt, nlen);
+		br_ecdsa_i31_bits2int(k, tt, nlen, n[0]);
+		if (br_i31_iszero(k)) {
+			continue;
+		}
+		if (br_i31_sub(k, n, 0)) {
+			break;
+		}
+	}
+
+	/*
+	 * Compute k*G and extract the X coordinate, then reduce it
+	 * modulo the curve order. Since we support only curves with
+	 * prime order, that reduction is only a matter of computing
+	 * a subtraction.
+	 */
+	br_i31_encode(tt, nlen, k);
+	ulen = impl->mulgen(eU, tt, nlen, sk->curve);
+	br_i31_zero(r, n[0]);
+	br_i31_decode(r, &eU[1], ulen >> 1);
+	r[0] = n[0];
+	br_i31_sub(r, n, br_i31_sub(r, n, 0) ^ 1);
+
+	/*
+	 * Compute 1/k in double-Montgomery representation. We do so by
+	 * first converting _from_ Montgomery representation (twice),
+	 * then using a modular exponentiation.
+	 */
+	br_i31_from_monty(k, n, n0i);
+	br_i31_from_monty(k, n, n0i);
+	memcpy(tt, cd->order, nlen);
+	tt[nlen - 1] -= 2;
+	br_i31_modpow(k, tt, nlen, n, n0i, t1, t2);
+
+	/*
+	 * Compute s = (m+xr)/k (mod n).
+	 * The k[] array contains R^2/k (double-Montgomery representation);
+	 * we thus can use direct Montgomery multiplications and conversions
+	 * from Montgomery, avoiding any call to br_i31_to_monty() (which
+	 * is slower).
+	 */
+	br_i31_from_monty(m, n, n0i);
+	br_i31_montymul(t1, x, r, n, n0i);
+	ctl = br_i31_add(t1, m, 1);
+	ctl |= br_i31_sub(t1, n, 0) ^ 1;
+	br_i31_sub(t1, n, ctl);
+	br_i31_montymul(s, t1, k, n, n0i);
+
+	/*
+	 * Encode r and s in the signature.
+	 */
+	br_i31_encode(sig, nlen, r);
+	br_i31_encode((unsigned char *)sig + nlen, nlen, s);
+	return nlen << 1;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c
new file mode 100644
index 00000000..4161aaaa
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define FIELD_LEN   ((BR_MAX_EC_SIZE + 7) >> 3)
+
+/* see bearssl_ec.h */
+uint32_t
+br_ecdsa_i31_vrfy_asn1(const br_ec_impl *impl,
+	const void *hash, size_t hash_len,
+	const br_ec_public_key *pk,
+	const void *sig, size_t sig_len)
+{
+	/*
+	 * We use a double-sized buffer because a malformed ASN.1 signature
+	 * may trigger a size expansion when converting to "raw" format.
+	 */
+	unsigned char rsig[(FIELD_LEN << 2) + 24];
+
+	if (sig_len > ((sizeof rsig) >> 1)) {
+		return 0;
+	}
+	memcpy(rsig, sig, sig_len);
+	sig_len = br_ecdsa_asn1_to_raw(rsig, sig_len);
+	return br_ecdsa_i31_vrfy_raw(impl, hash, hash_len, pk, rsig, sig_len);
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c
new file mode 100644
index 00000000..259477fd
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define I31_LEN     ((BR_MAX_EC_SIZE + 61) / 31)
+#define POINT_LEN   (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1))
+
+/* see bearssl_ec.h */
+uint32_t
+br_ecdsa_i31_vrfy_raw(const br_ec_impl *impl,
+	const void *hash, size_t hash_len,
+	const br_ec_public_key *pk,
+	const void *sig, size_t sig_len)
+{
+	/*
+	 * IMPORTANT: this code is fit only for curves with a prime
+	 * order. This is needed so that modular reduction of the X
+	 * coordinate of a point can be done with a simple subtraction.
+	 */
+	const br_ec_curve_def *cd;
+	uint32_t n[I31_LEN], r[I31_LEN], s[I31_LEN], t1[I31_LEN], t2[I31_LEN];
+	unsigned char tx[(BR_MAX_EC_SIZE + 7) >> 3];
+	unsigned char ty[(BR_MAX_EC_SIZE + 7) >> 3];
+	unsigned char eU[POINT_LEN];
+	size_t nlen, rlen, ulen;
+	uint32_t n0i, res;
+
+	/*
+	 * If the curve is not supported, then report an error.
+	 */
+	if (((impl->supported_curves >> pk->curve) & 1) == 0) {
+		return 0;
+	}
+
+	/*
+	 * Get the curve parameters (generator and order).
+	 */
+	switch (pk->curve) {
+	case BR_EC_secp256r1:
+		cd = &br_secp256r1;
+		break;
+	case BR_EC_secp384r1:
+		cd = &br_secp384r1;
+		break;
+	case BR_EC_secp521r1:
+		cd = &br_secp521r1;
+		break;
+	default:
+		return 0;
+	}
+
+	/*
+	 * Signature length must be even.
+	 */
+	if (sig_len & 1) {
+		return 0;
+	}
+	rlen = sig_len >> 1;
+
+	/*
+	 * Public key point must have the proper size for this curve.
+	 */
+	if (pk->qlen != cd->generator_len) {
+		return 0;
+	}
+
+	/*
+	 * Get modulus; then decode the r and s values. They must be
+	 * lower than the modulus, and s must not be null.
+	 */
+	nlen = cd->order_len;
+	br_i31_decode(n, cd->order, nlen);
+	n0i = br_i31_ninv31(n[1]);
+	if (!br_i31_decode_mod(r, sig, rlen, n)) {
+		return 0;
+	}
+	if (!br_i31_decode_mod(s, (const unsigned char *)sig + rlen, rlen, n)) {
+		return 0;
+	}
+	if (br_i31_iszero(s)) {
+		return 0;
+	}
+
+	/*
+	 * Invert s. We do that with a modular exponentiation; we use
+	 * the fact that for all the curves we support, the least
+	 * significant byte is not 0 or 1, so we can subtract 2 without
+	 * any carry to process.
+	 * We also want 1/s in Montgomery representation, which can be
+	 * done by converting _from_ Montgomery representation before
+	 * the inversion (because (1/s)*R = 1/(s/R)).
+	 */
+	br_i31_from_monty(s, n, n0i);
+	memcpy(tx, cd->order, nlen);
+	tx[nlen - 1] -= 2;
+	br_i31_modpow(s, tx, nlen, n, n0i, t1, t2);
+
+	/*
+	 * Truncate the hash to the modulus length (in bits) and reduce
+	 * it modulo the curve order. The modular reduction can be done
+	 * with a subtraction since the truncation already reduced the
+	 * value to the modulus bit length.
+	 */
+	br_ecdsa_i31_bits2int(t1, hash, hash_len, n[0]);
+	br_i31_sub(t1, n, br_i31_sub(t1, n, 0) ^ 1);
+
+	/*
+	 * Multiply the (truncated, reduced) hash value with 1/s, result in
+	 * t2, encoded in ty.
+	 */
+	br_i31_montymul(t2, t1, s, n, n0i);
+	br_i31_encode(ty, nlen, t2);
+
+	/*
+	 * Multiply r with 1/s, result in t1, encoded in tx.
+	 */
+	br_i31_montymul(t1, r, s, n, n0i);
+	br_i31_encode(tx, nlen, t1);
+
+	/*
+	 * Compute the point x*Q + y*G.
+	 */
+	ulen = cd->generator_len;
+	memcpy(eU, pk->q, ulen);
+	res = impl->muladd(eU, NULL, ulen,
+		tx, nlen, ty, nlen, cd->curve);
+
+	/*
+	 * Get the X coordinate, reduce modulo the curve order, and
+	 * compare with the 'r' value.
+	 *
+	 * The modular reduction can be done with subtractions because
+	 * we work with curves of prime order, so the curve order is
+	 * close to the field order (Hasse's theorem).
+	 */
+	br_i31_zero(t1, n[0]);
+	br_i31_decode(t1, &eU[1], ulen >> 1);
+	t1[0] = n[0];
+	br_i31_sub(t1, n, br_i31_sub(t1, n, 0) ^ 1);
+	res &= ~br_i31_sub(t1, r, 1);
+	res &= br_i31_iszero(t1);
+	return res;
+}
diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_rta.c b/test/monniaux/BearSSL/src/ec/ecdsa_rta.c
new file mode 100644
index 00000000..005c62c2
--- /dev/null
+++ b/test/monniaux/BearSSL/src/ec/ecdsa_rta.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Compute ASN.1 encoded length for the provided integer. The ASN.1
+ * encoding is signed, so its leading bit must have value 0; it must
+ * also be of minimal length (so leading bytes of value 0 must be
+ * removed, except if that would contradict the rule about the sign
+ * bit).
+ */
+static size_t
+asn1_int_length(const unsigned char *x, size_t xlen)
+{
+	while (xlen > 0 && *x == 0) {
+		x ++;
+		xlen --;
+	}
+	if (xlen == 0 || *x >= 0x80) {
+		xlen ++;
+	}
+	return xlen;
+}
+
+/* see bearssl_ec.h */
+size_t
+br_ecdsa_raw_to_asn1(void *sig, size_t sig_len)
+{
+	/*
+	 * Internal buffer is large enough to accommodate a signature
+	 * such that r and s fit on 125 bytes each (signed encoding),
+	 * meaning a curve order of up to 999 bits. This is the limit
+	 * that ensures "simple" length encodings.
+	 */
+	unsigned char *buf;
+	size_t hlen, rlen, slen, zlen, off;
+	unsigned char tmp[257];
+
+	buf = sig;
+	if ((sig_len & 1) != 0) {
+		return 0;
+	}
+
+	/*
+	 * Compute lengths for the two integers.
+	 */
+	hlen = sig_len >> 1;
+	rlen = asn1_int_length(buf, hlen);
+	slen = asn1_int_length(buf + hlen, hlen);
+	if (rlen > 125 || slen > 125) {
+		return 0;
+	}
+
+	/*
+	 * SEQUENCE header.
+	 */
+	tmp[0] = 0x30;
+	zlen = rlen + slen + 4;
+	if (zlen >= 0x80) {
+		tmp[1] = 0x81;
+		tmp[2] = zlen;
+		off = 3;
+	} else {
+		tmp[1] = zlen;
+		off = 2;
+	}
+
+	/*
+	 * First INTEGER (r).
+	 */
+	tmp[off ++] = 0x02;
+	tmp[off ++] = rlen;
+	if (rlen > hlen) {
+		tmp[off] = 0x00;
+		memcpy(tmp + off + 1, buf, hlen);
+	} else {
+		memcpy(tmp + off, buf + hlen - rlen, rlen);
+	}
+	off += rlen;
+
+	/*
+	 * Second INTEGER (s).
+	 */
+	tmp[off ++] = 0x02;
+	tmp[off ++] = slen;
+	if (slen > hlen) {
+		tmp[off] = 0x00;
+		memcpy(tmp + off + 1, buf + hlen, hlen);
+	} else {
+		memcpy(tmp + off, buf + sig_len - slen, slen);
+	}
+	off += slen;
+
+	/*
+	 * Return ASN.1 signature.
+	 */
+	memcpy(sig, tmp, off);
+	return off;
+}