diff options
Diffstat (limited to 'test/monniaux/BearSSL/src/ec')
37 files changed, 15319 insertions, 0 deletions
diff --git a/test/monniaux/BearSSL/src/ec/ec_all_m15.c b/test/monniaux/BearSSL/src/ec/ec_all_m15.c new file mode 100644 index 00000000..bb550e18 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_all_m15.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + switch (curve) { + case BR_EC_secp256r1: + return br_ec_p256_m15.generator(curve, len); + case BR_EC_curve25519: + return br_ec_c25519_m15.generator(curve, len); + default: + return br_ec_prime_i15.generator(curve, len); + } +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + switch (curve) { + case BR_EC_secp256r1: + return br_ec_p256_m15.order(curve, len); + case BR_EC_curve25519: + return br_ec_c25519_m15.order(curve, len); + default: + return br_ec_prime_i15.order(curve, len); + } +} + +static size_t +api_xoff(int curve, size_t *len) +{ + switch (curve) { + case BR_EC_secp256r1: + return br_ec_p256_m15.xoff(curve, len); + case BR_EC_curve25519: + return br_ec_c25519_m15.xoff(curve, len); + default: + return br_ec_prime_i15.xoff(curve, len); + } +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ + switch (curve) { + case BR_EC_secp256r1: + return br_ec_p256_m15.mul(G, Glen, kb, kblen, curve); + case BR_EC_curve25519: + return br_ec_c25519_m15.mul(G, Glen, kb, kblen, curve); + default: + return br_ec_prime_i15.mul(G, Glen, kb, kblen, curve); + } +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + switch (curve) { + case BR_EC_secp256r1: + return br_ec_p256_m15.mulgen(R, x, xlen, curve); + case BR_EC_curve25519: + return br_ec_c25519_m15.mulgen(R, x, xlen, curve); + default: + return br_ec_prime_i15.mulgen(R, x, xlen, curve); + } +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + switch (curve) { + case BR_EC_secp256r1: + return br_ec_p256_m15.muladd(A, B, len, + x, xlen, y, ylen, curve); + case BR_EC_curve25519: + return br_ec_c25519_m15.muladd(A, B, len, + x, xlen, y, ylen, curve); + default: + return br_ec_prime_i15.muladd(A, B, len, + x, xlen, y, ylen, curve); + } +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_all_m15 = { + (uint32_t)0x23800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_all_m31.c b/test/monniaux/BearSSL/src/ec/ec_all_m31.c new file mode 100644 index 00000000..8fd8c3c0 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_all_m31.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + switch (curve) { + case BR_EC_secp256r1: +#if BR_INT128 || BR_UMUL128 + return br_ec_p256_m64.generator(curve, len); +#else + return br_ec_p256_m31.generator(curve, len); +#endif + case BR_EC_curve25519: +#if BR_INT128 || BR_UMUL128 + return br_ec_c25519_m64.generator(curve, len); +#else + return br_ec_c25519_m31.generator(curve, len); +#endif + default: + return br_ec_prime_i31.generator(curve, len); + } +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + switch (curve) { + case BR_EC_secp256r1: +#if BR_INT128 || BR_UMUL128 + return br_ec_p256_m64.order(curve, len); +#else + return br_ec_p256_m31.order(curve, len); +#endif + case BR_EC_curve25519: +#if BR_INT128 || BR_UMUL128 + return br_ec_c25519_m64.order(curve, len); +#else + return br_ec_c25519_m31.order(curve, len); +#endif + default: + return br_ec_prime_i31.order(curve, len); + } +} + +static size_t +api_xoff(int curve, size_t *len) +{ + switch (curve) { + case BR_EC_secp256r1: +#if BR_INT128 || BR_UMUL128 + return br_ec_p256_m64.xoff(curve, len); +#else + return br_ec_p256_m31.xoff(curve, len); +#endif + case BR_EC_curve25519: +#if BR_INT128 || BR_UMUL128 + return br_ec_c25519_m64.xoff(curve, len); +#else + return br_ec_c25519_m31.xoff(curve, len); +#endif + default: + return br_ec_prime_i31.xoff(curve, len); + } +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ + switch (curve) { + case BR_EC_secp256r1: +#if BR_INT128 || BR_UMUL128 + return br_ec_p256_m64.mul(G, Glen, kb, kblen, curve); +#else + return br_ec_p256_m31.mul(G, Glen, kb, kblen, curve); +#endif + case BR_EC_curve25519: +#if BR_INT128 || BR_UMUL128 + return br_ec_c25519_m64.mul(G, Glen, kb, kblen, curve); +#else + return br_ec_c25519_m31.mul(G, Glen, kb, kblen, curve); +#endif + default: + return br_ec_prime_i31.mul(G, Glen, kb, kblen, curve); + } +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + switch (curve) { + case BR_EC_secp256r1: +#if BR_INT128 || BR_UMUL128 + return br_ec_p256_m64.mulgen(R, x, xlen, curve); +#else + return br_ec_p256_m31.mulgen(R, x, xlen, curve); +#endif + case BR_EC_curve25519: +#if BR_INT128 || BR_UMUL128 + return br_ec_c25519_m64.mulgen(R, x, xlen, curve); +#else + return br_ec_c25519_m31.mulgen(R, x, xlen, curve); +#endif + default: + return br_ec_prime_i31.mulgen(R, x, xlen, curve); + } +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + switch (curve) { + case BR_EC_secp256r1: +#if BR_INT128 || BR_UMUL128 + return br_ec_p256_m64.muladd(A, B, len, + x, xlen, y, ylen, curve); +#else + return br_ec_p256_m31.muladd(A, B, len, + x, xlen, y, ylen, curve); +#endif + case BR_EC_curve25519: +#if BR_INT128 || BR_UMUL128 + return br_ec_c25519_m64.muladd(A, B, len, + x, xlen, y, ylen, curve); +#else + return br_ec_c25519_m31.muladd(A, B, len, + x, xlen, y, ylen, curve); +#endif + default: + return br_ec_prime_i31.muladd(A, B, len, + x, xlen, y, ylen, curve); + } +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_all_m31 = { + (uint32_t)0x23800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c b/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c new file mode 100644 index 00000000..8fadcf48 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_c25519_i15.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Parameters for the field: + * - field modulus p = 2^255-19 + * - R^2 mod p (R = 2^(15k) for the smallest k such that R >= p) + */ + +static const uint16_t C255_P[] = { + 0x0110, + 0x7FED, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, + 0x7FFF +}; + +#define P0I 0x4A1B + +static const uint16_t C255_R2[] = { + 0x0110, + 0x0169, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000 +}; + +/* obsolete +#include <stdio.h> +#include <stdlib.h> +static void +print_int_mont(const char *name, const uint16_t *x) +{ + uint16_t y[18]; + unsigned char tmp[32]; + size_t u; + + printf("%s = ", name); + memcpy(y, x, sizeof y); + br_i15_from_monty(y, C255_P, P0I); + br_i15_encode(tmp, sizeof tmp, y); + for (u = 0; u < sizeof tmp; u ++) { + printf("%02X", tmp[u]); + } + printf("\n"); +} +*/ + +static const uint16_t C255_A24[] = { + 0x0110, + 0x45D3, 0x0046, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000 +}; + +static const unsigned char GEN[] = { + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const unsigned char ORDER[] = { + 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return GEN; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return ORDER; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 0; +} + +static void +cswap(uint16_t *a, uint16_t *b, uint32_t ctl) +{ + int i; + + ctl = -ctl; + for (i = 0; i < 18; i ++) { + uint32_t aw, bw, tw; + + aw = a[i]; + bw = b[i]; + tw = ctl & (aw ^ bw); + a[i] = aw ^ tw; + b[i] = bw ^ tw; + } +} + +static void +c255_add(uint16_t *d, const uint16_t *a, const uint16_t *b) +{ + uint32_t ctl; + uint16_t t[18]; + + memcpy(t, a, sizeof t); + ctl = br_i15_add(t, b, 1); + ctl |= NOT(br_i15_sub(t, C255_P, 0)); + br_i15_sub(t, C255_P, ctl); + memcpy(d, t, sizeof t); +} + +static void +c255_sub(uint16_t *d, const uint16_t *a, const uint16_t *b) +{ + uint16_t t[18]; + + memcpy(t, a, sizeof t); + br_i15_add(t, C255_P, br_i15_sub(t, b, 1)); + memcpy(d, t, sizeof t); +} + +static void +c255_mul(uint16_t *d, const uint16_t *a, const uint16_t *b) +{ + uint16_t t[18]; + + br_i15_montymul(t, a, b, C255_P, P0I); + memcpy(d, t, sizeof t); +} + +static void +byteswap(unsigned char *G) +{ + int i; + + for (i = 0; i < 16; i ++) { + unsigned char t; + + t = G[i]; + G[i] = G[31 - i]; + G[31 - i] = t; + } +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ +#define ILEN (18 * sizeof(uint16_t)) + + /* + * The a[] and b[] arrays have an extra word to allow for + * decoding without using br_i15_decode_reduce(). + */ + uint16_t x1[18], x2[18], x3[18], z2[18], z3[18]; + uint16_t a[19], aa[18], b[19], bb[18]; + uint16_t c[18], d[18], e[18], da[18], cb[18]; + unsigned char k[32]; + uint32_t swap; + int i; + + (void)curve; + + /* + * Points are encoded over exactly 32 bytes. Multipliers must fit + * in 32 bytes as well. + * RFC 7748 mandates that the high bit of the last point byte must + * be ignored/cleared. + */ + if (Glen != 32 || kblen > 32) { + return 0; + } + G[31] &= 0x7F; + + /* + * Byteswap the point encoding, because it uses little-endian, and + * the generic decoding routine uses big-endian. + */ + byteswap(G); + + /* + * Decode the point ('u' coordinate). This should be reduced + * modulo p, but we prefer to avoid the dependency on + * br_i15_decode_reduce(). Instead, we use br_i15_decode_mod() + * with a synthetic modulus of value 2^255 (this must work + * since G was truncated to 255 bits), then use a conditional + * subtraction. We use br_i15_decode_mod() and not + * br_i15_decode(), because the ec_prime_i15 implementation uses + * the former but not the latter. + * br_i15_decode_reduce(a, G, 32, C255_P); + */ + br_i15_zero(b, 0x111); + b[18] = 1; + br_i15_decode_mod(a, G, 32, b); + a[0] = 0x110; + br_i15_sub(a, C255_P, NOT(br_i15_sub(a, C255_P, 0))); + + /* + * Initialise variables x1, x2, z2, x3 and z3. We set all of them + * into Montgomery representation. + */ + br_i15_montymul(x1, a, C255_R2, C255_P, P0I); + memcpy(x3, x1, ILEN); + br_i15_zero(z2, C255_P[0]); + memcpy(x2, z2, ILEN); + x2[1] = 19; + memcpy(z3, x2, ILEN); + + memset(k, 0, (sizeof k) - kblen); + memcpy(k + (sizeof k) - kblen, kb, kblen); + k[31] &= 0xF8; + k[0] &= 0x7F; + k[0] |= 0x40; + + /* obsolete + print_int_mont("x1", x1); + */ + + swap = 0; + for (i = 254; i >= 0; i --) { + uint32_t kt; + + kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; + swap ^= kt; + cswap(x2, x3, swap); + cswap(z2, z3, swap); + swap = kt; + + /* obsolete + print_int_mont("x2", x2); + print_int_mont("z2", z2); + print_int_mont("x3", x3); + print_int_mont("z3", z3); + */ + + c255_add(a, x2, z2); + c255_mul(aa, a, a); + c255_sub(b, x2, z2); + c255_mul(bb, b, b); + c255_sub(e, aa, bb); + c255_add(c, x3, z3); + c255_sub(d, x3, z3); + c255_mul(da, d, a); + c255_mul(cb, c, b); + + /* obsolete + print_int_mont("a ", a); + print_int_mont("aa", aa); + print_int_mont("b ", b); + print_int_mont("bb", bb); + print_int_mont("e ", e); + print_int_mont("c ", c); + print_int_mont("d ", d); + print_int_mont("da", da); + print_int_mont("cb", cb); + */ + + c255_add(x3, da, cb); + c255_mul(x3, x3, x3); + c255_sub(z3, da, cb); + c255_mul(z3, z3, z3); + c255_mul(z3, z3, x1); + c255_mul(x2, aa, bb); + c255_mul(z2, C255_A24, e); + c255_add(z2, z2, aa); + c255_mul(z2, e, z2); + + /* obsolete + print_int_mont("x2", x2); + print_int_mont("z2", z2); + print_int_mont("x3", x3); + print_int_mont("z3", z3); + */ + } + cswap(x2, x3, swap); + cswap(z2, z3, swap); + + /* + * Inverse z2 with a modular exponentiation. This is a simple + * square-and-multiply algorithm; we mutualise most non-squarings + * since the exponent contains almost only ones. + */ + memcpy(a, z2, ILEN); + for (i = 0; i < 15; i ++) { + c255_mul(a, a, a); + c255_mul(a, a, z2); + } + memcpy(b, a, ILEN); + for (i = 0; i < 14; i ++) { + int j; + + for (j = 0; j < 16; j ++) { + c255_mul(b, b, b); + } + c255_mul(b, b, a); + } + for (i = 14; i >= 0; i --) { + c255_mul(b, b, b); + if ((0xFFEB >> i) & 1) { + c255_mul(b, z2, b); + } + } + c255_mul(b, x2, b); + + /* + * To avoid a dependency on br_i15_from_monty(), we use a + * Montgomery multiplication with 1. + * memcpy(x2, b, ILEN); + * br_i15_from_monty(x2, C255_P, P0I); + */ + br_i15_zero(a, C255_P[0]); + a[1] = 1; + br_i15_montymul(x2, a, b, C255_P, P0I); + + br_i15_encode(G, 32, x2); + byteswap(G); + return 1; + +#undef ILEN +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + /* + * We don't implement this method, since it is used for ECDSA + * only, and there is no ECDSA over Curve25519 (which instead + * uses EdDSA). + */ + (void)A; + (void)B; + (void)len; + (void)x; + (void)xlen; + (void)y; + (void)ylen; + (void)curve; + return 0; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_c25519_i15 = { + (uint32_t)0x20000000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c b/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c new file mode 100644 index 00000000..f8ffc2c2 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_c25519_i31.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Parameters for the field: + * - field modulus p = 2^255-19 + * - R^2 mod p (R = 2^(31k) for the smallest k such that R >= p) + */ + +static const uint32_t C255_P[] = { + 0x00000107, + 0x7FFFFFED, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0000007F +}; + +#define P0I 0x286BCA1B + +static const uint32_t C255_R2[] = { + 0x00000107, + 0x00000000, 0x02D20000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 +}; + +static const uint32_t C255_A24[] = { + 0x00000107, + 0x53000000, 0x0000468B, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 +}; + +/* obsolete +#include <stdio.h> +#include <stdlib.h> +static void +print_int_mont(const char *name, const uint32_t *x) +{ + uint32_t y[10]; + unsigned char tmp[32]; + size_t u; + + printf("%s = ", name); + memcpy(y, x, sizeof y); + br_i31_from_monty(y, C255_P, P0I); + br_i31_encode(tmp, sizeof tmp, y); + for (u = 0; u < sizeof tmp; u ++) { + printf("%02X", tmp[u]); + } + printf("\n"); +} +*/ + +static const unsigned char GEN[] = { + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const unsigned char ORDER[] = { + 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return GEN; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return ORDER; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 0; +} + +static void +cswap(uint32_t *a, uint32_t *b, uint32_t ctl) +{ + int i; + + ctl = -ctl; + for (i = 0; i < 10; i ++) { + uint32_t aw, bw, tw; + + aw = a[i]; + bw = b[i]; + tw = ctl & (aw ^ bw); + a[i] = aw ^ tw; + b[i] = bw ^ tw; + } +} + +static void +c255_add(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t ctl; + uint32_t t[10]; + + memcpy(t, a, sizeof t); + ctl = br_i31_add(t, b, 1); + ctl |= NOT(br_i31_sub(t, C255_P, 0)); + br_i31_sub(t, C255_P, ctl); + memcpy(d, t, sizeof t); +} + +static void +c255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t t[10]; + + memcpy(t, a, sizeof t); + br_i31_add(t, C255_P, br_i31_sub(t, b, 1)); + memcpy(d, t, sizeof t); +} + +static void +c255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t t[10]; + + br_i31_montymul(t, a, b, C255_P, P0I); + memcpy(d, t, sizeof t); +} + +static void +byteswap(unsigned char *G) +{ + int i; + + for (i = 0; i < 16; i ++) { + unsigned char t; + + t = G[i]; + G[i] = G[31 - i]; + G[31 - i] = t; + } +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ + uint32_t x1[10], x2[10], x3[10], z2[10], z3[10]; + uint32_t a[10], aa[10], b[10], bb[10]; + uint32_t c[10], d[10], e[10], da[10], cb[10]; + unsigned char k[32]; + uint32_t swap; + int i; + + (void)curve; + + /* + * Points are encoded over exactly 32 bytes. Multipliers must fit + * in 32 bytes as well. + * RFC 7748 mandates that the high bit of the last point byte must + * be ignored/cleared. + */ + if (Glen != 32 || kblen > 32) { + return 0; + } + G[31] &= 0x7F; + + /* + * Byteswap the point encoding, because it uses little-endian, and + * the generic decoding routine uses big-endian. + */ + byteswap(G); + + /* + * Decode the point ('u' coordinate). This should be reduced + * modulo p, but we prefer to avoid the dependency on + * br_i31_decode_reduce(). Instead, we use br_i31_decode_mod() + * with a synthetic modulus of value 2^255 (this must work + * since G was truncated to 255 bits), then use a conditional + * subtraction. We use br_i31_decode_mod() and not + * br_i31_decode(), because the ec_prime_i31 implementation uses + * the former but not the latter. + * br_i31_decode_reduce(a, G, 32, C255_P); + */ + br_i31_zero(b, 0x108); + b[9] = 0x0080; + br_i31_decode_mod(a, G, 32, b); + a[0] = 0x107; + br_i31_sub(a, C255_P, NOT(br_i31_sub(a, C255_P, 0))); + + /* + * Initialise variables x1, x2, z2, x3 and z3. We set all of them + * into Montgomery representation. + */ + br_i31_montymul(x1, a, C255_R2, C255_P, P0I); + memcpy(x3, x1, sizeof x1); + br_i31_zero(z2, C255_P[0]); + memcpy(x2, z2, sizeof z2); + x2[1] = 0x13000000; + memcpy(z3, x2, sizeof x2); + + /* + * kb[] is in big-endian notation, but possibly shorter than k[]. + */ + memset(k, 0, (sizeof k) - kblen); + memcpy(k + (sizeof k) - kblen, kb, kblen); + k[31] &= 0xF8; + k[0] &= 0x7F; + k[0] |= 0x40; + + /* obsolete + print_int_mont("x1", x1); + */ + + swap = 0; + for (i = 254; i >= 0; i --) { + uint32_t kt; + + kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; + swap ^= kt; + cswap(x2, x3, swap); + cswap(z2, z3, swap); + swap = kt; + + /* obsolete + print_int_mont("x2", x2); + print_int_mont("z2", z2); + print_int_mont("x3", x3); + print_int_mont("z3", z3); + */ + + c255_add(a, x2, z2); + c255_mul(aa, a, a); + c255_sub(b, x2, z2); + c255_mul(bb, b, b); + c255_sub(e, aa, bb); + c255_add(c, x3, z3); + c255_sub(d, x3, z3); + c255_mul(da, d, a); + c255_mul(cb, c, b); + + /* obsolete + print_int_mont("a ", a); + print_int_mont("aa", aa); + print_int_mont("b ", b); + print_int_mont("bb", bb); + print_int_mont("e ", e); + print_int_mont("c ", c); + print_int_mont("d ", d); + print_int_mont("da", da); + print_int_mont("cb", cb); + */ + + c255_add(x3, da, cb); + c255_mul(x3, x3, x3); + c255_sub(z3, da, cb); + c255_mul(z3, z3, z3); + c255_mul(z3, z3, x1); + c255_mul(x2, aa, bb); + c255_mul(z2, C255_A24, e); + c255_add(z2, z2, aa); + c255_mul(z2, e, z2); + + /* obsolete + print_int_mont("x2", x2); + print_int_mont("z2", z2); + print_int_mont("x3", x3); + print_int_mont("z3", z3); + */ + } + cswap(x2, x3, swap); + cswap(z2, z3, swap); + + /* + * Inverse z2 with a modular exponentiation. This is a simple + * square-and-multiply algorithm; we mutualise most non-squarings + * since the exponent contains almost only ones. + */ + memcpy(a, z2, sizeof z2); + for (i = 0; i < 15; i ++) { + c255_mul(a, a, a); + c255_mul(a, a, z2); + } + memcpy(b, a, sizeof a); + for (i = 0; i < 14; i ++) { + int j; + + for (j = 0; j < 16; j ++) { + c255_mul(b, b, b); + } + c255_mul(b, b, a); + } + for (i = 14; i >= 0; i --) { + c255_mul(b, b, b); + if ((0xFFEB >> i) & 1) { + c255_mul(b, z2, b); + } + } + c255_mul(b, x2, b); + + /* + * To avoid a dependency on br_i31_from_monty(), we use + * a Montgomery multiplication with 1. + * memcpy(x2, b, sizeof b); + * br_i31_from_monty(x2, C255_P, P0I); + */ + br_i31_zero(a, C255_P[0]); + a[1] = 1; + br_i31_montymul(x2, a, b, C255_P, P0I); + + br_i31_encode(G, 32, x2); + byteswap(G); + return 1; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + /* + * We don't implement this method, since it is used for ECDSA + * only, and there is no ECDSA over Curve25519 (which instead + * uses EdDSA). + */ + (void)A; + (void)B; + (void)len; + (void)x; + (void)xlen; + (void)y; + (void)ylen; + (void)curve; + return 0; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_c25519_i31 = { + (uint32_t)0x20000000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c new file mode 100644 index 00000000..deff55b3 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m15.c @@ -0,0 +1,1478 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* obsolete +#include <stdio.h> +#include <stdlib.h> +static void +print_int(const char *name, const uint32_t *x) +{ + size_t u; + unsigned char tmp[36]; + + printf("%s = ", name); + for (u = 0; u < 20; u ++) { + if (x[u] > 0x1FFF) { + printf("INVALID:"); + for (u = 0; u < 20; u ++) { + printf(" %04X", x[u]); + } + printf("\n"); + return; + } + } + memset(tmp, 0, sizeof tmp); + for (u = 0; u < 20; u ++) { + uint32_t w; + int j, k; + + w = x[u]; + j = 13 * (int)u; + k = j & 7; + if (k != 0) { + w <<= k; + j -= k; + } + k = j >> 3; + tmp[35 - k] |= (unsigned char)w; + tmp[34 - k] |= (unsigned char)(w >> 8); + tmp[33 - k] |= (unsigned char)(w >> 16); + tmp[32 - k] |= (unsigned char)(w >> 24); + } + for (u = 4; u < 36; u ++) { + printf("%02X", tmp[u]); + } + printf("\n"); +} +*/ + +/* + * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ + * that right-shifting a signed negative integer copies the sign bit + * (arithmetic right-shift). This is "implementation-defined behaviour", + * i.e. it is not undefined, but it may differ between compilers. Each + * compiler is supposed to document its behaviour in that respect. GCC + * explicitly defines that an arithmetic right shift is used. We expect + * all other compilers to do the same, because underlying CPU offer an + * arithmetic right shift opcode that could not be used otherwise. + */ +#if BR_NO_ARITH_SHIFT +#define ARSH(x, n) (((uint32_t)(x) >> (n)) \ + | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) +#else +#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) +#endif + +/* + * Convert an integer from unsigned little-endian encoding to a sequence of + * 13-bit words in little-endian order. The final "partial" word is + * returned. + */ +static uint32_t +le8_to_le13(uint32_t *dst, const unsigned char *src, size_t len) +{ + uint32_t acc; + int acc_len; + + acc = 0; + acc_len = 0; + while (len -- > 0) { + acc |= (uint32_t)(*src ++) << acc_len; + acc_len += 8; + if (acc_len >= 13) { + *dst ++ = acc & 0x1FFF; + acc >>= 13; + acc_len -= 13; + } + } + return acc; +} + +/* + * Convert an integer (13-bit words, little-endian) to unsigned + * little-endian encoding. The total encoding length is provided; all + * the destination bytes will be filled. + */ +static void +le13_to_le8(unsigned char *dst, size_t len, const uint32_t *src) +{ + uint32_t acc; + int acc_len; + + acc = 0; + acc_len = 0; + while (len -- > 0) { + if (acc_len < 8) { + acc |= (*src ++) << acc_len; + acc_len += 13; + } + *dst ++ = (unsigned char)acc; + acc >>= 8; + acc_len -= 8; + } +} + +/* + * Normalise an array of words to a strict 13 bits per word. Returned + * value is the resulting carry. The source (w) and destination (d) + * arrays may be identical, but shall not overlap partially. + */ +static inline uint32_t +norm13(uint32_t *d, const uint32_t *w, size_t len) +{ + size_t u; + uint32_t cc; + + cc = 0; + for (u = 0; u < len; u ++) { + int32_t z; + + z = w[u] + cc; + d[u] = z & 0x1FFF; + cc = ARSH(z, 13); + } + return cc; +} + +/* + * mul20() multiplies two 260-bit integers together. Each word must fit + * on 13 bits; source operands use 20 words, destination operand + * receives 40 words. All overlaps allowed. + * + * square20() computes the square of a 260-bit integer. Each word must + * fit on 13 bits; source operand uses 20 words, destination operand + * receives 40 words. All overlaps allowed. + */ + +#if BR_SLOW_MUL15 + +static void +mul20(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + /* + * Two-level Karatsuba: turns a 20x20 multiplication into + * nine 5x5 multiplications. We use 13-bit words but do not + * propagate carries immediately, so words may expand: + * + * - First Karatsuba decomposition turns the 20x20 mul on + * 13-bit words into three 10x10 muls, two on 13-bit words + * and one on 14-bit words. + * + * - Second Karatsuba decomposition further splits these into: + * + * * four 5x5 muls on 13-bit words + * * four 5x5 muls on 14-bit words + * * one 5x5 mul on 15-bit words + * + * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit + * or 15-bit words, respectively. + */ + uint32_t u[45], v[45], w[90]; + uint32_t cc; + int i; + +#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \ + (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \ + + (s2w)[5 * (s2_off) + 0]; \ + (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \ + + (s2w)[5 * (s2_off) + 1]; \ + (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \ + + (s2w)[5 * (s2_off) + 2]; \ + (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \ + + (s2w)[5 * (s2_off) + 3]; \ + (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \ + + (s2w)[5 * (s2_off) + 4]; \ + } while (0) + +#define ZADDT(dw, d_off, sw, s_off) do { \ + (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \ + (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \ + (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \ + (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \ + (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \ + } while (0) + +#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \ + (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \ + + (s2w)[5 * (s2_off) + 0]; \ + (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \ + + (s2w)[5 * (s2_off) + 1]; \ + (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \ + + (s2w)[5 * (s2_off) + 2]; \ + (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \ + + (s2w)[5 * (s2_off) + 3]; \ + (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \ + + (s2w)[5 * (s2_off) + 4]; \ + } while (0) + +#define CPR1(w, cprcc) do { \ + uint32_t cprz = (w) + cprcc; \ + (w) = cprz & 0x1FFF; \ + cprcc = cprz >> 13; \ + } while (0) + +#define CPR(dw, d_off) do { \ + uint32_t cprcc; \ + cprcc = 0; \ + CPR1((dw)[(d_off) + 0], cprcc); \ + CPR1((dw)[(d_off) + 1], cprcc); \ + CPR1((dw)[(d_off) + 2], cprcc); \ + CPR1((dw)[(d_off) + 3], cprcc); \ + CPR1((dw)[(d_off) + 4], cprcc); \ + CPR1((dw)[(d_off) + 5], cprcc); \ + CPR1((dw)[(d_off) + 6], cprcc); \ + CPR1((dw)[(d_off) + 7], cprcc); \ + CPR1((dw)[(d_off) + 8], cprcc); \ + (dw)[(d_off) + 9] = cprcc; \ + } while (0) + + memcpy(u, a, 20 * sizeof *a); + ZADD(u, 4, a, 0, a, 1); + ZADD(u, 5, a, 2, a, 3); + ZADD(u, 6, a, 0, a, 2); + ZADD(u, 7, a, 1, a, 3); + ZADD(u, 8, u, 6, u, 7); + + memcpy(v, b, 20 * sizeof *b); + ZADD(v, 4, b, 0, b, 1); + ZADD(v, 5, b, 2, b, 3); + ZADD(v, 6, b, 0, b, 2); + ZADD(v, 7, b, 1, b, 3); + ZADD(v, 8, v, 6, v, 7); + + /* + * Do the eight first 8x8 muls. Source words are at most 16382 + * each, so we can add product results together "as is" in 32-bit + * words. + */ + for (i = 0; i < 40; i += 5) { + w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]); + w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1]) + + MUL15(u[i + 1], v[i + 0]); + w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2]) + + MUL15(u[i + 1], v[i + 1]) + + MUL15(u[i + 2], v[i + 0]); + w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3]) + + MUL15(u[i + 1], v[i + 2]) + + MUL15(u[i + 2], v[i + 1]) + + MUL15(u[i + 3], v[i + 0]); + w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4]) + + MUL15(u[i + 1], v[i + 3]) + + MUL15(u[i + 2], v[i + 2]) + + MUL15(u[i + 3], v[i + 1]) + + MUL15(u[i + 4], v[i + 0]); + w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4]) + + MUL15(u[i + 2], v[i + 3]) + + MUL15(u[i + 3], v[i + 2]) + + MUL15(u[i + 4], v[i + 1]); + w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4]) + + MUL15(u[i + 3], v[i + 3]) + + MUL15(u[i + 4], v[i + 2]); + w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4]) + + MUL15(u[i + 4], v[i + 3]); + w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]); + w[(i << 1) + 9] = 0; + } + + /* + * For the 9th multiplication, source words are up to 32764, + * so we must do some carry propagation. If we add up to + * 4 products and the carry is no more than 524224, then the + * result fits in 32 bits, and the next carry will be no more + * than 524224 (because 4*(32764^2)+524224 < 8192*524225). + * + * We thus just skip one of the products in the middle word, + * then do a carry propagation (this reduces words to 13 bits + * each, except possibly the last, which may use up to 17 bits + * or so), then add the missing product. + */ + w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]); + w[80 + 1] = MUL15(u[40 + 0], v[40 + 1]) + + MUL15(u[40 + 1], v[40 + 0]); + w[80 + 2] = MUL15(u[40 + 0], v[40 + 2]) + + MUL15(u[40 + 1], v[40 + 1]) + + MUL15(u[40 + 2], v[40 + 0]); + w[80 + 3] = MUL15(u[40 + 0], v[40 + 3]) + + MUL15(u[40 + 1], v[40 + 2]) + + MUL15(u[40 + 2], v[40 + 1]) + + MUL15(u[40 + 3], v[40 + 0]); + w[80 + 4] = MUL15(u[40 + 0], v[40 + 4]) + + MUL15(u[40 + 1], v[40 + 3]) + + MUL15(u[40 + 2], v[40 + 2]) + + MUL15(u[40 + 3], v[40 + 1]); + /* + MUL15(u[40 + 4], v[40 + 0]) */ + w[80 + 5] = MUL15(u[40 + 1], v[40 + 4]) + + MUL15(u[40 + 2], v[40 + 3]) + + MUL15(u[40 + 3], v[40 + 2]) + + MUL15(u[40 + 4], v[40 + 1]); + w[80 + 6] = MUL15(u[40 + 2], v[40 + 4]) + + MUL15(u[40 + 3], v[40 + 3]) + + MUL15(u[40 + 4], v[40 + 2]); + w[80 + 7] = MUL15(u[40 + 3], v[40 + 4]) + + MUL15(u[40 + 4], v[40 + 3]); + w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]); + + CPR(w, 80); + + w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]); + + /* + * The products on 14-bit words in slots 6 and 7 yield values + * up to 5*(16382^2) each, and we need to subtract two such + * values from the higher word. We need the subtraction to fit + * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit. + * However, 10*(16382^2) does not fit. So we must perform a + * bit of reduction here. + */ + CPR(w, 60); + CPR(w, 70); + + /* + * Recompose results. + */ + + /* 0..1*0..1 into 0..3 */ + ZSUB2F(w, 8, w, 0, w, 2); + ZSUB2F(w, 9, w, 1, w, 3); + ZADDT(w, 1, w, 8); + ZADDT(w, 2, w, 9); + + /* 2..3*2..3 into 4..7 */ + ZSUB2F(w, 10, w, 4, w, 6); + ZSUB2F(w, 11, w, 5, w, 7); + ZADDT(w, 5, w, 10); + ZADDT(w, 6, w, 11); + + /* (0..1+2..3)*(0..1+2..3) into 12..15 */ + ZSUB2F(w, 16, w, 12, w, 14); + ZSUB2F(w, 17, w, 13, w, 15); + ZADDT(w, 13, w, 16); + ZADDT(w, 14, w, 17); + + /* first-level recomposition */ + ZSUB2F(w, 12, w, 0, w, 4); + ZSUB2F(w, 13, w, 1, w, 5); + ZSUB2F(w, 14, w, 2, w, 6); + ZSUB2F(w, 15, w, 3, w, 7); + ZADDT(w, 2, w, 12); + ZADDT(w, 3, w, 13); + ZADDT(w, 4, w, 14); + ZADDT(w, 5, w, 15); + + /* + * Perform carry propagation to bring all words down to 13 bits. + */ + cc = norm13(d, w, 40); + d[39] += (cc << 13); + +#undef ZADD +#undef ZADDT +#undef ZSUB2F +#undef CPR1 +#undef CPR +} + +static inline void +square20(uint32_t *d, const uint32_t *a) +{ + mul20(d, a, a); +} + +#else + +static void +mul20(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t t[39]; + + t[ 0] = MUL15(a[ 0], b[ 0]); + t[ 1] = MUL15(a[ 0], b[ 1]) + + MUL15(a[ 1], b[ 0]); + t[ 2] = MUL15(a[ 0], b[ 2]) + + MUL15(a[ 1], b[ 1]) + + MUL15(a[ 2], b[ 0]); + t[ 3] = MUL15(a[ 0], b[ 3]) + + MUL15(a[ 1], b[ 2]) + + MUL15(a[ 2], b[ 1]) + + MUL15(a[ 3], b[ 0]); + t[ 4] = MUL15(a[ 0], b[ 4]) + + MUL15(a[ 1], b[ 3]) + + MUL15(a[ 2], b[ 2]) + + MUL15(a[ 3], b[ 1]) + + MUL15(a[ 4], b[ 0]); + t[ 5] = MUL15(a[ 0], b[ 5]) + + MUL15(a[ 1], b[ 4]) + + MUL15(a[ 2], b[ 3]) + + MUL15(a[ 3], b[ 2]) + + MUL15(a[ 4], b[ 1]) + + MUL15(a[ 5], b[ 0]); + t[ 6] = MUL15(a[ 0], b[ 6]) + + MUL15(a[ 1], b[ 5]) + + MUL15(a[ 2], b[ 4]) + + MUL15(a[ 3], b[ 3]) + + MUL15(a[ 4], b[ 2]) + + MUL15(a[ 5], b[ 1]) + + MUL15(a[ 6], b[ 0]); + t[ 7] = MUL15(a[ 0], b[ 7]) + + MUL15(a[ 1], b[ 6]) + + MUL15(a[ 2], b[ 5]) + + MUL15(a[ 3], b[ 4]) + + MUL15(a[ 4], b[ 3]) + + MUL15(a[ 5], b[ 2]) + + MUL15(a[ 6], b[ 1]) + + MUL15(a[ 7], b[ 0]); + t[ 8] = MUL15(a[ 0], b[ 8]) + + MUL15(a[ 1], b[ 7]) + + MUL15(a[ 2], b[ 6]) + + MUL15(a[ 3], b[ 5]) + + MUL15(a[ 4], b[ 4]) + + MUL15(a[ 5], b[ 3]) + + MUL15(a[ 6], b[ 2]) + + MUL15(a[ 7], b[ 1]) + + MUL15(a[ 8], b[ 0]); + t[ 9] = MUL15(a[ 0], b[ 9]) + + MUL15(a[ 1], b[ 8]) + + MUL15(a[ 2], b[ 7]) + + MUL15(a[ 3], b[ 6]) + + MUL15(a[ 4], b[ 5]) + + MUL15(a[ 5], b[ 4]) + + MUL15(a[ 6], b[ 3]) + + MUL15(a[ 7], b[ 2]) + + MUL15(a[ 8], b[ 1]) + + MUL15(a[ 9], b[ 0]); + t[10] = MUL15(a[ 0], b[10]) + + MUL15(a[ 1], b[ 9]) + + MUL15(a[ 2], b[ 8]) + + MUL15(a[ 3], b[ 7]) + + MUL15(a[ 4], b[ 6]) + + MUL15(a[ 5], b[ 5]) + + MUL15(a[ 6], b[ 4]) + + MUL15(a[ 7], b[ 3]) + + MUL15(a[ 8], b[ 2]) + + MUL15(a[ 9], b[ 1]) + + MUL15(a[10], b[ 0]); + t[11] = MUL15(a[ 0], b[11]) + + MUL15(a[ 1], b[10]) + + MUL15(a[ 2], b[ 9]) + + MUL15(a[ 3], b[ 8]) + + MUL15(a[ 4], b[ 7]) + + MUL15(a[ 5], b[ 6]) + + MUL15(a[ 6], b[ 5]) + + MUL15(a[ 7], b[ 4]) + + MUL15(a[ 8], b[ 3]) + + MUL15(a[ 9], b[ 2]) + + MUL15(a[10], b[ 1]) + + MUL15(a[11], b[ 0]); + t[12] = MUL15(a[ 0], b[12]) + + MUL15(a[ 1], b[11]) + + MUL15(a[ 2], b[10]) + + MUL15(a[ 3], b[ 9]) + + MUL15(a[ 4], b[ 8]) + + MUL15(a[ 5], b[ 7]) + + MUL15(a[ 6], b[ 6]) + + MUL15(a[ 7], b[ 5]) + + MUL15(a[ 8], b[ 4]) + + MUL15(a[ 9], b[ 3]) + + MUL15(a[10], b[ 2]) + + MUL15(a[11], b[ 1]) + + MUL15(a[12], b[ 0]); + t[13] = MUL15(a[ 0], b[13]) + + MUL15(a[ 1], b[12]) + + MUL15(a[ 2], b[11]) + + MUL15(a[ 3], b[10]) + + MUL15(a[ 4], b[ 9]) + + MUL15(a[ 5], b[ 8]) + + MUL15(a[ 6], b[ 7]) + + MUL15(a[ 7], b[ 6]) + + MUL15(a[ 8], b[ 5]) + + MUL15(a[ 9], b[ 4]) + + MUL15(a[10], b[ 3]) + + MUL15(a[11], b[ 2]) + + MUL15(a[12], b[ 1]) + + MUL15(a[13], b[ 0]); + t[14] = MUL15(a[ 0], b[14]) + + MUL15(a[ 1], b[13]) + + MUL15(a[ 2], b[12]) + + MUL15(a[ 3], b[11]) + + MUL15(a[ 4], b[10]) + + MUL15(a[ 5], b[ 9]) + + MUL15(a[ 6], b[ 8]) + + MUL15(a[ 7], b[ 7]) + + MUL15(a[ 8], b[ 6]) + + MUL15(a[ 9], b[ 5]) + + MUL15(a[10], b[ 4]) + + MUL15(a[11], b[ 3]) + + MUL15(a[12], b[ 2]) + + MUL15(a[13], b[ 1]) + + MUL15(a[14], b[ 0]); + t[15] = MUL15(a[ 0], b[15]) + + MUL15(a[ 1], b[14]) + + MUL15(a[ 2], b[13]) + + MUL15(a[ 3], b[12]) + + MUL15(a[ 4], b[11]) + + MUL15(a[ 5], b[10]) + + MUL15(a[ 6], b[ 9]) + + MUL15(a[ 7], b[ 8]) + + MUL15(a[ 8], b[ 7]) + + MUL15(a[ 9], b[ 6]) + + MUL15(a[10], b[ 5]) + + MUL15(a[11], b[ 4]) + + MUL15(a[12], b[ 3]) + + MUL15(a[13], b[ 2]) + + MUL15(a[14], b[ 1]) + + MUL15(a[15], b[ 0]); + t[16] = MUL15(a[ 0], b[16]) + + MUL15(a[ 1], b[15]) + + MUL15(a[ 2], b[14]) + + MUL15(a[ 3], b[13]) + + MUL15(a[ 4], b[12]) + + MUL15(a[ 5], b[11]) + + MUL15(a[ 6], b[10]) + + MUL15(a[ 7], b[ 9]) + + MUL15(a[ 8], b[ 8]) + + MUL15(a[ 9], b[ 7]) + + MUL15(a[10], b[ 6]) + + MUL15(a[11], b[ 5]) + + MUL15(a[12], b[ 4]) + + MUL15(a[13], b[ 3]) + + MUL15(a[14], b[ 2]) + + MUL15(a[15], b[ 1]) + + MUL15(a[16], b[ 0]); + t[17] = MUL15(a[ 0], b[17]) + + MUL15(a[ 1], b[16]) + + MUL15(a[ 2], b[15]) + + MUL15(a[ 3], b[14]) + + MUL15(a[ 4], b[13]) + + MUL15(a[ 5], b[12]) + + MUL15(a[ 6], b[11]) + + MUL15(a[ 7], b[10]) + + MUL15(a[ 8], b[ 9]) + + MUL15(a[ 9], b[ 8]) + + MUL15(a[10], b[ 7]) + + MUL15(a[11], b[ 6]) + + MUL15(a[12], b[ 5]) + + MUL15(a[13], b[ 4]) + + MUL15(a[14], b[ 3]) + + MUL15(a[15], b[ 2]) + + MUL15(a[16], b[ 1]) + + MUL15(a[17], b[ 0]); + t[18] = MUL15(a[ 0], b[18]) + + MUL15(a[ 1], b[17]) + + MUL15(a[ 2], b[16]) + + MUL15(a[ 3], b[15]) + + MUL15(a[ 4], b[14]) + + MUL15(a[ 5], b[13]) + + MUL15(a[ 6], b[12]) + + MUL15(a[ 7], b[11]) + + MUL15(a[ 8], b[10]) + + MUL15(a[ 9], b[ 9]) + + MUL15(a[10], b[ 8]) + + MUL15(a[11], b[ 7]) + + MUL15(a[12], b[ 6]) + + MUL15(a[13], b[ 5]) + + MUL15(a[14], b[ 4]) + + MUL15(a[15], b[ 3]) + + MUL15(a[16], b[ 2]) + + MUL15(a[17], b[ 1]) + + MUL15(a[18], b[ 0]); + t[19] = MUL15(a[ 0], b[19]) + + MUL15(a[ 1], b[18]) + + MUL15(a[ 2], b[17]) + + MUL15(a[ 3], b[16]) + + MUL15(a[ 4], b[15]) + + MUL15(a[ 5], b[14]) + + MUL15(a[ 6], b[13]) + + MUL15(a[ 7], b[12]) + + MUL15(a[ 8], b[11]) + + MUL15(a[ 9], b[10]) + + MUL15(a[10], b[ 9]) + + MUL15(a[11], b[ 8]) + + MUL15(a[12], b[ 7]) + + MUL15(a[13], b[ 6]) + + MUL15(a[14], b[ 5]) + + MUL15(a[15], b[ 4]) + + MUL15(a[16], b[ 3]) + + MUL15(a[17], b[ 2]) + + MUL15(a[18], b[ 1]) + + MUL15(a[19], b[ 0]); + t[20] = MUL15(a[ 1], b[19]) + + MUL15(a[ 2], b[18]) + + MUL15(a[ 3], b[17]) + + MUL15(a[ 4], b[16]) + + MUL15(a[ 5], b[15]) + + MUL15(a[ 6], b[14]) + + MUL15(a[ 7], b[13]) + + MUL15(a[ 8], b[12]) + + MUL15(a[ 9], b[11]) + + MUL15(a[10], b[10]) + + MUL15(a[11], b[ 9]) + + MUL15(a[12], b[ 8]) + + MUL15(a[13], b[ 7]) + + MUL15(a[14], b[ 6]) + + MUL15(a[15], b[ 5]) + + MUL15(a[16], b[ 4]) + + MUL15(a[17], b[ 3]) + + MUL15(a[18], b[ 2]) + + MUL15(a[19], b[ 1]); + t[21] = MUL15(a[ 2], b[19]) + + MUL15(a[ 3], b[18]) + + MUL15(a[ 4], b[17]) + + MUL15(a[ 5], b[16]) + + MUL15(a[ 6], b[15]) + + MUL15(a[ 7], b[14]) + + MUL15(a[ 8], b[13]) + + MUL15(a[ 9], b[12]) + + MUL15(a[10], b[11]) + + MUL15(a[11], b[10]) + + MUL15(a[12], b[ 9]) + + MUL15(a[13], b[ 8]) + + MUL15(a[14], b[ 7]) + + MUL15(a[15], b[ 6]) + + MUL15(a[16], b[ 5]) + + MUL15(a[17], b[ 4]) + + MUL15(a[18], b[ 3]) + + MUL15(a[19], b[ 2]); + t[22] = MUL15(a[ 3], b[19]) + + MUL15(a[ 4], b[18]) + + MUL15(a[ 5], b[17]) + + MUL15(a[ 6], b[16]) + + MUL15(a[ 7], b[15]) + + MUL15(a[ 8], b[14]) + + MUL15(a[ 9], b[13]) + + MUL15(a[10], b[12]) + + MUL15(a[11], b[11]) + + MUL15(a[12], b[10]) + + MUL15(a[13], b[ 9]) + + MUL15(a[14], b[ 8]) + + MUL15(a[15], b[ 7]) + + MUL15(a[16], b[ 6]) + + MUL15(a[17], b[ 5]) + + MUL15(a[18], b[ 4]) + + MUL15(a[19], b[ 3]); + t[23] = MUL15(a[ 4], b[19]) + + MUL15(a[ 5], b[18]) + + MUL15(a[ 6], b[17]) + + MUL15(a[ 7], b[16]) + + MUL15(a[ 8], b[15]) + + MUL15(a[ 9], b[14]) + + MUL15(a[10], b[13]) + + MUL15(a[11], b[12]) + + MUL15(a[12], b[11]) + + MUL15(a[13], b[10]) + + MUL15(a[14], b[ 9]) + + MUL15(a[15], b[ 8]) + + MUL15(a[16], b[ 7]) + + MUL15(a[17], b[ 6]) + + MUL15(a[18], b[ 5]) + + MUL15(a[19], b[ 4]); + t[24] = MUL15(a[ 5], b[19]) + + MUL15(a[ 6], b[18]) + + MUL15(a[ 7], b[17]) + + MUL15(a[ 8], b[16]) + + MUL15(a[ 9], b[15]) + + MUL15(a[10], b[14]) + + MUL15(a[11], b[13]) + + MUL15(a[12], b[12]) + + MUL15(a[13], b[11]) + + MUL15(a[14], b[10]) + + MUL15(a[15], b[ 9]) + + MUL15(a[16], b[ 8]) + + MUL15(a[17], b[ 7]) + + MUL15(a[18], b[ 6]) + + MUL15(a[19], b[ 5]); + t[25] = MUL15(a[ 6], b[19]) + + MUL15(a[ 7], b[18]) + + MUL15(a[ 8], b[17]) + + MUL15(a[ 9], b[16]) + + MUL15(a[10], b[15]) + + MUL15(a[11], b[14]) + + MUL15(a[12], b[13]) + + MUL15(a[13], b[12]) + + MUL15(a[14], b[11]) + + MUL15(a[15], b[10]) + + MUL15(a[16], b[ 9]) + + MUL15(a[17], b[ 8]) + + MUL15(a[18], b[ 7]) + + MUL15(a[19], b[ 6]); + t[26] = MUL15(a[ 7], b[19]) + + MUL15(a[ 8], b[18]) + + MUL15(a[ 9], b[17]) + + MUL15(a[10], b[16]) + + MUL15(a[11], b[15]) + + MUL15(a[12], b[14]) + + MUL15(a[13], b[13]) + + MUL15(a[14], b[12]) + + MUL15(a[15], b[11]) + + MUL15(a[16], b[10]) + + MUL15(a[17], b[ 9]) + + MUL15(a[18], b[ 8]) + + MUL15(a[19], b[ 7]); + t[27] = MUL15(a[ 8], b[19]) + + MUL15(a[ 9], b[18]) + + MUL15(a[10], b[17]) + + MUL15(a[11], b[16]) + + MUL15(a[12], b[15]) + + MUL15(a[13], b[14]) + + MUL15(a[14], b[13]) + + MUL15(a[15], b[12]) + + MUL15(a[16], b[11]) + + MUL15(a[17], b[10]) + + MUL15(a[18], b[ 9]) + + MUL15(a[19], b[ 8]); + t[28] = MUL15(a[ 9], b[19]) + + MUL15(a[10], b[18]) + + MUL15(a[11], b[17]) + + MUL15(a[12], b[16]) + + MUL15(a[13], b[15]) + + MUL15(a[14], b[14]) + + MUL15(a[15], b[13]) + + MUL15(a[16], b[12]) + + MUL15(a[17], b[11]) + + MUL15(a[18], b[10]) + + MUL15(a[19], b[ 9]); + t[29] = MUL15(a[10], b[19]) + + MUL15(a[11], b[18]) + + MUL15(a[12], b[17]) + + MUL15(a[13], b[16]) + + MUL15(a[14], b[15]) + + MUL15(a[15], b[14]) + + MUL15(a[16], b[13]) + + MUL15(a[17], b[12]) + + MUL15(a[18], b[11]) + + MUL15(a[19], b[10]); + t[30] = MUL15(a[11], b[19]) + + MUL15(a[12], b[18]) + + MUL15(a[13], b[17]) + + MUL15(a[14], b[16]) + + MUL15(a[15], b[15]) + + MUL15(a[16], b[14]) + + MUL15(a[17], b[13]) + + MUL15(a[18], b[12]) + + MUL15(a[19], b[11]); + t[31] = MUL15(a[12], b[19]) + + MUL15(a[13], b[18]) + + MUL15(a[14], b[17]) + + MUL15(a[15], b[16]) + + MUL15(a[16], b[15]) + + MUL15(a[17], b[14]) + + MUL15(a[18], b[13]) + + MUL15(a[19], b[12]); + t[32] = MUL15(a[13], b[19]) + + MUL15(a[14], b[18]) + + MUL15(a[15], b[17]) + + MUL15(a[16], b[16]) + + MUL15(a[17], b[15]) + + MUL15(a[18], b[14]) + + MUL15(a[19], b[13]); + t[33] = MUL15(a[14], b[19]) + + MUL15(a[15], b[18]) + + MUL15(a[16], b[17]) + + MUL15(a[17], b[16]) + + MUL15(a[18], b[15]) + + MUL15(a[19], b[14]); + t[34] = MUL15(a[15], b[19]) + + MUL15(a[16], b[18]) + + MUL15(a[17], b[17]) + + MUL15(a[18], b[16]) + + MUL15(a[19], b[15]); + t[35] = MUL15(a[16], b[19]) + + MUL15(a[17], b[18]) + + MUL15(a[18], b[17]) + + MUL15(a[19], b[16]); + t[36] = MUL15(a[17], b[19]) + + MUL15(a[18], b[18]) + + MUL15(a[19], b[17]); + t[37] = MUL15(a[18], b[19]) + + MUL15(a[19], b[18]); + t[38] = MUL15(a[19], b[19]); + + d[39] = norm13(d, t, 39); +} + +static void +square20(uint32_t *d, const uint32_t *a) +{ + uint32_t t[39]; + + t[ 0] = MUL15(a[ 0], a[ 0]); + t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1); + t[ 2] = MUL15(a[ 1], a[ 1]) + + ((MUL15(a[ 0], a[ 2])) << 1); + t[ 3] = ((MUL15(a[ 0], a[ 3]) + + MUL15(a[ 1], a[ 2])) << 1); + t[ 4] = MUL15(a[ 2], a[ 2]) + + ((MUL15(a[ 0], a[ 4]) + + MUL15(a[ 1], a[ 3])) << 1); + t[ 5] = ((MUL15(a[ 0], a[ 5]) + + MUL15(a[ 1], a[ 4]) + + MUL15(a[ 2], a[ 3])) << 1); + t[ 6] = MUL15(a[ 3], a[ 3]) + + ((MUL15(a[ 0], a[ 6]) + + MUL15(a[ 1], a[ 5]) + + MUL15(a[ 2], a[ 4])) << 1); + t[ 7] = ((MUL15(a[ 0], a[ 7]) + + MUL15(a[ 1], a[ 6]) + + MUL15(a[ 2], a[ 5]) + + MUL15(a[ 3], a[ 4])) << 1); + t[ 8] = MUL15(a[ 4], a[ 4]) + + ((MUL15(a[ 0], a[ 8]) + + MUL15(a[ 1], a[ 7]) + + MUL15(a[ 2], a[ 6]) + + MUL15(a[ 3], a[ 5])) << 1); + t[ 9] = ((MUL15(a[ 0], a[ 9]) + + MUL15(a[ 1], a[ 8]) + + MUL15(a[ 2], a[ 7]) + + MUL15(a[ 3], a[ 6]) + + MUL15(a[ 4], a[ 5])) << 1); + t[10] = MUL15(a[ 5], a[ 5]) + + ((MUL15(a[ 0], a[10]) + + MUL15(a[ 1], a[ 9]) + + MUL15(a[ 2], a[ 8]) + + MUL15(a[ 3], a[ 7]) + + MUL15(a[ 4], a[ 6])) << 1); + t[11] = ((MUL15(a[ 0], a[11]) + + MUL15(a[ 1], a[10]) + + MUL15(a[ 2], a[ 9]) + + MUL15(a[ 3], a[ 8]) + + MUL15(a[ 4], a[ 7]) + + MUL15(a[ 5], a[ 6])) << 1); + t[12] = MUL15(a[ 6], a[ 6]) + + ((MUL15(a[ 0], a[12]) + + MUL15(a[ 1], a[11]) + + MUL15(a[ 2], a[10]) + + MUL15(a[ 3], a[ 9]) + + MUL15(a[ 4], a[ 8]) + + MUL15(a[ 5], a[ 7])) << 1); + t[13] = ((MUL15(a[ 0], a[13]) + + MUL15(a[ 1], a[12]) + + MUL15(a[ 2], a[11]) + + MUL15(a[ 3], a[10]) + + MUL15(a[ 4], a[ 9]) + + MUL15(a[ 5], a[ 8]) + + MUL15(a[ 6], a[ 7])) << 1); + t[14] = MUL15(a[ 7], a[ 7]) + + ((MUL15(a[ 0], a[14]) + + MUL15(a[ 1], a[13]) + + MUL15(a[ 2], a[12]) + + MUL15(a[ 3], a[11]) + + MUL15(a[ 4], a[10]) + + MUL15(a[ 5], a[ 9]) + + MUL15(a[ 6], a[ 8])) << 1); + t[15] = ((MUL15(a[ 0], a[15]) + + MUL15(a[ 1], a[14]) + + MUL15(a[ 2], a[13]) + + MUL15(a[ 3], a[12]) + + MUL15(a[ 4], a[11]) + + MUL15(a[ 5], a[10]) + + MUL15(a[ 6], a[ 9]) + + MUL15(a[ 7], a[ 8])) << 1); + t[16] = MUL15(a[ 8], a[ 8]) + + ((MUL15(a[ 0], a[16]) + + MUL15(a[ 1], a[15]) + + MUL15(a[ 2], a[14]) + + MUL15(a[ 3], a[13]) + + MUL15(a[ 4], a[12]) + + MUL15(a[ 5], a[11]) + + MUL15(a[ 6], a[10]) + + MUL15(a[ 7], a[ 9])) << 1); + t[17] = ((MUL15(a[ 0], a[17]) + + MUL15(a[ 1], a[16]) + + MUL15(a[ 2], a[15]) + + MUL15(a[ 3], a[14]) + + MUL15(a[ 4], a[13]) + + MUL15(a[ 5], a[12]) + + MUL15(a[ 6], a[11]) + + MUL15(a[ 7], a[10]) + + MUL15(a[ 8], a[ 9])) << 1); + t[18] = MUL15(a[ 9], a[ 9]) + + ((MUL15(a[ 0], a[18]) + + MUL15(a[ 1], a[17]) + + MUL15(a[ 2], a[16]) + + MUL15(a[ 3], a[15]) + + MUL15(a[ 4], a[14]) + + MUL15(a[ 5], a[13]) + + MUL15(a[ 6], a[12]) + + MUL15(a[ 7], a[11]) + + MUL15(a[ 8], a[10])) << 1); + t[19] = ((MUL15(a[ 0], a[19]) + + MUL15(a[ 1], a[18]) + + MUL15(a[ 2], a[17]) + + MUL15(a[ 3], a[16]) + + MUL15(a[ 4], a[15]) + + MUL15(a[ 5], a[14]) + + MUL15(a[ 6], a[13]) + + MUL15(a[ 7], a[12]) + + MUL15(a[ 8], a[11]) + + MUL15(a[ 9], a[10])) << 1); + t[20] = MUL15(a[10], a[10]) + + ((MUL15(a[ 1], a[19]) + + MUL15(a[ 2], a[18]) + + MUL15(a[ 3], a[17]) + + MUL15(a[ 4], a[16]) + + MUL15(a[ 5], a[15]) + + MUL15(a[ 6], a[14]) + + MUL15(a[ 7], a[13]) + + MUL15(a[ 8], a[12]) + + MUL15(a[ 9], a[11])) << 1); + t[21] = ((MUL15(a[ 2], a[19]) + + MUL15(a[ 3], a[18]) + + MUL15(a[ 4], a[17]) + + MUL15(a[ 5], a[16]) + + MUL15(a[ 6], a[15]) + + MUL15(a[ 7], a[14]) + + MUL15(a[ 8], a[13]) + + MUL15(a[ 9], a[12]) + + MUL15(a[10], a[11])) << 1); + t[22] = MUL15(a[11], a[11]) + + ((MUL15(a[ 3], a[19]) + + MUL15(a[ 4], a[18]) + + MUL15(a[ 5], a[17]) + + MUL15(a[ 6], a[16]) + + MUL15(a[ 7], a[15]) + + MUL15(a[ 8], a[14]) + + MUL15(a[ 9], a[13]) + + MUL15(a[10], a[12])) << 1); + t[23] = ((MUL15(a[ 4], a[19]) + + MUL15(a[ 5], a[18]) + + MUL15(a[ 6], a[17]) + + MUL15(a[ 7], a[16]) + + MUL15(a[ 8], a[15]) + + MUL15(a[ 9], a[14]) + + MUL15(a[10], a[13]) + + MUL15(a[11], a[12])) << 1); + t[24] = MUL15(a[12], a[12]) + + ((MUL15(a[ 5], a[19]) + + MUL15(a[ 6], a[18]) + + MUL15(a[ 7], a[17]) + + MUL15(a[ 8], a[16]) + + MUL15(a[ 9], a[15]) + + MUL15(a[10], a[14]) + + MUL15(a[11], a[13])) << 1); + t[25] = ((MUL15(a[ 6], a[19]) + + MUL15(a[ 7], a[18]) + + MUL15(a[ 8], a[17]) + + MUL15(a[ 9], a[16]) + + MUL15(a[10], a[15]) + + MUL15(a[11], a[14]) + + MUL15(a[12], a[13])) << 1); + t[26] = MUL15(a[13], a[13]) + + ((MUL15(a[ 7], a[19]) + + MUL15(a[ 8], a[18]) + + MUL15(a[ 9], a[17]) + + MUL15(a[10], a[16]) + + MUL15(a[11], a[15]) + + MUL15(a[12], a[14])) << 1); + t[27] = ((MUL15(a[ 8], a[19]) + + MUL15(a[ 9], a[18]) + + MUL15(a[10], a[17]) + + MUL15(a[11], a[16]) + + MUL15(a[12], a[15]) + + MUL15(a[13], a[14])) << 1); + t[28] = MUL15(a[14], a[14]) + + ((MUL15(a[ 9], a[19]) + + MUL15(a[10], a[18]) + + MUL15(a[11], a[17]) + + MUL15(a[12], a[16]) + + MUL15(a[13], a[15])) << 1); + t[29] = ((MUL15(a[10], a[19]) + + MUL15(a[11], a[18]) + + MUL15(a[12], a[17]) + + MUL15(a[13], a[16]) + + MUL15(a[14], a[15])) << 1); + t[30] = MUL15(a[15], a[15]) + + ((MUL15(a[11], a[19]) + + MUL15(a[12], a[18]) + + MUL15(a[13], a[17]) + + MUL15(a[14], a[16])) << 1); + t[31] = ((MUL15(a[12], a[19]) + + MUL15(a[13], a[18]) + + MUL15(a[14], a[17]) + + MUL15(a[15], a[16])) << 1); + t[32] = MUL15(a[16], a[16]) + + ((MUL15(a[13], a[19]) + + MUL15(a[14], a[18]) + + MUL15(a[15], a[17])) << 1); + t[33] = ((MUL15(a[14], a[19]) + + MUL15(a[15], a[18]) + + MUL15(a[16], a[17])) << 1); + t[34] = MUL15(a[17], a[17]) + + ((MUL15(a[15], a[19]) + + MUL15(a[16], a[18])) << 1); + t[35] = ((MUL15(a[16], a[19]) + + MUL15(a[17], a[18])) << 1); + t[36] = MUL15(a[18], a[18]) + + ((MUL15(a[17], a[19])) << 1); + t[37] = ((MUL15(a[18], a[19])) << 1); + t[38] = MUL15(a[19], a[19]); + + d[39] = norm13(d, t, 39); +} + +#endif + +/* + * Perform a "final reduction" in field F255 (field for Curve25519) + * The source value must be less than twice the modulus. If the value + * is not lower than the modulus, then the modulus is subtracted and + * this function returns 1; otherwise, it leaves it untouched and it + * returns 0. + */ +static uint32_t +reduce_final_f255(uint32_t *d) +{ + uint32_t t[20]; + uint32_t cc; + int i; + + memcpy(t, d, sizeof t); + cc = 19; + for (i = 0; i < 20; i ++) { + uint32_t w; + + w = t[i] + cc; + cc = w >> 13; + t[i] = w & 0x1FFF; + } + cc = t[19] >> 8; + t[19] &= 0xFF; + CCOPY(cc, d, t, sizeof t); + return cc; +} + +static void +f255_mulgen(uint32_t *d, const uint32_t *a, const uint32_t *b, int square) +{ + uint32_t t[40], cc, w; + + /* + * Compute raw multiplication. All result words fit in 13 bits + * each; upper word (t[39]) must fit on 5 bits, since the product + * of two 256-bit integers must fit on 512 bits. + */ + if (square) { + square20(t, a); + } else { + mul20(t, a, b); + } + + /* + * Modular reduction: each high word is added where necessary. + * Since the modulus is 2^255-19 and word 20 corresponds to + * offset 20*13 = 260, word 20+k must be added to word k with + * a factor of 19*2^5 = 608. The extra bits in word 19 are also + * added that way. + */ + cc = MUL15(t[19] >> 8, 19); + t[19] &= 0xFF; + +#define MM1(x) do { \ + w = t[x] + cc + MUL15(t[(x) + 20], 608); \ + t[x] = w & 0x1FFF; \ + cc = w >> 13; \ + } while (0) + + MM1( 0); + MM1( 1); + MM1( 2); + MM1( 3); + MM1( 4); + MM1( 5); + MM1( 6); + MM1( 7); + MM1( 8); + MM1( 9); + MM1(10); + MM1(11); + MM1(12); + MM1(13); + MM1(14); + MM1(15); + MM1(16); + MM1(17); + MM1(18); + MM1(19); + +#undef MM1 + + cc = MUL15(w >> 8, 19); + t[19] &= 0xFF; + +#define MM2(x) do { \ + w = t[x] + cc; \ + d[x] = w & 0x1FFF; \ + cc = w >> 13; \ + } while (0) + + MM2( 0); + MM2( 1); + MM2( 2); + MM2( 3); + MM2( 4); + MM2( 5); + MM2( 6); + MM2( 7); + MM2( 8); + MM2( 9); + MM2(10); + MM2(11); + MM2(12); + MM2(13); + MM2(14); + MM2(15); + MM2(16); + MM2(17); + MM2(18); + MM2(19); + +#undef MM2 +} + +/* + * Perform a multiplication of two integers modulo 2^255-19. + * Operands are arrays of 20 words, each containing 13 bits of data, in + * little-endian order. Input value may be up to 2^256-1; on output, value + * fits on 256 bits and is lower than twice the modulus. + * + * f255_mul() is the general multiplication, f255_square() is specialised + * for squarings. + */ +#define f255_mul(d, a, b) f255_mulgen(d, a, b, 0) +#define f255_square(d, a) f255_mulgen(d, a, a, 1) + +/* + * Add two values in F255. Partial reduction is performed (down to less + * than twice the modulus). + */ +static void +f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + int i; + uint32_t cc, w; + + cc = 0; + for (i = 0; i < 20; i ++) { + w = a[i] + b[i] + cc; + d[i] = w & 0x1FFF; + cc = w >> 13; + } + cc = MUL15(w >> 8, 19); + d[19] &= 0xFF; + for (i = 0; i < 20; i ++) { + w = d[i] + cc; + d[i] = w & 0x1FFF; + cc = w >> 13; + } +} + +/* + * Subtract one value from another in F255. Partial reduction is + * performed (down to less than twice the modulus). + */ +static void +f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + /* + * We actually compute a - b + 2*p, so that the final value is + * necessarily positive. + */ + int i; + uint32_t cc, w; + + cc = (uint32_t)-38; + for (i = 0; i < 20; i ++) { + w = a[i] - b[i] + cc; + d[i] = w & 0x1FFF; + cc = ARSH(w, 13); + } + cc = MUL15((w + 0x200) >> 8, 19); + d[19] &= 0xFF; + for (i = 0; i < 20; i ++) { + w = d[i] + cc; + d[i] = w & 0x1FFF; + cc = w >> 13; + } +} + +/* + * Multiply an integer by the 'A24' constant (121665). Partial reduction + * is performed (down to less than twice the modulus). + */ +static void +f255_mul_a24(uint32_t *d, const uint32_t *a) +{ + int i; + uint32_t cc, w; + + cc = 0; + for (i = 0; i < 20; i ++) { + w = MUL15(a[i], 121665) + cc; + d[i] = w & 0x1FFF; + cc = w >> 13; + } + cc = MUL15(w >> 8, 19); + d[19] &= 0xFF; + for (i = 0; i < 20; i ++) { + w = d[i] + cc; + d[i] = w & 0x1FFF; + cc = w >> 13; + } +} + +static const unsigned char GEN[] = { + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const unsigned char ORDER[] = { + 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return GEN; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return ORDER; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 0; +} + +static void +cswap(uint32_t *a, uint32_t *b, uint32_t ctl) +{ + int i; + + ctl = -ctl; + for (i = 0; i < 20; i ++) { + uint32_t aw, bw, tw; + + aw = a[i]; + bw = b[i]; + tw = ctl & (aw ^ bw); + a[i] = aw ^ tw; + b[i] = bw ^ tw; + } +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ + uint32_t x1[20], x2[20], x3[20], z2[20], z3[20]; + uint32_t a[20], aa[20], b[20], bb[20]; + uint32_t c[20], d[20], e[20], da[20], cb[20]; + unsigned char k[32]; + uint32_t swap; + int i; + + (void)curve; + + /* + * Points are encoded over exactly 32 bytes. Multipliers must fit + * in 32 bytes as well. + * RFC 7748 mandates that the high bit of the last point byte must + * be ignored/cleared. + */ + if (Glen != 32 || kblen > 32) { + return 0; + } + G[31] &= 0x7F; + + /* + * Initialise variables x1, x2, z2, x3 and z3. We set all of them + * into Montgomery representation. + */ + x1[19] = le8_to_le13(x1, G, 32); + memcpy(x3, x1, sizeof x1); + memset(z2, 0, sizeof z2); + memset(x2, 0, sizeof x2); + x2[0] = 1; + memset(z3, 0, sizeof z3); + z3[0] = 1; + + memset(k, 0, (sizeof k) - kblen); + memcpy(k + (sizeof k) - kblen, kb, kblen); + k[31] &= 0xF8; + k[0] &= 0x7F; + k[0] |= 0x40; + + /* obsolete + print_int("x1", x1); + */ + + swap = 0; + for (i = 254; i >= 0; i --) { + uint32_t kt; + + kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; + swap ^= kt; + cswap(x2, x3, swap); + cswap(z2, z3, swap); + swap = kt; + + /* obsolete + print_int("x2", x2); + print_int("z2", z2); + print_int("x3", x3); + print_int("z3", z3); + */ + + f255_add(a, x2, z2); + f255_square(aa, a); + f255_sub(b, x2, z2); + f255_square(bb, b); + f255_sub(e, aa, bb); + f255_add(c, x3, z3); + f255_sub(d, x3, z3); + f255_mul(da, d, a); + f255_mul(cb, c, b); + + /* obsolete + print_int("a ", a); + print_int("aa", aa); + print_int("b ", b); + print_int("bb", bb); + print_int("e ", e); + print_int("c ", c); + print_int("d ", d); + print_int("da", da); + print_int("cb", cb); + */ + + f255_add(x3, da, cb); + f255_square(x3, x3); + f255_sub(z3, da, cb); + f255_square(z3, z3); + f255_mul(z3, z3, x1); + f255_mul(x2, aa, bb); + f255_mul_a24(z2, e); + f255_add(z2, z2, aa); + f255_mul(z2, e, z2); + + /* obsolete + print_int("x2", x2); + print_int("z2", z2); + print_int("x3", x3); + print_int("z3", z3); + */ + } + cswap(x2, x3, swap); + cswap(z2, z3, swap); + + /* + * Inverse z2 with a modular exponentiation. This is a simple + * square-and-multiply algorithm; we mutualise most non-squarings + * since the exponent contains almost only ones. + */ + memcpy(a, z2, sizeof z2); + for (i = 0; i < 15; i ++) { + f255_square(a, a); + f255_mul(a, a, z2); + } + memcpy(b, a, sizeof a); + for (i = 0; i < 14; i ++) { + int j; + + for (j = 0; j < 16; j ++) { + f255_square(b, b); + } + f255_mul(b, b, a); + } + for (i = 14; i >= 0; i --) { + f255_square(b, b); + if ((0xFFEB >> i) & 1) { + f255_mul(b, z2, b); + } + } + f255_mul(x2, x2, b); + reduce_final_f255(x2); + le13_to_le8(G, 32, x2); + return 1; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + /* + * We don't implement this method, since it is used for ECDSA + * only, and there is no ECDSA over Curve25519 (which instead + * uses EdDSA). + */ + (void)A; + (void)B; + (void)len; + (void)x; + (void)xlen; + (void)y; + (void)ylen; + (void)curve; + return 0; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_c25519_m15 = { + (uint32_t)0x20000000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c new file mode 100644 index 00000000..1dd6d514 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m31.c @@ -0,0 +1,800 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* obsolete +#include <stdio.h> +#include <stdlib.h> +static void +print_int(const char *name, const uint32_t *x) +{ + size_t u; + unsigned char tmp[40]; + + printf("%s = ", name); + for (u = 0; u < 9; u ++) { + if (x[u] > 0x3FFFFFFF) { + printf("INVALID:"); + for (u = 0; u < 9; u ++) { + printf(" %08X", x[u]); + } + printf("\n"); + return; + } + } + memset(tmp, 0, sizeof tmp); + for (u = 0; u < 9; u ++) { + uint64_t w; + int j, k; + + w = x[u]; + j = 30 * (int)u; + k = j & 7; + if (k != 0) { + w <<= k; + j -= k; + } + k = j >> 3; + for (j = 0; j < 8; j ++) { + tmp[39 - k - j] |= (unsigned char)w; + w >>= 8; + } + } + for (u = 8; u < 40; u ++) { + printf("%02X", tmp[u]); + } + printf("\n"); +} +*/ + +/* + * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ + * that right-shifting a signed negative integer copies the sign bit + * (arithmetic right-shift). This is "implementation-defined behaviour", + * i.e. it is not undefined, but it may differ between compilers. Each + * compiler is supposed to document its behaviour in that respect. GCC + * explicitly defines that an arithmetic right shift is used. We expect + * all other compilers to do the same, because underlying CPU offer an + * arithmetic right shift opcode that could not be used otherwise. + */ +#if BR_NO_ARITH_SHIFT +#define ARSH(x, n) (((uint32_t)(x) >> (n)) \ + | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) +#else +#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) +#endif + +/* + * Convert an integer from unsigned little-endian encoding to a sequence of + * 30-bit words in little-endian order. The final "partial" word is + * returned. + */ +static uint32_t +le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len) +{ + uint32_t acc; + int acc_len; + + acc = 0; + acc_len = 0; + while (len -- > 0) { + uint32_t b; + + b = *src ++; + if (acc_len < 22) { + acc |= b << acc_len; + acc_len += 8; + } else { + *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF; + acc = b >> (30 - acc_len); + acc_len -= 22; + } + } + return acc; +} + +/* + * Convert an integer (30-bit words, little-endian) to unsigned + * little-endian encoding. The total encoding length is provided; all + * the destination bytes will be filled. + */ +static void +le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src) +{ + uint32_t acc; + int acc_len; + + acc = 0; + acc_len = 0; + while (len -- > 0) { + if (acc_len < 8) { + uint32_t w; + + w = *src ++; + *dst ++ = (unsigned char)(acc | (w << acc_len)); + acc = w >> (8 - acc_len); + acc_len += 22; + } else { + *dst ++ = (unsigned char)acc; + acc >>= 8; + acc_len -= 8; + } + } +} + +/* + * Multiply two integers. Source integers are represented as arrays of + * nine 30-bit words, for values up to 2^270-1. Result is encoded over + * 18 words of 30 bits each. + */ +static void +mul9(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + /* + * Maximum intermediate result is no more than + * 10376293531797946367, which fits in 64 bits. Reason: + * + * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406 + * 10376293531797946367 < 9663676407 * 2^30 + * + * Thus, adding together 9 products of 30-bit integers, with + * a carry of at most 9663676406, yields an integer that fits + * on 64 bits and generates a carry of at most 9663676406. + */ + uint64_t t[17]; + uint64_t cc; + int i; + + t[ 0] = MUL31(a[0], b[0]); + t[ 1] = MUL31(a[0], b[1]) + + MUL31(a[1], b[0]); + t[ 2] = MUL31(a[0], b[2]) + + MUL31(a[1], b[1]) + + MUL31(a[2], b[0]); + t[ 3] = MUL31(a[0], b[3]) + + MUL31(a[1], b[2]) + + MUL31(a[2], b[1]) + + MUL31(a[3], b[0]); + t[ 4] = MUL31(a[0], b[4]) + + MUL31(a[1], b[3]) + + MUL31(a[2], b[2]) + + MUL31(a[3], b[1]) + + MUL31(a[4], b[0]); + t[ 5] = MUL31(a[0], b[5]) + + MUL31(a[1], b[4]) + + MUL31(a[2], b[3]) + + MUL31(a[3], b[2]) + + MUL31(a[4], b[1]) + + MUL31(a[5], b[0]); + t[ 6] = MUL31(a[0], b[6]) + + MUL31(a[1], b[5]) + + MUL31(a[2], b[4]) + + MUL31(a[3], b[3]) + + MUL31(a[4], b[2]) + + MUL31(a[5], b[1]) + + MUL31(a[6], b[0]); + t[ 7] = MUL31(a[0], b[7]) + + MUL31(a[1], b[6]) + + MUL31(a[2], b[5]) + + MUL31(a[3], b[4]) + + MUL31(a[4], b[3]) + + MUL31(a[5], b[2]) + + MUL31(a[6], b[1]) + + MUL31(a[7], b[0]); + t[ 8] = MUL31(a[0], b[8]) + + MUL31(a[1], b[7]) + + MUL31(a[2], b[6]) + + MUL31(a[3], b[5]) + + MUL31(a[4], b[4]) + + MUL31(a[5], b[3]) + + MUL31(a[6], b[2]) + + MUL31(a[7], b[1]) + + MUL31(a[8], b[0]); + t[ 9] = MUL31(a[1], b[8]) + + MUL31(a[2], b[7]) + + MUL31(a[3], b[6]) + + MUL31(a[4], b[5]) + + MUL31(a[5], b[4]) + + MUL31(a[6], b[3]) + + MUL31(a[7], b[2]) + + MUL31(a[8], b[1]); + t[10] = MUL31(a[2], b[8]) + + MUL31(a[3], b[7]) + + MUL31(a[4], b[6]) + + MUL31(a[5], b[5]) + + MUL31(a[6], b[4]) + + MUL31(a[7], b[3]) + + MUL31(a[8], b[2]); + t[11] = MUL31(a[3], b[8]) + + MUL31(a[4], b[7]) + + MUL31(a[5], b[6]) + + MUL31(a[6], b[5]) + + MUL31(a[7], b[4]) + + MUL31(a[8], b[3]); + t[12] = MUL31(a[4], b[8]) + + MUL31(a[5], b[7]) + + MUL31(a[6], b[6]) + + MUL31(a[7], b[5]) + + MUL31(a[8], b[4]); + t[13] = MUL31(a[5], b[8]) + + MUL31(a[6], b[7]) + + MUL31(a[7], b[6]) + + MUL31(a[8], b[5]); + t[14] = MUL31(a[6], b[8]) + + MUL31(a[7], b[7]) + + MUL31(a[8], b[6]); + t[15] = MUL31(a[7], b[8]) + + MUL31(a[8], b[7]); + t[16] = MUL31(a[8], b[8]); + + /* + * Propagate carries. + */ + cc = 0; + for (i = 0; i < 17; i ++) { + uint64_t w; + + w = t[i] + cc; + d[i] = (uint32_t)w & 0x3FFFFFFF; + cc = w >> 30; + } + d[17] = (uint32_t)cc; +} + +/* + * Square a 270-bit integer, represented as an array of nine 30-bit words. + * Result uses 18 words of 30 bits each. + */ +static void +square9(uint32_t *d, const uint32_t *a) +{ + uint64_t t[17]; + uint64_t cc; + int i; + + t[ 0] = MUL31(a[0], a[0]); + t[ 1] = ((MUL31(a[0], a[1])) << 1); + t[ 2] = MUL31(a[1], a[1]) + + ((MUL31(a[0], a[2])) << 1); + t[ 3] = ((MUL31(a[0], a[3]) + + MUL31(a[1], a[2])) << 1); + t[ 4] = MUL31(a[2], a[2]) + + ((MUL31(a[0], a[4]) + + MUL31(a[1], a[3])) << 1); + t[ 5] = ((MUL31(a[0], a[5]) + + MUL31(a[1], a[4]) + + MUL31(a[2], a[3])) << 1); + t[ 6] = MUL31(a[3], a[3]) + + ((MUL31(a[0], a[6]) + + MUL31(a[1], a[5]) + + MUL31(a[2], a[4])) << 1); + t[ 7] = ((MUL31(a[0], a[7]) + + MUL31(a[1], a[6]) + + MUL31(a[2], a[5]) + + MUL31(a[3], a[4])) << 1); + t[ 8] = MUL31(a[4], a[4]) + + ((MUL31(a[0], a[8]) + + MUL31(a[1], a[7]) + + MUL31(a[2], a[6]) + + MUL31(a[3], a[5])) << 1); + t[ 9] = ((MUL31(a[1], a[8]) + + MUL31(a[2], a[7]) + + MUL31(a[3], a[6]) + + MUL31(a[4], a[5])) << 1); + t[10] = MUL31(a[5], a[5]) + + ((MUL31(a[2], a[8]) + + MUL31(a[3], a[7]) + + MUL31(a[4], a[6])) << 1); + t[11] = ((MUL31(a[3], a[8]) + + MUL31(a[4], a[7]) + + MUL31(a[5], a[6])) << 1); + t[12] = MUL31(a[6], a[6]) + + ((MUL31(a[4], a[8]) + + MUL31(a[5], a[7])) << 1); + t[13] = ((MUL31(a[5], a[8]) + + MUL31(a[6], a[7])) << 1); + t[14] = MUL31(a[7], a[7]) + + ((MUL31(a[6], a[8])) << 1); + t[15] = ((MUL31(a[7], a[8])) << 1); + t[16] = MUL31(a[8], a[8]); + + /* + * Propagate carries. + */ + cc = 0; + for (i = 0; i < 17; i ++) { + uint64_t w; + + w = t[i] + cc; + d[i] = (uint32_t)w & 0x3FFFFFFF; + cc = w >> 30; + } + d[17] = (uint32_t)cc; +} + +/* + * Perform a "final reduction" in field F255 (field for Curve25519) + * The source value must be less than twice the modulus. If the value + * is not lower than the modulus, then the modulus is subtracted and + * this function returns 1; otherwise, it leaves it untouched and it + * returns 0. + */ +static uint32_t +reduce_final_f255(uint32_t *d) +{ + uint32_t t[9]; + uint32_t cc; + int i; + + memcpy(t, d, sizeof t); + cc = 19; + for (i = 0; i < 9; i ++) { + uint32_t w; + + w = t[i] + cc; + cc = w >> 30; + t[i] = w & 0x3FFFFFFF; + } + cc = t[8] >> 15; + t[8] &= 0x7FFF; + CCOPY(cc, d, t, sizeof t); + return cc; +} + +/* + * Perform a multiplication of two integers modulo 2^255-19. + * Operands are arrays of 9 words, each containing 30 bits of data, in + * little-endian order. Input value may be up to 2^256-1; on output, value + * fits on 256 bits and is lower than twice the modulus. + */ +static void +f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t t[18], cc; + int i; + + /* + * Compute raw multiplication. All result words fit in 30 bits + * each; upper word (t[17]) must fit on 2 bits, since the product + * of two 256-bit integers must fit on 512 bits. + */ + mul9(t, a, b); + + /* + * Modular reduction: each high word is added where necessary. + * Since the modulus is 2^255-19 and word 9 corresponds to + * offset 9*30 = 270, word 9+k must be added to word k with + * a factor of 19*2^15 = 622592. The extra bits in word 8 are also + * added that way. + * + * Keeping the carry on 32 bits helps with 32-bit architectures, + * and does not noticeably impact performance on 64-bit systems. + */ + cc = MUL15(t[8] >> 15, 19); /* at most 19*(2^15-1) = 622573 */ + t[8] &= 0x7FFF; + for (i = 0; i < 9; i ++) { + uint64_t w; + + w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592); + t[i] = (uint32_t)w & 0x3FFFFFFF; + cc = (uint32_t)(w >> 30); /* at most 622592 */ + } + + /* + * Original product was up to (2^256-1)^2, i.e. a 512-bit integer. + * This was split into two parts (upper of 257 bits, lower of 255 + * bits), and the upper was added to the lower with a factor 19, + * which means that the intermediate value is less than 77*2^255 + * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are + * less than 77, and the initial carry cc is at most 76*19 = 1444. + */ + cc = MUL15(t[8] >> 15, 19); + t[8] &= 0x7FFF; + for (i = 0; i < 9; i ++) { + uint32_t z; + + z = t[i] + cc; + d[i] = z & 0x3FFFFFFF; + cc = z >> 30; + } + + /* + * Final result is at most 2^255 + 1443. In particular, the last + * carry is necessarily 0, since t[8] was truncated to 15 bits. + */ +} + +/* + * Perform a squaring of an integer modulo 2^255-19. + * Operands are arrays of 9 words, each containing 30 bits of data, in + * little-endian order. Input value may be up to 2^256-1; on output, value + * fits on 256 bits and is lower than twice the modulus. + */ +static void +f255_square(uint32_t *d, const uint32_t *a) +{ + uint32_t t[18], cc; + int i; + + /* + * Compute raw squaring. All result words fit in 30 bits + * each; upper word (t[17]) must fit on 2 bits, since the square + * of a 256-bit integers must fit on 512 bits. + */ + square9(t, a); + + /* + * Modular reduction: each high word is added where necessary. + * See f255_mul() for details on the reduction and carry limits. + */ + cc = MUL15(t[8] >> 15, 19); + t[8] &= 0x7FFF; + for (i = 0; i < 9; i ++) { + uint64_t w; + + w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592); + t[i] = (uint32_t)w & 0x3FFFFFFF; + cc = (uint32_t)(w >> 30); + } + cc = MUL15(t[8] >> 15, 19); + t[8] &= 0x7FFF; + for (i = 0; i < 9; i ++) { + uint32_t z; + + z = t[i] + cc; + d[i] = z & 0x3FFFFFFF; + cc = z >> 30; + } +} + +/* + * Add two values in F255. Partial reduction is performed (down to less + * than twice the modulus). + */ +static void +f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + /* + * Since operand words fit on 30 bits, we can use 32-bit + * variables throughout. + */ + int i; + uint32_t cc, w; + + cc = 0; + for (i = 0; i < 9; i ++) { + w = a[i] + b[i] + cc; + d[i] = w & 0x3FFFFFFF; + cc = w >> 30; + } + cc = MUL15(w >> 15, 19); + d[8] &= 0x7FFF; + for (i = 0; i < 9; i ++) { + w = d[i] + cc; + d[i] = w & 0x3FFFFFFF; + cc = w >> 30; + } +} + +/* + * Subtract one value from another in F255. Partial reduction is + * performed (down to less than twice the modulus). + */ +static void +f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + /* + * We actually compute a - b + 2*p, so that the final value is + * necessarily positive. + */ + int i; + uint32_t cc, w; + + cc = (uint32_t)-38; + for (i = 0; i < 9; i ++) { + w = a[i] - b[i] + cc; + d[i] = w & 0x3FFFFFFF; + cc = ARSH(w, 30); + } + cc = MUL15((w + 0x10000) >> 15, 19); + d[8] &= 0x7FFF; + for (i = 0; i < 9; i ++) { + w = d[i] + cc; + d[i] = w & 0x3FFFFFFF; + cc = w >> 30; + } +} + +/* + * Multiply an integer by the 'A24' constant (121665). Partial reduction + * is performed (down to less than twice the modulus). + */ +static void +f255_mul_a24(uint32_t *d, const uint32_t *a) +{ + int i; + uint64_t w; + uint32_t cc; + + /* + * a[] is over 256 bits, thus a[8] has length at most 16 bits. + * We single out the processing of the last word: intermediate + * value w is up to 121665*2^16, yielding a carry for the next + * loop of at most 19*(121665*2^16/2^15) = 4623289. + */ + cc = 0; + for (i = 0; i < 8; i ++) { + w = MUL31(a[i], 121665) + (uint64_t)cc; + d[i] = (uint32_t)w & 0x3FFFFFFF; + cc = (uint32_t)(w >> 30); + } + w = MUL31(a[8], 121665) + (uint64_t)cc; + d[8] = (uint32_t)w & 0x7FFF; + cc = MUL15((uint32_t)(w >> 15), 19); + + for (i = 0; i < 9; i ++) { + uint32_t z; + + z = d[i] + cc; + d[i] = z & 0x3FFFFFFF; + cc = z >> 30; + } +} + +static const unsigned char GEN[] = { + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const unsigned char ORDER[] = { + 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return GEN; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return ORDER; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 0; +} + +static void +cswap(uint32_t *a, uint32_t *b, uint32_t ctl) +{ + int i; + + ctl = -ctl; + for (i = 0; i < 9; i ++) { + uint32_t aw, bw, tw; + + aw = a[i]; + bw = b[i]; + tw = ctl & (aw ^ bw); + a[i] = aw ^ tw; + b[i] = bw ^ tw; + } +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ + uint32_t x1[9], x2[9], x3[9], z2[9], z3[9]; + uint32_t a[9], aa[9], b[9], bb[9]; + uint32_t c[9], d[9], e[9], da[9], cb[9]; + unsigned char k[32]; + uint32_t swap; + int i; + + (void)curve; + + /* + * Points are encoded over exactly 32 bytes. Multipliers must fit + * in 32 bytes as well. + * RFC 7748 mandates that the high bit of the last point byte must + * be ignored/cleared. + */ + if (Glen != 32 || kblen > 32) { + return 0; + } + G[31] &= 0x7F; + + /* + * Initialise variables x1, x2, z2, x3 and z3. We set all of them + * into Montgomery representation. + */ + x1[8] = le8_to_le30(x1, G, 32); + memcpy(x3, x1, sizeof x1); + memset(z2, 0, sizeof z2); + memset(x2, 0, sizeof x2); + x2[0] = 1; + memset(z3, 0, sizeof z3); + z3[0] = 1; + + memset(k, 0, (sizeof k) - kblen); + memcpy(k + (sizeof k) - kblen, kb, kblen); + k[31] &= 0xF8; + k[0] &= 0x7F; + k[0] |= 0x40; + + /* obsolete + print_int("x1", x1); + */ + + swap = 0; + for (i = 254; i >= 0; i --) { + uint32_t kt; + + kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; + swap ^= kt; + cswap(x2, x3, swap); + cswap(z2, z3, swap); + swap = kt; + + /* obsolete + print_int("x2", x2); + print_int("z2", z2); + print_int("x3", x3); + print_int("z3", z3); + */ + + f255_add(a, x2, z2); + f255_square(aa, a); + f255_sub(b, x2, z2); + f255_square(bb, b); + f255_sub(e, aa, bb); + f255_add(c, x3, z3); + f255_sub(d, x3, z3); + f255_mul(da, d, a); + f255_mul(cb, c, b); + + /* obsolete + print_int("a ", a); + print_int("aa", aa); + print_int("b ", b); + print_int("bb", bb); + print_int("e ", e); + print_int("c ", c); + print_int("d ", d); + print_int("da", da); + print_int("cb", cb); + */ + + f255_add(x3, da, cb); + f255_square(x3, x3); + f255_sub(z3, da, cb); + f255_square(z3, z3); + f255_mul(z3, z3, x1); + f255_mul(x2, aa, bb); + f255_mul_a24(z2, e); + f255_add(z2, z2, aa); + f255_mul(z2, e, z2); + + /* obsolete + print_int("x2", x2); + print_int("z2", z2); + print_int("x3", x3); + print_int("z3", z3); + */ + } + cswap(x2, x3, swap); + cswap(z2, z3, swap); + + /* + * Inverse z2 with a modular exponentiation. This is a simple + * square-and-multiply algorithm; we mutualise most non-squarings + * since the exponent contains almost only ones. + */ + memcpy(a, z2, sizeof z2); + for (i = 0; i < 15; i ++) { + f255_square(a, a); + f255_mul(a, a, z2); + } + memcpy(b, a, sizeof a); + for (i = 0; i < 14; i ++) { + int j; + + for (j = 0; j < 16; j ++) { + f255_square(b, b); + } + f255_mul(b, b, a); + } + for (i = 14; i >= 0; i --) { + f255_square(b, b); + if ((0xFFEB >> i) & 1) { + f255_mul(b, z2, b); + } + } + f255_mul(x2, x2, b); + reduce_final_f255(x2); + le30_to_le8(G, 32, x2); + return 1; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + /* + * We don't implement this method, since it is used for ECDSA + * only, and there is no ECDSA over Curve25519 (which instead + * uses EdDSA). + */ + (void)A; + (void)B; + (void)len; + (void)x; + (void)xlen; + (void)y; + (void)ylen; + (void)curve; + return 0; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_c25519_m31 = { + (uint32_t)0x20000000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c new file mode 100644 index 00000000..6b058eb1 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m62.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#if BR_INT128 || BR_UMUL128 + +#if BR_UMUL128 +#include <intrin.h> +#endif + +static const unsigned char GEN[] = { + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const unsigned char ORDER[] = { + 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return GEN; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return ORDER; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 0; +} + +/* + * A field element is encoded as five 64-bit integers, in basis 2^51. + * Limbs may be occasionally larger than 2^51, to save on carry + * propagation costs. + */ + +#define MASK51 (((uint64_t)1 << 51) - (uint64_t)1) + +/* + * Swap two field elements, conditionally on a flag. + */ +static inline void +f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl) +{ + uint64_t m, w; + + m = -(uint64_t)ctl; + w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w; + w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w; + w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w; + w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w; + w = m & (a[4] ^ b[4]); a[4] ^= w; b[4] ^= w; +} + +/* + * Addition with no carry propagation. Limbs double in size. + */ +static inline void +f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ + d[0] = a[0] + b[0]; + d[1] = a[1] + b[1]; + d[2] = a[2] + b[2]; + d[3] = a[3] + b[3]; + d[4] = a[4] + b[4]; +} + +/* + * Subtraction. + * On input, limbs must fit on 60 bits each. On output, result is + * partially reduced, with max value 2^255+19456; moreover, all + * limbs will fit on 51 bits, except the low limb, which may have + * value up to 2^51+19455. + */ +static inline void +f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ + uint64_t cc, w; + + /* + * We compute d = (2^255-19)*1024 + a - b. Since the limbs + * fit on 60 bits, the maximum value of operands are slightly + * more than 2^264, but much less than 2^265-19456. This + * ensures that the result is positive. + */ + + /* + * Initial carry is 19456, since we add 2^265-19456. Each + * individual subtraction may yield a carry up to 513. + */ + w = a[0] - b[0] - 19456; + d[0] = w & MASK51; + cc = -(w >> 51) & 0x3FF; + w = a[1] - b[1] - cc; + d[1] = w & MASK51; + cc = -(w >> 51) & 0x3FF; + w = a[2] - b[2] - cc; + d[2] = w & MASK51; + cc = -(w >> 51) & 0x3FF; + w = a[3] - b[3] - cc; + d[3] = w & MASK51; + cc = -(w >> 51) & 0x3FF; + d[4] = ((uint64_t)1 << 61) + a[4] - b[4] - cc; + + /* + * Partial reduction. The intermediate result may be up to + * slightly above 2^265, but less than 2^265+2^255. When we + * truncate to 255 bits, the upper bits will be at most 1024. + */ + d[0] += 19 * (d[4] >> 51); + d[4] &= MASK51; +} + +/* + * UMUL51(hi, lo, x, y) computes: + * + * hi = floor((x * y) / (2^51)) + * lo = x * y mod 2^51 + * + * Note that lo < 2^51, but "hi" may be larger, if the input operands are + * larger. + */ +#if BR_INT128 + +#define UMUL51(hi, lo, x, y) do { \ + unsigned __int128 umul_tmp; \ + umul_tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \ + (hi) = (uint64_t)(umul_tmp >> 51); \ + (lo) = (uint64_t)umul_tmp & MASK51; \ + } while (0) + +#elif BR_UMUL128 + +#define UMUL51(hi, lo, x, y) do { \ + uint64_t umul_hi, umul_lo; \ + umul_lo = _umul128((x), (y), &umul_hi); \ + (hi) = (umul_hi << 13) | (umul_lo >> 51); \ + (lo) = umul_lo & MASK51; \ + } while (0) + +#endif + +/* + * Multiplication. + * On input, limbs must fit on 54 bits each. + * On output, limb 0 is at most 2^51 + 155647, and other limbs fit + * on 51 bits each. + */ +static inline void +f255_mul(uint64_t *d, uint64_t *a, uint64_t *b) +{ + uint64_t t[10], hi, lo, w, cc; + + /* + * Perform cross products, accumulating values without carry + * propagation. + * + * Since input limbs fit on 54 bits each, each individual + * UMUL51 will produce a "hi" of less than 2^57. The maximum + * sum will be at most 5*(2^57-1) + 4*(2^51-1) (for t[5]), + * i.e. less than 324*2^51. + */ + + UMUL51(t[1], t[0], a[0], b[0]); + + UMUL51(t[2], lo, a[1], b[0]); t[1] += lo; + UMUL51(hi, lo, a[0], b[1]); t[1] += lo; t[2] += hi; + + UMUL51(t[3], lo, a[2], b[0]); t[2] += lo; + UMUL51(hi, lo, a[1], b[1]); t[2] += lo; t[3] += hi; + UMUL51(hi, lo, a[0], b[2]); t[2] += lo; t[3] += hi; + + UMUL51(t[4], lo, a[3], b[0]); t[3] += lo; + UMUL51(hi, lo, a[2], b[1]); t[3] += lo; t[4] += hi; + UMUL51(hi, lo, a[1], b[2]); t[3] += lo; t[4] += hi; + UMUL51(hi, lo, a[0], b[3]); t[3] += lo; t[4] += hi; + + UMUL51(t[5], lo, a[4], b[0]); t[4] += lo; + UMUL51(hi, lo, a[3], b[1]); t[4] += lo; t[5] += hi; + UMUL51(hi, lo, a[2], b[2]); t[4] += lo; t[5] += hi; + UMUL51(hi, lo, a[1], b[3]); t[4] += lo; t[5] += hi; + UMUL51(hi, lo, a[0], b[4]); t[4] += lo; t[5] += hi; + + UMUL51(t[6], lo, a[4], b[1]); t[5] += lo; + UMUL51(hi, lo, a[3], b[2]); t[5] += lo; t[6] += hi; + UMUL51(hi, lo, a[2], b[3]); t[5] += lo; t[6] += hi; + UMUL51(hi, lo, a[1], b[4]); t[5] += lo; t[6] += hi; + + UMUL51(t[7], lo, a[4], b[2]); t[6] += lo; + UMUL51(hi, lo, a[3], b[3]); t[6] += lo; t[7] += hi; + UMUL51(hi, lo, a[2], b[4]); t[6] += lo; t[7] += hi; + + UMUL51(t[8], lo, a[4], b[3]); t[7] += lo; + UMUL51(hi, lo, a[3], b[4]); t[7] += lo; t[8] += hi; + + UMUL51(t[9], lo, a[4], b[4]); t[8] += lo; + + /* + * The upper words t[5]..t[9] are folded back into the lower + * words, using the rule that 2^255 = 19 in the field. + * + * Since each t[i] is less than 324*2^51, the additions below + * will yield less than 6480*2^51 in each limb; this fits in + * 64 bits (6480*2^51 < 8192*2^51 = 2^64), hence there is + * no overflow. + */ + t[0] += 19 * t[5]; + t[1] += 19 * t[6]; + t[2] += 19 * t[7]; + t[3] += 19 * t[8]; + t[4] += 19 * t[9]; + + /* + * Propagate carries. + */ + w = t[0]; + d[0] = w & MASK51; + cc = w >> 51; + w = t[1] + cc; + d[1] = w & MASK51; + cc = w >> 51; + w = t[2] + cc; + d[2] = w & MASK51; + cc = w >> 51; + w = t[3] + cc; + d[3] = w & MASK51; + cc = w >> 51; + w = t[4] + cc; + d[4] = w & MASK51; + cc = w >> 51; + + /* + * Since the limbs were 64-bit values, the top carry is at + * most 8192 (in practice, that cannot be reached). We simply + * performed a partial reduction. + */ + d[0] += 19 * cc; +} + +/* + * Multiplication by A24 = 121665. + * Input must have limbs of 60 bits at most. + */ +static inline void +f255_mul_a24(uint64_t *d, const uint64_t *a) +{ + uint64_t t[5], cc, w; + + /* + * 121665 = 15 * 8111. We first multiply by 15, with carry + * propagation and partial reduction. + */ + w = a[0] * 15; + t[0] = w & MASK51; + cc = w >> 51; + w = a[1] * 15 + cc; + t[1] = w & MASK51; + cc = w >> 51; + w = a[2] * 15 + cc; + t[2] = w & MASK51; + cc = w >> 51; + w = a[3] * 15 + cc; + t[3] = w & MASK51; + cc = w >> 51; + w = a[4] * 15 + cc; + t[4] = w & MASK51; + t[0] += 19 * (w >> 51); + + /* + * Then multiplication by 8111. At that point, we known that + * t[0] is less than 2^51 + 19*8192, and other limbs are less + * than 2^51; thus, there will be no overflow. + */ + w = t[0] * 8111; + d[0] = w & MASK51; + cc = w >> 51; + w = t[1] * 8111 + cc; + d[1] = w & MASK51; + cc = w >> 51; + w = t[2] * 8111 + cc; + d[2] = w & MASK51; + cc = w >> 51; + w = t[3] * 8111 + cc; + d[3] = w & MASK51; + cc = w >> 51; + w = t[4] * 8111 + cc; + d[4] = w & MASK51; + d[0] += 19 * (w >> 51); +} + +/* + * Finalize reduction. + * On input, limbs must fit on 51 bits, except possibly the low limb, + * which may be slightly above 2^51. + */ +static inline void +f255_final_reduce(uint64_t *a) +{ + uint64_t t[5], cc, w; + + /* + * We add 19. If the result (in t[]) is below 2^255, then a[] + * is already less than 2^255-19, thus already reduced. + * Otherwise, we subtract 2^255 from t[], in which case we + * have t = a - (2^255-19), and that's our result. + */ + w = a[0] + 19; + t[0] = w & MASK51; + cc = w >> 51; + w = a[1] + cc; + t[1] = w & MASK51; + cc = w >> 51; + w = a[2] + cc; + t[2] = w & MASK51; + cc = w >> 51; + w = a[3] + cc; + t[3] = w & MASK51; + cc = w >> 51; + w = a[4] + cc; + t[4] = w & MASK51; + cc = w >> 51; + + /* + * The bit 255 of t is in cc. If that bit is 0, when a[] must + * be unchanged; otherwise, it must be replaced with t[]. + */ + cc = -cc; + a[0] ^= cc & (a[0] ^ t[0]); + a[1] ^= cc & (a[1] ^ t[1]); + a[2] ^= cc & (a[2] ^ t[2]); + a[3] ^= cc & (a[3] ^ t[3]); + a[4] ^= cc & (a[4] ^ t[4]); +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ + unsigned char k[32]; + uint64_t x1[5], x2[5], z2[5], x3[5], z3[5]; + uint32_t swap; + int i; + + (void)curve; + + /* + * Points are encoded over exactly 32 bytes. Multipliers must fit + * in 32 bytes as well. + */ + if (Glen != 32 || kblen > 32) { + return 0; + } + + /* + * RFC 7748 mandates that the high bit of the last point byte must + * be ignored/cleared; the "& MASK51" in the initialization for + * x1[4] clears that bit. + */ + x1[0] = br_dec64le(&G[0]) & MASK51; + x1[1] = (br_dec64le(&G[6]) >> 3) & MASK51; + x1[2] = (br_dec64le(&G[12]) >> 6) & MASK51; + x1[3] = (br_dec64le(&G[19]) >> 1) & MASK51; + x1[4] = (br_dec64le(&G[24]) >> 12) & MASK51; + + /* + * We can use memset() to clear values, because exact-width types + * like uint64_t are guaranteed to have no padding bits or + * trap representations. + */ + memset(x2, 0, sizeof x2); + x2[0] = 1; + memset(z2, 0, sizeof z2); + memcpy(x3, x1, sizeof x1); + memcpy(z3, x2, sizeof x2); + + /* + * The multiplier is provided in big-endian notation, and + * possibly shorter than 32 bytes. + */ + memset(k, 0, (sizeof k) - kblen); + memcpy(k + (sizeof k) - kblen, kb, kblen); + k[31] &= 0xF8; + k[0] &= 0x7F; + k[0] |= 0x40; + + swap = 0; + + for (i = 254; i >= 0; i --) { + uint64_t a[5], aa[5], b[5], bb[5], e[5]; + uint64_t c[5], d[5], da[5], cb[5]; + uint32_t kt; + + kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; + swap ^= kt; + f255_cswap(x2, x3, swap); + f255_cswap(z2, z3, swap); + swap = kt; + + /* + * At that point, limbs of x_2 and z_2 are assumed to fit + * on at most 52 bits each. + * + * Each f255_add() adds one bit to the maximum range of + * the values, but f255_sub() and f255_mul() bring back + * the limbs into 52 bits. All f255_add() outputs are + * used only as inputs for f255_mul(), which ensures + * that limbs remain in the proper range. + */ + + /* A = x_2 + z_2 -- limbs fit on 53 bits each */ + f255_add(a, x2, z2); + + /* AA = A^2 */ + f255_mul(aa, a, a); + + /* B = x_2 - z_2 */ + f255_sub(b, x2, z2); + + /* BB = B^2 */ + f255_mul(bb, b, b); + + /* E = AA - BB */ + f255_sub(e, aa, bb); + + /* C = x_3 + z_3 -- limbs fit on 53 bits each */ + f255_add(c, x3, z3); + + /* D = x_3 - z_3 */ + f255_sub(d, x3, z3); + + /* DA = D * A */ + f255_mul(da, d, a); + + /* CB = C * B */ + f255_mul(cb, c, b); + + /* x_3 = (DA + CB)^2 */ + f255_add(x3, da, cb); + f255_mul(x3, x3, x3); + + /* z_3 = x_1 * (DA - CB)^2 */ + f255_sub(z3, da, cb); + f255_mul(z3, z3, z3); + f255_mul(z3, x1, z3); + + /* x_2 = AA * BB */ + f255_mul(x2, aa, bb); + + /* z_2 = E * (AA + a24 * E) */ + f255_mul_a24(z2, e); + f255_add(z2, aa, z2); + f255_mul(z2, e, z2); + } + + f255_cswap(x2, x3, swap); + f255_cswap(z2, z3, swap); + + /* + * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize + * most non-squarings. We use x1 and x3, now useless, as temporaries. + */ + memcpy(x1, z2, sizeof z2); + for (i = 0; i < 15; i ++) { + f255_mul(x1, x1, x1); + f255_mul(x1, x1, z2); + } + memcpy(x3, x1, sizeof x1); + for (i = 0; i < 14; i ++) { + int j; + + for (j = 0; j < 16; j ++) { + f255_mul(x3, x3, x3); + } + f255_mul(x3, x3, x1); + } + for (i = 14; i >= 0; i --) { + f255_mul(x3, x3, x3); + if ((0xFFEB >> i) & 1) { + f255_mul(x3, z2, x3); + } + } + + /* + * Compute x2/z2. We have 1/z2 in x3. + */ + f255_mul(x2, x2, x3); + f255_final_reduce(x2); + + /* + * Encode the final x2 value in little-endian. We first assemble + * the limbs into 64-bit values. + */ + x2[0] |= x2[1] << 51; + x2[1] = (x2[1] >> 13) | (x2[2] << 38); + x2[2] = (x2[2] >> 26) | (x2[3] << 25); + x2[3] = (x2[3] >> 39) | (x2[4] << 12); + br_enc64le(G, x2[0]); + br_enc64le(G + 8, x2[1]); + br_enc64le(G + 16, x2[2]); + br_enc64le(G + 24, x2[3]); + return 1; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + /* + * We don't implement this method, since it is used for ECDSA + * only, and there is no ECDSA over Curve25519 (which instead + * uses EdDSA). + */ + (void)A; + (void)B; + (void)len; + (void)x; + (void)xlen; + (void)y; + (void)ylen; + (void)curve; + return 0; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_c25519_m62 = { + (uint32_t)0x20000000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_c25519_m62_get(void) +{ + return &br_ec_c25519_m62; +} + +#else + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_c25519_m62_get(void) +{ + return 0; +} + +#endif diff --git a/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c b/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c new file mode 100644 index 00000000..7e7f12f7 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_c25519_m64.c @@ -0,0 +1,835 @@ +/* + * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#if BR_INT128 || BR_UMUL128 + +#if BR_UMUL128 +#include <intrin.h> +#endif + +static const unsigned char GEN[] = { + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const unsigned char ORDER[] = { + 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return GEN; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return ORDER; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 0; +} + +/* + * A field element is encoded as four 64-bit integers, in basis 2^63. + * Operations return partially reduced values, which may range up to + * 2^255+37. + */ + +#define MASK63 (((uint64_t)1 << 63) - (uint64_t)1) + +/* + * Swap two field elements, conditionally on a flag. + */ +static inline void +f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl) +{ + uint64_t m, w; + + m = -(uint64_t)ctl; + w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w; + w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w; + w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w; + w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w; +} + +/* + * Addition in the field. + */ +static inline void +f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ +#if BR_INT128 + + uint64_t t0, t1, t2, t3, cc; + unsigned __int128 z; + + z = (unsigned __int128)a[0] + (unsigned __int128)b[0]; + t0 = (uint64_t)z; + z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64); + t1 = (uint64_t)z; + z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64); + t2 = (uint64_t)z; + z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64); + t3 = (uint64_t)z & MASK63; + cc = (uint64_t)(z >> 63); + + /* + * Since operands are at most 2^255+37, the sum is at most + * 2^256+74; thus, the carry cc is equal to 0, 1 or 2. + * + * We use: 2^255 = 19 mod p. + * Since we add 0, 19 or 38 to a value that fits on 255 bits, + * the result is at most 2^255+37. + */ + z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc); + d[0] = (uint64_t)z; + z = (unsigned __int128)t1 + (z >> 64); + d[1] = (uint64_t)z; + z = (unsigned __int128)t2 + (z >> 64); + d[2] = (uint64_t)z; + d[3] = t3 + (uint64_t)(z >> 64); + +#elif BR_UMUL128 + + uint64_t t0, t1, t2, t3, cc; + unsigned char k; + + k = _addcarry_u64(0, a[0], b[0], &t0); + k = _addcarry_u64(k, a[1], b[1], &t1); + k = _addcarry_u64(k, a[2], b[2], &t2); + k = _addcarry_u64(k, a[3], b[3], &t3); + cc = (k << 1) + (t3 >> 63); + t3 &= MASK63; + + /* + * Since operands are at most 2^255+37, the sum is at most + * 2^256+74; thus, the carry cc is equal to 0, 1 or 2. + * + * We use: 2^255 = 19 mod p. + * Since we add 0, 19 or 38 to a value that fits on 255 bits, + * the result is at most 2^255+37. + */ + k = _addcarry_u64(0, t0, 19 * cc, &d[0]); + k = _addcarry_u64(k, t1, 0, &d[1]); + k = _addcarry_u64(k, t2, 0, &d[2]); + (void)_addcarry_u64(k, t3, 0, &d[3]); + +#endif +} + +/* + * Subtraction. + * On input, limbs must fit on 60 bits each. On output, result is + * partially reduced, with max value 2^255+19456; moreover, all + * limbs will fit on 51 bits, except the low limb, which may have + * value up to 2^51+19455. + */ +static inline void +f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ +#if BR_INT128 + + /* + * We compute t = 2^256 - 38 + a - b, which is necessarily + * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37 + * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending + * on the two upper bits of t (bits 255 and 256). + */ + + uint64_t t0, t1, t2, t3, t4, cc; + unsigned __int128 z; + + z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38; + t0 = (uint64_t)z; + cc = -(uint64_t)(z >> 64); + z = (unsigned __int128)a[1] - (unsigned __int128)b[1] + - (unsigned __int128)cc; + t1 = (uint64_t)z; + cc = -(uint64_t)(z >> 64); + z = (unsigned __int128)a[2] - (unsigned __int128)b[2] + - (unsigned __int128)cc; + t2 = (uint64_t)z; + cc = -(uint64_t)(z >> 64); + z = (unsigned __int128)a[3] - (unsigned __int128)b[3] + - (unsigned __int128)cc; + t3 = (uint64_t)z; + t4 = 1 + (uint64_t)(z >> 64); + + /* + * We have a 257-bit result. The two top bits can be 00, 01 or 10, + * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1). + * Therefore, we can truncate to 255 bits, and add 0, 19 or 38. + * This guarantees that the result is at most 2^255+37. + */ + cc = (38 & -t4) + (19 & -(t3 >> 63)); + t3 &= MASK63; + z = (unsigned __int128)t0 + (unsigned __int128)cc; + d[0] = (uint64_t)z; + z = (unsigned __int128)t1 + (z >> 64); + d[1] = (uint64_t)z; + z = (unsigned __int128)t2 + (z >> 64); + d[2] = (uint64_t)z; + d[3] = t3 + (uint64_t)(z >> 64); + +#elif BR_UMUL128 + + /* + * We compute t = 2^256 - 38 + a - b, which is necessarily + * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37 + * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending + * on the two upper bits of t (bits 255 and 256). + */ + + uint64_t t0, t1, t2, t3, t4; + unsigned char k; + + k = _subborrow_u64(0, a[0], b[0], &t0); + k = _subborrow_u64(k, a[1], b[1], &t1); + k = _subborrow_u64(k, a[2], b[2], &t2); + k = _subborrow_u64(k, a[3], b[3], &t3); + (void)_subborrow_u64(k, 1, 0, &t4); + + k = _subborrow_u64(0, t0, 38, &t0); + k = _subborrow_u64(k, t1, 0, &t1); + k = _subborrow_u64(k, t2, 0, &t2); + k = _subborrow_u64(k, t3, 0, &t3); + (void)_subborrow_u64(k, t4, 0, &t4); + + /* + * We have a 257-bit result. The two top bits can be 00, 01 or 10, + * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1). + * Therefore, we can truncate to 255 bits, and add 0, 19 or 38. + * This guarantees that the result is at most 2^255+37. + */ + t4 = (38 & -t4) + (19 & -(t3 >> 63)); + t3 &= MASK63; + k = _addcarry_u64(0, t0, t4, &d[0]); + k = _addcarry_u64(k, t1, 0, &d[1]); + k = _addcarry_u64(k, t2, 0, &d[2]); + (void)_addcarry_u64(k, t3, 0, &d[3]); + +#endif +} + +/* + * Multiplication. + */ +static inline void +f255_mul(uint64_t *d, uint64_t *a, uint64_t *b) +{ +#if BR_INT128 + + unsigned __int128 z; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th; + + /* + * Compute the product a*b over plain integers. + */ + z = (unsigned __int128)a[0] * (unsigned __int128)b[0]; + t0 = (uint64_t)z; + z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64); + t1 = (uint64_t)z; + z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64); + t2 = (uint64_t)z; + z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64); + t3 = (uint64_t)z; + t4 = (uint64_t)(z >> 64); + + z = (unsigned __int128)a[1] * (unsigned __int128)b[0] + + (unsigned __int128)t1; + t1 = (uint64_t)z; + z = (unsigned __int128)a[1] * (unsigned __int128)b[1] + + (unsigned __int128)t2 + (z >> 64); + t2 = (uint64_t)z; + z = (unsigned __int128)a[1] * (unsigned __int128)b[2] + + (unsigned __int128)t3 + (z >> 64); + t3 = (uint64_t)z; + z = (unsigned __int128)a[1] * (unsigned __int128)b[3] + + (unsigned __int128)t4 + (z >> 64); + t4 = (uint64_t)z; + t5 = (uint64_t)(z >> 64); + + z = (unsigned __int128)a[2] * (unsigned __int128)b[0] + + (unsigned __int128)t2; + t2 = (uint64_t)z; + z = (unsigned __int128)a[2] * (unsigned __int128)b[1] + + (unsigned __int128)t3 + (z >> 64); + t3 = (uint64_t)z; + z = (unsigned __int128)a[2] * (unsigned __int128)b[2] + + (unsigned __int128)t4 + (z >> 64); + t4 = (uint64_t)z; + z = (unsigned __int128)a[2] * (unsigned __int128)b[3] + + (unsigned __int128)t5 + (z >> 64); + t5 = (uint64_t)z; + t6 = (uint64_t)(z >> 64); + + z = (unsigned __int128)a[3] * (unsigned __int128)b[0] + + (unsigned __int128)t3; + t3 = (uint64_t)z; + z = (unsigned __int128)a[3] * (unsigned __int128)b[1] + + (unsigned __int128)t4 + (z >> 64); + t4 = (uint64_t)z; + z = (unsigned __int128)a[3] * (unsigned __int128)b[2] + + (unsigned __int128)t5 + (z >> 64); + t5 = (uint64_t)z; + z = (unsigned __int128)a[3] * (unsigned __int128)b[3] + + (unsigned __int128)t6 + (z >> 64); + t6 = (uint64_t)z; + t7 = (uint64_t)(z >> 64); + + /* + * Modulo p, we have: + * + * 2^255 = 19 + * 2^510 = 19*19 = 361 + * + * We split the intermediate t into three parts, in basis + * 2^255. The low one will be in t0..t3; the middle one in t4..t7. + * The upper one can only be a single bit (th), since the + * multiplication operands are at most 2^255+37 each. + */ + th = t7 >> 62; + t7 = ((t7 << 1) | (t6 >> 63)) & MASK63; + t6 = (t6 << 1) | (t5 >> 63); + t5 = (t5 << 1) | (t4 >> 63); + t4 = (t4 << 1) | (t3 >> 63); + t3 &= MASK63; + + /* + * Multiply the middle part (t4..t7) by 19. We truncate it to + * 255 bits; the extra bits will go along with th. + */ + z = (unsigned __int128)t4 * 19; + t4 = (uint64_t)z; + z = (unsigned __int128)t5 * 19 + (z >> 64); + t5 = (uint64_t)z; + z = (unsigned __int128)t6 * 19 + (z >> 64); + t6 = (uint64_t)z; + z = (unsigned __int128)t7 * 19 + (z >> 64); + t7 = (uint64_t)z & MASK63; + + th = (361 & -th) + (19 * (uint64_t)(z >> 63)); + + /* + * Add elements together. + * At this point: + * t0..t3 fits on 255 bits. + * t4..t7 fits on 255 bits. + * th <= 361 + 342 = 703. + */ + z = (unsigned __int128)t0 + (unsigned __int128)t4 + + (unsigned __int128)th; + t0 = (uint64_t)z; + z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64); + t1 = (uint64_t)z; + z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64); + t2 = (uint64_t)z; + z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64); + t3 = (uint64_t)z & MASK63; + th = (uint64_t)(z >> 63); + + /* + * Since the sum is at most 2^256 + 703, the two upper bits, in th, + * can only have value 0, 1 or 2. We just add th*19, which + * guarantees a result of at most 2^255+37. + */ + z = (unsigned __int128)t0 + (19 * th); + d[0] = (uint64_t)z; + z = (unsigned __int128)t1 + (z >> 64); + d[1] = (uint64_t)z; + z = (unsigned __int128)t2 + (z >> 64); + d[2] = (uint64_t)z; + d[3] = t3 + (uint64_t)(z >> 64); + +#elif BR_UMUL128 + + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th; + uint64_t h0, h1, h2, h3; + unsigned char k; + + /* + * Compute the product a*b over plain integers. + */ + t0 = _umul128(a[0], b[0], &h0); + t1 = _umul128(a[0], b[1], &h1); + k = _addcarry_u64(0, t1, h0, &t1); + t2 = _umul128(a[0], b[2], &h2); + k = _addcarry_u64(k, t2, h1, &t2); + t3 = _umul128(a[0], b[3], &h3); + k = _addcarry_u64(k, t3, h2, &t3); + (void)_addcarry_u64(k, h3, 0, &t4); + + k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1); + k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2); + k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3); + k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4); + t5 = k; + k = _addcarry_u64(0, t2, h0, &t2); + k = _addcarry_u64(k, t3, h1, &t3); + k = _addcarry_u64(k, t4, h2, &t4); + (void)_addcarry_u64(k, t5, h3, &t5); + + k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2); + k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3); + k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4); + k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5); + t6 = k; + k = _addcarry_u64(0, t3, h0, &t3); + k = _addcarry_u64(k, t4, h1, &t4); + k = _addcarry_u64(k, t5, h2, &t5); + (void)_addcarry_u64(k, t6, h3, &t6); + + k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3); + k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4); + k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5); + k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6); + t7 = k; + k = _addcarry_u64(0, t4, h0, &t4); + k = _addcarry_u64(k, t5, h1, &t5); + k = _addcarry_u64(k, t6, h2, &t6); + (void)_addcarry_u64(k, t7, h3, &t7); + + /* + * Modulo p, we have: + * + * 2^255 = 19 + * 2^510 = 19*19 = 361 + * + * We split the intermediate t into three parts, in basis + * 2^255. The low one will be in t0..t3; the middle one in t4..t7. + * The upper one can only be a single bit (th), since the + * multiplication operands are at most 2^255+37 each. + */ + th = t7 >> 62; + t7 = ((t7 << 1) | (t6 >> 63)) & MASK63; + t6 = (t6 << 1) | (t5 >> 63); + t5 = (t5 << 1) | (t4 >> 63); + t4 = (t4 << 1) | (t3 >> 63); + t3 &= MASK63; + + /* + * Multiply the middle part (t4..t7) by 19. We truncate it to + * 255 bits; the extra bits will go along with th. + */ + t4 = _umul128(t4, 19, &h0); + t5 = _umul128(t5, 19, &h1); + t6 = _umul128(t6, 19, &h2); + t7 = _umul128(t7, 19, &h3); + k = _addcarry_u64(0, t5, h0, &t5); + k = _addcarry_u64(k, t6, h1, &t6); + k = _addcarry_u64(k, t7, h2, &t7); + (void)_addcarry_u64(k, h3, 0, &h3); + th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63))); + t7 &= MASK63; + + /* + * Add elements together. + * At this point: + * t0..t3 fits on 255 bits. + * t4..t7 fits on 255 bits. + * th <= 361 + 342 = 703. + */ + k = _addcarry_u64(0, t0, t4, &t0); + k = _addcarry_u64(k, t1, t5, &t1); + k = _addcarry_u64(k, t2, t6, &t2); + k = _addcarry_u64(k, t3, t7, &t3); + t4 = k; + k = _addcarry_u64(0, t0, th, &t0); + k = _addcarry_u64(k, t1, 0, &t1); + k = _addcarry_u64(k, t2, 0, &t2); + k = _addcarry_u64(k, t3, 0, &t3); + (void)_addcarry_u64(k, t4, 0, &t4); + + th = (t4 << 1) + (t3 >> 63); + t3 &= MASK63; + + /* + * Since the sum is at most 2^256 + 703, the two upper bits, in th, + * can only have value 0, 1 or 2. We just add th*19, which + * guarantees a result of at most 2^255+37. + */ + k = _addcarry_u64(0, t0, 19 * th, &d[0]); + k = _addcarry_u64(k, t1, 0, &d[1]); + k = _addcarry_u64(k, t2, 0, &d[2]); + (void)_addcarry_u64(k, t3, 0, &d[3]); + +#endif +} + +/* + * Multiplication by A24 = 121665. + */ +static inline void +f255_mul_a24(uint64_t *d, const uint64_t *a) +{ +#if BR_INT128 + + uint64_t t0, t1, t2, t3; + unsigned __int128 z; + + z = (unsigned __int128)a[0] * 121665; + t0 = (uint64_t)z; + z = (unsigned __int128)a[1] * 121665 + (z >> 64); + t1 = (uint64_t)z; + z = (unsigned __int128)a[2] * 121665 + (z >> 64); + t2 = (uint64_t)z; + z = (unsigned __int128)a[3] * 121665 + (z >> 64); + t3 = (uint64_t)z & MASK63; + + z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63)); + t0 = (uint64_t)z; + z = (unsigned __int128)t1 + (z >> 64); + t1 = (uint64_t)z; + z = (unsigned __int128)t2 + (z >> 64); + t2 = (uint64_t)z; + t3 = t3 + (uint64_t)(z >> 64); + + z = (unsigned __int128)t0 + (19 & -(t3 >> 63)); + d[0] = (uint64_t)z; + z = (unsigned __int128)t1 + (z >> 64); + d[1] = (uint64_t)z; + z = (unsigned __int128)t2 + (z >> 64); + d[2] = (uint64_t)z; + d[3] = (t3 & MASK63) + (uint64_t)(z >> 64); + +#elif BR_UMUL128 + + uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3; + unsigned char k; + + t0 = _umul128(a[0], 121665, &h0); + t1 = _umul128(a[1], 121665, &h1); + k = _addcarry_u64(0, t1, h0, &t1); + t2 = _umul128(a[2], 121665, &h2); + k = _addcarry_u64(k, t2, h1, &t2); + t3 = _umul128(a[3], 121665, &h3); + k = _addcarry_u64(k, t3, h2, &t3); + (void)_addcarry_u64(k, h3, 0, &t4); + + t4 = (t4 << 1) + (t3 >> 63); + t3 &= MASK63; + k = _addcarry_u64(0, t0, 19 * t4, &t0); + k = _addcarry_u64(k, t1, 0, &t1); + k = _addcarry_u64(k, t2, 0, &t2); + (void)_addcarry_u64(k, t3, 0, &t3); + + t4 = 19 & -(t3 >> 63); + t3 &= MASK63; + k = _addcarry_u64(0, t0, t4, &d[0]); + k = _addcarry_u64(k, t1, 0, &d[1]); + k = _addcarry_u64(k, t2, 0, &d[2]); + (void)_addcarry_u64(k, t3, 0, &d[3]); + +#endif +} + +/* + * Finalize reduction. + */ +static inline void +f255_final_reduce(uint64_t *a) +{ +#if BR_INT128 + + uint64_t t0, t1, t2, t3, m; + unsigned __int128 z; + + /* + * We add 19. If the result (in t) is below 2^255, then a[] + * is already less than 2^255-19, thus already reduced. + * Otherwise, we subtract 2^255 from t[], in which case we + * have t = a - (2^255-19), and that's our result. + */ + z = (unsigned __int128)a[0] + 19; + t0 = (uint64_t)z; + z = (unsigned __int128)a[1] + (z >> 64); + t1 = (uint64_t)z; + z = (unsigned __int128)a[2] + (z >> 64); + t2 = (uint64_t)z; + t3 = a[3] + (uint64_t)(z >> 64); + + m = -(t3 >> 63); + t3 &= MASK63; + a[0] ^= m & (a[0] ^ t0); + a[1] ^= m & (a[1] ^ t1); + a[2] ^= m & (a[2] ^ t2); + a[3] ^= m & (a[3] ^ t3); + +#elif BR_UMUL128 + + uint64_t t0, t1, t2, t3, m; + unsigned char k; + + /* + * We add 19. If the result (in t) is below 2^255, then a[] + * is already less than 2^255-19, thus already reduced. + * Otherwise, we subtract 2^255 from t[], in which case we + * have t = a - (2^255-19), and that's our result. + */ + k = _addcarry_u64(0, a[0], 19, &t0); + k = _addcarry_u64(k, a[1], 0, &t1); + k = _addcarry_u64(k, a[2], 0, &t2); + (void)_addcarry_u64(k, a[3], 0, &t3); + + m = -(t3 >> 63); + t3 &= MASK63; + a[0] ^= m & (a[0] ^ t0); + a[1] ^= m & (a[1] ^ t1); + a[2] ^= m & (a[2] ^ t2); + a[3] ^= m & (a[3] ^ t3); + +#endif +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *kb, size_t kblen, int curve) +{ + unsigned char k[32]; + uint64_t x1[4], x2[4], z2[4], x3[4], z3[4]; + uint32_t swap; + int i; + + (void)curve; + + /* + * Points are encoded over exactly 32 bytes. Multipliers must fit + * in 32 bytes as well. + */ + if (Glen != 32 || kblen > 32) { + return 0; + } + + /* + * RFC 7748 mandates that the high bit of the last point byte must + * be ignored/cleared. + */ + x1[0] = br_dec64le(&G[ 0]); + x1[1] = br_dec64le(&G[ 8]); + x1[2] = br_dec64le(&G[16]); + x1[3] = br_dec64le(&G[24]) & MASK63; + + /* + * We can use memset() to clear values, because exact-width types + * like uint64_t are guaranteed to have no padding bits or + * trap representations. + */ + memset(x2, 0, sizeof x2); + x2[0] = 1; + memset(z2, 0, sizeof z2); + memcpy(x3, x1, sizeof x1); + memcpy(z3, x2, sizeof x2); + + /* + * The multiplier is provided in big-endian notation, and + * possibly shorter than 32 bytes. + */ + memset(k, 0, (sizeof k) - kblen); + memcpy(k + (sizeof k) - kblen, kb, kblen); + k[31] &= 0xF8; + k[0] &= 0x7F; + k[0] |= 0x40; + + swap = 0; + + for (i = 254; i >= 0; i --) { + uint64_t a[4], aa[4], b[4], bb[4], e[4]; + uint64_t c[4], d[4], da[4], cb[4]; + uint32_t kt; + + kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; + swap ^= kt; + f255_cswap(x2, x3, swap); + f255_cswap(z2, z3, swap); + swap = kt; + + /* A = x_2 + z_2 */ + f255_add(a, x2, z2); + + /* AA = A^2 */ + f255_mul(aa, a, a); + + /* B = x_2 - z_2 */ + f255_sub(b, x2, z2); + + /* BB = B^2 */ + f255_mul(bb, b, b); + + /* E = AA - BB */ + f255_sub(e, aa, bb); + + /* C = x_3 + z_3 */ + f255_add(c, x3, z3); + + /* D = x_3 - z_3 */ + f255_sub(d, x3, z3); + + /* DA = D * A */ + f255_mul(da, d, a); + + /* CB = C * B */ + f255_mul(cb, c, b); + + /* x_3 = (DA + CB)^2 */ + f255_add(x3, da, cb); + f255_mul(x3, x3, x3); + + /* z_3 = x_1 * (DA - CB)^2 */ + f255_sub(z3, da, cb); + f255_mul(z3, z3, z3); + f255_mul(z3, x1, z3); + + /* x_2 = AA * BB */ + f255_mul(x2, aa, bb); + + /* z_2 = E * (AA + a24 * E) */ + f255_mul_a24(z2, e); + f255_add(z2, aa, z2); + f255_mul(z2, e, z2); + } + + f255_cswap(x2, x3, swap); + f255_cswap(z2, z3, swap); + + /* + * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize + * most non-squarings. We use x1 and x3, now useless, as temporaries. + */ + memcpy(x1, z2, sizeof z2); + for (i = 0; i < 15; i ++) { + f255_mul(x1, x1, x1); + f255_mul(x1, x1, z2); + } + memcpy(x3, x1, sizeof x1); + for (i = 0; i < 14; i ++) { + int j; + + for (j = 0; j < 16; j ++) { + f255_mul(x3, x3, x3); + } + f255_mul(x3, x3, x1); + } + for (i = 14; i >= 0; i --) { + f255_mul(x3, x3, x3); + if ((0xFFEB >> i) & 1) { + f255_mul(x3, z2, x3); + } + } + + /* + * Compute x2/z2. We have 1/z2 in x3. + */ + f255_mul(x2, x2, x3); + f255_final_reduce(x2); + + /* + * Encode the final x2 value in little-endian. + */ + br_enc64le(G, x2[0]); + br_enc64le(G + 8, x2[1]); + br_enc64le(G + 16, x2[2]); + br_enc64le(G + 24, x2[3]); + return 1; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + /* + * We don't implement this method, since it is used for ECDSA + * only, and there is no ECDSA over Curve25519 (which instead + * uses EdDSA). + */ + (void)A; + (void)B; + (void)len; + (void)x; + (void)xlen; + (void)y; + (void)ylen; + (void)curve; + return 0; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_c25519_m64 = { + (uint32_t)0x20000000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_c25519_m64_get(void) +{ + return &br_ec_c25519_m64; +} + +#else + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_c25519_m64_get(void) +{ + return 0; +} + +#endif diff --git a/test/monniaux/BearSSL/src/ec/ec_curve25519.c b/test/monniaux/BearSSL/src/ec/ec_curve25519.c new file mode 100644 index 00000000..a47d215e --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_curve25519.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static const unsigned char GEN[] = { + 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const unsigned char ORDER[] = { + 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF +}; + +/* see inner.h */ +const br_ec_curve_def br_curve25519 = { + BR_EC_curve25519, + ORDER, sizeof ORDER, + GEN, sizeof GEN +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_default.c b/test/monniaux/BearSSL/src/ec/ec_default.c new file mode 100644 index 00000000..7bb6e0c7 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_default.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_get_default(void) +{ +#if BR_LOMUL + return &br_ec_all_m15; +#else + return &br_ec_all_m31; +#endif +} diff --git a/test/monniaux/BearSSL/src/ec/ec_keygen.c b/test/monniaux/BearSSL/src/ec/ec_keygen.c new file mode 100644 index 00000000..02a30962 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_keygen.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_ec.h */ +size_t +br_ec_keygen(const br_prng_class **rng_ctx, + const br_ec_impl *impl, br_ec_private_key *sk, + void *kbuf, int curve) +{ + const unsigned char *order; + unsigned char *buf; + size_t len; + unsigned mask; + + if (curve < 0 || curve >= 32 + || ((impl->supported_curves >> curve) & 1) == 0) + { + return 0; + } + order = impl->order(curve, &len); + while (len > 0 && *order == 0) { + order ++; + len --; + } + if (kbuf == NULL || len == 0) { + return len; + } + mask = order[0]; + mask |= (mask >> 1); + mask |= (mask >> 2); + mask |= (mask >> 4); + + /* + * We generate sequences of random bits of the right size, until + * the value is strictly lower than the curve order (we also + * check for all-zero values, which are invalid). + */ + buf = kbuf; + for (;;) { + size_t u; + unsigned cc, zz; + + (*rng_ctx)->generate(rng_ctx, buf, len); + buf[0] &= mask; + cc = 0; + u = len; + zz = 0; + while (u -- > 0) { + cc = ((unsigned)(buf[u] - order[u] - cc) >> 8) & 1; + zz |= buf[u]; + } + if (cc != 0 && zz != 0) { + break; + } + } + + if (sk != NULL) { + sk->curve = curve; + sk->x = buf; + sk->xlen = len; + } + return len; +} diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m15.c b/test/monniaux/BearSSL/src/ec/ec_p256_m15.c new file mode 100644 index 00000000..8d68d1d2 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_p256_m15.c @@ -0,0 +1,2130 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ + * that right-shifting a signed negative integer copies the sign bit + * (arithmetic right-shift). This is "implementation-defined behaviour", + * i.e. it is not undefined, but it may differ between compilers. Each + * compiler is supposed to document its behaviour in that respect. GCC + * explicitly defines that an arithmetic right shift is used. We expect + * all other compilers to do the same, because underlying CPU offer an + * arithmetic right shift opcode that could not be used otherwise. + */ +#if BR_NO_ARITH_SHIFT +#define ARSH(x, n) (((uint32_t)(x) >> (n)) \ + | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) +#else +#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) +#endif + +/* + * Convert an integer from unsigned big-endian encoding to a sequence of + * 13-bit words in little-endian order. The final "partial" word is + * returned. + */ +static uint32_t +be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len) +{ + uint32_t acc; + int acc_len; + + acc = 0; + acc_len = 0; + while (len -- > 0) { + acc |= (uint32_t)src[len] << acc_len; + acc_len += 8; + if (acc_len >= 13) { + *dst ++ = acc & 0x1FFF; + acc >>= 13; + acc_len -= 13; + } + } + return acc; +} + +/* + * Convert an integer (13-bit words, little-endian) to unsigned + * big-endian encoding. The total encoding length is provided; all + * the destination bytes will be filled. + */ +static void +le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src) +{ + uint32_t acc; + int acc_len; + + acc = 0; + acc_len = 0; + while (len -- > 0) { + if (acc_len < 8) { + acc |= (*src ++) << acc_len; + acc_len += 13; + } + dst[len] = (unsigned char)acc; + acc >>= 8; + acc_len -= 8; + } +} + +/* + * Normalise an array of words to a strict 13 bits per word. Returned + * value is the resulting carry. The source (w) and destination (d) + * arrays may be identical, but shall not overlap partially. + */ +static inline uint32_t +norm13(uint32_t *d, const uint32_t *w, size_t len) +{ + size_t u; + uint32_t cc; + + cc = 0; + for (u = 0; u < len; u ++) { + int32_t z; + + z = w[u] + cc; + d[u] = z & 0x1FFF; + cc = ARSH(z, 13); + } + return cc; +} + +/* + * mul20() multiplies two 260-bit integers together. Each word must fit + * on 13 bits; source operands use 20 words, destination operand + * receives 40 words. All overlaps allowed. + * + * square20() computes the square of a 260-bit integer. Each word must + * fit on 13 bits; source operand uses 20 words, destination operand + * receives 40 words. All overlaps allowed. + */ + +#if BR_SLOW_MUL15 + +static void +mul20(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + /* + * Two-level Karatsuba: turns a 20x20 multiplication into + * nine 5x5 multiplications. We use 13-bit words but do not + * propagate carries immediately, so words may expand: + * + * - First Karatsuba decomposition turns the 20x20 mul on + * 13-bit words into three 10x10 muls, two on 13-bit words + * and one on 14-bit words. + * + * - Second Karatsuba decomposition further splits these into: + * + * * four 5x5 muls on 13-bit words + * * four 5x5 muls on 14-bit words + * * one 5x5 mul on 15-bit words + * + * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit + * or 15-bit words, respectively. + */ + uint32_t u[45], v[45], w[90]; + uint32_t cc; + int i; + +#define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \ + (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \ + + (s2w)[5 * (s2_off) + 0]; \ + (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \ + + (s2w)[5 * (s2_off) + 1]; \ + (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \ + + (s2w)[5 * (s2_off) + 2]; \ + (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \ + + (s2w)[5 * (s2_off) + 3]; \ + (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \ + + (s2w)[5 * (s2_off) + 4]; \ + } while (0) + +#define ZADDT(dw, d_off, sw, s_off) do { \ + (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \ + (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \ + (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \ + (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \ + (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \ + } while (0) + +#define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \ + (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \ + + (s2w)[5 * (s2_off) + 0]; \ + (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \ + + (s2w)[5 * (s2_off) + 1]; \ + (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \ + + (s2w)[5 * (s2_off) + 2]; \ + (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \ + + (s2w)[5 * (s2_off) + 3]; \ + (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \ + + (s2w)[5 * (s2_off) + 4]; \ + } while (0) + +#define CPR1(w, cprcc) do { \ + uint32_t cprz = (w) + cprcc; \ + (w) = cprz & 0x1FFF; \ + cprcc = cprz >> 13; \ + } while (0) + +#define CPR(dw, d_off) do { \ + uint32_t cprcc; \ + cprcc = 0; \ + CPR1((dw)[(d_off) + 0], cprcc); \ + CPR1((dw)[(d_off) + 1], cprcc); \ + CPR1((dw)[(d_off) + 2], cprcc); \ + CPR1((dw)[(d_off) + 3], cprcc); \ + CPR1((dw)[(d_off) + 4], cprcc); \ + CPR1((dw)[(d_off) + 5], cprcc); \ + CPR1((dw)[(d_off) + 6], cprcc); \ + CPR1((dw)[(d_off) + 7], cprcc); \ + CPR1((dw)[(d_off) + 8], cprcc); \ + (dw)[(d_off) + 9] = cprcc; \ + } while (0) + + memcpy(u, a, 20 * sizeof *a); + ZADD(u, 4, a, 0, a, 1); + ZADD(u, 5, a, 2, a, 3); + ZADD(u, 6, a, 0, a, 2); + ZADD(u, 7, a, 1, a, 3); + ZADD(u, 8, u, 6, u, 7); + + memcpy(v, b, 20 * sizeof *b); + ZADD(v, 4, b, 0, b, 1); + ZADD(v, 5, b, 2, b, 3); + ZADD(v, 6, b, 0, b, 2); + ZADD(v, 7, b, 1, b, 3); + ZADD(v, 8, v, 6, v, 7); + + /* + * Do the eight first 8x8 muls. Source words are at most 16382 + * each, so we can add product results together "as is" in 32-bit + * words. + */ + for (i = 0; i < 40; i += 5) { + w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]); + w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1]) + + MUL15(u[i + 1], v[i + 0]); + w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2]) + + MUL15(u[i + 1], v[i + 1]) + + MUL15(u[i + 2], v[i + 0]); + w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3]) + + MUL15(u[i + 1], v[i + 2]) + + MUL15(u[i + 2], v[i + 1]) + + MUL15(u[i + 3], v[i + 0]); + w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4]) + + MUL15(u[i + 1], v[i + 3]) + + MUL15(u[i + 2], v[i + 2]) + + MUL15(u[i + 3], v[i + 1]) + + MUL15(u[i + 4], v[i + 0]); + w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4]) + + MUL15(u[i + 2], v[i + 3]) + + MUL15(u[i + 3], v[i + 2]) + + MUL15(u[i + 4], v[i + 1]); + w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4]) + + MUL15(u[i + 3], v[i + 3]) + + MUL15(u[i + 4], v[i + 2]); + w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4]) + + MUL15(u[i + 4], v[i + 3]); + w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]); + w[(i << 1) + 9] = 0; + } + + /* + * For the 9th multiplication, source words are up to 32764, + * so we must do some carry propagation. If we add up to + * 4 products and the carry is no more than 524224, then the + * result fits in 32 bits, and the next carry will be no more + * than 524224 (because 4*(32764^2)+524224 < 8192*524225). + * + * We thus just skip one of the products in the middle word, + * then do a carry propagation (this reduces words to 13 bits + * each, except possibly the last, which may use up to 17 bits + * or so), then add the missing product. + */ + w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]); + w[80 + 1] = MUL15(u[40 + 0], v[40 + 1]) + + MUL15(u[40 + 1], v[40 + 0]); + w[80 + 2] = MUL15(u[40 + 0], v[40 + 2]) + + MUL15(u[40 + 1], v[40 + 1]) + + MUL15(u[40 + 2], v[40 + 0]); + w[80 + 3] = MUL15(u[40 + 0], v[40 + 3]) + + MUL15(u[40 + 1], v[40 + 2]) + + MUL15(u[40 + 2], v[40 + 1]) + + MUL15(u[40 + 3], v[40 + 0]); + w[80 + 4] = MUL15(u[40 + 0], v[40 + 4]) + + MUL15(u[40 + 1], v[40 + 3]) + + MUL15(u[40 + 2], v[40 + 2]) + + MUL15(u[40 + 3], v[40 + 1]); + /* + MUL15(u[40 + 4], v[40 + 0]) */ + w[80 + 5] = MUL15(u[40 + 1], v[40 + 4]) + + MUL15(u[40 + 2], v[40 + 3]) + + MUL15(u[40 + 3], v[40 + 2]) + + MUL15(u[40 + 4], v[40 + 1]); + w[80 + 6] = MUL15(u[40 + 2], v[40 + 4]) + + MUL15(u[40 + 3], v[40 + 3]) + + MUL15(u[40 + 4], v[40 + 2]); + w[80 + 7] = MUL15(u[40 + 3], v[40 + 4]) + + MUL15(u[40 + 4], v[40 + 3]); + w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]); + + CPR(w, 80); + + w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]); + + /* + * The products on 14-bit words in slots 6 and 7 yield values + * up to 5*(16382^2) each, and we need to subtract two such + * values from the higher word. We need the subtraction to fit + * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit. + * However, 10*(16382^2) does not fit. So we must perform a + * bit of reduction here. + */ + CPR(w, 60); + CPR(w, 70); + + /* + * Recompose results. + */ + + /* 0..1*0..1 into 0..3 */ + ZSUB2F(w, 8, w, 0, w, 2); + ZSUB2F(w, 9, w, 1, w, 3); + ZADDT(w, 1, w, 8); + ZADDT(w, 2, w, 9); + + /* 2..3*2..3 into 4..7 */ + ZSUB2F(w, 10, w, 4, w, 6); + ZSUB2F(w, 11, w, 5, w, 7); + ZADDT(w, 5, w, 10); + ZADDT(w, 6, w, 11); + + /* (0..1+2..3)*(0..1+2..3) into 12..15 */ + ZSUB2F(w, 16, w, 12, w, 14); + ZSUB2F(w, 17, w, 13, w, 15); + ZADDT(w, 13, w, 16); + ZADDT(w, 14, w, 17); + + /* first-level recomposition */ + ZSUB2F(w, 12, w, 0, w, 4); + ZSUB2F(w, 13, w, 1, w, 5); + ZSUB2F(w, 14, w, 2, w, 6); + ZSUB2F(w, 15, w, 3, w, 7); + ZADDT(w, 2, w, 12); + ZADDT(w, 3, w, 13); + ZADDT(w, 4, w, 14); + ZADDT(w, 5, w, 15); + + /* + * Perform carry propagation to bring all words down to 13 bits. + */ + cc = norm13(d, w, 40); + d[39] += (cc << 13); + +#undef ZADD +#undef ZADDT +#undef ZSUB2F +#undef CPR1 +#undef CPR +} + +static inline void +square20(uint32_t *d, const uint32_t *a) +{ + mul20(d, a, a); +} + +#else + +static void +mul20(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t t[39]; + + t[ 0] = MUL15(a[ 0], b[ 0]); + t[ 1] = MUL15(a[ 0], b[ 1]) + + MUL15(a[ 1], b[ 0]); + t[ 2] = MUL15(a[ 0], b[ 2]) + + MUL15(a[ 1], b[ 1]) + + MUL15(a[ 2], b[ 0]); + t[ 3] = MUL15(a[ 0], b[ 3]) + + MUL15(a[ 1], b[ 2]) + + MUL15(a[ 2], b[ 1]) + + MUL15(a[ 3], b[ 0]); + t[ 4] = MUL15(a[ 0], b[ 4]) + + MUL15(a[ 1], b[ 3]) + + MUL15(a[ 2], b[ 2]) + + MUL15(a[ 3], b[ 1]) + + MUL15(a[ 4], b[ 0]); + t[ 5] = MUL15(a[ 0], b[ 5]) + + MUL15(a[ 1], b[ 4]) + + MUL15(a[ 2], b[ 3]) + + MUL15(a[ 3], b[ 2]) + + MUL15(a[ 4], b[ 1]) + + MUL15(a[ 5], b[ 0]); + t[ 6] = MUL15(a[ 0], b[ 6]) + + MUL15(a[ 1], b[ 5]) + + MUL15(a[ 2], b[ 4]) + + MUL15(a[ 3], b[ 3]) + + MUL15(a[ 4], b[ 2]) + + MUL15(a[ 5], b[ 1]) + + MUL15(a[ 6], b[ 0]); + t[ 7] = MUL15(a[ 0], b[ 7]) + + MUL15(a[ 1], b[ 6]) + + MUL15(a[ 2], b[ 5]) + + MUL15(a[ 3], b[ 4]) + + MUL15(a[ 4], b[ 3]) + + MUL15(a[ 5], b[ 2]) + + MUL15(a[ 6], b[ 1]) + + MUL15(a[ 7], b[ 0]); + t[ 8] = MUL15(a[ 0], b[ 8]) + + MUL15(a[ 1], b[ 7]) + + MUL15(a[ 2], b[ 6]) + + MUL15(a[ 3], b[ 5]) + + MUL15(a[ 4], b[ 4]) + + MUL15(a[ 5], b[ 3]) + + MUL15(a[ 6], b[ 2]) + + MUL15(a[ 7], b[ 1]) + + MUL15(a[ 8], b[ 0]); + t[ 9] = MUL15(a[ 0], b[ 9]) + + MUL15(a[ 1], b[ 8]) + + MUL15(a[ 2], b[ 7]) + + MUL15(a[ 3], b[ 6]) + + MUL15(a[ 4], b[ 5]) + + MUL15(a[ 5], b[ 4]) + + MUL15(a[ 6], b[ 3]) + + MUL15(a[ 7], b[ 2]) + + MUL15(a[ 8], b[ 1]) + + MUL15(a[ 9], b[ 0]); + t[10] = MUL15(a[ 0], b[10]) + + MUL15(a[ 1], b[ 9]) + + MUL15(a[ 2], b[ 8]) + + MUL15(a[ 3], b[ 7]) + + MUL15(a[ 4], b[ 6]) + + MUL15(a[ 5], b[ 5]) + + MUL15(a[ 6], b[ 4]) + + MUL15(a[ 7], b[ 3]) + + MUL15(a[ 8], b[ 2]) + + MUL15(a[ 9], b[ 1]) + + MUL15(a[10], b[ 0]); + t[11] = MUL15(a[ 0], b[11]) + + MUL15(a[ 1], b[10]) + + MUL15(a[ 2], b[ 9]) + + MUL15(a[ 3], b[ 8]) + + MUL15(a[ 4], b[ 7]) + + MUL15(a[ 5], b[ 6]) + + MUL15(a[ 6], b[ 5]) + + MUL15(a[ 7], b[ 4]) + + MUL15(a[ 8], b[ 3]) + + MUL15(a[ 9], b[ 2]) + + MUL15(a[10], b[ 1]) + + MUL15(a[11], b[ 0]); + t[12] = MUL15(a[ 0], b[12]) + + MUL15(a[ 1], b[11]) + + MUL15(a[ 2], b[10]) + + MUL15(a[ 3], b[ 9]) + + MUL15(a[ 4], b[ 8]) + + MUL15(a[ 5], b[ 7]) + + MUL15(a[ 6], b[ 6]) + + MUL15(a[ 7], b[ 5]) + + MUL15(a[ 8], b[ 4]) + + MUL15(a[ 9], b[ 3]) + + MUL15(a[10], b[ 2]) + + MUL15(a[11], b[ 1]) + + MUL15(a[12], b[ 0]); + t[13] = MUL15(a[ 0], b[13]) + + MUL15(a[ 1], b[12]) + + MUL15(a[ 2], b[11]) + + MUL15(a[ 3], b[10]) + + MUL15(a[ 4], b[ 9]) + + MUL15(a[ 5], b[ 8]) + + MUL15(a[ 6], b[ 7]) + + MUL15(a[ 7], b[ 6]) + + MUL15(a[ 8], b[ 5]) + + MUL15(a[ 9], b[ 4]) + + MUL15(a[10], b[ 3]) + + MUL15(a[11], b[ 2]) + + MUL15(a[12], b[ 1]) + + MUL15(a[13], b[ 0]); + t[14] = MUL15(a[ 0], b[14]) + + MUL15(a[ 1], b[13]) + + MUL15(a[ 2], b[12]) + + MUL15(a[ 3], b[11]) + + MUL15(a[ 4], b[10]) + + MUL15(a[ 5], b[ 9]) + + MUL15(a[ 6], b[ 8]) + + MUL15(a[ 7], b[ 7]) + + MUL15(a[ 8], b[ 6]) + + MUL15(a[ 9], b[ 5]) + + MUL15(a[10], b[ 4]) + + MUL15(a[11], b[ 3]) + + MUL15(a[12], b[ 2]) + + MUL15(a[13], b[ 1]) + + MUL15(a[14], b[ 0]); + t[15] = MUL15(a[ 0], b[15]) + + MUL15(a[ 1], b[14]) + + MUL15(a[ 2], b[13]) + + MUL15(a[ 3], b[12]) + + MUL15(a[ 4], b[11]) + + MUL15(a[ 5], b[10]) + + MUL15(a[ 6], b[ 9]) + + MUL15(a[ 7], b[ 8]) + + MUL15(a[ 8], b[ 7]) + + MUL15(a[ 9], b[ 6]) + + MUL15(a[10], b[ 5]) + + MUL15(a[11], b[ 4]) + + MUL15(a[12], b[ 3]) + + MUL15(a[13], b[ 2]) + + MUL15(a[14], b[ 1]) + + MUL15(a[15], b[ 0]); + t[16] = MUL15(a[ 0], b[16]) + + MUL15(a[ 1], b[15]) + + MUL15(a[ 2], b[14]) + + MUL15(a[ 3], b[13]) + + MUL15(a[ 4], b[12]) + + MUL15(a[ 5], b[11]) + + MUL15(a[ 6], b[10]) + + MUL15(a[ 7], b[ 9]) + + MUL15(a[ 8], b[ 8]) + + MUL15(a[ 9], b[ 7]) + + MUL15(a[10], b[ 6]) + + MUL15(a[11], b[ 5]) + + MUL15(a[12], b[ 4]) + + MUL15(a[13], b[ 3]) + + MUL15(a[14], b[ 2]) + + MUL15(a[15], b[ 1]) + + MUL15(a[16], b[ 0]); + t[17] = MUL15(a[ 0], b[17]) + + MUL15(a[ 1], b[16]) + + MUL15(a[ 2], b[15]) + + MUL15(a[ 3], b[14]) + + MUL15(a[ 4], b[13]) + + MUL15(a[ 5], b[12]) + + MUL15(a[ 6], b[11]) + + MUL15(a[ 7], b[10]) + + MUL15(a[ 8], b[ 9]) + + MUL15(a[ 9], b[ 8]) + + MUL15(a[10], b[ 7]) + + MUL15(a[11], b[ 6]) + + MUL15(a[12], b[ 5]) + + MUL15(a[13], b[ 4]) + + MUL15(a[14], b[ 3]) + + MUL15(a[15], b[ 2]) + + MUL15(a[16], b[ 1]) + + MUL15(a[17], b[ 0]); + t[18] = MUL15(a[ 0], b[18]) + + MUL15(a[ 1], b[17]) + + MUL15(a[ 2], b[16]) + + MUL15(a[ 3], b[15]) + + MUL15(a[ 4], b[14]) + + MUL15(a[ 5], b[13]) + + MUL15(a[ 6], b[12]) + + MUL15(a[ 7], b[11]) + + MUL15(a[ 8], b[10]) + + MUL15(a[ 9], b[ 9]) + + MUL15(a[10], b[ 8]) + + MUL15(a[11], b[ 7]) + + MUL15(a[12], b[ 6]) + + MUL15(a[13], b[ 5]) + + MUL15(a[14], b[ 4]) + + MUL15(a[15], b[ 3]) + + MUL15(a[16], b[ 2]) + + MUL15(a[17], b[ 1]) + + MUL15(a[18], b[ 0]); + t[19] = MUL15(a[ 0], b[19]) + + MUL15(a[ 1], b[18]) + + MUL15(a[ 2], b[17]) + + MUL15(a[ 3], b[16]) + + MUL15(a[ 4], b[15]) + + MUL15(a[ 5], b[14]) + + MUL15(a[ 6], b[13]) + + MUL15(a[ 7], b[12]) + + MUL15(a[ 8], b[11]) + + MUL15(a[ 9], b[10]) + + MUL15(a[10], b[ 9]) + + MUL15(a[11], b[ 8]) + + MUL15(a[12], b[ 7]) + + MUL15(a[13], b[ 6]) + + MUL15(a[14], b[ 5]) + + MUL15(a[15], b[ 4]) + + MUL15(a[16], b[ 3]) + + MUL15(a[17], b[ 2]) + + MUL15(a[18], b[ 1]) + + MUL15(a[19], b[ 0]); + t[20] = MUL15(a[ 1], b[19]) + + MUL15(a[ 2], b[18]) + + MUL15(a[ 3], b[17]) + + MUL15(a[ 4], b[16]) + + MUL15(a[ 5], b[15]) + + MUL15(a[ 6], b[14]) + + MUL15(a[ 7], b[13]) + + MUL15(a[ 8], b[12]) + + MUL15(a[ 9], b[11]) + + MUL15(a[10], b[10]) + + MUL15(a[11], b[ 9]) + + MUL15(a[12], b[ 8]) + + MUL15(a[13], b[ 7]) + + MUL15(a[14], b[ 6]) + + MUL15(a[15], b[ 5]) + + MUL15(a[16], b[ 4]) + + MUL15(a[17], b[ 3]) + + MUL15(a[18], b[ 2]) + + MUL15(a[19], b[ 1]); + t[21] = MUL15(a[ 2], b[19]) + + MUL15(a[ 3], b[18]) + + MUL15(a[ 4], b[17]) + + MUL15(a[ 5], b[16]) + + MUL15(a[ 6], b[15]) + + MUL15(a[ 7], b[14]) + + MUL15(a[ 8], b[13]) + + MUL15(a[ 9], b[12]) + + MUL15(a[10], b[11]) + + MUL15(a[11], b[10]) + + MUL15(a[12], b[ 9]) + + MUL15(a[13], b[ 8]) + + MUL15(a[14], b[ 7]) + + MUL15(a[15], b[ 6]) + + MUL15(a[16], b[ 5]) + + MUL15(a[17], b[ 4]) + + MUL15(a[18], b[ 3]) + + MUL15(a[19], b[ 2]); + t[22] = MUL15(a[ 3], b[19]) + + MUL15(a[ 4], b[18]) + + MUL15(a[ 5], b[17]) + + MUL15(a[ 6], b[16]) + + MUL15(a[ 7], b[15]) + + MUL15(a[ 8], b[14]) + + MUL15(a[ 9], b[13]) + + MUL15(a[10], b[12]) + + MUL15(a[11], b[11]) + + MUL15(a[12], b[10]) + + MUL15(a[13], b[ 9]) + + MUL15(a[14], b[ 8]) + + MUL15(a[15], b[ 7]) + + MUL15(a[16], b[ 6]) + + MUL15(a[17], b[ 5]) + + MUL15(a[18], b[ 4]) + + MUL15(a[19], b[ 3]); + t[23] = MUL15(a[ 4], b[19]) + + MUL15(a[ 5], b[18]) + + MUL15(a[ 6], b[17]) + + MUL15(a[ 7], b[16]) + + MUL15(a[ 8], b[15]) + + MUL15(a[ 9], b[14]) + + MUL15(a[10], b[13]) + + MUL15(a[11], b[12]) + + MUL15(a[12], b[11]) + + MUL15(a[13], b[10]) + + MUL15(a[14], b[ 9]) + + MUL15(a[15], b[ 8]) + + MUL15(a[16], b[ 7]) + + MUL15(a[17], b[ 6]) + + MUL15(a[18], b[ 5]) + + MUL15(a[19], b[ 4]); + t[24] = MUL15(a[ 5], b[19]) + + MUL15(a[ 6], b[18]) + + MUL15(a[ 7], b[17]) + + MUL15(a[ 8], b[16]) + + MUL15(a[ 9], b[15]) + + MUL15(a[10], b[14]) + + MUL15(a[11], b[13]) + + MUL15(a[12], b[12]) + + MUL15(a[13], b[11]) + + MUL15(a[14], b[10]) + + MUL15(a[15], b[ 9]) + + MUL15(a[16], b[ 8]) + + MUL15(a[17], b[ 7]) + + MUL15(a[18], b[ 6]) + + MUL15(a[19], b[ 5]); + t[25] = MUL15(a[ 6], b[19]) + + MUL15(a[ 7], b[18]) + + MUL15(a[ 8], b[17]) + + MUL15(a[ 9], b[16]) + + MUL15(a[10], b[15]) + + MUL15(a[11], b[14]) + + MUL15(a[12], b[13]) + + MUL15(a[13], b[12]) + + MUL15(a[14], b[11]) + + MUL15(a[15], b[10]) + + MUL15(a[16], b[ 9]) + + MUL15(a[17], b[ 8]) + + MUL15(a[18], b[ 7]) + + MUL15(a[19], b[ 6]); + t[26] = MUL15(a[ 7], b[19]) + + MUL15(a[ 8], b[18]) + + MUL15(a[ 9], b[17]) + + MUL15(a[10], b[16]) + + MUL15(a[11], b[15]) + + MUL15(a[12], b[14]) + + MUL15(a[13], b[13]) + + MUL15(a[14], b[12]) + + MUL15(a[15], b[11]) + + MUL15(a[16], b[10]) + + MUL15(a[17], b[ 9]) + + MUL15(a[18], b[ 8]) + + MUL15(a[19], b[ 7]); + t[27] = MUL15(a[ 8], b[19]) + + MUL15(a[ 9], b[18]) + + MUL15(a[10], b[17]) + + MUL15(a[11], b[16]) + + MUL15(a[12], b[15]) + + MUL15(a[13], b[14]) + + MUL15(a[14], b[13]) + + MUL15(a[15], b[12]) + + MUL15(a[16], b[11]) + + MUL15(a[17], b[10]) + + MUL15(a[18], b[ 9]) + + MUL15(a[19], b[ 8]); + t[28] = MUL15(a[ 9], b[19]) + + MUL15(a[10], b[18]) + + MUL15(a[11], b[17]) + + MUL15(a[12], b[16]) + + MUL15(a[13], b[15]) + + MUL15(a[14], b[14]) + + MUL15(a[15], b[13]) + + MUL15(a[16], b[12]) + + MUL15(a[17], b[11]) + + MUL15(a[18], b[10]) + + MUL15(a[19], b[ 9]); + t[29] = MUL15(a[10], b[19]) + + MUL15(a[11], b[18]) + + MUL15(a[12], b[17]) + + MUL15(a[13], b[16]) + + MUL15(a[14], b[15]) + + MUL15(a[15], b[14]) + + MUL15(a[16], b[13]) + + MUL15(a[17], b[12]) + + MUL15(a[18], b[11]) + + MUL15(a[19], b[10]); + t[30] = MUL15(a[11], b[19]) + + MUL15(a[12], b[18]) + + MUL15(a[13], b[17]) + + MUL15(a[14], b[16]) + + MUL15(a[15], b[15]) + + MUL15(a[16], b[14]) + + MUL15(a[17], b[13]) + + MUL15(a[18], b[12]) + + MUL15(a[19], b[11]); + t[31] = MUL15(a[12], b[19]) + + MUL15(a[13], b[18]) + + MUL15(a[14], b[17]) + + MUL15(a[15], b[16]) + + MUL15(a[16], b[15]) + + MUL15(a[17], b[14]) + + MUL15(a[18], b[13]) + + MUL15(a[19], b[12]); + t[32] = MUL15(a[13], b[19]) + + MUL15(a[14], b[18]) + + MUL15(a[15], b[17]) + + MUL15(a[16], b[16]) + + MUL15(a[17], b[15]) + + MUL15(a[18], b[14]) + + MUL15(a[19], b[13]); + t[33] = MUL15(a[14], b[19]) + + MUL15(a[15], b[18]) + + MUL15(a[16], b[17]) + + MUL15(a[17], b[16]) + + MUL15(a[18], b[15]) + + MUL15(a[19], b[14]); + t[34] = MUL15(a[15], b[19]) + + MUL15(a[16], b[18]) + + MUL15(a[17], b[17]) + + MUL15(a[18], b[16]) + + MUL15(a[19], b[15]); + t[35] = MUL15(a[16], b[19]) + + MUL15(a[17], b[18]) + + MUL15(a[18], b[17]) + + MUL15(a[19], b[16]); + t[36] = MUL15(a[17], b[19]) + + MUL15(a[18], b[18]) + + MUL15(a[19], b[17]); + t[37] = MUL15(a[18], b[19]) + + MUL15(a[19], b[18]); + t[38] = MUL15(a[19], b[19]); + d[39] = norm13(d, t, 39); +} + +static void +square20(uint32_t *d, const uint32_t *a) +{ + uint32_t t[39]; + + t[ 0] = MUL15(a[ 0], a[ 0]); + t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1); + t[ 2] = MUL15(a[ 1], a[ 1]) + + ((MUL15(a[ 0], a[ 2])) << 1); + t[ 3] = ((MUL15(a[ 0], a[ 3]) + + MUL15(a[ 1], a[ 2])) << 1); + t[ 4] = MUL15(a[ 2], a[ 2]) + + ((MUL15(a[ 0], a[ 4]) + + MUL15(a[ 1], a[ 3])) << 1); + t[ 5] = ((MUL15(a[ 0], a[ 5]) + + MUL15(a[ 1], a[ 4]) + + MUL15(a[ 2], a[ 3])) << 1); + t[ 6] = MUL15(a[ 3], a[ 3]) + + ((MUL15(a[ 0], a[ 6]) + + MUL15(a[ 1], a[ 5]) + + MUL15(a[ 2], a[ 4])) << 1); + t[ 7] = ((MUL15(a[ 0], a[ 7]) + + MUL15(a[ 1], a[ 6]) + + MUL15(a[ 2], a[ 5]) + + MUL15(a[ 3], a[ 4])) << 1); + t[ 8] = MUL15(a[ 4], a[ 4]) + + ((MUL15(a[ 0], a[ 8]) + + MUL15(a[ 1], a[ 7]) + + MUL15(a[ 2], a[ 6]) + + MUL15(a[ 3], a[ 5])) << 1); + t[ 9] = ((MUL15(a[ 0], a[ 9]) + + MUL15(a[ 1], a[ 8]) + + MUL15(a[ 2], a[ 7]) + + MUL15(a[ 3], a[ 6]) + + MUL15(a[ 4], a[ 5])) << 1); + t[10] = MUL15(a[ 5], a[ 5]) + + ((MUL15(a[ 0], a[10]) + + MUL15(a[ 1], a[ 9]) + + MUL15(a[ 2], a[ 8]) + + MUL15(a[ 3], a[ 7]) + + MUL15(a[ 4], a[ 6])) << 1); + t[11] = ((MUL15(a[ 0], a[11]) + + MUL15(a[ 1], a[10]) + + MUL15(a[ 2], a[ 9]) + + MUL15(a[ 3], a[ 8]) + + MUL15(a[ 4], a[ 7]) + + MUL15(a[ 5], a[ 6])) << 1); + t[12] = MUL15(a[ 6], a[ 6]) + + ((MUL15(a[ 0], a[12]) + + MUL15(a[ 1], a[11]) + + MUL15(a[ 2], a[10]) + + MUL15(a[ 3], a[ 9]) + + MUL15(a[ 4], a[ 8]) + + MUL15(a[ 5], a[ 7])) << 1); + t[13] = ((MUL15(a[ 0], a[13]) + + MUL15(a[ 1], a[12]) + + MUL15(a[ 2], a[11]) + + MUL15(a[ 3], a[10]) + + MUL15(a[ 4], a[ 9]) + + MUL15(a[ 5], a[ 8]) + + MUL15(a[ 6], a[ 7])) << 1); + t[14] = MUL15(a[ 7], a[ 7]) + + ((MUL15(a[ 0], a[14]) + + MUL15(a[ 1], a[13]) + + MUL15(a[ 2], a[12]) + + MUL15(a[ 3], a[11]) + + MUL15(a[ 4], a[10]) + + MUL15(a[ 5], a[ 9]) + + MUL15(a[ 6], a[ 8])) << 1); + t[15] = ((MUL15(a[ 0], a[15]) + + MUL15(a[ 1], a[14]) + + MUL15(a[ 2], a[13]) + + MUL15(a[ 3], a[12]) + + MUL15(a[ 4], a[11]) + + MUL15(a[ 5], a[10]) + + MUL15(a[ 6], a[ 9]) + + MUL15(a[ 7], a[ 8])) << 1); + t[16] = MUL15(a[ 8], a[ 8]) + + ((MUL15(a[ 0], a[16]) + + MUL15(a[ 1], a[15]) + + MUL15(a[ 2], a[14]) + + MUL15(a[ 3], a[13]) + + MUL15(a[ 4], a[12]) + + MUL15(a[ 5], a[11]) + + MUL15(a[ 6], a[10]) + + MUL15(a[ 7], a[ 9])) << 1); + t[17] = ((MUL15(a[ 0], a[17]) + + MUL15(a[ 1], a[16]) + + MUL15(a[ 2], a[15]) + + MUL15(a[ 3], a[14]) + + MUL15(a[ 4], a[13]) + + MUL15(a[ 5], a[12]) + + MUL15(a[ 6], a[11]) + + MUL15(a[ 7], a[10]) + + MUL15(a[ 8], a[ 9])) << 1); + t[18] = MUL15(a[ 9], a[ 9]) + + ((MUL15(a[ 0], a[18]) + + MUL15(a[ 1], a[17]) + + MUL15(a[ 2], a[16]) + + MUL15(a[ 3], a[15]) + + MUL15(a[ 4], a[14]) + + MUL15(a[ 5], a[13]) + + MUL15(a[ 6], a[12]) + + MUL15(a[ 7], a[11]) + + MUL15(a[ 8], a[10])) << 1); + t[19] = ((MUL15(a[ 0], a[19]) + + MUL15(a[ 1], a[18]) + + MUL15(a[ 2], a[17]) + + MUL15(a[ 3], a[16]) + + MUL15(a[ 4], a[15]) + + MUL15(a[ 5], a[14]) + + MUL15(a[ 6], a[13]) + + MUL15(a[ 7], a[12]) + + MUL15(a[ 8], a[11]) + + MUL15(a[ 9], a[10])) << 1); + t[20] = MUL15(a[10], a[10]) + + ((MUL15(a[ 1], a[19]) + + MUL15(a[ 2], a[18]) + + MUL15(a[ 3], a[17]) + + MUL15(a[ 4], a[16]) + + MUL15(a[ 5], a[15]) + + MUL15(a[ 6], a[14]) + + MUL15(a[ 7], a[13]) + + MUL15(a[ 8], a[12]) + + MUL15(a[ 9], a[11])) << 1); + t[21] = ((MUL15(a[ 2], a[19]) + + MUL15(a[ 3], a[18]) + + MUL15(a[ 4], a[17]) + + MUL15(a[ 5], a[16]) + + MUL15(a[ 6], a[15]) + + MUL15(a[ 7], a[14]) + + MUL15(a[ 8], a[13]) + + MUL15(a[ 9], a[12]) + + MUL15(a[10], a[11])) << 1); + t[22] = MUL15(a[11], a[11]) + + ((MUL15(a[ 3], a[19]) + + MUL15(a[ 4], a[18]) + + MUL15(a[ 5], a[17]) + + MUL15(a[ 6], a[16]) + + MUL15(a[ 7], a[15]) + + MUL15(a[ 8], a[14]) + + MUL15(a[ 9], a[13]) + + MUL15(a[10], a[12])) << 1); + t[23] = ((MUL15(a[ 4], a[19]) + + MUL15(a[ 5], a[18]) + + MUL15(a[ 6], a[17]) + + MUL15(a[ 7], a[16]) + + MUL15(a[ 8], a[15]) + + MUL15(a[ 9], a[14]) + + MUL15(a[10], a[13]) + + MUL15(a[11], a[12])) << 1); + t[24] = MUL15(a[12], a[12]) + + ((MUL15(a[ 5], a[19]) + + MUL15(a[ 6], a[18]) + + MUL15(a[ 7], a[17]) + + MUL15(a[ 8], a[16]) + + MUL15(a[ 9], a[15]) + + MUL15(a[10], a[14]) + + MUL15(a[11], a[13])) << 1); + t[25] = ((MUL15(a[ 6], a[19]) + + MUL15(a[ 7], a[18]) + + MUL15(a[ 8], a[17]) + + MUL15(a[ 9], a[16]) + + MUL15(a[10], a[15]) + + MUL15(a[11], a[14]) + + MUL15(a[12], a[13])) << 1); + t[26] = MUL15(a[13], a[13]) + + ((MUL15(a[ 7], a[19]) + + MUL15(a[ 8], a[18]) + + MUL15(a[ 9], a[17]) + + MUL15(a[10], a[16]) + + MUL15(a[11], a[15]) + + MUL15(a[12], a[14])) << 1); + t[27] = ((MUL15(a[ 8], a[19]) + + MUL15(a[ 9], a[18]) + + MUL15(a[10], a[17]) + + MUL15(a[11], a[16]) + + MUL15(a[12], a[15]) + + MUL15(a[13], a[14])) << 1); + t[28] = MUL15(a[14], a[14]) + + ((MUL15(a[ 9], a[19]) + + MUL15(a[10], a[18]) + + MUL15(a[11], a[17]) + + MUL15(a[12], a[16]) + + MUL15(a[13], a[15])) << 1); + t[29] = ((MUL15(a[10], a[19]) + + MUL15(a[11], a[18]) + + MUL15(a[12], a[17]) + + MUL15(a[13], a[16]) + + MUL15(a[14], a[15])) << 1); + t[30] = MUL15(a[15], a[15]) + + ((MUL15(a[11], a[19]) + + MUL15(a[12], a[18]) + + MUL15(a[13], a[17]) + + MUL15(a[14], a[16])) << 1); + t[31] = ((MUL15(a[12], a[19]) + + MUL15(a[13], a[18]) + + MUL15(a[14], a[17]) + + MUL15(a[15], a[16])) << 1); + t[32] = MUL15(a[16], a[16]) + + ((MUL15(a[13], a[19]) + + MUL15(a[14], a[18]) + + MUL15(a[15], a[17])) << 1); + t[33] = ((MUL15(a[14], a[19]) + + MUL15(a[15], a[18]) + + MUL15(a[16], a[17])) << 1); + t[34] = MUL15(a[17], a[17]) + + ((MUL15(a[15], a[19]) + + MUL15(a[16], a[18])) << 1); + t[35] = ((MUL15(a[16], a[19]) + + MUL15(a[17], a[18])) << 1); + t[36] = MUL15(a[18], a[18]) + + ((MUL15(a[17], a[19])) << 1); + t[37] = ((MUL15(a[18], a[19])) << 1); + t[38] = MUL15(a[19], a[19]); + d[39] = norm13(d, t, 39); +} + +#endif + +/* + * Modulus for field F256 (field for point coordinates in curve P-256). + */ +static const uint32_t F256[] = { + 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000, + 0x0000, 0x1FF8, 0x1FFF, 0x01FF +}; + +/* + * The 'b' curve equation coefficient for P-256. + */ +static const uint32_t P256_B[] = { + 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619, + 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C, + 0x0A3A, 0x0EC5, 0x118D, 0x00B5 +}; + +/* + * Perform a "short reduction" in field F256 (field for curve P-256). + * The source value should be less than 262 bits; on output, it will + * be at most 257 bits, and less than twice the modulus. + */ +static void +reduce_f256(uint32_t *d) +{ + uint32_t x; + + x = d[19] >> 9; + d[19] &= 0x01FF; + d[17] += x << 3; + d[14] -= x << 10; + d[7] -= x << 5; + d[0] += x; + norm13(d, d, 20); +} + +/* + * Perform a "final reduction" in field F256 (field for curve P-256). + * The source value must be less than twice the modulus. If the value + * is not lower than the modulus, then the modulus is subtracted and + * this function returns 1; otherwise, it leaves it untouched and it + * returns 0. + */ +static uint32_t +reduce_final_f256(uint32_t *d) +{ + uint32_t t[20]; + uint32_t cc; + int i; + + memcpy(t, d, sizeof t); + cc = 0; + for (i = 0; i < 20; i ++) { + uint32_t w; + + w = t[i] - F256[i] - cc; + cc = w >> 31; + t[i] = w & 0x1FFF; + } + cc ^= 1; + CCOPY(cc, d, t, sizeof t); + return cc; +} + +/* + * Perform a multiplication of two integers modulo + * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays + * of 20 words, each containing 13 bits of data, in little-endian order. + * On input, upper word may be up to 13 bits (hence value up to 2^260-1); + * on output, value fits on 257 bits and is lower than twice the modulus. + */ +static void +mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t t[40], cc; + int i; + + /* + * Compute raw multiplication. All result words fit in 13 bits + * each. + */ + mul20(t, a, b); + + /* + * Modular reduction: each high word in added/subtracted where + * necessary. + * + * The modulus is: + * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 + * Therefore: + * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p + * + * For a word x at bit offset n (n >= 256), we have: + * x*2^n = x*2^(n-32) - x*2^(n-64) + * - x*2^(n - 160) + x*2^(n-256) mod p + * + * Thus, we can nullify the high word if we reinject it at some + * proper emplacements. + */ + for (i = 39; i >= 20; i --) { + uint32_t x; + + x = t[i]; + t[i - 2] += ARSH(x, 6); + t[i - 3] += (x << 7) & 0x1FFF; + t[i - 4] -= ARSH(x, 12); + t[i - 5] -= (x << 1) & 0x1FFF; + t[i - 12] -= ARSH(x, 4); + t[i - 13] -= (x << 9) & 0x1FFF; + t[i - 19] += ARSH(x, 9); + t[i - 20] += (x << 4) & 0x1FFF; + } + + /* + * Propagate carries. This is a signed propagation, and the + * result may be negative. The loop above may enlarge values, + * but not two much: worst case is the chain involving t[i - 3], + * in which a value may be added to itself up to 7 times. Since + * starting values are 13-bit each, all words fit on 20 bits + * (21 to account for the sign bit). + */ + cc = norm13(t, t, 20); + + /* + * Perform modular reduction again for the bits beyond 256 (the carry + * and the bits 256..259). Since the largest shift below is by 10 + * bits, and the values fit on 21 bits, values fit in 32-bit words, + * thereby allowing injecting full word values. + */ + cc = (cc << 4) | (t[19] >> 9); + t[19] &= 0x01FF; + t[17] += cc << 3; + t[14] -= cc << 10; + t[7] -= cc << 5; + t[0] += cc; + + /* + * If the carry is negative, then after carry propagation, we may + * end up with a value which is negative, and we don't want that. + * Thus, in that case, we add the modulus. Note that the subtraction + * result, when the carry is negative, is always smaller than the + * modulus, so the extra addition will not make the value exceed + * twice the modulus. + */ + cc >>= 31; + t[0] -= cc; + t[7] += cc << 5; + t[14] += cc << 10; + t[17] -= cc << 3; + t[19] += cc << 9; + + norm13(d, t, 20); +} + +/* + * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve + * P-256). Operand is an array of 20 words, each containing 13 bits of + * data, in little-endian order. On input, upper word may be up to 13 + * bits (hence value up to 2^260-1); on output, value fits on 257 bits + * and is lower than twice the modulus. + */ +static void +square_f256(uint32_t *d, const uint32_t *a) +{ + uint32_t t[40], cc; + int i; + + /* + * Compute raw square. All result words fit in 13 bits each. + */ + square20(t, a); + + /* + * Modular reduction: each high word in added/subtracted where + * necessary. + * + * The modulus is: + * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 + * Therefore: + * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p + * + * For a word x at bit offset n (n >= 256), we have: + * x*2^n = x*2^(n-32) - x*2^(n-64) + * - x*2^(n - 160) + x*2^(n-256) mod p + * + * Thus, we can nullify the high word if we reinject it at some + * proper emplacements. + */ + for (i = 39; i >= 20; i --) { + uint32_t x; + + x = t[i]; + t[i - 2] += ARSH(x, 6); + t[i - 3] += (x << 7) & 0x1FFF; + t[i - 4] -= ARSH(x, 12); + t[i - 5] -= (x << 1) & 0x1FFF; + t[i - 12] -= ARSH(x, 4); + t[i - 13] -= (x << 9) & 0x1FFF; + t[i - 19] += ARSH(x, 9); + t[i - 20] += (x << 4) & 0x1FFF; + } + + /* + * Propagate carries. This is a signed propagation, and the + * result may be negative. The loop above may enlarge values, + * but not two much: worst case is the chain involving t[i - 3], + * in which a value may be added to itself up to 7 times. Since + * starting values are 13-bit each, all words fit on 20 bits + * (21 to account for the sign bit). + */ + cc = norm13(t, t, 20); + + /* + * Perform modular reduction again for the bits beyond 256 (the carry + * and the bits 256..259). Since the largest shift below is by 10 + * bits, and the values fit on 21 bits, values fit in 32-bit words, + * thereby allowing injecting full word values. + */ + cc = (cc << 4) | (t[19] >> 9); + t[19] &= 0x01FF; + t[17] += cc << 3; + t[14] -= cc << 10; + t[7] -= cc << 5; + t[0] += cc; + + /* + * If the carry is negative, then after carry propagation, we may + * end up with a value which is negative, and we don't want that. + * Thus, in that case, we add the modulus. Note that the subtraction + * result, when the carry is negative, is always smaller than the + * modulus, so the extra addition will not make the value exceed + * twice the modulus. + */ + cc >>= 31; + t[0] -= cc; + t[7] += cc << 5; + t[14] += cc << 10; + t[17] -= cc << 3; + t[19] += cc << 9; + + norm13(d, t, 20); +} + +/* + * Jacobian coordinates for a point in P-256: affine coordinates (X,Y) + * are such that: + * X = x / z^2 + * Y = y / z^3 + * For the point at infinity, z = 0. + * Each point thus admits many possible representations. + * + * Coordinates are represented in arrays of 32-bit integers, each holding + * 13 bits of data. Values may also be slightly greater than the modulus, + * but they will always be lower than twice the modulus. + */ +typedef struct { + uint32_t x[20]; + uint32_t y[20]; + uint32_t z[20]; +} p256_jacobian; + +/* + * Convert a point to affine coordinates: + * - If the point is the point at infinity, then all three coordinates + * are set to 0. + * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y' + * coordinates are the 'X' and 'Y' affine coordinates. + * The coordinates are guaranteed to be lower than the modulus. + */ +static void +p256_to_affine(p256_jacobian *P) +{ + uint32_t t1[20], t2[20]; + int i; + + /* + * Invert z with a modular exponentiation: the modulus is + * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is + * p-2. Exponent bit pattern (from high to low) is: + * - 32 bits of value 1 + * - 31 bits of value 0 + * - 1 bit of value 1 + * - 96 bits of value 0 + * - 94 bits of value 1 + * - 1 bit of value 0 + * - 1 bit of value 1 + * Thus, we precompute z^(2^31-1) to speed things up. + * + * If z = 0 (point at infinity) then the modular exponentiation + * will yield 0, which leads to the expected result (all three + * coordinates set to 0). + */ + + /* + * A simple square-and-multiply for z^(2^31-1). We could save about + * two dozen multiplications here with an addition chain, but + * this would require a bit more code, and extra stack buffers. + */ + memcpy(t1, P->z, sizeof P->z); + for (i = 0; i < 30; i ++) { + square_f256(t1, t1); + mul_f256(t1, t1, P->z); + } + + /* + * Square-and-multiply. Apart from the squarings, we have a few + * multiplications to set bits to 1; we multiply by the original z + * for setting 1 bit, and by t1 for setting 31 bits. + */ + memcpy(t2, P->z, sizeof P->z); + for (i = 1; i < 256; i ++) { + square_f256(t2, t2); + switch (i) { + case 31: + case 190: + case 221: + case 252: + mul_f256(t2, t2, t1); + break; + case 63: + case 253: + case 255: + mul_f256(t2, t2, P->z); + break; + } + } + + /* + * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3. + */ + mul_f256(t1, t2, t2); + mul_f256(P->x, t1, P->x); + mul_f256(t1, t1, t2); + mul_f256(P->y, t1, P->y); + reduce_final_f256(P->x); + reduce_final_f256(P->y); + + /* + * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise + * this will set z to 1. + */ + mul_f256(P->z, P->z, t2); + reduce_final_f256(P->z); +} + +/* + * Double a point in P-256. This function works for all valid points, + * including the point at infinity. + */ +static void +p256_double(p256_jacobian *Q) +{ + /* + * Doubling formulas are: + * + * s = 4*x*y^2 + * m = 3*(x + z^2)*(x - z^2) + * x' = m^2 - 2*s + * y' = m*(s - x') - 8*y^4 + * z' = 2*y*z + * + * These formulas work for all points, including points of order 2 + * and points at infinity: + * - If y = 0 then z' = 0. But there is no such point in P-256 + * anyway. + * - If z = 0 then z' = 0. + */ + uint32_t t1[20], t2[20], t3[20], t4[20]; + int i; + + /* + * Compute z^2 in t1. + */ + square_f256(t1, Q->z); + + /* + * Compute x-z^2 in t2 and x+z^2 in t1. + */ + for (i = 0; i < 20; i ++) { + t2[i] = (F256[i] << 1) + Q->x[i] - t1[i]; + t1[i] += Q->x[i]; + } + norm13(t1, t1, 20); + norm13(t2, t2, 20); + + /* + * Compute 3*(x+z^2)*(x-z^2) in t1. + */ + mul_f256(t3, t1, t2); + for (i = 0; i < 20; i ++) { + t1[i] = MUL15(3, t3[i]); + } + norm13(t1, t1, 20); + + /* + * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3). + */ + square_f256(t3, Q->y); + for (i = 0; i < 20; i ++) { + t3[i] <<= 1; + } + norm13(t3, t3, 20); + mul_f256(t2, Q->x, t3); + for (i = 0; i < 20; i ++) { + t2[i] <<= 1; + } + norm13(t2, t2, 20); + reduce_f256(t2); + + /* + * Compute x' = m^2 - 2*s. + */ + square_f256(Q->x, t1); + for (i = 0; i < 20; i ++) { + Q->x[i] += (F256[i] << 2) - (t2[i] << 1); + } + norm13(Q->x, Q->x, 20); + reduce_f256(Q->x); + + /* + * Compute z' = 2*y*z. + */ + mul_f256(t4, Q->y, Q->z); + for (i = 0; i < 20; i ++) { + Q->z[i] = t4[i] << 1; + } + norm13(Q->z, Q->z, 20); + reduce_f256(Q->z); + + /* + * Compute y' = m*(s - x') - 8*y^4. Note that we already have + * 2*y^2 in t3. + */ + for (i = 0; i < 20; i ++) { + t2[i] += (F256[i] << 1) - Q->x[i]; + } + norm13(t2, t2, 20); + mul_f256(Q->y, t1, t2); + square_f256(t4, t3); + for (i = 0; i < 20; i ++) { + Q->y[i] += (F256[i] << 2) - (t4[i] << 1); + } + norm13(Q->y, Q->y, 20); + reduce_f256(Q->y); +} + +/* + * Add point P2 to point P1. + * + * This function computes the wrong result in the following cases: + * + * - If P1 == 0 but P2 != 0 + * - If P1 != 0 but P2 == 0 + * - If P1 == P2 + * + * In all three cases, P1 is set to the point at infinity. + * + * Returned value is 0 if one of the following occurs: + * + * - P1 and P2 have the same Y coordinate + * - P1 == 0 and P2 == 0 + * - The Y coordinate of one of the points is 0 and the other point is + * the point at infinity. + * + * The third case cannot actually happen with valid points, since a point + * with Y == 0 is a point of order 2, and there is no point of order 2 on + * curve P-256. + * + * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller + * can apply the following: + * + * - If the result is not the point at infinity, then it is correct. + * - Otherwise, if the returned value is 1, then this is a case of + * P1+P2 == 0, so the result is indeed the point at infinity. + * - Otherwise, P1 == P2, so a "double" operation should have been + * performed. + */ +static uint32_t +p256_add(p256_jacobian *P1, const p256_jacobian *P2) +{ + /* + * Addtions formulas are: + * + * u1 = x1 * z2^2 + * u2 = x2 * z1^2 + * s1 = y1 * z2^3 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 * z2 + */ + uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20]; + uint32_t ret; + int i; + + /* + * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). + */ + square_f256(t3, P2->z); + mul_f256(t1, P1->x, t3); + mul_f256(t4, P2->z, t3); + mul_f256(t3, P1->y, t4); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + square_f256(t4, P1->z); + mul_f256(t2, P2->x, t4); + mul_f256(t5, P1->z, t4); + mul_f256(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * We need to test whether r is zero, so we will do some extra + * reduce. + */ + for (i = 0; i < 20; i ++) { + t2[i] += (F256[i] << 1) - t1[i]; + t4[i] += (F256[i] << 1) - t3[i]; + } + norm13(t2, t2, 20); + norm13(t4, t4, 20); + reduce_f256(t4); + reduce_final_f256(t4); + ret = 0; + for (i = 0; i < 20; i ++) { + ret |= t4[i]; + } + ret = (ret | -ret) >> 31; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + square_f256(t7, t2); + mul_f256(t6, t1, t7); + mul_f256(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + square_f256(P1->x, t4); + for (i = 0; i < 20; i ++) { + P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1); + } + norm13(P1->x, P1->x, 20); + reduce_f256(P1->x); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + for (i = 0; i < 20; i ++) { + t6[i] += (F256[i] << 1) - P1->x[i]; + } + norm13(t6, t6, 20); + mul_f256(P1->y, t4, t6); + mul_f256(t1, t5, t3); + for (i = 0; i < 20; i ++) { + P1->y[i] += (F256[i] << 1) - t1[i]; + } + norm13(P1->y, P1->y, 20); + reduce_f256(P1->y); + + /* + * Compute z3 = h*z1*z2. + */ + mul_f256(t1, P1->z, P2->z); + mul_f256(P1->z, t1, t2); + + return ret; +} + +/* + * Add point P2 to point P1. This is a specialised function for the + * case when P2 is a non-zero point in affine coordinate. + * + * This function computes the wrong result in the following cases: + * + * - If P1 == 0 + * - If P1 == P2 + * + * In both cases, P1 is set to the point at infinity. + * + * Returned value is 0 if one of the following occurs: + * + * - P1 and P2 have the same Y coordinate + * - The Y coordinate of P2 is 0 and P1 is the point at infinity. + * + * The second case cannot actually happen with valid points, since a point + * with Y == 0 is a point of order 2, and there is no point of order 2 on + * curve P-256. + * + * Therefore, assuming that P1 != 0 on input, then the caller + * can apply the following: + * + * - If the result is not the point at infinity, then it is correct. + * - Otherwise, if the returned value is 1, then this is a case of + * P1+P2 == 0, so the result is indeed the point at infinity. + * - Otherwise, P1 == P2, so a "double" operation should have been + * performed. + */ +static uint32_t +p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2) +{ + /* + * Addtions formulas are: + * + * u1 = x1 + * u2 = x2 * z1^2 + * s1 = y1 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 + */ + uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20]; + uint32_t ret; + int i; + + /* + * Compute u1 = x1 (in t1) and s1 = y1 (in t3). + */ + memcpy(t1, P1->x, sizeof t1); + memcpy(t3, P1->y, sizeof t3); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + square_f256(t4, P1->z); + mul_f256(t2, P2->x, t4); + mul_f256(t5, P1->z, t4); + mul_f256(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * We need to test whether r is zero, so we will do some extra + * reduce. + */ + for (i = 0; i < 20; i ++) { + t2[i] += (F256[i] << 1) - t1[i]; + t4[i] += (F256[i] << 1) - t3[i]; + } + norm13(t2, t2, 20); + norm13(t4, t4, 20); + reduce_f256(t4); + reduce_final_f256(t4); + ret = 0; + for (i = 0; i < 20; i ++) { + ret |= t4[i]; + } + ret = (ret | -ret) >> 31; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + square_f256(t7, t2); + mul_f256(t6, t1, t7); + mul_f256(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + square_f256(P1->x, t4); + for (i = 0; i < 20; i ++) { + P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1); + } + norm13(P1->x, P1->x, 20); + reduce_f256(P1->x); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + for (i = 0; i < 20; i ++) { + t6[i] += (F256[i] << 1) - P1->x[i]; + } + norm13(t6, t6, 20); + mul_f256(P1->y, t4, t6); + mul_f256(t1, t5, t3); + for (i = 0; i < 20; i ++) { + P1->y[i] += (F256[i] << 1) - t1[i]; + } + norm13(P1->y, P1->y, 20); + reduce_f256(P1->y); + + /* + * Compute z3 = h*z1*z2. + */ + mul_f256(P1->z, P1->z, t2); + + return ret; +} + +/* + * Decode a P-256 point. This function does not support the point at + * infinity. Returned value is 0 if the point is invalid, 1 otherwise. + */ +static uint32_t +p256_decode(p256_jacobian *P, const void *src, size_t len) +{ + const unsigned char *buf; + uint32_t tx[20], ty[20], t1[20], t2[20]; + uint32_t bad; + int i; + + if (len != 65) { + return 0; + } + buf = src; + + /* + * First byte must be 0x04 (uncompressed format). We could support + * "hybrid format" (first byte is 0x06 or 0x07, and encodes the + * least significant bit of the Y coordinate), but it is explicitly + * forbidden by RFC 5480 (section 2.2). + */ + bad = NEQ(buf[0], 0x04); + + /* + * Decode the coordinates, and check that they are both lower + * than the modulus. + */ + tx[19] = be8_to_le13(tx, buf + 1, 32); + ty[19] = be8_to_le13(ty, buf + 33, 32); + bad |= reduce_final_f256(tx); + bad |= reduce_final_f256(ty); + + /* + * Check curve equation. + */ + square_f256(t1, tx); + mul_f256(t1, tx, t1); + square_f256(t2, ty); + for (i = 0; i < 20; i ++) { + t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i]; + } + norm13(t1, t1, 20); + reduce_f256(t1); + reduce_final_f256(t1); + for (i = 0; i < 20; i ++) { + bad |= t1[i]; + } + + /* + * Copy coordinates to the point structure. + */ + memcpy(P->x, tx, sizeof tx); + memcpy(P->y, ty, sizeof ty); + memset(P->z, 0, sizeof P->z); + P->z[0] = 1; + return EQ(bad, 0); +} + +/* + * Encode a point into a buffer. This function assumes that the point is + * valid, in affine coordinates, and not the point at infinity. + */ +static void +p256_encode(void *dst, const p256_jacobian *P) +{ + unsigned char *buf; + + buf = dst; + buf[0] = 0x04; + le13_to_be8(buf + 1, 32, P->x); + le13_to_be8(buf + 33, 32, P->y); +} + +/* + * Multiply a curve point by an integer. The integer is assumed to be + * lower than the curve order, and the base point must not be the point + * at infinity. + */ +static void +p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen) +{ + /* + * qz is a flag that is initially 1, and remains equal to 1 + * as long as the point is the point at infinity. + * + * We use a 2-bit window to handle multiplier bits by pairs. + * The precomputed window really is the points P2 and P3. + */ + uint32_t qz; + p256_jacobian P2, P3, Q, T, U; + + /* + * Compute window values. + */ + P2 = *P; + p256_double(&P2); + P3 = *P; + p256_add(&P3, &P2); + + /* + * We start with Q = 0. We process multiplier bits 2 by 2. + */ + memset(&Q, 0, sizeof Q); + qz = 1; + while (xlen -- > 0) { + int k; + + for (k = 6; k >= 0; k -= 2) { + uint32_t bits; + uint32_t bnz; + + p256_double(&Q); + p256_double(&Q); + T = *P; + U = Q; + bits = (*x >> k) & (uint32_t)3; + bnz = NEQ(bits, 0); + CCOPY(EQ(bits, 2), &T, &P2, sizeof T); + CCOPY(EQ(bits, 3), &T, &P3, sizeof T); + p256_add(&U, &T); + CCOPY(bnz & qz, &Q, &T, sizeof Q); + CCOPY(bnz & ~qz, &Q, &U, sizeof Q); + qz &= ~bnz; + } + x ++; + } + *P = Q; +} + +/* + * Precomputed window: k*G points, where G is the curve generator, and k + * is an integer from 1 to 15 (inclusive). The X and Y coordinates of + * the point are encoded as 20 words of 13 bits each (little-endian + * order); 13-bit words are then grouped 2-by-2 into 32-bit words + * (little-endian order within each word). + */ +static const uint32_t Gwin[15][20] = { + + { 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC, + 0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2, + 0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D, + 0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785, + 0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 }, + + { 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8, + 0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29, + 0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748, + 0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36, + 0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 }, + + { 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6, + 0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323, + 0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8, + 0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB, + 0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 }, + + { 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156, + 0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80, + 0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6, + 0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D, + 0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 }, + + { 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F, + 0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E, + 0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F, + 0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49, + 0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F }, + + { 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8, + 0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B, + 0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3, + 0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE, + 0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 }, + + { 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F, + 0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896, + 0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0, + 0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400, + 0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 }, + + { 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A, + 0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01, + 0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83, + 0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8, + 0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 }, + + { 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7, + 0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E, + 0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293, + 0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832, + 0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 }, + + { 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120, + 0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964, + 0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91, + 0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304, + 0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 }, + + { 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541, + 0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418, + 0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A, + 0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2, + 0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 }, + + { 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0, + 0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918, + 0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3, + 0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55, + 0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D }, + + { 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB, + 0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986, + 0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB, + 0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C, + 0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 }, + + { 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A, + 0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9, + 0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8, + 0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0, + 0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C }, + + { 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8, + 0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7, + 0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783, + 0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9, + 0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F } +}; + +/* + * Lookup one of the Gwin[] values, by index. This is constant-time. + */ +static void +lookup_Gwin(p256_jacobian *T, uint32_t idx) +{ + uint32_t xy[20]; + uint32_t k; + size_t u; + + memset(xy, 0, sizeof xy); + for (k = 0; k < 15; k ++) { + uint32_t m; + + m = -EQ(idx, k + 1); + for (u = 0; u < 20; u ++) { + xy[u] |= m & Gwin[k][u]; + } + } + for (u = 0; u < 10; u ++) { + T->x[(u << 1) + 0] = xy[u] & 0xFFFF; + T->x[(u << 1) + 1] = xy[u] >> 16; + T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF; + T->y[(u << 1) + 1] = xy[u + 10] >> 16; + } + memset(T->z, 0, sizeof T->z); + T->z[0] = 1; +} + +/* + * Multiply the generator by an integer. The integer is assumed non-zero + * and lower than the curve order. + */ +static void +p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen) +{ + /* + * qz is a flag that is initially 1, and remains equal to 1 + * as long as the point is the point at infinity. + * + * We use a 4-bit window to handle multiplier bits by groups + * of 4. The precomputed window is constant static data, with + * points in affine coordinates; we use a constant-time lookup. + */ + p256_jacobian Q; + uint32_t qz; + + memset(&Q, 0, sizeof Q); + qz = 1; + while (xlen -- > 0) { + int k; + unsigned bx; + + bx = *x ++; + for (k = 0; k < 2; k ++) { + uint32_t bits; + uint32_t bnz; + p256_jacobian T, U; + + p256_double(&Q); + p256_double(&Q); + p256_double(&Q); + p256_double(&Q); + bits = (bx >> 4) & 0x0F; + bnz = NEQ(bits, 0); + lookup_Gwin(&T, bits); + U = Q; + p256_add_mixed(&U, &T); + CCOPY(bnz & qz, &Q, &T, sizeof Q); + CCOPY(bnz & ~qz, &Q, &U, sizeof Q); + qz &= ~bnz; + bx <<= 4; + } + } + *P = Q; +} + +static const unsigned char P256_G[] = { + 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8, + 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D, + 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, + 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F, + 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B, + 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40, + 0x68, 0x37, 0xBF, 0x51, 0xF5 +}; + +static const unsigned char P256_N[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD, + 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, + 0x25, 0x51 +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = sizeof P256_G; + return P256_G; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = sizeof P256_N; + return P256_N; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 1; +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *x, size_t xlen, int curve) +{ + uint32_t r; + p256_jacobian P; + + (void)curve; + r = p256_decode(&P, G, Glen); + p256_mul(&P, x, xlen); + if (Glen >= 65) { + p256_to_affine(&P); + p256_encode(G, &P); + } + return r; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + p256_jacobian P; + + (void)curve; + p256_mulgen(&P, x, xlen); + p256_to_affine(&P); + p256_encode(R, &P); + return 65; + + /* + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; + */ +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + p256_jacobian P, Q; + uint32_t r, t, z; + int i; + + (void)curve; + r = p256_decode(&P, A, len); + p256_mul(&P, x, xlen); + if (B == NULL) { + p256_mulgen(&Q, y, ylen); + } else { + r &= p256_decode(&Q, B, len); + p256_mul(&Q, y, ylen); + } + + /* + * The final addition may fail in case both points are equal. + */ + t = p256_add(&P, &Q); + reduce_final_f256(P.z); + z = 0; + for (i = 0; i < 20; i ++) { + z |= P.z[i]; + } + z = EQ(z, 0); + p256_double(&Q); + + /* + * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we + * have the following: + * + * z = 0, t = 0 return P (normal addition) + * z = 0, t = 1 return P (normal addition) + * z = 1, t = 0 return Q (a 'double' case) + * z = 1, t = 1 report an error (P+Q = 0) + */ + CCOPY(z & ~t, &P, &Q, sizeof Q); + p256_to_affine(&P); + p256_encode(A, &P); + r &= ~(z & t); + return r; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_p256_m15 = { + (uint32_t)0x00800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m31.c b/test/monniaux/BearSSL/src/ec/ec_p256_m31.c new file mode 100644 index 00000000..d57ef7b0 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_p256_m31.c @@ -0,0 +1,1475 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ + * that right-shifting a signed negative integer copies the sign bit + * (arithmetic right-shift). This is "implementation-defined behaviour", + * i.e. it is not undefined, but it may differ between compilers. Each + * compiler is supposed to document its behaviour in that respect. GCC + * explicitly defines that an arithmetic right shift is used. We expect + * all other compilers to do the same, because underlying CPU offer an + * arithmetic right shift opcode that could not be used otherwise. + */ +#if BR_NO_ARITH_SHIFT +#define ARSH(x, n) (((uint32_t)(x) >> (n)) \ + | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) +#define ARSHW(x, n) (((uint64_t)(x) >> (n)) \ + | ((-((uint64_t)(x) >> 63)) << (64 - (n)))) +#else +#define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) +#define ARSHW(x, n) ((*(int64_t *)&(x)) >> (n)) +#endif + +/* + * Convert an integer from unsigned big-endian encoding to a sequence of + * 30-bit words in little-endian order. The final "partial" word is + * returned. + */ +static uint32_t +be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len) +{ + uint32_t acc; + int acc_len; + + acc = 0; + acc_len = 0; + while (len -- > 0) { + uint32_t b; + + b = src[len]; + if (acc_len < 22) { + acc |= b << acc_len; + acc_len += 8; + } else { + *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF; + acc = b >> (30 - acc_len); + acc_len -= 22; + } + } + return acc; +} + +/* + * Convert an integer (30-bit words, little-endian) to unsigned + * big-endian encoding. The total encoding length is provided; all + * the destination bytes will be filled. + */ +static void +le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src) +{ + uint32_t acc; + int acc_len; + + acc = 0; + acc_len = 0; + while (len -- > 0) { + if (acc_len < 8) { + uint32_t w; + + w = *src ++; + dst[len] = (unsigned char)(acc | (w << acc_len)); + acc = w >> (8 - acc_len); + acc_len += 22; + } else { + dst[len] = (unsigned char)acc; + acc >>= 8; + acc_len -= 8; + } + } +} + +/* + * Multiply two integers. Source integers are represented as arrays of + * nine 30-bit words, for values up to 2^270-1. Result is encoded over + * 18 words of 30 bits each. + */ +static void +mul9(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + /* + * Maximum intermediate result is no more than + * 10376293531797946367, which fits in 64 bits. Reason: + * + * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406 + * 10376293531797946367 < 9663676407 * 2^30 + * + * Thus, adding together 9 products of 30-bit integers, with + * a carry of at most 9663676406, yields an integer that fits + * on 64 bits and generates a carry of at most 9663676406. + */ + uint64_t t[17]; + uint64_t cc; + int i; + + t[ 0] = MUL31(a[0], b[0]); + t[ 1] = MUL31(a[0], b[1]) + + MUL31(a[1], b[0]); + t[ 2] = MUL31(a[0], b[2]) + + MUL31(a[1], b[1]) + + MUL31(a[2], b[0]); + t[ 3] = MUL31(a[0], b[3]) + + MUL31(a[1], b[2]) + + MUL31(a[2], b[1]) + + MUL31(a[3], b[0]); + t[ 4] = MUL31(a[0], b[4]) + + MUL31(a[1], b[3]) + + MUL31(a[2], b[2]) + + MUL31(a[3], b[1]) + + MUL31(a[4], b[0]); + t[ 5] = MUL31(a[0], b[5]) + + MUL31(a[1], b[4]) + + MUL31(a[2], b[3]) + + MUL31(a[3], b[2]) + + MUL31(a[4], b[1]) + + MUL31(a[5], b[0]); + t[ 6] = MUL31(a[0], b[6]) + + MUL31(a[1], b[5]) + + MUL31(a[2], b[4]) + + MUL31(a[3], b[3]) + + MUL31(a[4], b[2]) + + MUL31(a[5], b[1]) + + MUL31(a[6], b[0]); + t[ 7] = MUL31(a[0], b[7]) + + MUL31(a[1], b[6]) + + MUL31(a[2], b[5]) + + MUL31(a[3], b[4]) + + MUL31(a[4], b[3]) + + MUL31(a[5], b[2]) + + MUL31(a[6], b[1]) + + MUL31(a[7], b[0]); + t[ 8] = MUL31(a[0], b[8]) + + MUL31(a[1], b[7]) + + MUL31(a[2], b[6]) + + MUL31(a[3], b[5]) + + MUL31(a[4], b[4]) + + MUL31(a[5], b[3]) + + MUL31(a[6], b[2]) + + MUL31(a[7], b[1]) + + MUL31(a[8], b[0]); + t[ 9] = MUL31(a[1], b[8]) + + MUL31(a[2], b[7]) + + MUL31(a[3], b[6]) + + MUL31(a[4], b[5]) + + MUL31(a[5], b[4]) + + MUL31(a[6], b[3]) + + MUL31(a[7], b[2]) + + MUL31(a[8], b[1]); + t[10] = MUL31(a[2], b[8]) + + MUL31(a[3], b[7]) + + MUL31(a[4], b[6]) + + MUL31(a[5], b[5]) + + MUL31(a[6], b[4]) + + MUL31(a[7], b[3]) + + MUL31(a[8], b[2]); + t[11] = MUL31(a[3], b[8]) + + MUL31(a[4], b[7]) + + MUL31(a[5], b[6]) + + MUL31(a[6], b[5]) + + MUL31(a[7], b[4]) + + MUL31(a[8], b[3]); + t[12] = MUL31(a[4], b[8]) + + MUL31(a[5], b[7]) + + MUL31(a[6], b[6]) + + MUL31(a[7], b[5]) + + MUL31(a[8], b[4]); + t[13] = MUL31(a[5], b[8]) + + MUL31(a[6], b[7]) + + MUL31(a[7], b[6]) + + MUL31(a[8], b[5]); + t[14] = MUL31(a[6], b[8]) + + MUL31(a[7], b[7]) + + MUL31(a[8], b[6]); + t[15] = MUL31(a[7], b[8]) + + MUL31(a[8], b[7]); + t[16] = MUL31(a[8], b[8]); + + /* + * Propagate carries. + */ + cc = 0; + for (i = 0; i < 17; i ++) { + uint64_t w; + + w = t[i] + cc; + d[i] = (uint32_t)w & 0x3FFFFFFF; + cc = w >> 30; + } + d[17] = (uint32_t)cc; +} + +/* + * Square a 270-bit integer, represented as an array of nine 30-bit words. + * Result uses 18 words of 30 bits each. + */ +static void +square9(uint32_t *d, const uint32_t *a) +{ + uint64_t t[17]; + uint64_t cc; + int i; + + t[ 0] = MUL31(a[0], a[0]); + t[ 1] = ((MUL31(a[0], a[1])) << 1); + t[ 2] = MUL31(a[1], a[1]) + + ((MUL31(a[0], a[2])) << 1); + t[ 3] = ((MUL31(a[0], a[3]) + + MUL31(a[1], a[2])) << 1); + t[ 4] = MUL31(a[2], a[2]) + + ((MUL31(a[0], a[4]) + + MUL31(a[1], a[3])) << 1); + t[ 5] = ((MUL31(a[0], a[5]) + + MUL31(a[1], a[4]) + + MUL31(a[2], a[3])) << 1); + t[ 6] = MUL31(a[3], a[3]) + + ((MUL31(a[0], a[6]) + + MUL31(a[1], a[5]) + + MUL31(a[2], a[4])) << 1); + t[ 7] = ((MUL31(a[0], a[7]) + + MUL31(a[1], a[6]) + + MUL31(a[2], a[5]) + + MUL31(a[3], a[4])) << 1); + t[ 8] = MUL31(a[4], a[4]) + + ((MUL31(a[0], a[8]) + + MUL31(a[1], a[7]) + + MUL31(a[2], a[6]) + + MUL31(a[3], a[5])) << 1); + t[ 9] = ((MUL31(a[1], a[8]) + + MUL31(a[2], a[7]) + + MUL31(a[3], a[6]) + + MUL31(a[4], a[5])) << 1); + t[10] = MUL31(a[5], a[5]) + + ((MUL31(a[2], a[8]) + + MUL31(a[3], a[7]) + + MUL31(a[4], a[6])) << 1); + t[11] = ((MUL31(a[3], a[8]) + + MUL31(a[4], a[7]) + + MUL31(a[5], a[6])) << 1); + t[12] = MUL31(a[6], a[6]) + + ((MUL31(a[4], a[8]) + + MUL31(a[5], a[7])) << 1); + t[13] = ((MUL31(a[5], a[8]) + + MUL31(a[6], a[7])) << 1); + t[14] = MUL31(a[7], a[7]) + + ((MUL31(a[6], a[8])) << 1); + t[15] = ((MUL31(a[7], a[8])) << 1); + t[16] = MUL31(a[8], a[8]); + + /* + * Propagate carries. + */ + cc = 0; + for (i = 0; i < 17; i ++) { + uint64_t w; + + w = t[i] + cc; + d[i] = (uint32_t)w & 0x3FFFFFFF; + cc = w >> 30; + } + d[17] = (uint32_t)cc; +} + +/* + * Base field modulus for P-256. + */ +static const uint32_t F256[] = { + + 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000, + 0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF +}; + +/* + * The 'b' curve equation coefficient for P-256. + */ +static const uint32_t P256_B[] = { + + 0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65, + 0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6 +}; + +/* + * Addition in the field. Source operands shall fit on 257 bits; output + * will be lower than twice the modulus. + */ +static void +add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t w, cc; + int i; + + cc = 0; + for (i = 0; i < 9; i ++) { + w = a[i] + b[i] + cc; + d[i] = w & 0x3FFFFFFF; + cc = w >> 30; + } + w >>= 16; + d[8] &= 0xFFFF; + d[3] -= w << 6; + d[6] -= w << 12; + d[7] += w << 14; + cc = w; + for (i = 0; i < 9; i ++) { + w = d[i] + cc; + d[i] = w & 0x3FFFFFFF; + cc = ARSH(w, 30); + } +} + +/* + * Subtraction in the field. Source operands shall be smaller than twice + * the modulus; the result will fulfil the same property. + */ +static void +sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t w, cc; + int i; + + /* + * We really compute a - b + 2*p to make sure that the result is + * positive. + */ + w = a[0] - b[0] - 0x00002; + d[0] = w & 0x3FFFFFFF; + w = a[1] - b[1] + ARSH(w, 30); + d[1] = w & 0x3FFFFFFF; + w = a[2] - b[2] + ARSH(w, 30); + d[2] = w & 0x3FFFFFFF; + w = a[3] - b[3] + ARSH(w, 30) + 0x00080; + d[3] = w & 0x3FFFFFFF; + w = a[4] - b[4] + ARSH(w, 30); + d[4] = w & 0x3FFFFFFF; + w = a[5] - b[5] + ARSH(w, 30); + d[5] = w & 0x3FFFFFFF; + w = a[6] - b[6] + ARSH(w, 30) + 0x02000; + d[6] = w & 0x3FFFFFFF; + w = a[7] - b[7] + ARSH(w, 30) - 0x08000; + d[7] = w & 0x3FFFFFFF; + w = a[8] - b[8] + ARSH(w, 30) + 0x20000; + d[8] = w & 0xFFFF; + w >>= 16; + d[8] &= 0xFFFF; + d[3] -= w << 6; + d[6] -= w << 12; + d[7] += w << 14; + cc = w; + for (i = 0; i < 9; i ++) { + w = d[i] + cc; + d[i] = w & 0x3FFFFFFF; + cc = ARSH(w, 30); + } +} + +/* + * Compute a multiplication in F256. Source operands shall be less than + * twice the modulus. + */ +static void +mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b) +{ + uint32_t t[18]; + uint64_t s[18]; + uint64_t cc, x; + uint32_t z, c; + int i; + + mul9(t, a, b); + + /* + * Modular reduction: each high word in added/subtracted where + * necessary. + * + * The modulus is: + * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 + * Therefore: + * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p + * + * For a word x at bit offset n (n >= 256), we have: + * x*2^n = x*2^(n-32) - x*2^(n-64) + * - x*2^(n - 160) + x*2^(n-256) mod p + * + * Thus, we can nullify the high word if we reinject it at some + * proper emplacements. + * + * We use 64-bit intermediate words to allow for carries to + * accumulate easily, before performing the final propagation. + */ + for (i = 0; i < 18; i ++) { + s[i] = t[i]; + } + + for (i = 17; i >= 9; i --) { + uint64_t y; + + y = s[i]; + s[i - 1] += ARSHW(y, 2); + s[i - 2] += (y << 28) & 0x3FFFFFFF; + s[i - 2] -= ARSHW(y, 4); + s[i - 3] -= (y << 26) & 0x3FFFFFFF; + s[i - 5] -= ARSHW(y, 10); + s[i - 6] -= (y << 20) & 0x3FFFFFFF; + s[i - 8] += ARSHW(y, 16); + s[i - 9] += (y << 14) & 0x3FFFFFFF; + } + + /* + * Carry propagation must be signed. Moreover, we may have overdone + * it a bit, and obtain a negative result. + * + * The loop above ran 9 times; each time, each word was augmented + * by at most one extra word (in absolute value). Thus, the top + * word must in fine fit in 39 bits, so the carry below will fit + * on 9 bits. + */ + cc = 0; + for (i = 0; i < 9; i ++) { + x = s[i] + cc; + d[i] = (uint32_t)x & 0x3FFFFFFF; + cc = ARSHW(x, 30); + } + + /* + * All nine words fit on 30 bits, but there may be an extra + * carry for a few bits (at most 9), and that carry may be + * negative. Moreover, we want the result to fit on 257 bits. + * The two lines below ensure that the word in d[] has length + * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The + * significant length of cc is less than 24 bits, so we will be + * able to switch to 32-bit operations. + */ + cc = ARSHW(x, 16); + d[8] &= 0xFFFF; + + /* + * One extra round of reduction, for cc*2^256, which means + * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative) + * value. If cc is negative, then it may happen (rarely, but + * not neglectibly so) that the result would be negative. In + * order to avoid that, if cc is negative, then we add the + * modulus once. Note that if cc is negative, then propagating + * that carry must yield a value lower than the modulus, so + * adding the modulus once will keep the final result under + * twice the modulus. + */ + z = (uint32_t)cc; + d[3] -= z << 6; + d[6] -= (z << 12) & 0x3FFFFFFF; + d[7] -= ARSH(z, 18); + d[7] += (z << 14) & 0x3FFFFFFF; + d[8] += ARSH(z, 16); + c = z >> 31; + d[0] -= c; + d[3] += c << 6; + d[6] += c << 12; + d[7] -= c << 14; + d[8] += c << 16; + for (i = 0; i < 9; i ++) { + uint32_t w; + + w = d[i] + z; + d[i] = w & 0x3FFFFFFF; + z = ARSH(w, 30); + } +} + +/* + * Compute a square in F256. Source operand shall be less than + * twice the modulus. + */ +static void +square_f256(uint32_t *d, const uint32_t *a) +{ + uint32_t t[18]; + uint64_t s[18]; + uint64_t cc, x; + uint32_t z, c; + int i; + + square9(t, a); + + /* + * Modular reduction: each high word in added/subtracted where + * necessary. + * + * The modulus is: + * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 + * Therefore: + * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p + * + * For a word x at bit offset n (n >= 256), we have: + * x*2^n = x*2^(n-32) - x*2^(n-64) + * - x*2^(n - 160) + x*2^(n-256) mod p + * + * Thus, we can nullify the high word if we reinject it at some + * proper emplacements. + * + * We use 64-bit intermediate words to allow for carries to + * accumulate easily, before performing the final propagation. + */ + for (i = 0; i < 18; i ++) { + s[i] = t[i]; + } + + for (i = 17; i >= 9; i --) { + uint64_t y; + + y = s[i]; + s[i - 1] += ARSHW(y, 2); + s[i - 2] += (y << 28) & 0x3FFFFFFF; + s[i - 2] -= ARSHW(y, 4); + s[i - 3] -= (y << 26) & 0x3FFFFFFF; + s[i - 5] -= ARSHW(y, 10); + s[i - 6] -= (y << 20) & 0x3FFFFFFF; + s[i - 8] += ARSHW(y, 16); + s[i - 9] += (y << 14) & 0x3FFFFFFF; + } + + /* + * Carry propagation must be signed. Moreover, we may have overdone + * it a bit, and obtain a negative result. + * + * The loop above ran 9 times; each time, each word was augmented + * by at most one extra word (in absolute value). Thus, the top + * word must in fine fit in 39 bits, so the carry below will fit + * on 9 bits. + */ + cc = 0; + for (i = 0; i < 9; i ++) { + x = s[i] + cc; + d[i] = (uint32_t)x & 0x3FFFFFFF; + cc = ARSHW(x, 30); + } + + /* + * All nine words fit on 30 bits, but there may be an extra + * carry for a few bits (at most 9), and that carry may be + * negative. Moreover, we want the result to fit on 257 bits. + * The two lines below ensure that the word in d[] has length + * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The + * significant length of cc is less than 24 bits, so we will be + * able to switch to 32-bit operations. + */ + cc = ARSHW(x, 16); + d[8] &= 0xFFFF; + + /* + * One extra round of reduction, for cc*2^256, which means + * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative) + * value. If cc is negative, then it may happen (rarely, but + * not neglectibly so) that the result would be negative. In + * order to avoid that, if cc is negative, then we add the + * modulus once. Note that if cc is negative, then propagating + * that carry must yield a value lower than the modulus, so + * adding the modulus once will keep the final result under + * twice the modulus. + */ + z = (uint32_t)cc; + d[3] -= z << 6; + d[6] -= (z << 12) & 0x3FFFFFFF; + d[7] -= ARSH(z, 18); + d[7] += (z << 14) & 0x3FFFFFFF; + d[8] += ARSH(z, 16); + c = z >> 31; + d[0] -= c; + d[3] += c << 6; + d[6] += c << 12; + d[7] -= c << 14; + d[8] += c << 16; + for (i = 0; i < 9; i ++) { + uint32_t w; + + w = d[i] + z; + d[i] = w & 0x3FFFFFFF; + z = ARSH(w, 30); + } +} + +/* + * Perform a "final reduction" in field F256 (field for curve P-256). + * The source value must be less than twice the modulus. If the value + * is not lower than the modulus, then the modulus is subtracted and + * this function returns 1; otherwise, it leaves it untouched and it + * returns 0. + */ +static uint32_t +reduce_final_f256(uint32_t *d) +{ + uint32_t t[9]; + uint32_t cc; + int i; + + cc = 0; + for (i = 0; i < 9; i ++) { + uint32_t w; + + w = d[i] - F256[i] - cc; + cc = w >> 31; + t[i] = w & 0x3FFFFFFF; + } + cc ^= 1; + CCOPY(cc, d, t, sizeof t); + return cc; +} + +/* + * Jacobian coordinates for a point in P-256: affine coordinates (X,Y) + * are such that: + * X = x / z^2 + * Y = y / z^3 + * For the point at infinity, z = 0. + * Each point thus admits many possible representations. + * + * Coordinates are represented in arrays of 32-bit integers, each holding + * 30 bits of data. Values may also be slightly greater than the modulus, + * but they will always be lower than twice the modulus. + */ +typedef struct { + uint32_t x[9]; + uint32_t y[9]; + uint32_t z[9]; +} p256_jacobian; + +/* + * Convert a point to affine coordinates: + * - If the point is the point at infinity, then all three coordinates + * are set to 0. + * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y' + * coordinates are the 'X' and 'Y' affine coordinates. + * The coordinates are guaranteed to be lower than the modulus. + */ +static void +p256_to_affine(p256_jacobian *P) +{ + uint32_t t1[9], t2[9]; + int i; + + /* + * Invert z with a modular exponentiation: the modulus is + * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is + * p-2. Exponent bit pattern (from high to low) is: + * - 32 bits of value 1 + * - 31 bits of value 0 + * - 1 bit of value 1 + * - 96 bits of value 0 + * - 94 bits of value 1 + * - 1 bit of value 0 + * - 1 bit of value 1 + * Thus, we precompute z^(2^31-1) to speed things up. + * + * If z = 0 (point at infinity) then the modular exponentiation + * will yield 0, which leads to the expected result (all three + * coordinates set to 0). + */ + + /* + * A simple square-and-multiply for z^(2^31-1). We could save about + * two dozen multiplications here with an addition chain, but + * this would require a bit more code, and extra stack buffers. + */ + memcpy(t1, P->z, sizeof P->z); + for (i = 0; i < 30; i ++) { + square_f256(t1, t1); + mul_f256(t1, t1, P->z); + } + + /* + * Square-and-multiply. Apart from the squarings, we have a few + * multiplications to set bits to 1; we multiply by the original z + * for setting 1 bit, and by t1 for setting 31 bits. + */ + memcpy(t2, P->z, sizeof P->z); + for (i = 1; i < 256; i ++) { + square_f256(t2, t2); + switch (i) { + case 31: + case 190: + case 221: + case 252: + mul_f256(t2, t2, t1); + break; + case 63: + case 253: + case 255: + mul_f256(t2, t2, P->z); + break; + } + } + + /* + * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3. + */ + mul_f256(t1, t2, t2); + mul_f256(P->x, t1, P->x); + mul_f256(t1, t1, t2); + mul_f256(P->y, t1, P->y); + reduce_final_f256(P->x); + reduce_final_f256(P->y); + + /* + * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise + * this will set z to 1. + */ + mul_f256(P->z, P->z, t2); + reduce_final_f256(P->z); +} + +/* + * Double a point in P-256. This function works for all valid points, + * including the point at infinity. + */ +static void +p256_double(p256_jacobian *Q) +{ + /* + * Doubling formulas are: + * + * s = 4*x*y^2 + * m = 3*(x + z^2)*(x - z^2) + * x' = m^2 - 2*s + * y' = m*(s - x') - 8*y^4 + * z' = 2*y*z + * + * These formulas work for all points, including points of order 2 + * and points at infinity: + * - If y = 0 then z' = 0. But there is no such point in P-256 + * anyway. + * - If z = 0 then z' = 0. + */ + uint32_t t1[9], t2[9], t3[9], t4[9]; + + /* + * Compute z^2 in t1. + */ + square_f256(t1, Q->z); + + /* + * Compute x-z^2 in t2 and x+z^2 in t1. + */ + add_f256(t2, Q->x, t1); + sub_f256(t1, Q->x, t1); + + /* + * Compute 3*(x+z^2)*(x-z^2) in t1. + */ + mul_f256(t3, t1, t2); + add_f256(t1, t3, t3); + add_f256(t1, t3, t1); + + /* + * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3). + */ + square_f256(t3, Q->y); + add_f256(t3, t3, t3); + mul_f256(t2, Q->x, t3); + add_f256(t2, t2, t2); + + /* + * Compute x' = m^2 - 2*s. + */ + square_f256(Q->x, t1); + sub_f256(Q->x, Q->x, t2); + sub_f256(Q->x, Q->x, t2); + + /* + * Compute z' = 2*y*z. + */ + mul_f256(t4, Q->y, Q->z); + add_f256(Q->z, t4, t4); + + /* + * Compute y' = m*(s - x') - 8*y^4. Note that we already have + * 2*y^2 in t3. + */ + sub_f256(t2, t2, Q->x); + mul_f256(Q->y, t1, t2); + square_f256(t4, t3); + add_f256(t4, t4, t4); + sub_f256(Q->y, Q->y, t4); +} + +/* + * Add point P2 to point P1. + * + * This function computes the wrong result in the following cases: + * + * - If P1 == 0 but P2 != 0 + * - If P1 != 0 but P2 == 0 + * - If P1 == P2 + * + * In all three cases, P1 is set to the point at infinity. + * + * Returned value is 0 if one of the following occurs: + * + * - P1 and P2 have the same Y coordinate + * - P1 == 0 and P2 == 0 + * - The Y coordinate of one of the points is 0 and the other point is + * the point at infinity. + * + * The third case cannot actually happen with valid points, since a point + * with Y == 0 is a point of order 2, and there is no point of order 2 on + * curve P-256. + * + * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller + * can apply the following: + * + * - If the result is not the point at infinity, then it is correct. + * - Otherwise, if the returned value is 1, then this is a case of + * P1+P2 == 0, so the result is indeed the point at infinity. + * - Otherwise, P1 == P2, so a "double" operation should have been + * performed. + */ +static uint32_t +p256_add(p256_jacobian *P1, const p256_jacobian *P2) +{ + /* + * Addtions formulas are: + * + * u1 = x1 * z2^2 + * u2 = x2 * z1^2 + * s1 = y1 * z2^3 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 * z2 + */ + uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9]; + uint32_t ret; + int i; + + /* + * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). + */ + square_f256(t3, P2->z); + mul_f256(t1, P1->x, t3); + mul_f256(t4, P2->z, t3); + mul_f256(t3, P1->y, t4); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + square_f256(t4, P1->z); + mul_f256(t2, P2->x, t4); + mul_f256(t5, P1->z, t4); + mul_f256(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * We need to test whether r is zero, so we will do some extra + * reduce. + */ + sub_f256(t2, t2, t1); + sub_f256(t4, t4, t3); + reduce_final_f256(t4); + ret = 0; + for (i = 0; i < 9; i ++) { + ret |= t4[i]; + } + ret = (ret | -ret) >> 31; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + square_f256(t7, t2); + mul_f256(t6, t1, t7); + mul_f256(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + square_f256(P1->x, t4); + sub_f256(P1->x, P1->x, t5); + sub_f256(P1->x, P1->x, t6); + sub_f256(P1->x, P1->x, t6); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + sub_f256(t6, t6, P1->x); + mul_f256(P1->y, t4, t6); + mul_f256(t1, t5, t3); + sub_f256(P1->y, P1->y, t1); + + /* + * Compute z3 = h*z1*z2. + */ + mul_f256(t1, P1->z, P2->z); + mul_f256(P1->z, t1, t2); + + return ret; +} + +/* + * Add point P2 to point P1. This is a specialised function for the + * case when P2 is a non-zero point in affine coordinate. + * + * This function computes the wrong result in the following cases: + * + * - If P1 == 0 + * - If P1 == P2 + * + * In both cases, P1 is set to the point at infinity. + * + * Returned value is 0 if one of the following occurs: + * + * - P1 and P2 have the same Y coordinate + * - The Y coordinate of P2 is 0 and P1 is the point at infinity. + * + * The second case cannot actually happen with valid points, since a point + * with Y == 0 is a point of order 2, and there is no point of order 2 on + * curve P-256. + * + * Therefore, assuming that P1 != 0 on input, then the caller + * can apply the following: + * + * - If the result is not the point at infinity, then it is correct. + * - Otherwise, if the returned value is 1, then this is a case of + * P1+P2 == 0, so the result is indeed the point at infinity. + * - Otherwise, P1 == P2, so a "double" operation should have been + * performed. + */ +static uint32_t +p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2) +{ + /* + * Addtions formulas are: + * + * u1 = x1 + * u2 = x2 * z1^2 + * s1 = y1 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 + */ + uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9]; + uint32_t ret; + int i; + + /* + * Compute u1 = x1 (in t1) and s1 = y1 (in t3). + */ + memcpy(t1, P1->x, sizeof t1); + memcpy(t3, P1->y, sizeof t3); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + square_f256(t4, P1->z); + mul_f256(t2, P2->x, t4); + mul_f256(t5, P1->z, t4); + mul_f256(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * We need to test whether r is zero, so we will do some extra + * reduce. + */ + sub_f256(t2, t2, t1); + sub_f256(t4, t4, t3); + reduce_final_f256(t4); + ret = 0; + for (i = 0; i < 9; i ++) { + ret |= t4[i]; + } + ret = (ret | -ret) >> 31; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + square_f256(t7, t2); + mul_f256(t6, t1, t7); + mul_f256(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + square_f256(P1->x, t4); + sub_f256(P1->x, P1->x, t5); + sub_f256(P1->x, P1->x, t6); + sub_f256(P1->x, P1->x, t6); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + sub_f256(t6, t6, P1->x); + mul_f256(P1->y, t4, t6); + mul_f256(t1, t5, t3); + sub_f256(P1->y, P1->y, t1); + + /* + * Compute z3 = h*z1*z2. + */ + mul_f256(P1->z, P1->z, t2); + + return ret; +} + +/* + * Decode a P-256 point. This function does not support the point at + * infinity. Returned value is 0 if the point is invalid, 1 otherwise. + */ +static uint32_t +p256_decode(p256_jacobian *P, const void *src, size_t len) +{ + const unsigned char *buf; + uint32_t tx[9], ty[9], t1[9], t2[9]; + uint32_t bad; + int i; + + if (len != 65) { + return 0; + } + buf = src; + + /* + * First byte must be 0x04 (uncompressed format). We could support + * "hybrid format" (first byte is 0x06 or 0x07, and encodes the + * least significant bit of the Y coordinate), but it is explicitly + * forbidden by RFC 5480 (section 2.2). + */ + bad = NEQ(buf[0], 0x04); + + /* + * Decode the coordinates, and check that they are both lower + * than the modulus. + */ + tx[8] = be8_to_le30(tx, buf + 1, 32); + ty[8] = be8_to_le30(ty, buf + 33, 32); + bad |= reduce_final_f256(tx); + bad |= reduce_final_f256(ty); + + /* + * Check curve equation. + */ + square_f256(t1, tx); + mul_f256(t1, tx, t1); + square_f256(t2, ty); + sub_f256(t1, t1, tx); + sub_f256(t1, t1, tx); + sub_f256(t1, t1, tx); + add_f256(t1, t1, P256_B); + sub_f256(t1, t1, t2); + reduce_final_f256(t1); + for (i = 0; i < 9; i ++) { + bad |= t1[i]; + } + + /* + * Copy coordinates to the point structure. + */ + memcpy(P->x, tx, sizeof tx); + memcpy(P->y, ty, sizeof ty); + memset(P->z, 0, sizeof P->z); + P->z[0] = 1; + return EQ(bad, 0); +} + +/* + * Encode a point into a buffer. This function assumes that the point is + * valid, in affine coordinates, and not the point at infinity. + */ +static void +p256_encode(void *dst, const p256_jacobian *P) +{ + unsigned char *buf; + + buf = dst; + buf[0] = 0x04; + le30_to_be8(buf + 1, 32, P->x); + le30_to_be8(buf + 33, 32, P->y); +} + +/* + * Multiply a curve point by an integer. The integer is assumed to be + * lower than the curve order, and the base point must not be the point + * at infinity. + */ +static void +p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen) +{ + /* + * qz is a flag that is initially 1, and remains equal to 1 + * as long as the point is the point at infinity. + * + * We use a 2-bit window to handle multiplier bits by pairs. + * The precomputed window really is the points P2 and P3. + */ + uint32_t qz; + p256_jacobian P2, P3, Q, T, U; + + /* + * Compute window values. + */ + P2 = *P; + p256_double(&P2); + P3 = *P; + p256_add(&P3, &P2); + + /* + * We start with Q = 0. We process multiplier bits 2 by 2. + */ + memset(&Q, 0, sizeof Q); + qz = 1; + while (xlen -- > 0) { + int k; + + for (k = 6; k >= 0; k -= 2) { + uint32_t bits; + uint32_t bnz; + + p256_double(&Q); + p256_double(&Q); + T = *P; + U = Q; + bits = (*x >> k) & (uint32_t)3; + bnz = NEQ(bits, 0); + CCOPY(EQ(bits, 2), &T, &P2, sizeof T); + CCOPY(EQ(bits, 3), &T, &P3, sizeof T); + p256_add(&U, &T); + CCOPY(bnz & qz, &Q, &T, sizeof Q); + CCOPY(bnz & ~qz, &Q, &U, sizeof Q); + qz &= ~bnz; + } + x ++; + } + *P = Q; +} + +/* + * Precomputed window: k*G points, where G is the curve generator, and k + * is an integer from 1 to 15 (inclusive). The X and Y coordinates of + * the point are encoded as 9 words of 30 bits each (little-endian + * order). + */ +static const uint32_t Gwin[15][18] = { + + { 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B, + 0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B, + 0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC, + 0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE, + 0x10B8BF86, 0x00004FE3 }, + + { 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D, + 0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340, + 0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299, + 0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293, + 0x154436E3, 0x00000777 }, + + { 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B, + 0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C, + 0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9, + 0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374, + 0x19031266, 0x00008734 }, + + { 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE, + 0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4, + 0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055, + 0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D, + 0x15D69318, 0x0000E0F1 }, + + { 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47, + 0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454, + 0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D, + 0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE, + 0x1F6A2412, 0x0000E0C1 }, + + { 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A, + 0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9, + 0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF, + 0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE, + 0x041D0C8D, 0x0000E85C }, + + { 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A, + 0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F, + 0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C, + 0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0, + 0x076F780C, 0x000073EB }, + + { 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603, + 0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA, + 0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D, + 0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF, + 0x332F647A, 0x0000AD5A }, + + { 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B, + 0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7, + 0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE, + 0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870, + 0x11325CB2, 0x00002A27 }, + + { 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7, + 0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E, + 0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC, + 0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1, + 0x18A88A6A, 0x00008786 }, + + { 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409, + 0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E, + 0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE, + 0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C, + 0x0826B331, 0x00009099 }, + + { 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C, + 0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05, + 0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71, + 0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567, + 0x2D1AA70E, 0x00000770 }, + + { 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9, + 0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B, + 0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39, + 0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24, + 0x163353AF, 0x000063BB }, + + { 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E, + 0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E, + 0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081, + 0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421, + 0x3C6ECA7D, 0x0000F599 }, + + { 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7, + 0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6, + 0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4, + 0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6, + 0x0FB8D64B, 0x0000B5B9 } +}; + +/* + * Lookup one of the Gwin[] values, by index. This is constant-time. + */ +static void +lookup_Gwin(p256_jacobian *T, uint32_t idx) +{ + uint32_t xy[18]; + uint32_t k; + size_t u; + + memset(xy, 0, sizeof xy); + for (k = 0; k < 15; k ++) { + uint32_t m; + + m = -EQ(idx, k + 1); + for (u = 0; u < 18; u ++) { + xy[u] |= m & Gwin[k][u]; + } + } + memcpy(T->x, &xy[0], sizeof T->x); + memcpy(T->y, &xy[9], sizeof T->y); + memset(T->z, 0, sizeof T->z); + T->z[0] = 1; +} + +/* + * Multiply the generator by an integer. The integer is assumed non-zero + * and lower than the curve order. + */ +static void +p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen) +{ + /* + * qz is a flag that is initially 1, and remains equal to 1 + * as long as the point is the point at infinity. + * + * We use a 4-bit window to handle multiplier bits by groups + * of 4. The precomputed window is constant static data, with + * points in affine coordinates; we use a constant-time lookup. + */ + p256_jacobian Q; + uint32_t qz; + + memset(&Q, 0, sizeof Q); + qz = 1; + while (xlen -- > 0) { + int k; + unsigned bx; + + bx = *x ++; + for (k = 0; k < 2; k ++) { + uint32_t bits; + uint32_t bnz; + p256_jacobian T, U; + + p256_double(&Q); + p256_double(&Q); + p256_double(&Q); + p256_double(&Q); + bits = (bx >> 4) & 0x0F; + bnz = NEQ(bits, 0); + lookup_Gwin(&T, bits); + U = Q; + p256_add_mixed(&U, &T); + CCOPY(bnz & qz, &Q, &T, sizeof Q); + CCOPY(bnz & ~qz, &Q, &U, sizeof Q); + qz &= ~bnz; + bx <<= 4; + } + } + *P = Q; +} + +static const unsigned char P256_G[] = { + 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8, + 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D, + 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, + 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F, + 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B, + 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40, + 0x68, 0x37, 0xBF, 0x51, 0xF5 +}; + +static const unsigned char P256_N[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD, + 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, + 0x25, 0x51 +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = sizeof P256_G; + return P256_G; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = sizeof P256_N; + return P256_N; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 1; +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *x, size_t xlen, int curve) +{ + uint32_t r; + p256_jacobian P; + + (void)curve; + r = p256_decode(&P, G, Glen); + p256_mul(&P, x, xlen); + if (Glen >= 65) { + p256_to_affine(&P); + p256_encode(G, &P); + } + return r; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + p256_jacobian P; + + (void)curve; + p256_mulgen(&P, x, xlen); + p256_to_affine(&P); + p256_encode(R, &P); + return 65; + + /* + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; + */ +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + p256_jacobian P, Q; + uint32_t r, t, z; + int i; + + (void)curve; + r = p256_decode(&P, A, len); + p256_mul(&P, x, xlen); + if (B == NULL) { + p256_mulgen(&Q, y, ylen); + } else { + r &= p256_decode(&Q, B, len); + p256_mul(&Q, y, ylen); + } + + /* + * The final addition may fail in case both points are equal. + */ + t = p256_add(&P, &Q); + reduce_final_f256(P.z); + z = 0; + for (i = 0; i < 9; i ++) { + z |= P.z[i]; + } + z = EQ(z, 0); + p256_double(&Q); + + /* + * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we + * have the following: + * + * z = 0, t = 0 return P (normal addition) + * z = 0, t = 1 return P (normal addition) + * z = 1, t = 0 return Q (a 'double' case) + * z = 1, t = 1 report an error (P+Q = 0) + */ + CCOPY(z & ~t, &P, &Q, sizeof Q); + p256_to_affine(&P); + p256_encode(A, &P); + r &= ~(z & t); + return r; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_p256_m31 = { + (uint32_t)0x00800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m62.c b/test/monniaux/BearSSL/src/ec/ec_p256_m62.c new file mode 100644 index 00000000..3bcb95b5 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_p256_m62.c @@ -0,0 +1,1765 @@ +/* + * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#if BR_INT128 || BR_UMUL128 + +#if BR_UMUL128 +#include <intrin.h> +#endif + +static const unsigned char P256_G[] = { + 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8, + 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D, + 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, + 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F, + 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B, + 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40, + 0x68, 0x37, 0xBF, 0x51, 0xF5 +}; + +static const unsigned char P256_N[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD, + 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, + 0x25, 0x51 +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = sizeof P256_G; + return P256_G; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = sizeof P256_N; + return P256_N; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 1; +} + +/* + * A field element is encoded as five 64-bit integers, in basis 2^52. + * Limbs may occasionally exceed 2^52. + * + * A _partially reduced_ value is such that the following hold: + * - top limb is less than 2^48 + 2^30 + * - the other limbs fit on 53 bits each + * In particular, such a value is less than twice the modulus p. + */ + +#define BIT(n) ((uint64_t)1 << (n)) +#define MASK48 (BIT(48) - BIT(0)) +#define MASK52 (BIT(52) - BIT(0)) + +/* R = 2^260 mod p */ +static const uint64_t F256_R[] = { + 0x0000000000010, 0xF000000000000, 0xFFFFFFFFFFFFF, + 0xFFEFFFFFFFFFF, 0x00000000FFFFF +}; + +/* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p + (Montgomery representation of B). */ +static const uint64_t P256_B_MONTY[] = { + 0xDF6229C4BDDFD, 0xCA8843090D89C, 0x212ED6ACF005C, + 0x83415A220ABF7, 0x0C30061DD4874 +}; + +/* + * Addition in the field. Carry propagation is not performed. + * On input, limbs may be up to 63 bits each; on output, they will + * be up to one bit more than on input. + */ +static inline void +f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ + d[0] = a[0] + b[0]; + d[1] = a[1] + b[1]; + d[2] = a[2] + b[2]; + d[3] = a[3] + b[3]; + d[4] = a[4] + b[4]; +} + +/* + * Partially reduce the provided value. + * Input: limbs can go up to 61 bits each. + * Output: partially reduced. + */ +static inline void +f256_partial_reduce(uint64_t *a) +{ + uint64_t w, cc, s; + + /* + * Propagate carries. + */ + w = a[0]; + a[0] = w & MASK52; + cc = w >> 52; + w = a[1] + cc; + a[1] = w & MASK52; + cc = w >> 52; + w = a[2] + cc; + a[2] = w & MASK52; + cc = w >> 52; + w = a[3] + cc; + a[3] = w & MASK52; + cc = w >> 52; + a[4] += cc; + + s = a[4] >> 48; /* s < 2^14 */ + a[0] += s; /* a[0] < 2^52 + 2^14 */ + w = a[1] - (s << 44); + a[1] = w & MASK52; /* a[1] < 2^52 */ + cc = -(w >> 52) & 0xFFF; /* cc < 16 */ + w = a[2] - cc; + a[2] = w & MASK52; /* a[2] < 2^52 */ + cc = w >> 63; /* cc = 0 or 1 */ + w = a[3] - cc - (s << 36); + a[3] = w & MASK52; /* a[3] < 2^52 */ + cc = w >> 63; /* cc = 0 or 1 */ + w = a[4] & MASK48; + a[4] = w + (s << 16) - cc; /* a[4] < 2^48 + 2^30 */ +} + +/* + * Subtraction in the field. + * Input: limbs must fit on 60 bits each; in particular, the complete + * integer will be less than 2^268 + 2^217. + * Output: partially reduced. + */ +static inline void +f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ + uint64_t t[5], w, s, cc; + + /* + * We compute d = 2^13*p + a - b; this ensures a positive + * intermediate value. + * + * Each individual addition/subtraction may yield a positive or + * negative result; thus, we need to handle a signed carry, thus + * with sign extension. We prefer not to use signed types (int64_t) + * because conversion from unsigned to signed is cumbersome (a + * direct cast with the top bit set is undefined behavior; instead, + * we have to use pointer aliasing, using the guaranteed properties + * of exact-width types, but this requires the compiler to optimize + * away the writes and reads from RAM), and right-shifting a + * signed negative value is implementation-defined. Therefore, + * we use a custom sign extension. + */ + + w = a[0] - b[0] - BIT(13); + t[0] = w & MASK52; + cc = w >> 52; + cc |= -(cc & BIT(11)); + w = a[1] - b[1] + cc; + t[1] = w & MASK52; + cc = w >> 52; + cc |= -(cc & BIT(11)); + w = a[2] - b[2] + cc; + t[2] = (w & MASK52) + BIT(5); + cc = w >> 52; + cc |= -(cc & BIT(11)); + w = a[3] - b[3] + cc; + t[3] = (w & MASK52) + BIT(49); + cc = w >> 52; + cc |= -(cc & BIT(11)); + t[4] = (BIT(61) - BIT(29)) + a[4] - b[4] + cc; + + /* + * Perform partial reduction. Rule is: + * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p + * + * At that point: + * 0 <= t[0] <= 2^52 - 1 + * 0 <= t[1] <= 2^52 - 1 + * 2^5 <= t[2] <= 2^52 + 2^5 - 1 + * 2^49 <= t[3] <= 2^52 + 2^49 - 1 + * 2^59 < t[4] <= 2^61 + 2^60 - 2^29 + * + * Thus, the value 's' (t[4] / 2^48) will be necessarily + * greater than 2048, and less than 12288. + */ + s = t[4] >> 48; + + d[0] = t[0] + s; /* d[0] <= 2^52 + 12287 */ + w = t[1] - (s << 44); + d[1] = w & MASK52; /* d[1] <= 2^52 - 1 */ + cc = -(w >> 52) & 0xFFF; /* cc <= 48 */ + w = t[2] - cc; + cc = w >> 63; /* cc = 0 or 1 */ + d[2] = w + (cc << 52); /* d[2] <= 2^52 + 31 */ + w = t[3] - cc - (s << 36); + cc = w >> 63; /* cc = 0 or 1 */ + d[3] = w + (cc << 52); /* t[3] <= 2^52 + 2^49 - 1 */ + d[4] = (t[4] & MASK48) + (s << 16) - cc; /* d[4] < 2^48 + 2^30 */ + + /* + * If s = 0, then none of the limbs is modified, and there cannot + * be an overflow; if s != 0, then (s << 16) > cc, and there is + * no overflow either. + */ +} + +/* + * Montgomery multiplication in the field. + * Input: limbs must fit on 56 bits each. + * Output: partially reduced. + */ +static void +f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ +#if BR_INT128 + + int i; + uint64_t t[5]; + + t[0] = 0; + t[1] = 0; + t[2] = 0; + t[3] = 0; + t[4] = 0; + for (i = 0; i < 5; i ++) { + uint64_t x, f, cc, w, s; + unsigned __int128 z; + + /* + * Since limbs of a[] and b[] fit on 56 bits each, + * each individual product fits on 112 bits. Also, + * the factor f fits on 52 bits, so f<<48 fits on + * 112 bits too. This guarantees that carries (cc) + * will fit on 62 bits, thus no overflow. + * + * The operations below compute: + * t <- (t + x*b + f*p) / 2^64 + */ + x = a[i]; + z = (unsigned __int128)b[0] * (unsigned __int128)x + + (unsigned __int128)t[0]; + f = (uint64_t)z & MASK52; + cc = (uint64_t)(z >> 52); + z = (unsigned __int128)b[1] * (unsigned __int128)x + + (unsigned __int128)t[1] + cc + + ((unsigned __int128)f << 44); + t[0] = (uint64_t)z & MASK52; + cc = (uint64_t)(z >> 52); + z = (unsigned __int128)b[2] * (unsigned __int128)x + + (unsigned __int128)t[2] + cc; + t[1] = (uint64_t)z & MASK52; + cc = (uint64_t)(z >> 52); + z = (unsigned __int128)b[3] * (unsigned __int128)x + + (unsigned __int128)t[3] + cc + + ((unsigned __int128)f << 36); + t[2] = (uint64_t)z & MASK52; + cc = (uint64_t)(z >> 52); + z = (unsigned __int128)b[4] * (unsigned __int128)x + + (unsigned __int128)t[4] + cc + + ((unsigned __int128)f << 48) + - ((unsigned __int128)f << 16); + t[3] = (uint64_t)z & MASK52; + t[4] = (uint64_t)(z >> 52); + + /* + * t[4] may be up to 62 bits here; we need to do a + * partial reduction. Note that limbs t[0] to t[3] + * fit on 52 bits each. + */ + s = t[4] >> 48; /* s < 2^14 */ + t[0] += s; /* t[0] < 2^52 + 2^14 */ + w = t[1] - (s << 44); + t[1] = w & MASK52; /* t[1] < 2^52 */ + cc = -(w >> 52) & 0xFFF; /* cc < 16 */ + w = t[2] - cc; + t[2] = w & MASK52; /* t[2] < 2^52 */ + cc = w >> 63; /* cc = 0 or 1 */ + w = t[3] - cc - (s << 36); + t[3] = w & MASK52; /* t[3] < 2^52 */ + cc = w >> 63; /* cc = 0 or 1 */ + w = t[4] & MASK48; + t[4] = w + (s << 16) - cc; /* t[4] < 2^48 + 2^30 */ + + /* + * The final t[4] cannot overflow because cc is 0 or 1, + * and cc can be 1 only if s != 0. + */ + } + + d[0] = t[0]; + d[1] = t[1]; + d[2] = t[2]; + d[3] = t[3]; + d[4] = t[4]; + +#elif BR_UMUL128 + + int i; + uint64_t t[5]; + + t[0] = 0; + t[1] = 0; + t[2] = 0; + t[3] = 0; + t[4] = 0; + for (i = 0; i < 5; i ++) { + uint64_t x, f, cc, w, s, zh, zl; + unsigned char k; + + /* + * Since limbs of a[] and b[] fit on 56 bits each, + * each individual product fits on 112 bits. Also, + * the factor f fits on 52 bits, so f<<48 fits on + * 112 bits too. This guarantees that carries (cc) + * will fit on 62 bits, thus no overflow. + * + * The operations below compute: + * t <- (t + x*b + f*p) / 2^64 + */ + x = a[i]; + zl = _umul128(b[0], x, &zh); + k = _addcarry_u64(0, t[0], zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + f = zl & MASK52; + cc = (zl >> 52) | (zh << 12); + + zl = _umul128(b[1], x, &zh); + k = _addcarry_u64(0, t[1], zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + k = _addcarry_u64(0, cc, zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + k = _addcarry_u64(0, f << 44, zl, &zl); + (void)_addcarry_u64(k, f >> 20, zh, &zh); + t[0] = zl & MASK52; + cc = (zl >> 52) | (zh << 12); + + zl = _umul128(b[2], x, &zh); + k = _addcarry_u64(0, t[2], zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + k = _addcarry_u64(0, cc, zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + t[1] = zl & MASK52; + cc = (zl >> 52) | (zh << 12); + + zl = _umul128(b[3], x, &zh); + k = _addcarry_u64(0, t[3], zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + k = _addcarry_u64(0, cc, zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + k = _addcarry_u64(0, f << 36, zl, &zl); + (void)_addcarry_u64(k, f >> 28, zh, &zh); + t[2] = zl & MASK52; + cc = (zl >> 52) | (zh << 12); + + zl = _umul128(b[4], x, &zh); + k = _addcarry_u64(0, t[4], zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + k = _addcarry_u64(0, cc, zl, &zl); + (void)_addcarry_u64(k, 0, zh, &zh); + k = _addcarry_u64(0, f << 48, zl, &zl); + (void)_addcarry_u64(k, f >> 16, zh, &zh); + k = _subborrow_u64(0, zl, f << 16, &zl); + (void)_subborrow_u64(k, zh, f >> 48, &zh); + t[3] = zl & MASK52; + t[4] = (zl >> 52) | (zh << 12); + + /* + * t[4] may be up to 62 bits here; we need to do a + * partial reduction. Note that limbs t[0] to t[3] + * fit on 52 bits each. + */ + s = t[4] >> 48; /* s < 2^14 */ + t[0] += s; /* t[0] < 2^52 + 2^14 */ + w = t[1] - (s << 44); + t[1] = w & MASK52; /* t[1] < 2^52 */ + cc = -(w >> 52) & 0xFFF; /* cc < 16 */ + w = t[2] - cc; + t[2] = w & MASK52; /* t[2] < 2^52 */ + cc = w >> 63; /* cc = 0 or 1 */ + w = t[3] - cc - (s << 36); + t[3] = w & MASK52; /* t[3] < 2^52 */ + cc = w >> 63; /* cc = 0 or 1 */ + w = t[4] & MASK48; + t[4] = w + (s << 16) - cc; /* t[4] < 2^48 + 2^30 */ + + /* + * The final t[4] cannot overflow because cc is 0 or 1, + * and cc can be 1 only if s != 0. + */ + } + + d[0] = t[0]; + d[1] = t[1]; + d[2] = t[2]; + d[3] = t[3]; + d[4] = t[4]; + +#endif +} + +/* + * Montgomery squaring in the field; currently a basic wrapper around + * multiplication (inline, should be optimized away). + * TODO: see if some extra speed can be gained here. + */ +static inline void +f256_montysquare(uint64_t *d, const uint64_t *a) +{ + f256_montymul(d, a, a); +} + +/* + * Convert to Montgomery representation. + */ +static void +f256_tomonty(uint64_t *d, const uint64_t *a) +{ + /* + * R2 = 2^520 mod p. + * If R = 2^260 mod p, then R2 = R^2 mod p; and the Montgomery + * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the + * conversion to Montgomery representation. + */ + static const uint64_t R2[] = { + 0x0000000000300, 0xFFFFFFFF00000, 0xFFFFEFFFFFFFB, + 0xFDFFFFFFFFFFF, 0x0000004FFFFFF + }; + + f256_montymul(d, a, R2); +} + +/* + * Convert from Montgomery representation. + */ +static void +f256_frommonty(uint64_t *d, const uint64_t *a) +{ + /* + * Montgomery multiplication by 1 is division by 2^260 modulo p. + */ + static const uint64_t one[] = { 1, 0, 0, 0, 0 }; + + f256_montymul(d, a, one); +} + +/* + * Inversion in the field. If the source value is 0 modulo p, then this + * returns 0 or p. This function uses Montgomery representation. + */ +static void +f256_invert(uint64_t *d, const uint64_t *a) +{ + /* + * We compute a^(p-2) mod p. The exponent pattern (from high to + * low) is: + * - 32 bits of value 1 + * - 31 bits of value 0 + * - 1 bit of value 1 + * - 96 bits of value 0 + * - 94 bits of value 1 + * - 1 bit of value 0 + * - 1 bit of value 1 + * To speed up the square-and-multiply algorithm, we precompute + * a^(2^31-1). + */ + + uint64_t r[5], t[5]; + int i; + + memcpy(t, a, sizeof t); + for (i = 0; i < 30; i ++) { + f256_montysquare(t, t); + f256_montymul(t, t, a); + } + + memcpy(r, t, sizeof t); + for (i = 224; i >= 0; i --) { + f256_montysquare(r, r); + switch (i) { + case 0: + case 2: + case 192: + case 224: + f256_montymul(r, r, a); + break; + case 3: + case 34: + case 65: + f256_montymul(r, r, t); + break; + } + } + memcpy(d, r, sizeof r); +} + +/* + * Finalize reduction. + * Input value should be partially reduced. + * On output, limbs a[0] to a[3] fit on 52 bits each, limb a[4] fits + * on 48 bits, and the integer is less than p. + */ +static inline void +f256_final_reduce(uint64_t *a) +{ + uint64_t r[5], t[5], w, cc; + int i; + + /* + * Propagate carries to ensure that limbs 0 to 3 fit on 52 bits. + */ + cc = 0; + for (i = 0; i < 5; i ++) { + w = a[i] + cc; + r[i] = w & MASK52; + cc = w >> 52; + } + + /* + * We compute t = r + (2^256 - p) = r + 2^224 - 2^192 - 2^96 + 1. + * If t < 2^256, then r < p, and we return r. Otherwise, we + * want to return r - p = t - 2^256. + */ + + /* + * Add 2^224 + 1, and propagate carries to ensure that limbs + * t[0] to t[3] fit in 52 bits each. + */ + w = r[0] + 1; + t[0] = w & MASK52; + cc = w >> 52; + w = r[1] + cc; + t[1] = w & MASK52; + cc = w >> 52; + w = r[2] + cc; + t[2] = w & MASK52; + cc = w >> 52; + w = r[3] + cc; + t[3] = w & MASK52; + cc = w >> 52; + t[4] = r[4] + cc + BIT(16); + + /* + * Subtract 2^192 + 2^96. Since we just added 2^224 + 1, the + * result cannot be negative. + */ + w = t[1] - BIT(44); + t[1] = w & MASK52; + cc = w >> 63; + w = t[2] - cc; + t[2] = w & MASK52; + cc = w >> 63; + w = t[3] - BIT(36); + t[3] = w & MASK52; + cc = w >> 63; + t[4] -= cc; + + /* + * If the top limb t[4] fits on 48 bits, then r[] is already + * in the proper range. Otherwise, t[] is the value to return + * (truncated to 256 bits). + */ + cc = -(t[4] >> 48); + t[4] &= MASK48; + for (i = 0; i < 5; i ++) { + a[i] = r[i] ^ (cc & (r[i] ^ t[i])); + } +} + +/* + * Points in affine and Jacobian coordinates. + * + * - In affine coordinates, the point-at-infinity cannot be encoded. + * - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3); + * if Z = 0 then this is the point-at-infinity. + */ +typedef struct { + uint64_t x[5]; + uint64_t y[5]; +} p256_affine; + +typedef struct { + uint64_t x[5]; + uint64_t y[5]; + uint64_t z[5]; +} p256_jacobian; + +/* + * Decode a field element (unsigned big endian notation). + */ +static void +f256_decode(uint64_t *a, const unsigned char *buf) +{ + uint64_t w0, w1, w2, w3; + + w3 = br_dec64be(buf + 0); + w2 = br_dec64be(buf + 8); + w1 = br_dec64be(buf + 16); + w0 = br_dec64be(buf + 24); + a[0] = w0 & MASK52; + a[1] = ((w0 >> 52) | (w1 << 12)) & MASK52; + a[2] = ((w1 >> 40) | (w2 << 24)) & MASK52; + a[3] = ((w2 >> 28) | (w3 << 36)) & MASK52; + a[4] = w3 >> 16; +} + +/* + * Encode a field element (unsigned big endian notation). The field + * element MUST be fully reduced. + */ +static void +f256_encode(unsigned char *buf, const uint64_t *a) +{ + uint64_t w0, w1, w2, w3; + + w0 = a[0] | (a[1] << 52); + w1 = (a[1] >> 12) | (a[2] << 40); + w2 = (a[2] >> 24) | (a[3] << 28); + w3 = (a[3] >> 36) | (a[4] << 16); + br_enc64be(buf + 0, w3); + br_enc64be(buf + 8, w2); + br_enc64be(buf + 16, w1); + br_enc64be(buf + 24, w0); +} + +/* + * Decode a point. The returned point is in Jacobian coordinates, but + * with z = 1. If the encoding is invalid, or encodes a point which is + * not on the curve, or encodes the point at infinity, then this function + * returns 0. Otherwise, 1 is returned. + * + * The buffer is assumed to have length exactly 65 bytes. + */ +static uint32_t +point_decode(p256_jacobian *P, const unsigned char *buf) +{ + uint64_t x[5], y[5], t[5], x3[5], tt; + uint32_t r; + + /* + * Header byte shall be 0x04. + */ + r = EQ(buf[0], 0x04); + + /* + * Decode X and Y coordinates, and convert them into + * Montgomery representation. + */ + f256_decode(x, buf + 1); + f256_decode(y, buf + 33); + f256_tomonty(x, x); + f256_tomonty(y, y); + + /* + * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3. + * Note that the Montgomery representation of 0 is 0. We must + * take care to apply the final reduction to make sure we have + * 0 and not p. + */ + f256_montysquare(t, y); + f256_montysquare(x3, x); + f256_montymul(x3, x3, x); + f256_sub(t, t, x3); + f256_add(t, t, x); + f256_add(t, t, x); + f256_add(t, t, x); + f256_sub(t, t, P256_B_MONTY); + f256_final_reduce(t); + tt = t[0] | t[1] | t[2] | t[3] | t[4]; + r &= EQ((uint32_t)(tt | (tt >> 32)), 0); + + /* + * Return the point in Jacobian coordinates (and Montgomery + * representation). + */ + memcpy(P->x, x, sizeof x); + memcpy(P->y, y, sizeof y); + memcpy(P->z, F256_R, sizeof F256_R); + return r; +} + +/* + * Final conversion for a point: + * - The point is converted back to affine coordinates. + * - Final reduction is performed. + * - The point is encoded into the provided buffer. + * + * If the point is the point-at-infinity, all operations are performed, + * but the buffer contents are indeterminate, and 0 is returned. Otherwise, + * the encoded point is written in the buffer, and 1 is returned. + */ +static uint32_t +point_encode(unsigned char *buf, const p256_jacobian *P) +{ + uint64_t t1[5], t2[5], z; + + /* Set t1 = 1/z^2 and t2 = 1/z^3. */ + f256_invert(t2, P->z); + f256_montysquare(t1, t2); + f256_montymul(t2, t2, t1); + + /* Compute affine coordinates x (in t1) and y (in t2). */ + f256_montymul(t1, P->x, t1); + f256_montymul(t2, P->y, t2); + + /* Convert back from Montgomery representation, and finalize + reductions. */ + f256_frommonty(t1, t1); + f256_frommonty(t2, t2); + f256_final_reduce(t1); + f256_final_reduce(t2); + + /* Encode. */ + buf[0] = 0x04; + f256_encode(buf + 1, t1); + f256_encode(buf + 33, t2); + + /* Return success if and only if P->z != 0. */ + z = P->z[0] | P->z[1] | P->z[2] | P->z[3] | P->z[4]; + return NEQ((uint32_t)(z | z >> 32), 0); +} + +/* + * Point doubling in Jacobian coordinates: point P is doubled. + * Note: if the source point is the point-at-infinity, then the result is + * still the point-at-infinity, which is correct. Moreover, if the three + * coordinates were zero, then they still are zero in the returned value. + */ +static void +p256_double(p256_jacobian *P) +{ + /* + * Doubling formulas are: + * + * s = 4*x*y^2 + * m = 3*(x + z^2)*(x - z^2) + * x' = m^2 - 2*s + * y' = m*(s - x') - 8*y^4 + * z' = 2*y*z + * + * These formulas work for all points, including points of order 2 + * and points at infinity: + * - If y = 0 then z' = 0. But there is no such point in P-256 + * anyway. + * - If z = 0 then z' = 0. + */ + uint64_t t1[5], t2[5], t3[5], t4[5]; + + /* + * Compute z^2 in t1. + */ + f256_montysquare(t1, P->z); + + /* + * Compute x-z^2 in t2 and x+z^2 in t1. + */ + f256_add(t2, P->x, t1); + f256_sub(t1, P->x, t1); + + /* + * Compute 3*(x+z^2)*(x-z^2) in t1. + */ + f256_montymul(t3, t1, t2); + f256_add(t1, t3, t3); + f256_add(t1, t3, t1); + + /* + * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3). + */ + f256_montysquare(t3, P->y); + f256_add(t3, t3, t3); + f256_montymul(t2, P->x, t3); + f256_add(t2, t2, t2); + + /* + * Compute x' = m^2 - 2*s. + */ + f256_montysquare(P->x, t1); + f256_sub(P->x, P->x, t2); + f256_sub(P->x, P->x, t2); + + /* + * Compute z' = 2*y*z. + */ + f256_montymul(t4, P->y, P->z); + f256_add(P->z, t4, t4); + f256_partial_reduce(P->z); + + /* + * Compute y' = m*(s - x') - 8*y^4. Note that we already have + * 2*y^2 in t3. + */ + f256_sub(t2, t2, P->x); + f256_montymul(P->y, t1, t2); + f256_montysquare(t4, t3); + f256_add(t4, t4, t4); + f256_sub(P->y, P->y, t4); +} + +/* + * Point addition (Jacobian coordinates): P1 is replaced with P1+P2. + * This function computes the wrong result in the following cases: + * + * - If P1 == 0 but P2 != 0 + * - If P1 != 0 but P2 == 0 + * - If P1 == P2 + * + * In all three cases, P1 is set to the point at infinity. + * + * Returned value is 0 if one of the following occurs: + * + * - P1 and P2 have the same Y coordinate. + * - P1 == 0 and P2 == 0. + * - The Y coordinate of one of the points is 0 and the other point is + * the point at infinity. + * + * The third case cannot actually happen with valid points, since a point + * with Y == 0 is a point of order 2, and there is no point of order 2 on + * curve P-256. + * + * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller + * can apply the following: + * + * - If the result is not the point at infinity, then it is correct. + * - Otherwise, if the returned value is 1, then this is a case of + * P1+P2 == 0, so the result is indeed the point at infinity. + * - Otherwise, P1 == P2, so a "double" operation should have been + * performed. + * + * Note that you can get a returned value of 0 with a correct result, + * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates. + */ +static uint32_t +p256_add(p256_jacobian *P1, const p256_jacobian *P2) +{ + /* + * Addtions formulas are: + * + * u1 = x1 * z2^2 + * u2 = x2 * z1^2 + * s1 = y1 * z2^3 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 * z2 + */ + uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt; + uint32_t ret; + + /* + * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). + */ + f256_montysquare(t3, P2->z); + f256_montymul(t1, P1->x, t3); + f256_montymul(t4, P2->z, t3); + f256_montymul(t3, P1->y, t4); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + f256_montysquare(t4, P1->z); + f256_montymul(t2, P2->x, t4); + f256_montymul(t5, P1->z, t4); + f256_montymul(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * We need to test whether r is zero, so we will do some extra + * reduce. + */ + f256_sub(t2, t2, t1); + f256_sub(t4, t4, t3); + f256_final_reduce(t4); + tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4]; + ret = (uint32_t)(tt | (tt >> 32)); + ret = (ret | -ret) >> 31; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + f256_montysquare(t7, t2); + f256_montymul(t6, t1, t7); + f256_montymul(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + f256_montysquare(P1->x, t4); + f256_sub(P1->x, P1->x, t5); + f256_sub(P1->x, P1->x, t6); + f256_sub(P1->x, P1->x, t6); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + f256_sub(t6, t6, P1->x); + f256_montymul(P1->y, t4, t6); + f256_montymul(t1, t5, t3); + f256_sub(P1->y, P1->y, t1); + + /* + * Compute z3 = h*z1*z2. + */ + f256_montymul(t1, P1->z, P2->z); + f256_montymul(P1->z, t1, t2); + + return ret; +} + +/* + * Point addition (mixed coordinates): P1 is replaced with P1+P2. + * This is a specialised function for the case when P2 is a non-zero point + * in affine coordinates. + * + * This function computes the wrong result in the following cases: + * + * - If P1 == 0 + * - If P1 == P2 + * + * In both cases, P1 is set to the point at infinity. + * + * Returned value is 0 if one of the following occurs: + * + * - P1 and P2 have the same Y (affine) coordinate. + * - The Y coordinate of P2 is 0 and P1 is the point at infinity. + * + * The second case cannot actually happen with valid points, since a point + * with Y == 0 is a point of order 2, and there is no point of order 2 on + * curve P-256. + * + * Therefore, assuming that P1 != 0 on input, then the caller + * can apply the following: + * + * - If the result is not the point at infinity, then it is correct. + * - Otherwise, if the returned value is 1, then this is a case of + * P1+P2 == 0, so the result is indeed the point at infinity. + * - Otherwise, P1 == P2, so a "double" operation should have been + * performed. + * + * Again, a value of 0 may be returned in some cases where the addition + * result is correct. + */ +static uint32_t +p256_add_mixed(p256_jacobian *P1, const p256_affine *P2) +{ + /* + * Addtions formulas are: + * + * u1 = x1 + * u2 = x2 * z1^2 + * s1 = y1 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 + */ + uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt; + uint32_t ret; + + /* + * Compute u1 = x1 (in t1) and s1 = y1 (in t3). + */ + memcpy(t1, P1->x, sizeof t1); + memcpy(t3, P1->y, sizeof t3); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + f256_montysquare(t4, P1->z); + f256_montymul(t2, P2->x, t4); + f256_montymul(t5, P1->z, t4); + f256_montymul(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * We need to test whether r is zero, so we will do some extra + * reduce. + */ + f256_sub(t2, t2, t1); + f256_sub(t4, t4, t3); + f256_final_reduce(t4); + tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4]; + ret = (uint32_t)(tt | (tt >> 32)); + ret = (ret | -ret) >> 31; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + f256_montysquare(t7, t2); + f256_montymul(t6, t1, t7); + f256_montymul(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + f256_montysquare(P1->x, t4); + f256_sub(P1->x, P1->x, t5); + f256_sub(P1->x, P1->x, t6); + f256_sub(P1->x, P1->x, t6); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + f256_sub(t6, t6, P1->x); + f256_montymul(P1->y, t4, t6); + f256_montymul(t1, t5, t3); + f256_sub(P1->y, P1->y, t1); + + /* + * Compute z3 = h*z1*z2. + */ + f256_montymul(P1->z, P1->z, t2); + + return ret; +} + +#if 0 +/* unused */ +/* + * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2. + * This is a specialised function for the case when P2 is a non-zero point + * in affine coordinates. + * + * This function returns the correct result in all cases. + */ +static uint32_t +p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2) +{ + /* + * Addtions formulas, in the general case, are: + * + * u1 = x1 + * u2 = x2 * z1^2 + * s1 = y1 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 + * + * These formulas mishandle the two following cases: + * + * - If P1 is the point-at-infinity (z1 = 0), then z3 is + * incorrectly set to 0. + * + * - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3 + * are all set to 0. + * + * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then + * we correctly get z3 = 0 (the point-at-infinity). + * + * To fix the case P1 = 0, we perform at the end a copy of P2 + * over P1, conditional to z1 = 0. + * + * For P1 = P2: in that case, both h and r are set to 0, and + * we get x3, y3 and z3 equal to 0. We can test for that + * occurrence to make a mask which will be all-one if P1 = P2, + * or all-zero otherwise; then we can compute the double of P2 + * and add it, combined with the mask, to (x3,y3,z3). + * + * Using the doubling formulas in p256_double() on (x2,y2), + * simplifying since P2 is affine (i.e. z2 = 1, implicitly), + * we get: + * s = 4*x2*y2^2 + * m = 3*(x2 + 1)*(x2 - 1) + * x' = m^2 - 2*s + * y' = m*(s - x') - 8*y2^4 + * z' = 2*y2 + * which requires only 6 multiplications. Added to the 11 + * multiplications of the normal mixed addition in Jacobian + * coordinates, we get a cost of 17 multiplications in total. + */ + uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt, zz; + int i; + + /* + * Set zz to -1 if P1 is the point at infinity, 0 otherwise. + */ + zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3] | P1->z[4]; + zz = ((zz | -zz) >> 63) - (uint64_t)1; + + /* + * Compute u1 = x1 (in t1) and s1 = y1 (in t3). + */ + memcpy(t1, P1->x, sizeof t1); + memcpy(t3, P1->y, sizeof t3); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + f256_montysquare(t4, P1->z); + f256_montymul(t2, P2->x, t4); + f256_montymul(t5, P1->z, t4); + f256_montymul(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * reduce. + */ + f256_sub(t2, t2, t1); + f256_sub(t4, t4, t3); + + /* + * If both h = 0 and r = 0, then P1 = P2, and we want to set + * the mask tt to -1; otherwise, the mask will be 0. + */ + f256_final_reduce(t2); + f256_final_reduce(t4); + tt = t2[0] | t2[1] | t2[2] | t2[3] | t2[4] + | t4[0] | t4[1] | t4[2] | t4[3] | t4[4]; + tt = ((tt | -tt) >> 63) - (uint64_t)1; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + f256_montysquare(t7, t2); + f256_montymul(t6, t1, t7); + f256_montymul(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + f256_montysquare(P1->x, t4); + f256_sub(P1->x, P1->x, t5); + f256_sub(P1->x, P1->x, t6); + f256_sub(P1->x, P1->x, t6); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + f256_sub(t6, t6, P1->x); + f256_montymul(P1->y, t4, t6); + f256_montymul(t1, t5, t3); + f256_sub(P1->y, P1->y, t1); + + /* + * Compute z3 = h*z1. + */ + f256_montymul(P1->z, P1->z, t2); + + /* + * The "double" result, in case P1 = P2. + */ + + /* + * Compute z' = 2*y2 (in t1). + */ + f256_add(t1, P2->y, P2->y); + f256_partial_reduce(t1); + + /* + * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3). + */ + f256_montysquare(t2, P2->y); + f256_add(t2, t2, t2); + f256_add(t3, t2, t2); + f256_montymul(t3, P2->x, t3); + + /* + * Compute m = 3*(x2^2 - 1) (in t4). + */ + f256_montysquare(t4, P2->x); + f256_sub(t4, t4, F256_R); + f256_add(t5, t4, t4); + f256_add(t4, t4, t5); + + /* + * Compute x' = m^2 - 2*s (in t5). + */ + f256_montysquare(t5, t4); + f256_sub(t5, t3); + f256_sub(t5, t3); + + /* + * Compute y' = m*(s - x') - 8*y2^4 (in t6). + */ + f256_sub(t6, t3, t5); + f256_montymul(t6, t6, t4); + f256_montysquare(t7, t2); + f256_sub(t6, t6, t7); + f256_sub(t6, t6, t7); + + /* + * We now have the alternate (doubling) coordinates in (t5,t6,t1). + * We combine them with (x3,y3,z3). + */ + for (i = 0; i < 5; i ++) { + P1->x[i] |= tt & t5[i]; + P1->y[i] |= tt & t6[i]; + P1->z[i] |= tt & t1[i]; + } + + /* + * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0, + * then we want to replace the result with a copy of P2. The + * test on z1 was done at the start, in the zz mask. + */ + for (i = 0; i < 5; i ++) { + P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]); + P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]); + P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]); + } +} +#endif + +/* + * Inner function for computing a point multiplication. A window is + * provided, with points 1*P to 15*P in affine coordinates. + * + * Assumptions: + * - All provided points are valid points on the curve. + * - Multiplier is non-zero, and smaller than the curve order. + * - Everything is in Montgomery representation. + */ +static void +point_mul_inner(p256_jacobian *R, const p256_affine *W, + const unsigned char *k, size_t klen) +{ + p256_jacobian Q; + uint32_t qz; + + memset(&Q, 0, sizeof Q); + qz = 1; + while (klen -- > 0) { + int i; + unsigned bk; + + bk = *k ++; + for (i = 0; i < 2; i ++) { + uint32_t bits; + uint32_t bnz; + p256_affine T; + p256_jacobian U; + uint32_t n; + int j; + uint64_t m; + + p256_double(&Q); + p256_double(&Q); + p256_double(&Q); + p256_double(&Q); + bits = (bk >> 4) & 0x0F; + bnz = NEQ(bits, 0); + + /* + * Lookup point in window. If the bits are 0, + * we get something invalid, which is not a + * problem because we will use it only if the + * bits are non-zero. + */ + memset(&T, 0, sizeof T); + for (n = 0; n < 15; n ++) { + m = -(uint64_t)EQ(bits, n + 1); + T.x[0] |= m & W[n].x[0]; + T.x[1] |= m & W[n].x[1]; + T.x[2] |= m & W[n].x[2]; + T.x[3] |= m & W[n].x[3]; + T.x[4] |= m & W[n].x[4]; + T.y[0] |= m & W[n].y[0]; + T.y[1] |= m & W[n].y[1]; + T.y[2] |= m & W[n].y[2]; + T.y[3] |= m & W[n].y[3]; + T.y[4] |= m & W[n].y[4]; + } + + U = Q; + p256_add_mixed(&U, &T); + + /* + * If qz is still 1, then Q was all-zeros, and this + * is conserved through p256_double(). + */ + m = -(uint64_t)(bnz & qz); + for (j = 0; j < 5; j ++) { + Q.x[j] ^= m & (Q.x[j] ^ T.x[j]); + Q.y[j] ^= m & (Q.y[j] ^ T.y[j]); + Q.z[j] ^= m & (Q.z[j] ^ F256_R[j]); + } + CCOPY(bnz & ~qz, &Q, &U, sizeof Q); + qz &= ~bnz; + bk <<= 4; + } + } + *R = Q; +} + +/* + * Convert a window from Jacobian to affine coordinates. A single + * field inversion is used. This function works for windows up to + * 32 elements. + * + * The destination array (aff[]) and the source array (jac[]) may + * overlap, provided that the start of aff[] is not after the start of + * jac[]. Even if the arrays do _not_ overlap, the source array is + * modified. + */ +static void +window_to_affine(p256_affine *aff, p256_jacobian *jac, int num) +{ + /* + * Convert the window points to affine coordinates. We use the + * following trick to mutualize the inversion computation: if + * we have z1, z2, z3, and z4, and want to invert all of them, + * we compute u = 1/(z1*z2*z3*z4), and then we have: + * 1/z1 = u*z2*z3*z4 + * 1/z2 = u*z1*z3*z4 + * 1/z3 = u*z1*z2*z4 + * 1/z4 = u*z1*z2*z3 + * + * The partial products are computed recursively: + * + * - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2 + * - on input (z_1,z_2,... z_n): + * recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1 + * recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2 + * multiply elements of r1 by m2 -> s1 + * multiply elements of r2 by m1 -> s2 + * return r1||r2 and m1*m2 + * + * In the example below, we suppose that we have 14 elements. + * Let z1, z2,... zE be the 14 values to invert (index noted in + * hexadecimal, starting at 1). + * + * - Depth 1: + * swap(z1, z2); z12 = z1*z2 + * swap(z3, z4); z34 = z3*z4 + * swap(z5, z6); z56 = z5*z6 + * swap(z7, z8); z78 = z7*z8 + * swap(z9, zA); z9A = z9*zA + * swap(zB, zC); zBC = zB*zC + * swap(zD, zE); zDE = zD*zE + * + * - Depth 2: + * z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12 + * z1234 = z12*z34 + * z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56 + * z5678 = z56*z78 + * z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A + * z9ABC = z9A*zBC + * + * - Depth 3: + * z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678 + * z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234 + * z12345678 = z1234*z5678 + * z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE + * zD <- zD*z9ABC, zE*z9ABC + * z9ABCDE = z9ABC*zDE + * + * - Depth 4: + * multiply z1..z8 by z9ABCDE + * multiply z9..zE by z12345678 + * final z = z12345678*z9ABCDE + */ + + uint64_t z[16][5]; + int i, k, s; +#define zt (z[15]) +#define zu (z[14]) +#define zv (z[13]) + + /* + * First recursion step (pairwise swapping and multiplication). + * If there is an odd number of elements, then we "invent" an + * extra one with coordinate Z = 1 (in Montgomery representation). + */ + for (i = 0; (i + 1) < num; i += 2) { + memcpy(zt, jac[i].z, sizeof zt); + memcpy(jac[i].z, jac[i + 1].z, sizeof zt); + memcpy(jac[i + 1].z, zt, sizeof zt); + f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z); + } + if ((num & 1) != 0) { + memcpy(z[num >> 1], jac[num - 1].z, sizeof zt); + memcpy(jac[num - 1].z, F256_R, sizeof F256_R); + } + + /* + * Perform further recursion steps. At the entry of each step, + * the process has been done for groups of 's' points. The + * integer k is the log2 of s. + */ + for (k = 1, s = 2; s < num; k ++, s <<= 1) { + int n; + + for (i = 0; i < num; i ++) { + f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]); + } + n = (num + s - 1) >> k; + for (i = 0; i < (n >> 1); i ++) { + f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]); + } + if ((n & 1) != 0) { + memmove(z[n >> 1], z[n], sizeof zt); + } + } + + /* + * Invert the final result, and convert all points. + */ + f256_invert(zt, z[0]); + for (i = 0; i < num; i ++) { + f256_montymul(zv, jac[i].z, zt); + f256_montysquare(zu, zv); + f256_montymul(zv, zv, zu); + f256_montymul(aff[i].x, jac[i].x, zu); + f256_montymul(aff[i].y, jac[i].y, zv); + } +} + +/* + * Multiply the provided point by an integer. + * Assumptions: + * - Source point is a valid curve point. + * - Source point is not the point-at-infinity. + * - Integer is not 0, and is lower than the curve order. + * If these conditions are not met, then the result is indeterminate + * (but the process is still constant-time). + */ +static void +p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen) +{ + union { + p256_affine aff[15]; + p256_jacobian jac[15]; + } window; + int i; + + /* + * Compute window, in Jacobian coordinates. + */ + window.jac[0] = *P; + for (i = 2; i < 16; i ++) { + window.jac[i - 1] = window.jac[(i >> 1) - 1]; + if ((i & 1) == 0) { + p256_double(&window.jac[i - 1]); + } else { + p256_add(&window.jac[i - 1], &window.jac[i >> 1]); + } + } + + /* + * Convert the window points to affine coordinates. Point + * window[0] is the source point, already in affine coordinates. + */ + window_to_affine(window.aff, window.jac, 15); + + /* + * Perform point multiplication. + */ + point_mul_inner(P, window.aff, k, klen); +} + +/* + * Precomputed window for the conventional generator: P256_Gwin[n] + * contains (n+1)*G (affine coordinates, in Montgomery representation). + */ +static const p256_affine P256_Gwin[] = { + { + { 0x30D418A9143C1, 0xC4FEDB60179E7, 0x62251075BA95F, + 0x5C669FB732B77, 0x08905F76B5375 }, + { 0x5357CE95560A8, 0x43A19E45CDDF2, 0x21F3258B4AB8E, + 0xD8552E88688DD, 0x0571FF18A5885 } + }, + { + { 0x46D410DDD64DF, 0x0B433827D8500, 0x1490D9AA6AE3C, + 0xA3A832205038D, 0x06BB32E52DCF3 }, + { 0x48D361BEE1A57, 0xB7B236FF82F36, 0x042DBE152CD7C, + 0xA3AA9A8FB0E92, 0x08C577517A5B8 } + }, + { + { 0x3F904EEBC1272, 0x9E87D81FBFFAC, 0xCBBC98B027F84, + 0x47E46AD77DD87, 0x06936A3FD6FF7 }, + { 0x5C1FC983A7EBD, 0xC3861FE1AB04C, 0x2EE98E583E47A, + 0xC06A88208311A, 0x05F06A2AB587C } + }, + { + { 0xB50D46918DCC5, 0xD7623C17374B0, 0x100AF24650A6E, + 0x76ABCDAACACE8, 0x077362F591B01 }, + { 0xF24CE4CBABA68, 0x17AD6F4472D96, 0xDDD22E1762847, + 0x862EB6C36DEE5, 0x04B14C39CC5AB } + }, + { + { 0x8AAEC45C61F5C, 0x9D4B9537DBE1B, 0x76C20C90EC649, + 0x3C7D41CB5AAD0, 0x0907960649052 }, + { 0x9B4AE7BA4F107, 0xF75EB882BEB30, 0x7A1F6873C568E, + 0x915C540A9877E, 0x03A076BB9DD1E } + }, + { + { 0x47373E77664A1, 0xF246CEE3E4039, 0x17A3AD55AE744, + 0x673C50A961A5B, 0x03074B5964213 }, + { 0x6220D377E44BA, 0x30DFF14B593D3, 0x639F11299C2B5, + 0x75F5424D44CEF, 0x04C9916DEA07F } + }, + { + { 0x354EA0173B4F1, 0x3C23C00F70746, 0x23BB082BD2021, + 0xE03E43EAAB50C, 0x03BA5119D3123 }, + { 0xD0303F5B9D4DE, 0x17DA67BDD2847, 0xC941956742F2F, + 0x8670F933BDC77, 0x0AEDD9164E240 } + }, + { + { 0x4CD19499A78FB, 0x4BF9B345527F1, 0x2CFC6B462AB5C, + 0x30CDF90F02AF0, 0x0763891F62652 }, + { 0xA3A9532D49775, 0xD7F9EBA15F59D, 0x60BBF021E3327, + 0xF75C23C7B84BE, 0x06EC12F2C706D } + }, + { + { 0x6E8F264E20E8E, 0xC79A7A84175C9, 0xC8EB00ABE6BFE, + 0x16A4CC09C0444, 0x005B3081D0C4E }, + { 0x777AA45F33140, 0xDCE5D45E31EB7, 0xB12F1A56AF7BE, + 0xF9B2B6E019A88, 0x086659CDFD835 } + }, + { + { 0xDBD19DC21EC8C, 0x94FCF81392C18, 0x250B4998F9868, + 0x28EB37D2CD648, 0x0C61C947E4B34 }, + { 0x407880DD9E767, 0x0C83FBE080C2B, 0x9BE5D2C43A899, + 0xAB4EF7D2D6577, 0x08719A555B3B4 } + }, + { + { 0x260A6245E4043, 0x53E7FDFE0EA7D, 0xAC1AB59DE4079, + 0x072EFF3A4158D, 0x0E7090F1949C9 }, + { 0x85612B944E886, 0xE857F61C81A76, 0xAD643D250F939, + 0x88DAC0DAA891E, 0x089300244125B } + }, + { + { 0x1AA7D26977684, 0x58A345A3304B7, 0x37385EABDEDEF, + 0x155E409D29DEE, 0x0EE1DF780B83E }, + { 0x12D91CBB5B437, 0x65A8956370CAC, 0xDE6D66170ED2F, + 0xAC9B8228CFA8A, 0x0FF57C95C3238 } + }, + { + { 0x25634B2ED7097, 0x9156FD30DCCC4, 0x9E98110E35676, + 0x7594CBCD43F55, 0x038477ACC395B }, + { 0x2B90C00EE17FF, 0xF842ED2E33575, 0x1F5BC16874838, + 0x7968CD06422BD, 0x0BC0876AB9E7B } + }, + { + { 0xA35BB0CF664AF, 0x68F9707E3A242, 0x832660126E48F, + 0x72D2717BF54C6, 0x0AAE7333ED12C }, + { 0x2DB7995D586B1, 0xE732237C227B5, 0x65E7DBBE29569, + 0xBBBD8E4193E2A, 0x052706DC3EAA1 } + }, + { + { 0xD8B7BC60055BE, 0xD76E27E4B72BC, 0x81937003CC23E, + 0xA090E337424E4, 0x02AA0E43EAD3D }, + { 0x524F6383C45D2, 0x422A41B2540B8, 0x8A4797D766355, + 0xDF444EFA6DE77, 0x0042170A9079A } + }, +}; + +/* + * Multiply the conventional generator of the curve by the provided + * integer. Return is written in *P. + * + * Assumptions: + * - Integer is not 0, and is lower than the curve order. + * If this conditions is not met, then the result is indeterminate + * (but the process is still constant-time). + */ +static void +p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen) +{ + point_mul_inner(P, P256_Gwin, k, klen); +} + +/* + * Return 1 if all of the following hold: + * - klen <= 32 + * - k != 0 + * - k is lower than the curve order + * Otherwise, return 0. + * + * Constant-time behaviour: only klen may be observable. + */ +static uint32_t +check_scalar(const unsigned char *k, size_t klen) +{ + uint32_t z; + int32_t c; + size_t u; + + if (klen > 32) { + return 0; + } + z = 0; + for (u = 0; u < klen; u ++) { + z |= k[u]; + } + if (klen == 32) { + c = 0; + for (u = 0; u < klen; u ++) { + c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]); + } + } else { + c = -1; + } + return NEQ(z, 0) & LT0(c); +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *k, size_t klen, int curve) +{ + uint32_t r; + p256_jacobian P; + + (void)curve; + if (Glen != 65) { + return 0; + } + r = check_scalar(k, klen); + r &= point_decode(&P, G); + p256_mul(&P, k, klen); + r &= point_encode(G, &P); + return r; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *k, size_t klen, int curve) +{ + p256_jacobian P; + + (void)curve; + p256_mulgen(&P, k, klen); + point_encode(R, &P); + return 65; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + /* + * We might want to use Shamir's trick here: make a composite + * window of u*P+v*Q points, to merge the two doubling-ladders + * into one. This, however, has some complications: + * + * - During the computation, we may hit the point-at-infinity. + * Thus, we would need p256_add_complete_mixed() (complete + * formulas for point addition), with a higher cost (17 muls + * instead of 11). + * + * - A 4-bit window would be too large, since it would involve + * 16*16-1 = 255 points. For the same window size as in the + * p256_mul() case, we would need to reduce the window size + * to 2 bits, and thus perform twice as many non-doubling + * point additions. + * + * - The window may itself contain the point-at-infinity, and + * thus cannot be in all generality be made of affine points. + * Instead, we would need to make it a window of points in + * Jacobian coordinates. Even p256_add_complete_mixed() would + * be inappropriate. + * + * For these reasons, the code below performs two separate + * point multiplications, then computes the final point addition + * (which is both a "normal" addition, and a doubling, to handle + * all cases). + */ + + p256_jacobian P, Q; + uint32_t r, t, s; + uint64_t z; + + (void)curve; + if (len != 65) { + return 0; + } + r = point_decode(&P, A); + p256_mul(&P, x, xlen); + if (B == NULL) { + p256_mulgen(&Q, y, ylen); + } else { + r &= point_decode(&Q, B); + p256_mul(&Q, y, ylen); + } + + /* + * The final addition may fail in case both points are equal. + */ + t = p256_add(&P, &Q); + f256_final_reduce(P.z); + z = P.z[0] | P.z[1] | P.z[2] | P.z[3] | P.z[4]; + s = EQ((uint32_t)(z | (z >> 32)), 0); + p256_double(&Q); + + /* + * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we + * have the following: + * + * s = 0, t = 0 return P (normal addition) + * s = 0, t = 1 return P (normal addition) + * s = 1, t = 0 return Q (a 'double' case) + * s = 1, t = 1 report an error (P+Q = 0) + */ + CCOPY(s & ~t, &P, &Q, sizeof Q); + point_encode(A, &P); + r &= ~(s & t); + return r; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_p256_m62 = { + (uint32_t)0x00800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_p256_m62_get(void) +{ + return &br_ec_p256_m62; +} + +#else + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_p256_m62_get(void) +{ + return 0; +} + +#endif diff --git a/test/monniaux/BearSSL/src/ec/ec_p256_m64.c b/test/monniaux/BearSSL/src/ec/ec_p256_m64.c new file mode 100644 index 00000000..5a7ea177 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_p256_m64.c @@ -0,0 +1,1730 @@ +/* + * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#if BR_INT128 || BR_UMUL128 + +#if BR_UMUL128 +#include <intrin.h> +#endif + +static const unsigned char P256_G[] = { + 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8, + 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D, + 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, + 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F, + 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B, + 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40, + 0x68, 0x37, 0xBF, 0x51, 0xF5 +}; + +static const unsigned char P256_N[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD, + 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, + 0x25, 0x51 +}; + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + (void)curve; + *len = sizeof P256_G; + return P256_G; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + (void)curve; + *len = sizeof P256_N; + return P256_N; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + (void)curve; + *len = 32; + return 1; +} + +/* + * A field element is encoded as four 64-bit integers, in basis 2^64. + * Values may reach up to 2^256-1. Montgomery multiplication is used. + */ + +/* R = 2^256 mod p */ +static const uint64_t F256_R[] = { + 0x0000000000000001, 0xFFFFFFFF00000000, + 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFE +}; + +/* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p + (Montgomery representation of B). */ +static const uint64_t P256_B_MONTY[] = { + 0xD89CDF6229C4BDDF, 0xACF005CD78843090, + 0xE5A220ABF7212ED6, 0xDC30061D04874834 +}; + +/* + * Addition in the field. + */ +static inline void +f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ +#if BR_INT128 + unsigned __int128 w; + uint64_t t; + + w = (unsigned __int128)a[0] + b[0]; + d[0] = (uint64_t)w; + w = (unsigned __int128)a[1] + b[1] + (w >> 64); + d[1] = (uint64_t)w; + w = (unsigned __int128)a[2] + b[2] + (w >> 64); + d[2] = (uint64_t)w; + w = (unsigned __int128)a[3] + b[3] + (w >> 64); + d[3] = (uint64_t)w; + t = (uint64_t)(w >> 64); + + /* + * 2^256 = 2^224 - 2^192 - 2^96 + 1 in the field. + */ + w = (unsigned __int128)d[0] + t; + d[0] = (uint64_t)w; + w = (unsigned __int128)d[1] + (w >> 64) - (t << 32); + d[1] = (uint64_t)w; + /* Here, carry "w >> 64" can only be 0 or -1 */ + w = (unsigned __int128)d[2] - ((w >> 64) & 1); + d[2] = (uint64_t)w; + /* Again, carry is 0 or -1 */ + d[3] += (uint64_t)(w >> 64) + (t << 32) - t; + +#elif BR_UMUL128 + + unsigned char cc; + uint64_t t; + + cc = _addcarry_u64(0, a[0], b[0], &d[0]); + cc = _addcarry_u64(cc, a[1], b[1], &d[1]); + cc = _addcarry_u64(cc, a[2], b[2], &d[2]); + cc = _addcarry_u64(cc, a[3], b[3], &d[3]); + + /* + * If there is a carry, then we want to subtract p, which we + * do by adding 2^256 - p. + */ + t = cc; + cc = _addcarry_u64(cc, d[0], 0, &d[0]); + cc = _addcarry_u64(cc, d[1], -(t << 32), &d[1]); + cc = _addcarry_u64(cc, d[2], -t, &d[2]); + (void)_addcarry_u64(cc, d[3], (t << 32) - (t << 1), &d[3]); + +#endif +} + +/* + * Subtraction in the field. + */ +static inline void +f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ +#if BR_INT128 + + unsigned __int128 w; + uint64_t t; + + w = (unsigned __int128)a[0] - b[0]; + d[0] = (uint64_t)w; + w = (unsigned __int128)a[1] - b[1] - ((w >> 64) & 1); + d[1] = (uint64_t)w; + w = (unsigned __int128)a[2] - b[2] - ((w >> 64) & 1); + d[2] = (uint64_t)w; + w = (unsigned __int128)a[3] - b[3] - ((w >> 64) & 1); + d[3] = (uint64_t)w; + t = (uint64_t)(w >> 64) & 1; + + /* + * p = 2^256 - 2^224 + 2^192 + 2^96 - 1. + */ + w = (unsigned __int128)d[0] - t; + d[0] = (uint64_t)w; + w = (unsigned __int128)d[1] + (t << 32) - ((w >> 64) & 1); + d[1] = (uint64_t)w; + /* Here, carry "w >> 64" can only be 0 or +1 */ + w = (unsigned __int128)d[2] + (w >> 64); + d[2] = (uint64_t)w; + /* Again, carry is 0 or +1 */ + d[3] += (uint64_t)(w >> 64) - (t << 32) + t; + +#elif BR_UMUL128 + + unsigned char cc; + uint64_t t; + + cc = _subborrow_u64(0, a[0], b[0], &d[0]); + cc = _subborrow_u64(cc, a[1], b[1], &d[1]); + cc = _subborrow_u64(cc, a[2], b[2], &d[2]); + cc = _subborrow_u64(cc, a[3], b[3], &d[3]); + + /* + * If there is a carry, then we need to add p. + */ + t = cc; + cc = _addcarry_u64(0, d[0], -t, &d[0]); + cc = _addcarry_u64(cc, d[1], (-t) >> 32, &d[1]); + cc = _addcarry_u64(cc, d[2], 0, &d[2]); + (void)_addcarry_u64(cc, d[3], t - (t << 32), &d[3]); + +#endif +} + +/* + * Montgomery multiplication in the field. + */ +static void +f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b) +{ +#if BR_INT128 + + uint64_t x, f, t0, t1, t2, t3, t4; + unsigned __int128 z, ff; + int i; + + /* + * When computing d <- d + a[u]*b, we also add f*p such + * that d + a[u]*b + f*p is a multiple of 2^64. Since + * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64. + */ + + /* + * Step 1: t <- (a[0]*b + f*p) / 2^64 + * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this + * ensures that (a[0]*b + f*p) is a multiple of 2^64. + * + * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f. + */ + x = a[0]; + z = (unsigned __int128)b[0] * x; + f = (uint64_t)z; + z = (unsigned __int128)b[1] * x + (z >> 64) + (uint64_t)(f << 32); + t0 = (uint64_t)z; + z = (unsigned __int128)b[2] * x + (z >> 64) + (uint64_t)(f >> 32); + t1 = (uint64_t)z; + z = (unsigned __int128)b[3] * x + (z >> 64) + f; + t2 = (uint64_t)z; + t3 = (uint64_t)(z >> 64); + ff = ((unsigned __int128)f << 64) - ((unsigned __int128)f << 32); + z = (unsigned __int128)t2 + (uint64_t)ff; + t2 = (uint64_t)z; + z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64); + t3 = (uint64_t)z; + t4 = (uint64_t)(z >> 64); + + /* + * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64 + */ + for (i = 1; i < 4; i ++) { + x = a[i]; + + /* t <- (t + x*b - f) / 2^64 */ + z = (unsigned __int128)b[0] * x + t0; + f = (uint64_t)z; + z = (unsigned __int128)b[1] * x + t1 + (z >> 64); + t0 = (uint64_t)z; + z = (unsigned __int128)b[2] * x + t2 + (z >> 64); + t1 = (uint64_t)z; + z = (unsigned __int128)b[3] * x + t3 + (z >> 64); + t2 = (uint64_t)z; + z = t4 + (z >> 64); + t3 = (uint64_t)z; + t4 = (uint64_t)(z >> 64); + + /* t <- t + f*2^32, carry in the upper half of z */ + z = (unsigned __int128)t0 + (uint64_t)(f << 32); + t0 = (uint64_t)z; + z = (z >> 64) + (unsigned __int128)t1 + (uint64_t)(f >> 32); + t1 = (uint64_t)z; + + /* t <- t + f*2^192 - f*2^160 + f*2^128 */ + ff = ((unsigned __int128)f << 64) + - ((unsigned __int128)f << 32) + f; + z = (z >> 64) + (unsigned __int128)t2 + (uint64_t)ff; + t2 = (uint64_t)z; + z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64); + t3 = (uint64_t)z; + t4 += (uint64_t)(z >> 64); + } + + /* + * At that point, we have computed t = (a*b + F*p) / 2^256, where + * F is a 256-bit integer whose limbs are the "f" coefficients + * in the steps above. We have: + * a <= 2^256-1 + * b <= 2^256-1 + * F <= 2^256-1 + * Hence: + * a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1) + * a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p + * Therefore: + * t < 2^256 + p - 2 + * Since p < 2^256, it follows that: + * t4 can be only 0 or 1 + * t - p < 2^256 + * We can therefore subtract p from t, conditionally on t4, to + * get a nonnegative result that fits on 256 bits. + */ + z = (unsigned __int128)t0 + t4; + t0 = (uint64_t)z; + z = (unsigned __int128)t1 - (t4 << 32) + (z >> 64); + t1 = (uint64_t)z; + z = (unsigned __int128)t2 - (z >> 127); + t2 = (uint64_t)z; + t3 = t3 - (uint64_t)(z >> 127) - t4 + (t4 << 32); + + d[0] = t0; + d[1] = t1; + d[2] = t2; + d[3] = t3; + +#elif BR_UMUL128 + + uint64_t x, f, t0, t1, t2, t3, t4; + uint64_t zl, zh, ffl, ffh; + unsigned char k, m; + int i; + + /* + * When computing d <- d + a[u]*b, we also add f*p such + * that d + a[u]*b + f*p is a multiple of 2^64. Since + * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64. + */ + + /* + * Step 1: t <- (a[0]*b + f*p) / 2^64 + * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this + * ensures that (a[0]*b + f*p) is a multiple of 2^64. + * + * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f. + */ + x = a[0]; + + zl = _umul128(b[0], x, &zh); + f = zl; + t0 = zh; + + zl = _umul128(b[1], x, &zh); + k = _addcarry_u64(0, zl, t0, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + k = _addcarry_u64(0, zl, f << 32, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + t0 = zl; + t1 = zh; + + zl = _umul128(b[2], x, &zh); + k = _addcarry_u64(0, zl, t1, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + k = _addcarry_u64(0, zl, f >> 32, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + t1 = zl; + t2 = zh; + + zl = _umul128(b[3], x, &zh); + k = _addcarry_u64(0, zl, t2, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + k = _addcarry_u64(0, zl, f, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + t2 = zl; + t3 = zh; + + t4 = _addcarry_u64(0, t3, f, &t3); + k = _subborrow_u64(0, t2, f << 32, &t2); + k = _subborrow_u64(k, t3, f >> 32, &t3); + (void)_subborrow_u64(k, t4, 0, &t4); + + /* + * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64 + */ + for (i = 1; i < 4; i ++) { + x = a[i]; + /* f = t0 + x * b[0]; -- computed below */ + + /* t <- (t + x*b - f) / 2^64 */ + zl = _umul128(b[0], x, &zh); + k = _addcarry_u64(0, zl, t0, &f); + (void)_addcarry_u64(k, zh, 0, &t0); + + zl = _umul128(b[1], x, &zh); + k = _addcarry_u64(0, zl, t0, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + k = _addcarry_u64(0, zl, t1, &t0); + (void)_addcarry_u64(k, zh, 0, &t1); + + zl = _umul128(b[2], x, &zh); + k = _addcarry_u64(0, zl, t1, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + k = _addcarry_u64(0, zl, t2, &t1); + (void)_addcarry_u64(k, zh, 0, &t2); + + zl = _umul128(b[3], x, &zh); + k = _addcarry_u64(0, zl, t2, &zl); + (void)_addcarry_u64(k, zh, 0, &zh); + k = _addcarry_u64(0, zl, t3, &t2); + (void)_addcarry_u64(k, zh, 0, &t3); + + t4 = _addcarry_u64(0, t3, t4, &t3); + + /* t <- t + f*2^32, carry in k */ + k = _addcarry_u64(0, t0, f << 32, &t0); + k = _addcarry_u64(k, t1, f >> 32, &t1); + + /* t <- t + f*2^192 - f*2^160 + f*2^128 */ + m = _subborrow_u64(0, f, f << 32, &ffl); + (void)_subborrow_u64(m, f, f >> 32, &ffh); + k = _addcarry_u64(k, t2, ffl, &t2); + k = _addcarry_u64(k, t3, ffh, &t3); + (void)_addcarry_u64(k, t4, 0, &t4); + } + + /* + * At that point, we have computed t = (a*b + F*p) / 2^256, where + * F is a 256-bit integer whose limbs are the "f" coefficients + * in the steps above. We have: + * a <= 2^256-1 + * b <= 2^256-1 + * F <= 2^256-1 + * Hence: + * a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1) + * a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p + * Therefore: + * t < 2^256 + p - 2 + * Since p < 2^256, it follows that: + * t4 can be only 0 or 1 + * t - p < 2^256 + * We can therefore subtract p from t, conditionally on t4, to + * get a nonnegative result that fits on 256 bits. + */ + k = _addcarry_u64(0, t0, t4, &t0); + k = _addcarry_u64(k, t1, -(t4 << 32), &t1); + k = _addcarry_u64(k, t2, -t4, &t2); + (void)_addcarry_u64(k, t3, (t4 << 32) - (t4 << 1), &t3); + + d[0] = t0; + d[1] = t1; + d[2] = t2; + d[3] = t3; + +#endif +} + +/* + * Montgomery squaring in the field; currently a basic wrapper around + * multiplication (inline, should be optimized away). + * TODO: see if some extra speed can be gained here. + */ +static inline void +f256_montysquare(uint64_t *d, const uint64_t *a) +{ + f256_montymul(d, a, a); +} + +/* + * Convert to Montgomery representation. + */ +static void +f256_tomonty(uint64_t *d, const uint64_t *a) +{ + /* + * R2 = 2^512 mod p. + * If R = 2^256 mod p, then R2 = R^2 mod p; and the Montgomery + * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the + * conversion to Montgomery representation. + */ + static const uint64_t R2[] = { + 0x0000000000000003, + 0xFFFFFFFBFFFFFFFF, + 0xFFFFFFFFFFFFFFFE, + 0x00000004FFFFFFFD + }; + + f256_montymul(d, a, R2); +} + +/* + * Convert from Montgomery representation. + */ +static void +f256_frommonty(uint64_t *d, const uint64_t *a) +{ + /* + * Montgomery multiplication by 1 is division by 2^256 modulo p. + */ + static const uint64_t one[] = { 1, 0, 0, 0 }; + + f256_montymul(d, a, one); +} + +/* + * Inversion in the field. If the source value is 0 modulo p, then this + * returns 0 or p. This function uses Montgomery representation. + */ +static void +f256_invert(uint64_t *d, const uint64_t *a) +{ + /* + * We compute a^(p-2) mod p. The exponent pattern (from high to + * low) is: + * - 32 bits of value 1 + * - 31 bits of value 0 + * - 1 bit of value 1 + * - 96 bits of value 0 + * - 94 bits of value 1 + * - 1 bit of value 0 + * - 1 bit of value 1 + * To speed up the square-and-multiply algorithm, we precompute + * a^(2^31-1). + */ + + uint64_t r[4], t[4]; + int i; + + memcpy(t, a, sizeof t); + for (i = 0; i < 30; i ++) { + f256_montysquare(t, t); + f256_montymul(t, t, a); + } + + memcpy(r, t, sizeof t); + for (i = 224; i >= 0; i --) { + f256_montysquare(r, r); + switch (i) { + case 0: + case 2: + case 192: + case 224: + f256_montymul(r, r, a); + break; + case 3: + case 34: + case 65: + f256_montymul(r, r, t); + break; + } + } + memcpy(d, r, sizeof r); +} + +/* + * Finalize reduction. + * Input value fits on 256 bits. This function subtracts p if and only + * if the input is greater than or equal to p. + */ +static inline void +f256_final_reduce(uint64_t *a) +{ +#if BR_INT128 + + uint64_t t0, t1, t2, t3, cc; + unsigned __int128 z; + + /* + * We add 2^224 - 2^192 - 2^96 + 1 to a. If there is no carry, + * then a < p; otherwise, the addition result we computed is + * the value we must return. + */ + z = (unsigned __int128)a[0] + 1; + t0 = (uint64_t)z; + z = (unsigned __int128)a[1] + (z >> 64) - ((uint64_t)1 << 32); + t1 = (uint64_t)z; + z = (unsigned __int128)a[2] - (z >> 127); + t2 = (uint64_t)z; + z = (unsigned __int128)a[3] - (z >> 127) + 0xFFFFFFFF; + t3 = (uint64_t)z; + cc = -(uint64_t)(z >> 64); + + a[0] ^= cc & (a[0] ^ t0); + a[1] ^= cc & (a[1] ^ t1); + a[2] ^= cc & (a[2] ^ t2); + a[3] ^= cc & (a[3] ^ t3); + +#elif BR_UMUL128 + + uint64_t t0, t1, t2, t3, m; + unsigned char k; + + k = _addcarry_u64(0, a[0], (uint64_t)1, &t0); + k = _addcarry_u64(k, a[1], -((uint64_t)1 << 32), &t1); + k = _addcarry_u64(k, a[2], -(uint64_t)1, &t2); + k = _addcarry_u64(k, a[3], ((uint64_t)1 << 32) - 2, &t3); + m = -(uint64_t)k; + + a[0] ^= m & (a[0] ^ t0); + a[1] ^= m & (a[1] ^ t1); + a[2] ^= m & (a[2] ^ t2); + a[3] ^= m & (a[3] ^ t3); + +#endif +} + +/* + * Points in affine and Jacobian coordinates. + * + * - In affine coordinates, the point-at-infinity cannot be encoded. + * - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3); + * if Z = 0 then this is the point-at-infinity. + */ +typedef struct { + uint64_t x[4]; + uint64_t y[4]; +} p256_affine; + +typedef struct { + uint64_t x[4]; + uint64_t y[4]; + uint64_t z[4]; +} p256_jacobian; + +/* + * Decode a point. The returned point is in Jacobian coordinates, but + * with z = 1. If the encoding is invalid, or encodes a point which is + * not on the curve, or encodes the point at infinity, then this function + * returns 0. Otherwise, 1 is returned. + * + * The buffer is assumed to have length exactly 65 bytes. + */ +static uint32_t +point_decode(p256_jacobian *P, const unsigned char *buf) +{ + uint64_t x[4], y[4], t[4], x3[4], tt; + uint32_t r; + + /* + * Header byte shall be 0x04. + */ + r = EQ(buf[0], 0x04); + + /* + * Decode X and Y coordinates, and convert them into + * Montgomery representation. + */ + x[3] = br_dec64be(buf + 1); + x[2] = br_dec64be(buf + 9); + x[1] = br_dec64be(buf + 17); + x[0] = br_dec64be(buf + 25); + y[3] = br_dec64be(buf + 33); + y[2] = br_dec64be(buf + 41); + y[1] = br_dec64be(buf + 49); + y[0] = br_dec64be(buf + 57); + f256_tomonty(x, x); + f256_tomonty(y, y); + + /* + * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3. + * Note that the Montgomery representation of 0 is 0. We must + * take care to apply the final reduction to make sure we have + * 0 and not p. + */ + f256_montysquare(t, y); + f256_montysquare(x3, x); + f256_montymul(x3, x3, x); + f256_sub(t, t, x3); + f256_add(t, t, x); + f256_add(t, t, x); + f256_add(t, t, x); + f256_sub(t, t, P256_B_MONTY); + f256_final_reduce(t); + tt = t[0] | t[1] | t[2] | t[3]; + r &= EQ((uint32_t)(tt | (tt >> 32)), 0); + + /* + * Return the point in Jacobian coordinates (and Montgomery + * representation). + */ + memcpy(P->x, x, sizeof x); + memcpy(P->y, y, sizeof y); + memcpy(P->z, F256_R, sizeof F256_R); + return r; +} + +/* + * Final conversion for a point: + * - The point is converted back to affine coordinates. + * - Final reduction is performed. + * - The point is encoded into the provided buffer. + * + * If the point is the point-at-infinity, all operations are performed, + * but the buffer contents are indeterminate, and 0 is returned. Otherwise, + * the encoded point is written in the buffer, and 1 is returned. + */ +static uint32_t +point_encode(unsigned char *buf, const p256_jacobian *P) +{ + uint64_t t1[4], t2[4], z; + + /* Set t1 = 1/z^2 and t2 = 1/z^3. */ + f256_invert(t2, P->z); + f256_montysquare(t1, t2); + f256_montymul(t2, t2, t1); + + /* Compute affine coordinates x (in t1) and y (in t2). */ + f256_montymul(t1, P->x, t1); + f256_montymul(t2, P->y, t2); + + /* Convert back from Montgomery representation, and finalize + reductions. */ + f256_frommonty(t1, t1); + f256_frommonty(t2, t2); + f256_final_reduce(t1); + f256_final_reduce(t2); + + /* Encode. */ + buf[0] = 0x04; + br_enc64be(buf + 1, t1[3]); + br_enc64be(buf + 9, t1[2]); + br_enc64be(buf + 17, t1[1]); + br_enc64be(buf + 25, t1[0]); + br_enc64be(buf + 33, t2[3]); + br_enc64be(buf + 41, t2[2]); + br_enc64be(buf + 49, t2[1]); + br_enc64be(buf + 57, t2[0]); + + /* Return success if and only if P->z != 0. */ + z = P->z[0] | P->z[1] | P->z[2] | P->z[3]; + return NEQ((uint32_t)(z | z >> 32), 0); +} + +/* + * Point doubling in Jacobian coordinates: point P is doubled. + * Note: if the source point is the point-at-infinity, then the result is + * still the point-at-infinity, which is correct. Moreover, if the three + * coordinates were zero, then they still are zero in the returned value. + * + * (Note: this is true even without the final reduction: if the three + * coordinates are encoded as four words of value zero each, then the + * result will also have all-zero coordinate encodings, not the alternate + * encoding as the integer p.) + */ +static void +p256_double(p256_jacobian *P) +{ + /* + * Doubling formulas are: + * + * s = 4*x*y^2 + * m = 3*(x + z^2)*(x - z^2) + * x' = m^2 - 2*s + * y' = m*(s - x') - 8*y^4 + * z' = 2*y*z + * + * These formulas work for all points, including points of order 2 + * and points at infinity: + * - If y = 0 then z' = 0. But there is no such point in P-256 + * anyway. + * - If z = 0 then z' = 0. + */ + uint64_t t1[4], t2[4], t3[4], t4[4]; + + /* + * Compute z^2 in t1. + */ + f256_montysquare(t1, P->z); + + /* + * Compute x-z^2 in t2 and x+z^2 in t1. + */ + f256_add(t2, P->x, t1); + f256_sub(t1, P->x, t1); + + /* + * Compute 3*(x+z^2)*(x-z^2) in t1. + */ + f256_montymul(t3, t1, t2); + f256_add(t1, t3, t3); + f256_add(t1, t3, t1); + + /* + * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3). + */ + f256_montysquare(t3, P->y); + f256_add(t3, t3, t3); + f256_montymul(t2, P->x, t3); + f256_add(t2, t2, t2); + + /* + * Compute x' = m^2 - 2*s. + */ + f256_montysquare(P->x, t1); + f256_sub(P->x, P->x, t2); + f256_sub(P->x, P->x, t2); + + /* + * Compute z' = 2*y*z. + */ + f256_montymul(t4, P->y, P->z); + f256_add(P->z, t4, t4); + + /* + * Compute y' = m*(s - x') - 8*y^4. Note that we already have + * 2*y^2 in t3. + */ + f256_sub(t2, t2, P->x); + f256_montymul(P->y, t1, t2); + f256_montysquare(t4, t3); + f256_add(t4, t4, t4); + f256_sub(P->y, P->y, t4); +} + +/* + * Point addition (Jacobian coordinates): P1 is replaced with P1+P2. + * This function computes the wrong result in the following cases: + * + * - If P1 == 0 but P2 != 0 + * - If P1 != 0 but P2 == 0 + * - If P1 == P2 + * + * In all three cases, P1 is set to the point at infinity. + * + * Returned value is 0 if one of the following occurs: + * + * - P1 and P2 have the same Y coordinate. + * - P1 == 0 and P2 == 0. + * - The Y coordinate of one of the points is 0 and the other point is + * the point at infinity. + * + * The third case cannot actually happen with valid points, since a point + * with Y == 0 is a point of order 2, and there is no point of order 2 on + * curve P-256. + * + * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller + * can apply the following: + * + * - If the result is not the point at infinity, then it is correct. + * - Otherwise, if the returned value is 1, then this is a case of + * P1+P2 == 0, so the result is indeed the point at infinity. + * - Otherwise, P1 == P2, so a "double" operation should have been + * performed. + * + * Note that you can get a returned value of 0 with a correct result, + * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates. + */ +static uint32_t +p256_add(p256_jacobian *P1, const p256_jacobian *P2) +{ + /* + * Addtions formulas are: + * + * u1 = x1 * z2^2 + * u2 = x2 * z1^2 + * s1 = y1 * z2^3 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 * z2 + */ + uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt; + uint32_t ret; + + /* + * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). + */ + f256_montysquare(t3, P2->z); + f256_montymul(t1, P1->x, t3); + f256_montymul(t4, P2->z, t3); + f256_montymul(t3, P1->y, t4); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + f256_montysquare(t4, P1->z); + f256_montymul(t2, P2->x, t4); + f256_montymul(t5, P1->z, t4); + f256_montymul(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * We need to test whether r is zero, so we will do some extra + * reduce. + */ + f256_sub(t2, t2, t1); + f256_sub(t4, t4, t3); + f256_final_reduce(t4); + tt = t4[0] | t4[1] | t4[2] | t4[3]; + ret = (uint32_t)(tt | (tt >> 32)); + ret = (ret | -ret) >> 31; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + f256_montysquare(t7, t2); + f256_montymul(t6, t1, t7); + f256_montymul(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + f256_montysquare(P1->x, t4); + f256_sub(P1->x, P1->x, t5); + f256_sub(P1->x, P1->x, t6); + f256_sub(P1->x, P1->x, t6); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + f256_sub(t6, t6, P1->x); + f256_montymul(P1->y, t4, t6); + f256_montymul(t1, t5, t3); + f256_sub(P1->y, P1->y, t1); + + /* + * Compute z3 = h*z1*z2. + */ + f256_montymul(t1, P1->z, P2->z); + f256_montymul(P1->z, t1, t2); + + return ret; +} + +/* + * Point addition (mixed coordinates): P1 is replaced with P1+P2. + * This is a specialised function for the case when P2 is a non-zero point + * in affine coordinates. + * + * This function computes the wrong result in the following cases: + * + * - If P1 == 0 + * - If P1 == P2 + * + * In both cases, P1 is set to the point at infinity. + * + * Returned value is 0 if one of the following occurs: + * + * - P1 and P2 have the same Y (affine) coordinate. + * - The Y coordinate of P2 is 0 and P1 is the point at infinity. + * + * The second case cannot actually happen with valid points, since a point + * with Y == 0 is a point of order 2, and there is no point of order 2 on + * curve P-256. + * + * Therefore, assuming that P1 != 0 on input, then the caller + * can apply the following: + * + * - If the result is not the point at infinity, then it is correct. + * - Otherwise, if the returned value is 1, then this is a case of + * P1+P2 == 0, so the result is indeed the point at infinity. + * - Otherwise, P1 == P2, so a "double" operation should have been + * performed. + * + * Again, a value of 0 may be returned in some cases where the addition + * result is correct. + */ +static uint32_t +p256_add_mixed(p256_jacobian *P1, const p256_affine *P2) +{ + /* + * Addtions formulas are: + * + * u1 = x1 + * u2 = x2 * z1^2 + * s1 = y1 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 + */ + uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt; + uint32_t ret; + + /* + * Compute u1 = x1 (in t1) and s1 = y1 (in t3). + */ + memcpy(t1, P1->x, sizeof t1); + memcpy(t3, P1->y, sizeof t3); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + f256_montysquare(t4, P1->z); + f256_montymul(t2, P2->x, t4); + f256_montymul(t5, P1->z, t4); + f256_montymul(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * We need to test whether r is zero, so we will do some extra + * reduce. + */ + f256_sub(t2, t2, t1); + f256_sub(t4, t4, t3); + f256_final_reduce(t4); + tt = t4[0] | t4[1] | t4[2] | t4[3]; + ret = (uint32_t)(tt | (tt >> 32)); + ret = (ret | -ret) >> 31; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + f256_montysquare(t7, t2); + f256_montymul(t6, t1, t7); + f256_montymul(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + f256_montysquare(P1->x, t4); + f256_sub(P1->x, P1->x, t5); + f256_sub(P1->x, P1->x, t6); + f256_sub(P1->x, P1->x, t6); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + f256_sub(t6, t6, P1->x); + f256_montymul(P1->y, t4, t6); + f256_montymul(t1, t5, t3); + f256_sub(P1->y, P1->y, t1); + + /* + * Compute z3 = h*z1*z2. + */ + f256_montymul(P1->z, P1->z, t2); + + return ret; +} + +#if 0 +/* unused */ +/* + * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2. + * This is a specialised function for the case when P2 is a non-zero point + * in affine coordinates. + * + * This function returns the correct result in all cases. + */ +static uint32_t +p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2) +{ + /* + * Addtions formulas, in the general case, are: + * + * u1 = x1 + * u2 = x2 * z1^2 + * s1 = y1 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 + * + * These formulas mishandle the two following cases: + * + * - If P1 is the point-at-infinity (z1 = 0), then z3 is + * incorrectly set to 0. + * + * - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3 + * are all set to 0. + * + * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then + * we correctly get z3 = 0 (the point-at-infinity). + * + * To fix the case P1 = 0, we perform at the end a copy of P2 + * over P1, conditional to z1 = 0. + * + * For P1 = P2: in that case, both h and r are set to 0, and + * we get x3, y3 and z3 equal to 0. We can test for that + * occurrence to make a mask which will be all-one if P1 = P2, + * or all-zero otherwise; then we can compute the double of P2 + * and add it, combined with the mask, to (x3,y3,z3). + * + * Using the doubling formulas in p256_double() on (x2,y2), + * simplifying since P2 is affine (i.e. z2 = 1, implicitly), + * we get: + * s = 4*x2*y2^2 + * m = 3*(x2 + 1)*(x2 - 1) + * x' = m^2 - 2*s + * y' = m*(s - x') - 8*y2^4 + * z' = 2*y2 + * which requires only 6 multiplications. Added to the 11 + * multiplications of the normal mixed addition in Jacobian + * coordinates, we get a cost of 17 multiplications in total. + */ + uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt, zz; + int i; + + /* + * Set zz to -1 if P1 is the point at infinity, 0 otherwise. + */ + zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3]; + zz = ((zz | -zz) >> 63) - (uint64_t)1; + + /* + * Compute u1 = x1 (in t1) and s1 = y1 (in t3). + */ + memcpy(t1, P1->x, sizeof t1); + memcpy(t3, P1->y, sizeof t3); + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + f256_montysquare(t4, P1->z); + f256_montymul(t2, P2->x, t4); + f256_montymul(t5, P1->z, t4); + f256_montymul(t4, P2->y, t5); + + /* + * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). + * reduce. + */ + f256_sub(t2, t2, t1); + f256_sub(t4, t4, t3); + + /* + * If both h = 0 and r = 0, then P1 = P2, and we want to set + * the mask tt to -1; otherwise, the mask will be 0. + */ + f256_final_reduce(t2); + f256_final_reduce(t4); + tt = t2[0] | t2[1] | t2[2] | t2[3] | t4[0] | t4[1] | t4[2] | t4[3]; + tt = ((tt | -tt) >> 63) - (uint64_t)1; + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5); + */ + f256_montysquare(t7, t2); + f256_montymul(t6, t1, t7); + f256_montymul(t5, t7, t2); + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + */ + f256_montysquare(P1->x, t4); + f256_sub(P1->x, P1->x, t5); + f256_sub(P1->x, P1->x, t6); + f256_sub(P1->x, P1->x, t6); + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + f256_sub(t6, t6, P1->x); + f256_montymul(P1->y, t4, t6); + f256_montymul(t1, t5, t3); + f256_sub(P1->y, P1->y, t1); + + /* + * Compute z3 = h*z1. + */ + f256_montymul(P1->z, P1->z, t2); + + /* + * The "double" result, in case P1 = P2. + */ + + /* + * Compute z' = 2*y2 (in t1). + */ + f256_add(t1, P2->y, P2->y); + + /* + * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3). + */ + f256_montysquare(t2, P2->y); + f256_add(t2, t2, t2); + f256_add(t3, t2, t2); + f256_montymul(t3, P2->x, t3); + + /* + * Compute m = 3*(x2^2 - 1) (in t4). + */ + f256_montysquare(t4, P2->x); + f256_sub(t4, t4, F256_R); + f256_add(t5, t4, t4); + f256_add(t4, t4, t5); + + /* + * Compute x' = m^2 - 2*s (in t5). + */ + f256_montysquare(t5, t4); + f256_sub(t5, t3); + f256_sub(t5, t3); + + /* + * Compute y' = m*(s - x') - 8*y2^4 (in t6). + */ + f256_sub(t6, t3, t5); + f256_montymul(t6, t6, t4); + f256_montysquare(t7, t2); + f256_sub(t6, t6, t7); + f256_sub(t6, t6, t7); + + /* + * We now have the alternate (doubling) coordinates in (t5,t6,t1). + * We combine them with (x3,y3,z3). + */ + for (i = 0; i < 4; i ++) { + P1->x[i] |= tt & t5[i]; + P1->y[i] |= tt & t6[i]; + P1->z[i] |= tt & t1[i]; + } + + /* + * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0, + * then we want to replace the result with a copy of P2. The + * test on z1 was done at the start, in the zz mask. + */ + for (i = 0; i < 4; i ++) { + P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]); + P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]); + P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]); + } +} +#endif + +/* + * Inner function for computing a point multiplication. A window is + * provided, with points 1*P to 15*P in affine coordinates. + * + * Assumptions: + * - All provided points are valid points on the curve. + * - Multiplier is non-zero, and smaller than the curve order. + * - Everything is in Montgomery representation. + */ +static void +point_mul_inner(p256_jacobian *R, const p256_affine *W, + const unsigned char *k, size_t klen) +{ + p256_jacobian Q; + uint32_t qz; + + memset(&Q, 0, sizeof Q); + qz = 1; + while (klen -- > 0) { + int i; + unsigned bk; + + bk = *k ++; + for (i = 0; i < 2; i ++) { + uint32_t bits; + uint32_t bnz; + p256_affine T; + p256_jacobian U; + uint32_t n; + int j; + uint64_t m; + + p256_double(&Q); + p256_double(&Q); + p256_double(&Q); + p256_double(&Q); + bits = (bk >> 4) & 0x0F; + bnz = NEQ(bits, 0); + + /* + * Lookup point in window. If the bits are 0, + * we get something invalid, which is not a + * problem because we will use it only if the + * bits are non-zero. + */ + memset(&T, 0, sizeof T); + for (n = 0; n < 15; n ++) { + m = -(uint64_t)EQ(bits, n + 1); + T.x[0] |= m & W[n].x[0]; + T.x[1] |= m & W[n].x[1]; + T.x[2] |= m & W[n].x[2]; + T.x[3] |= m & W[n].x[3]; + T.y[0] |= m & W[n].y[0]; + T.y[1] |= m & W[n].y[1]; + T.y[2] |= m & W[n].y[2]; + T.y[3] |= m & W[n].y[3]; + } + + U = Q; + p256_add_mixed(&U, &T); + + /* + * If qz is still 1, then Q was all-zeros, and this + * is conserved through p256_double(). + */ + m = -(uint64_t)(bnz & qz); + for (j = 0; j < 4; j ++) { + Q.x[j] |= m & T.x[j]; + Q.y[j] |= m & T.y[j]; + Q.z[j] |= m & F256_R[j]; + } + CCOPY(bnz & ~qz, &Q, &U, sizeof Q); + qz &= ~bnz; + bk <<= 4; + } + } + *R = Q; +} + +/* + * Convert a window from Jacobian to affine coordinates. A single + * field inversion is used. This function works for windows up to + * 32 elements. + * + * The destination array (aff[]) and the source array (jac[]) may + * overlap, provided that the start of aff[] is not after the start of + * jac[]. Even if the arrays do _not_ overlap, the source array is + * modified. + */ +static void +window_to_affine(p256_affine *aff, p256_jacobian *jac, int num) +{ + /* + * Convert the window points to affine coordinates. We use the + * following trick to mutualize the inversion computation: if + * we have z1, z2, z3, and z4, and want to inverse all of them, + * we compute u = 1/(z1*z2*z3*z4), and then we have: + * 1/z1 = u*z2*z3*z4 + * 1/z2 = u*z1*z3*z4 + * 1/z3 = u*z1*z2*z4 + * 1/z4 = u*z1*z2*z3 + * + * The partial products are computed recursively: + * + * - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2 + * - on input (z_1,z_2,... z_n): + * recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1 + * recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2 + * multiply elements of r1 by m2 -> s1 + * multiply elements of r2 by m1 -> s2 + * return r1||r2 and m1*m2 + * + * In the example below, we suppose that we have 14 elements. + * Let z1, z2,... zE be the 14 values to invert (index noted in + * hexadecimal, starting at 1). + * + * - Depth 1: + * swap(z1, z2); z12 = z1*z2 + * swap(z3, z4); z34 = z3*z4 + * swap(z5, z6); z56 = z5*z6 + * swap(z7, z8); z78 = z7*z8 + * swap(z9, zA); z9A = z9*zA + * swap(zB, zC); zBC = zB*zC + * swap(zD, zE); zDE = zD*zE + * + * - Depth 2: + * z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12 + * z1234 = z12*z34 + * z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56 + * z5678 = z56*z78 + * z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A + * z9ABC = z9A*zBC + * + * - Depth 3: + * z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678 + * z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234 + * z12345678 = z1234*z5678 + * z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE + * zD <- zD*z9ABC, zE*z9ABC + * z9ABCDE = z9ABC*zDE + * + * - Depth 4: + * multiply z1..z8 by z9ABCDE + * multiply z9..zE by z12345678 + * final z = z12345678*z9ABCDE + */ + + uint64_t z[16][4]; + int i, k, s; +#define zt (z[15]) +#define zu (z[14]) +#define zv (z[13]) + + /* + * First recursion step (pairwise swapping and multiplication). + * If there is an odd number of elements, then we "invent" an + * extra one with coordinate Z = 1 (in Montgomery representation). + */ + for (i = 0; (i + 1) < num; i += 2) { + memcpy(zt, jac[i].z, sizeof zt); + memcpy(jac[i].z, jac[i + 1].z, sizeof zt); + memcpy(jac[i + 1].z, zt, sizeof zt); + f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z); + } + if ((num & 1) != 0) { + memcpy(z[num >> 1], jac[num - 1].z, sizeof zt); + memcpy(jac[num - 1].z, F256_R, sizeof F256_R); + } + + /* + * Perform further recursion steps. At the entry of each step, + * the process has been done for groups of 's' points. The + * integer k is the log2 of s. + */ + for (k = 1, s = 2; s < num; k ++, s <<= 1) { + int n; + + for (i = 0; i < num; i ++) { + f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]); + } + n = (num + s - 1) >> k; + for (i = 0; i < (n >> 1); i ++) { + f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]); + } + if ((n & 1) != 0) { + memmove(z[n >> 1], z[n], sizeof zt); + } + } + + /* + * Invert the final result, and convert all points. + */ + f256_invert(zt, z[0]); + for (i = 0; i < num; i ++) { + f256_montymul(zv, jac[i].z, zt); + f256_montysquare(zu, zv); + f256_montymul(zv, zv, zu); + f256_montymul(aff[i].x, jac[i].x, zu); + f256_montymul(aff[i].y, jac[i].y, zv); + } +} + +/* + * Multiply the provided point by an integer. + * Assumptions: + * - Source point is a valid curve point. + * - Source point is not the point-at-infinity. + * - Integer is not 0, and is lower than the curve order. + * If these conditions are not met, then the result is indeterminate + * (but the process is still constant-time). + */ +static void +p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen) +{ + union { + p256_affine aff[15]; + p256_jacobian jac[15]; + } window; + int i; + + /* + * Compute window, in Jacobian coordinates. + */ + window.jac[0] = *P; + for (i = 2; i < 16; i ++) { + window.jac[i - 1] = window.jac[(i >> 1) - 1]; + if ((i & 1) == 0) { + p256_double(&window.jac[i - 1]); + } else { + p256_add(&window.jac[i - 1], &window.jac[i >> 1]); + } + } + + /* + * Convert the window points to affine coordinates. Point + * window[0] is the source point, already in affine coordinates. + */ + window_to_affine(window.aff, window.jac, 15); + + /* + * Perform point multiplication. + */ + point_mul_inner(P, window.aff, k, klen); +} + +/* + * Precomputed window for the conventional generator: P256_Gwin[n] + * contains (n+1)*G (affine coordinates, in Montgomery representation). + */ +static const p256_affine P256_Gwin[] = { + { + { 0x79E730D418A9143C, 0x75BA95FC5FEDB601, + 0x79FB732B77622510, 0x18905F76A53755C6 }, + { 0xDDF25357CE95560A, 0x8B4AB8E4BA19E45C, + 0xD2E88688DD21F325, 0x8571FF1825885D85 } + }, + { + { 0x850046D410DDD64D, 0xAA6AE3C1A433827D, + 0x732205038D1490D9, 0xF6BB32E43DCF3A3B }, + { 0x2F3648D361BEE1A5, 0x152CD7CBEB236FF8, + 0x19A8FB0E92042DBE, 0x78C577510A5B8A3B } + }, + { + { 0xFFAC3F904EEBC127, 0xB027F84A087D81FB, + 0x66AD77DD87CBBC98, 0x26936A3FB6FF747E }, + { 0xB04C5C1FC983A7EB, 0x583E47AD0861FE1A, + 0x788208311A2EE98E, 0xD5F06A29E587CC07 } + }, + { + { 0x74B0B50D46918DCC, 0x4650A6EDC623C173, + 0x0CDAACACE8100AF2, 0x577362F541B0176B }, + { 0x2D96F24CE4CBABA6, 0x17628471FAD6F447, + 0x6B6C36DEE5DDD22E, 0x84B14C394C5AB863 } + }, + { + { 0xBE1B8AAEC45C61F5, 0x90EC649A94B9537D, + 0x941CB5AAD076C20C, 0xC9079605890523C8 }, + { 0xEB309B4AE7BA4F10, 0x73C568EFE5EB882B, + 0x3540A9877E7A1F68, 0x73A076BB2DD1E916 } + }, + { + { 0x403947373E77664A, 0x55AE744F346CEE3E, + 0xD50A961A5B17A3AD, 0x13074B5954213673 }, + { 0x93D36220D377E44B, 0x299C2B53ADFF14B5, + 0xF424D44CEF639F11, 0xA4C9916D4A07F75F } + }, + { + { 0x0746354EA0173B4F, 0x2BD20213D23C00F7, + 0xF43EAAB50C23BB08, 0x13BA5119C3123E03 }, + { 0x2847D0303F5B9D4D, 0x6742F2F25DA67BDD, + 0xEF933BDC77C94195, 0xEAEDD9156E240867 } + }, + { + { 0x27F14CD19499A78F, 0x462AB5C56F9B3455, + 0x8F90F02AF02CFC6B, 0xB763891EB265230D }, + { 0xF59DA3A9532D4977, 0x21E3327DCF9EBA15, + 0x123C7B84BE60BBF0, 0x56EC12F27706DF76 } + }, + { + { 0x75C96E8F264E20E8, 0xABE6BFED59A7A841, + 0x2CC09C0444C8EB00, 0xE05B3080F0C4E16B }, + { 0x1EB7777AA45F3314, 0x56AF7BEDCE5D45E3, + 0x2B6E019A88B12F1A, 0x086659CDFD835F9B } + }, + { + { 0x2C18DBD19DC21EC8, 0x98F9868A0FCF8139, + 0x737D2CD648250B49, 0xCC61C94724B3428F }, + { 0x0C2B407880DD9E76, 0xC43A8991383FBE08, + 0x5F7D2D65779BE5D2, 0x78719A54EB3B4AB5 } + }, + { + { 0xEA7D260A6245E404, 0x9DE407956E7FDFE0, + 0x1FF3A4158DAC1AB5, 0x3E7090F1649C9073 }, + { 0x1A7685612B944E88, 0x250F939EE57F61C8, + 0x0C0DAA891EAD643D, 0x68930023E125B88E } + }, + { + { 0x04B71AA7D2697768, 0xABDEDEF5CA345A33, + 0x2409D29DEE37385E, 0x4EE1DF77CB83E156 }, + { 0x0CAC12D91CBB5B43, 0x170ED2F6CA895637, + 0x28228CFA8ADE6D66, 0x7FF57C9553238ACA } + }, + { + { 0xCCC425634B2ED709, 0x0E356769856FD30D, + 0xBCBCD43F559E9811, 0x738477AC5395B759 }, + { 0x35752B90C00EE17F, 0x68748390742ED2E3, + 0x7CD06422BD1F5BC1, 0xFBC08769C9E7B797 } + }, + { + { 0xA242A35BB0CF664A, 0x126E48F77F9707E3, + 0x1717BF54C6832660, 0xFAAE7332FD12C72E }, + { 0x27B52DB7995D586B, 0xBE29569E832237C2, + 0xE8E4193E2A65E7DB, 0x152706DC2EAA1BBB } + }, + { + { 0x72BCD8B7BC60055B, 0x03CC23EE56E27E4B, + 0xEE337424E4819370, 0xE2AA0E430AD3DA09 }, + { 0x40B8524F6383C45D, 0xD766355442A41B25, + 0x64EFA6DE778A4797, 0x2042170A7079ADF4 } + } +}; + +/* + * Multiply the conventional generator of the curve by the provided + * integer. Return is written in *P. + * + * Assumptions: + * - Integer is not 0, and is lower than the curve order. + * If this conditions is not met, then the result is indeterminate + * (but the process is still constant-time). + */ +static void +p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen) +{ + point_mul_inner(P, P256_Gwin, k, klen); +} + +/* + * Return 1 if all of the following hold: + * - klen <= 32 + * - k != 0 + * - k is lower than the curve order + * Otherwise, return 0. + * + * Constant-time behaviour: only klen may be observable. + */ +static uint32_t +check_scalar(const unsigned char *k, size_t klen) +{ + uint32_t z; + int32_t c; + size_t u; + + if (klen > 32) { + return 0; + } + z = 0; + for (u = 0; u < klen; u ++) { + z |= k[u]; + } + if (klen == 32) { + c = 0; + for (u = 0; u < klen; u ++) { + c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]); + } + } else { + c = -1; + } + return NEQ(z, 0) & LT0(c); +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *k, size_t klen, int curve) +{ + uint32_t r; + p256_jacobian P; + + (void)curve; + if (Glen != 65) { + return 0; + } + r = check_scalar(k, klen); + r &= point_decode(&P, G); + p256_mul(&P, k, klen); + r &= point_encode(G, &P); + return r; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *k, size_t klen, int curve) +{ + p256_jacobian P; + + (void)curve; + p256_mulgen(&P, k, klen); + point_encode(R, &P); + return 65; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + /* + * We might want to use Shamir's trick here: make a composite + * window of u*P+v*Q points, to merge the two doubling-ladders + * into one. This, however, has some complications: + * + * - During the computation, we may hit the point-at-infinity. + * Thus, we would need p256_add_complete_mixed() (complete + * formulas for point addition), with a higher cost (17 muls + * instead of 11). + * + * - A 4-bit window would be too large, since it would involve + * 16*16-1 = 255 points. For the same window size as in the + * p256_mul() case, we would need to reduce the window size + * to 2 bits, and thus perform twice as many non-doubling + * point additions. + * + * - The window may itself contain the point-at-infinity, and + * thus cannot be in all generality be made of affine points. + * Instead, we would need to make it a window of points in + * Jacobian coordinates. Even p256_add_complete_mixed() would + * be inappropriate. + * + * For these reasons, the code below performs two separate + * point multiplications, then computes the final point addition + * (which is both a "normal" addition, and a doubling, to handle + * all cases). + */ + + p256_jacobian P, Q; + uint32_t r, t, s; + uint64_t z; + + (void)curve; + if (len != 65) { + return 0; + } + r = point_decode(&P, A); + p256_mul(&P, x, xlen); + if (B == NULL) { + p256_mulgen(&Q, y, ylen); + } else { + r &= point_decode(&Q, B); + p256_mul(&Q, y, ylen); + } + + /* + * The final addition may fail in case both points are equal. + */ + t = p256_add(&P, &Q); + f256_final_reduce(P.z); + z = P.z[0] | P.z[1] | P.z[2] | P.z[3]; + s = EQ((uint32_t)(z | (z >> 32)), 0); + p256_double(&Q); + + /* + * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we + * have the following: + * + * s = 0, t = 0 return P (normal addition) + * s = 0, t = 1 return P (normal addition) + * s = 1, t = 0 return Q (a 'double' case) + * s = 1, t = 1 report an error (P+Q = 0) + */ + CCOPY(s & ~t, &P, &Q, sizeof Q); + point_encode(A, &P); + r &= ~(s & t); + return r; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_p256_m64 = { + (uint32_t)0x00800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_p256_m64_get(void) +{ + return &br_ec_p256_m64; +} + +#else + +/* see bearssl_ec.h */ +const br_ec_impl * +br_ec_p256_m64_get(void) +{ + return 0; +} + +#endif diff --git a/test/monniaux/BearSSL/src/ec/ec_prime_i15.c b/test/monniaux/BearSSL/src/ec/ec_prime_i15.c new file mode 100644 index 00000000..0f210f24 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_prime_i15.c @@ -0,0 +1,820 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Parameters for supported curves: + * - field modulus p + * - R^2 mod p (R = 2^(15k) for the smallest k such that R >= p) + * - b*R mod p (b is the second curve equation parameter) + */ + +static const uint16_t P256_P[] = { + 0x0111, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x003F, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x1000, 0x0000, 0x4000, 0x7FFF, + 0x7FFF, 0x0001 +}; + +static const uint16_t P256_R2[] = { + 0x0111, + 0x0000, 0x6000, 0x0000, 0x0000, 0x0000, 0x0000, 0x7FFC, 0x7FFF, + 0x7FBF, 0x7FFF, 0x7FBF, 0x7FFF, 0x7FFF, 0x7FFF, 0x77FF, 0x7FFF, + 0x4FFF, 0x0000 +}; + +static const uint16_t P256_B[] = { + 0x0111, + 0x770C, 0x5EEF, 0x29C4, 0x3EC4, 0x6273, 0x0486, 0x4543, 0x3993, + 0x3C01, 0x6B56, 0x212E, 0x57EE, 0x4882, 0x204B, 0x7483, 0x3C16, + 0x0187, 0x0000 +}; + +static const uint16_t P384_P[] = { + 0x0199, + 0x7FFF, 0x7FFF, 0x0003, 0x0000, 0x0000, 0x0000, 0x7FC0, 0x7FFF, + 0x7EFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, + 0x7FFF, 0x01FF +}; + +static const uint16_t P384_R2[] = { + 0x0199, + 0x1000, 0x0000, 0x0000, 0x7FFF, 0x7FFF, 0x0001, 0x0000, 0x0010, + 0x0000, 0x0000, 0x0000, 0x7F00, 0x7FFF, 0x01FF, 0x0000, 0x1000, + 0x0000, 0x2000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000 +}; + +static const uint16_t P384_B[] = { + 0x0199, + 0x7333, 0x2096, 0x70D1, 0x2310, 0x3020, 0x6197, 0x1464, 0x35BB, + 0x70CA, 0x0117, 0x1920, 0x4136, 0x5FC8, 0x5713, 0x4938, 0x7DD2, + 0x4DD2, 0x4A71, 0x0220, 0x683E, 0x2C87, 0x4DB1, 0x7BFF, 0x6C09, + 0x0452, 0x0084 +}; + +static const uint16_t P521_P[] = { + 0x022B, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, + 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, + 0x7FFF, 0x7FFF, 0x07FF +}; + +static const uint16_t P521_R2[] = { + 0x022B, + 0x0100, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000 +}; + +static const uint16_t P521_B[] = { + 0x022B, + 0x7002, 0x6A07, 0x751A, 0x228F, 0x71EF, 0x5869, 0x20F4, 0x1EFC, + 0x7357, 0x37E0, 0x4EEC, 0x605E, 0x1652, 0x26F6, 0x31FA, 0x4A8F, + 0x6193, 0x3C2A, 0x3C42, 0x48C7, 0x3489, 0x6771, 0x4C57, 0x5CCD, + 0x2725, 0x545B, 0x503B, 0x5B42, 0x21A0, 0x2534, 0x687E, 0x70E4, + 0x1618, 0x27D7, 0x0465 +}; + +typedef struct { + const uint16_t *p; + const uint16_t *b; + const uint16_t *R2; + uint16_t p0i; + size_t point_len; +} curve_params; + +static inline const curve_params * +id_to_curve(int curve) +{ + static const curve_params pp[] = { + { P256_P, P256_B, P256_R2, 0x0001, 65 }, + { P384_P, P384_B, P384_R2, 0x0001, 97 }, + { P521_P, P521_B, P521_R2, 0x0001, 133 } + }; + + return &pp[curve - BR_EC_secp256r1]; +} + +#define I15_LEN ((BR_MAX_EC_SIZE + 29) / 15) + +/* + * Type for a point in Jacobian coordinates: + * -- three values, x, y and z, in Montgomery representation + * -- affine coordinates are X = x / z^2 and Y = y / z^3 + * -- for the point at infinity, z = 0 + */ +typedef struct { + uint16_t c[3][I15_LEN]; +} jacobian; + +/* + * We use a custom interpreter that uses a dozen registers, and + * only six operations: + * MSET(d, a) copy a into d + * MADD(d, a) d = d+a (modular) + * MSUB(d, a) d = d-a (modular) + * MMUL(d, a, b) d = a*b (Montgomery multiplication) + * MINV(d, a, b) invert d modulo p; a and b are used as scratch registers + * MTZ(d) clear return value if d = 0 + * Destination of MMUL (d) must be distinct from operands (a and b). + * There is no such constraint for MSUB and MADD. + * + * Registers include the operand coordinates, and temporaries. + */ +#define MSET(d, a) (0x0000 + ((d) << 8) + ((a) << 4)) +#define MADD(d, a) (0x1000 + ((d) << 8) + ((a) << 4)) +#define MSUB(d, a) (0x2000 + ((d) << 8) + ((a) << 4)) +#define MMUL(d, a, b) (0x3000 + ((d) << 8) + ((a) << 4) + (b)) +#define MINV(d, a, b) (0x4000 + ((d) << 8) + ((a) << 4) + (b)) +#define MTZ(d) (0x5000 + ((d) << 8)) +#define ENDCODE 0 + +/* + * Registers for the input operands. + */ +#define P1x 0 +#define P1y 1 +#define P1z 2 +#define P2x 3 +#define P2y 4 +#define P2z 5 + +/* + * Alternate names for the first input operand. + */ +#define Px 0 +#define Py 1 +#define Pz 2 + +/* + * Temporaries. + */ +#define t1 6 +#define t2 7 +#define t3 8 +#define t4 9 +#define t5 10 +#define t6 11 +#define t7 12 + +/* + * Extra scratch registers available when there is no second operand (e.g. + * for "double" and "affine"). + */ +#define t8 3 +#define t9 4 +#define t10 5 + +/* + * Doubling formulas are: + * + * s = 4*x*y^2 + * m = 3*(x + z^2)*(x - z^2) + * x' = m^2 - 2*s + * y' = m*(s - x') - 8*y^4 + * z' = 2*y*z + * + * If y = 0 (P has order 2) then this yields infinity (z' = 0), as it + * should. This case should not happen anyway, because our curves have + * prime order, and thus do not contain any point of order 2. + * + * If P is infinity (z = 0), then again the formulas yield infinity, + * which is correct. Thus, this code works for all points. + * + * Cost: 8 multiplications + */ +static const uint16_t code_double[] = { + /* + * Compute z^2 (in t1). + */ + MMUL(t1, Pz, Pz), + + /* + * Compute x-z^2 (in t2) and then x+z^2 (in t1). + */ + MSET(t2, Px), + MSUB(t2, t1), + MADD(t1, Px), + + /* + * Compute m = 3*(x+z^2)*(x-z^2) (in t1). + */ + MMUL(t3, t1, t2), + MSET(t1, t3), + MADD(t1, t3), + MADD(t1, t3), + + /* + * Compute s = 4*x*y^2 (in t2) and 2*y^2 (in t3). + */ + MMUL(t3, Py, Py), + MADD(t3, t3), + MMUL(t2, Px, t3), + MADD(t2, t2), + + /* + * Compute x' = m^2 - 2*s. + */ + MMUL(Px, t1, t1), + MSUB(Px, t2), + MSUB(Px, t2), + + /* + * Compute z' = 2*y*z. + */ + MMUL(t4, Py, Pz), + MSET(Pz, t4), + MADD(Pz, t4), + + /* + * Compute y' = m*(s - x') - 8*y^4. Note that we already have + * 2*y^2 in t3. + */ + MSUB(t2, Px), + MMUL(Py, t1, t2), + MMUL(t4, t3, t3), + MSUB(Py, t4), + MSUB(Py, t4), + + ENDCODE +}; + +/* + * Addtions formulas are: + * + * u1 = x1 * z2^2 + * u2 = x2 * z1^2 + * s1 = y1 * z2^3 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 * z2 + * + * If both P1 and P2 are infinity, then z1 == 0 and z2 == 0, implying that + * z3 == 0, so the result is correct. + * If either of P1 or P2 is infinity, but not both, then z3 == 0, which is + * not correct. + * h == 0 only if u1 == u2; this happens in two cases: + * -- if s1 == s2 then P1 and/or P2 is infinity, or P1 == P2 + * -- if s1 != s2 then P1 + P2 == infinity (but neither P1 or P2 is infinity) + * + * Thus, the following situations are not handled correctly: + * -- P1 = 0 and P2 != 0 + * -- P1 != 0 and P2 = 0 + * -- P1 = P2 + * All other cases are properly computed. However, even in "incorrect" + * situations, the three coordinates still are properly formed field + * elements. + * + * The returned flag is cleared if r == 0. This happens in the following + * cases: + * -- Both points are on the same horizontal line (same Y coordinate). + * -- Both points are infinity. + * -- One point is infinity and the other is on line Y = 0. + * The third case cannot happen with our curves (there is no valid point + * on line Y = 0 since that would be a point of order 2). If the two + * source points are non-infinity, then remains only the case where the + * two points are on the same horizontal line. + * + * This allows us to detect the "P1 == P2" case, assuming that P1 != 0 and + * P2 != 0: + * -- If the returned value is not the point at infinity, then it was properly + * computed. + * -- Otherwise, if the returned flag is 1, then P1+P2 = 0, and the result + * is indeed the point at infinity. + * -- Otherwise (result is infinity, flag is 0), then P1 = P2 and we should + * use the 'double' code. + * + * Cost: 16 multiplications + */ +static const uint16_t code_add[] = { + /* + * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). + */ + MMUL(t3, P2z, P2z), + MMUL(t1, P1x, t3), + MMUL(t4, P2z, t3), + MMUL(t3, P1y, t4), + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + MMUL(t4, P1z, P1z), + MMUL(t2, P2x, t4), + MMUL(t5, P1z, t4), + MMUL(t4, P2y, t5), + + /* + * Compute h = u2 - u1 (in t2) and r = s2 - s1 (in t4). + */ + MSUB(t2, t1), + MSUB(t4, t3), + + /* + * Report cases where r = 0 through the returned flag. + */ + MTZ(t4), + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5). + */ + MMUL(t7, t2, t2), + MMUL(t6, t1, t7), + MMUL(t5, t7, t2), + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + * t1 and t7 can be used as scratch registers. + */ + MMUL(P1x, t4, t4), + MSUB(P1x, t5), + MSUB(P1x, t6), + MSUB(P1x, t6), + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + MSUB(t6, P1x), + MMUL(P1y, t4, t6), + MMUL(t1, t5, t3), + MSUB(P1y, t1), + + /* + * Compute z3 = h*z1*z2. + */ + MMUL(t1, P1z, P2z), + MMUL(P1z, t1, t2), + + ENDCODE +}; + +/* + * Check that the point is on the curve. This code snippet assumes the + * following conventions: + * -- Coordinates x and y have been freshly decoded in P1 (but not + * converted to Montgomery coordinates yet). + * -- P2x, P2y and P2z are set to, respectively, R^2, b*R and 1. + */ +static const uint16_t code_check[] = { + + /* Convert x and y to Montgomery representation. */ + MMUL(t1, P1x, P2x), + MMUL(t2, P1y, P2x), + MSET(P1x, t1), + MSET(P1y, t2), + + /* Compute x^3 in t1. */ + MMUL(t2, P1x, P1x), + MMUL(t1, P1x, t2), + + /* Subtract 3*x from t1. */ + MSUB(t1, P1x), + MSUB(t1, P1x), + MSUB(t1, P1x), + + /* Add b. */ + MADD(t1, P2y), + + /* Compute y^2 in t2. */ + MMUL(t2, P1y, P1y), + + /* Compare y^2 with x^3 - 3*x + b; they must match. */ + MSUB(t1, t2), + MTZ(t1), + + /* Set z to 1 (in Montgomery representation). */ + MMUL(P1z, P2x, P2z), + + ENDCODE +}; + +/* + * Conversion back to affine coordinates. This code snippet assumes that + * the z coordinate of P2 is set to 1 (not in Montgomery representation). + */ +static const uint16_t code_affine[] = { + + /* Save z*R in t1. */ + MSET(t1, P1z), + + /* Compute z^3 in t2. */ + MMUL(t2, P1z, P1z), + MMUL(t3, P1z, t2), + MMUL(t2, t3, P2z), + + /* Invert to (1/z^3) in t2. */ + MINV(t2, t3, t4), + + /* Compute y. */ + MSET(t3, P1y), + MMUL(P1y, t2, t3), + + /* Compute (1/z^2) in t3. */ + MMUL(t3, t2, t1), + + /* Compute x. */ + MSET(t2, P1x), + MMUL(P1x, t2, t3), + + ENDCODE +}; + +static uint32_t +run_code(jacobian *P1, const jacobian *P2, + const curve_params *cc, const uint16_t *code) +{ + uint32_t r; + uint16_t t[13][I15_LEN]; + size_t u; + + r = 1; + + /* + * Copy the two operands in the dedicated registers. + */ + memcpy(t[P1x], P1->c, 3 * I15_LEN * sizeof(uint16_t)); + memcpy(t[P2x], P2->c, 3 * I15_LEN * sizeof(uint16_t)); + + /* + * Run formulas. + */ + for (u = 0;; u ++) { + unsigned op, d, a, b; + + op = code[u]; + if (op == 0) { + break; + } + d = (op >> 8) & 0x0F; + a = (op >> 4) & 0x0F; + b = op & 0x0F; + op >>= 12; + switch (op) { + uint32_t ctl; + size_t plen; + unsigned char tp[(BR_MAX_EC_SIZE + 7) >> 3]; + + case 0: + memcpy(t[d], t[a], I15_LEN * sizeof(uint16_t)); + break; + case 1: + ctl = br_i15_add(t[d], t[a], 1); + ctl |= NOT(br_i15_sub(t[d], cc->p, 0)); + br_i15_sub(t[d], cc->p, ctl); + break; + case 2: + br_i15_add(t[d], cc->p, br_i15_sub(t[d], t[a], 1)); + break; + case 3: + br_i15_montymul(t[d], t[a], t[b], cc->p, cc->p0i); + break; + case 4: + plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3; + br_i15_encode(tp, plen, cc->p); + tp[plen - 1] -= 2; + br_i15_modpow(t[d], tp, plen, + cc->p, cc->p0i, t[a], t[b]); + break; + default: + r &= ~br_i15_iszero(t[d]); + break; + } + } + + /* + * Copy back result. + */ + memcpy(P1->c, t[P1x], 3 * I15_LEN * sizeof(uint16_t)); + return r; +} + +static void +set_one(uint16_t *x, const uint16_t *p) +{ + size_t plen; + + plen = (p[0] + 31) >> 4; + memset(x, 0, plen * sizeof *x); + x[0] = p[0]; + x[1] = 0x0001; +} + +static void +point_zero(jacobian *P, const curve_params *cc) +{ + memset(P, 0, sizeof *P); + P->c[0][0] = P->c[1][0] = P->c[2][0] = cc->p[0]; +} + +static inline void +point_double(jacobian *P, const curve_params *cc) +{ + run_code(P, P, cc, code_double); +} + +static inline uint32_t +point_add(jacobian *P1, const jacobian *P2, const curve_params *cc) +{ + return run_code(P1, P2, cc, code_add); +} + +static void +point_mul(jacobian *P, const unsigned char *x, size_t xlen, + const curve_params *cc) +{ + /* + * We do a simple double-and-add ladder with a 2-bit window + * to make only one add every two doublings. We thus first + * precompute 2P and 3P in some local buffers. + * + * We always perform two doublings and one addition; the + * addition is with P, 2P and 3P and is done in a temporary + * array. + * + * The addition code cannot handle cases where one of the + * operands is infinity, which is the case at the start of the + * ladder. We therefore need to maintain a flag that controls + * this situation. + */ + uint32_t qz; + jacobian P2, P3, Q, T, U; + + memcpy(&P2, P, sizeof P2); + point_double(&P2, cc); + memcpy(&P3, P, sizeof P3); + point_add(&P3, &P2, cc); + + point_zero(&Q, cc); + qz = 1; + while (xlen -- > 0) { + int k; + + for (k = 6; k >= 0; k -= 2) { + uint32_t bits; + uint32_t bnz; + + point_double(&Q, cc); + point_double(&Q, cc); + memcpy(&T, P, sizeof T); + memcpy(&U, &Q, sizeof U); + bits = (*x >> k) & (uint32_t)3; + bnz = NEQ(bits, 0); + CCOPY(EQ(bits, 2), &T, &P2, sizeof T); + CCOPY(EQ(bits, 3), &T, &P3, sizeof T); + point_add(&U, &T, cc); + CCOPY(bnz & qz, &Q, &T, sizeof Q); + CCOPY(bnz & ~qz, &Q, &U, sizeof Q); + qz &= ~bnz; + } + x ++; + } + memcpy(P, &Q, sizeof Q); +} + +/* + * Decode point into Jacobian coordinates. This function does not support + * the point at infinity. If the point is invalid then this returns 0, but + * the coordinates are still set to properly formed field elements. + */ +static uint32_t +point_decode(jacobian *P, const void *src, size_t len, const curve_params *cc) +{ + /* + * Points must use uncompressed format: + * -- first byte is 0x04; + * -- coordinates X and Y use unsigned big-endian, with the same + * length as the field modulus. + * + * We don't support hybrid format (uncompressed, but first byte + * has value 0x06 or 0x07, depending on the least significant bit + * of Y) because it is rather useless, and explicitly forbidden + * by PKIX (RFC 5480, section 2.2). + * + * We don't support compressed format either, because it is not + * much used in practice (there are or were patent-related + * concerns about point compression, which explains the lack of + * generalised support). Also, point compression support would + * need a bit more code. + */ + const unsigned char *buf; + size_t plen, zlen; + uint32_t r; + jacobian Q; + + buf = src; + point_zero(P, cc); + plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3; + if (len != 1 + (plen << 1)) { + return 0; + } + r = br_i15_decode_mod(P->c[0], buf + 1, plen, cc->p); + r &= br_i15_decode_mod(P->c[1], buf + 1 + plen, plen, cc->p); + + /* + * Check first byte. + */ + r &= EQ(buf[0], 0x04); + /* obsolete + r &= EQ(buf[0], 0x04) | (EQ(buf[0] & 0xFE, 0x06) + & ~(uint32_t)(buf[0] ^ buf[plen << 1])); + */ + + /* + * Convert coordinates and check that the point is valid. + */ + zlen = ((cc->p[0] + 31) >> 4) * sizeof(uint16_t); + memcpy(Q.c[0], cc->R2, zlen); + memcpy(Q.c[1], cc->b, zlen); + set_one(Q.c[2], cc->p); + r &= ~run_code(P, &Q, cc, code_check); + return r; +} + +/* + * Encode a point. This method assumes that the point is correct and is + * not the point at infinity. Encoded size is always 1+2*plen, where + * plen is the field modulus length, in bytes. + */ +static void +point_encode(void *dst, const jacobian *P, const curve_params *cc) +{ + unsigned char *buf; + size_t plen; + jacobian Q, T; + + buf = dst; + plen = (cc->p[0] - (cc->p[0] >> 4) + 7) >> 3; + buf[0] = 0x04; + memcpy(&Q, P, sizeof *P); + set_one(T.c[2], cc->p); + run_code(&Q, &T, cc, code_affine); + br_i15_encode(buf + 1, plen, Q.c[0]); + br_i15_encode(buf + 1 + plen, plen, Q.c[1]); +} + +static const br_ec_curve_def * +id_to_curve_def(int curve) +{ + switch (curve) { + case BR_EC_secp256r1: + return &br_secp256r1; + case BR_EC_secp384r1: + return &br_secp384r1; + case BR_EC_secp521r1: + return &br_secp521r1; + } + return NULL; +} + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + const br_ec_curve_def *cd; + + cd = id_to_curve_def(curve); + *len = cd->generator_len; + return cd->generator; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + const br_ec_curve_def *cd; + + cd = id_to_curve_def(curve); + *len = cd->order_len; + return cd->order; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + api_generator(curve, len); + *len >>= 1; + return 1; +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *x, size_t xlen, int curve) +{ + uint32_t r; + const curve_params *cc; + jacobian P; + + cc = id_to_curve(curve); + r = point_decode(&P, G, Glen, cc); + point_mul(&P, x, xlen, cc); + if (Glen == cc->point_len) { + point_encode(G, &P, cc); + } + return r; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + uint32_t r, t, z; + const curve_params *cc; + jacobian P, Q; + + /* + * TODO: see about merging the two ladders. Right now, we do + * two independent point multiplications, which is a bit + * wasteful of CPU resources (but yields short code). + */ + + cc = id_to_curve(curve); + r = point_decode(&P, A, len, cc); + if (B == NULL) { + size_t Glen; + + B = api_generator(curve, &Glen); + } + r &= point_decode(&Q, B, len, cc); + point_mul(&P, x, xlen, cc); + point_mul(&Q, y, ylen, cc); + + /* + * We want to compute P+Q. Since the base points A and B are distinct + * from infinity, and the multipliers are non-zero and lower than the + * curve order, then we know that P and Q are non-infinity. This + * leaves two special situations to test for: + * -- If P = Q then we must use point_double(). + * -- If P+Q = 0 then we must report an error. + */ + t = point_add(&P, &Q, cc); + point_double(&Q, cc); + z = br_i15_iszero(P.c[2]); + + /* + * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we + * have the following: + * + * z = 0, t = 0 return P (normal addition) + * z = 0, t = 1 return P (normal addition) + * z = 1, t = 0 return Q (a 'double' case) + * z = 1, t = 1 report an error (P+Q = 0) + */ + CCOPY(z & ~t, &P, &Q, sizeof Q); + point_encode(A, &P, cc); + r &= ~(z & t); + + return r; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_prime_i15 = { + (uint32_t)0x03800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_prime_i31.c b/test/monniaux/BearSSL/src/ec/ec_prime_i31.c new file mode 100644 index 00000000..0586a3b5 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_prime_i31.c @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Parameters for supported curves (field modulus, and 'b' equation + * parameter; both values use the 'i31' format, and 'b' is in Montgomery + * representation). + */ + +static const uint32_t P256_P[] = { + 0x00000108, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x00000007, + 0x00000000, 0x00000000, 0x00000040, 0x7FFFFF80, + 0x000000FF +}; + +static const uint32_t P256_R2[] = { + 0x00000108, + 0x00014000, 0x00018000, 0x00000000, 0x7FF40000, + 0x7FEFFFFF, 0x7FF7FFFF, 0x7FAFFFFF, 0x005FFFFF, + 0x00000000 +}; + +static const uint32_t P256_B[] = { + 0x00000108, + 0x6FEE1803, 0x6229C4BD, 0x21B139BE, 0x327150AA, + 0x3567802E, 0x3F7212ED, 0x012E4355, 0x782DD38D, + 0x0000000E +}; + +static const uint32_t P384_P[] = { + 0x0000018C, + 0x7FFFFFFF, 0x00000001, 0x00000000, 0x7FFFFFF8, + 0x7FFFFFEF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x00000FFF +}; + +static const uint32_t P384_R2[] = { + 0x0000018C, + 0x00000000, 0x00000080, 0x7FFFFE00, 0x000001FF, + 0x00000800, 0x00000000, 0x7FFFE000, 0x00001FFF, + 0x00008000, 0x00008000, 0x00000000, 0x00000000, + 0x00000000 +}; + +static const uint32_t P384_B[] = { + 0x0000018C, + 0x6E666840, 0x070D0392, 0x5D810231, 0x7651D50C, + 0x17E218D6, 0x1B192002, 0x44EFE441, 0x3A524E2B, + 0x2719BA5F, 0x41F02209, 0x36C5643E, 0x5813EFFE, + 0x000008A5 +}; + +static const uint32_t P521_P[] = { + 0x00000219, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, + 0x01FFFFFF +}; + +static const uint32_t P521_R2[] = { + 0x00000219, + 0x00001000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000 +}; + +static const uint32_t P521_B[] = { + 0x00000219, + 0x540FC00A, 0x228FEA35, 0x2C34F1EF, 0x67BF107A, + 0x46FC1CD5, 0x1605E9DD, 0x6937B165, 0x272A3D8F, + 0x42785586, 0x44C8C778, 0x15F3B8B4, 0x64B73366, + 0x03BA8B69, 0x0D05B42A, 0x21F929A2, 0x2C31C393, + 0x00654FAE +}; + +typedef struct { + const uint32_t *p; + const uint32_t *b; + const uint32_t *R2; + uint32_t p0i; +} curve_params; + +static inline const curve_params * +id_to_curve(int curve) +{ + static const curve_params pp[] = { + { P256_P, P256_B, P256_R2, 0x00000001 }, + { P384_P, P384_B, P384_R2, 0x00000001 }, + { P521_P, P521_B, P521_R2, 0x00000001 } + }; + + return &pp[curve - BR_EC_secp256r1]; +} + +#define I31_LEN ((BR_MAX_EC_SIZE + 61) / 31) + +/* + * Type for a point in Jacobian coordinates: + * -- three values, x, y and z, in Montgomery representation + * -- affine coordinates are X = x / z^2 and Y = y / z^3 + * -- for the point at infinity, z = 0 + */ +typedef struct { + uint32_t c[3][I31_LEN]; +} jacobian; + +/* + * We use a custom interpreter that uses a dozen registers, and + * only six operations: + * MSET(d, a) copy a into d + * MADD(d, a) d = d+a (modular) + * MSUB(d, a) d = d-a (modular) + * MMUL(d, a, b) d = a*b (Montgomery multiplication) + * MINV(d, a, b) invert d modulo p; a and b are used as scratch registers + * MTZ(d) clear return value if d = 0 + * Destination of MMUL (d) must be distinct from operands (a and b). + * There is no such constraint for MSUB and MADD. + * + * Registers include the operand coordinates, and temporaries. + */ +#define MSET(d, a) (0x0000 + ((d) << 8) + ((a) << 4)) +#define MADD(d, a) (0x1000 + ((d) << 8) + ((a) << 4)) +#define MSUB(d, a) (0x2000 + ((d) << 8) + ((a) << 4)) +#define MMUL(d, a, b) (0x3000 + ((d) << 8) + ((a) << 4) + (b)) +#define MINV(d, a, b) (0x4000 + ((d) << 8) + ((a) << 4) + (b)) +#define MTZ(d) (0x5000 + ((d) << 8)) +#define ENDCODE 0 + +/* + * Registers for the input operands. + */ +#define P1x 0 +#define P1y 1 +#define P1z 2 +#define P2x 3 +#define P2y 4 +#define P2z 5 + +/* + * Alternate names for the first input operand. + */ +#define Px 0 +#define Py 1 +#define Pz 2 + +/* + * Temporaries. + */ +#define t1 6 +#define t2 7 +#define t3 8 +#define t4 9 +#define t5 10 +#define t6 11 +#define t7 12 + +/* + * Extra scratch registers available when there is no second operand (e.g. + * for "double" and "affine"). + */ +#define t8 3 +#define t9 4 +#define t10 5 + +/* + * Doubling formulas are: + * + * s = 4*x*y^2 + * m = 3*(x + z^2)*(x - z^2) + * x' = m^2 - 2*s + * y' = m*(s - x') - 8*y^4 + * z' = 2*y*z + * + * If y = 0 (P has order 2) then this yields infinity (z' = 0), as it + * should. This case should not happen anyway, because our curves have + * prime order, and thus do not contain any point of order 2. + * + * If P is infinity (z = 0), then again the formulas yield infinity, + * which is correct. Thus, this code works for all points. + * + * Cost: 8 multiplications + */ +static const uint16_t code_double[] = { + /* + * Compute z^2 (in t1). + */ + MMUL(t1, Pz, Pz), + + /* + * Compute x-z^2 (in t2) and then x+z^2 (in t1). + */ + MSET(t2, Px), + MSUB(t2, t1), + MADD(t1, Px), + + /* + * Compute m = 3*(x+z^2)*(x-z^2) (in t1). + */ + MMUL(t3, t1, t2), + MSET(t1, t3), + MADD(t1, t3), + MADD(t1, t3), + + /* + * Compute s = 4*x*y^2 (in t2) and 2*y^2 (in t3). + */ + MMUL(t3, Py, Py), + MADD(t3, t3), + MMUL(t2, Px, t3), + MADD(t2, t2), + + /* + * Compute x' = m^2 - 2*s. + */ + MMUL(Px, t1, t1), + MSUB(Px, t2), + MSUB(Px, t2), + + /* + * Compute z' = 2*y*z. + */ + MMUL(t4, Py, Pz), + MSET(Pz, t4), + MADD(Pz, t4), + + /* + * Compute y' = m*(s - x') - 8*y^4. Note that we already have + * 2*y^2 in t3. + */ + MSUB(t2, Px), + MMUL(Py, t1, t2), + MMUL(t4, t3, t3), + MSUB(Py, t4), + MSUB(Py, t4), + + ENDCODE +}; + +/* + * Addtions formulas are: + * + * u1 = x1 * z2^2 + * u2 = x2 * z1^2 + * s1 = y1 * z2^3 + * s2 = y2 * z1^3 + * h = u2 - u1 + * r = s2 - s1 + * x3 = r^2 - h^3 - 2 * u1 * h^2 + * y3 = r * (u1 * h^2 - x3) - s1 * h^3 + * z3 = h * z1 * z2 + * + * If both P1 and P2 are infinity, then z1 == 0 and z2 == 0, implying that + * z3 == 0, so the result is correct. + * If either of P1 or P2 is infinity, but not both, then z3 == 0, which is + * not correct. + * h == 0 only if u1 == u2; this happens in two cases: + * -- if s1 == s2 then P1 and/or P2 is infinity, or P1 == P2 + * -- if s1 != s2 then P1 + P2 == infinity (but neither P1 or P2 is infinity) + * + * Thus, the following situations are not handled correctly: + * -- P1 = 0 and P2 != 0 + * -- P1 != 0 and P2 = 0 + * -- P1 = P2 + * All other cases are properly computed. However, even in "incorrect" + * situations, the three coordinates still are properly formed field + * elements. + * + * The returned flag is cleared if r == 0. This happens in the following + * cases: + * -- Both points are on the same horizontal line (same Y coordinate). + * -- Both points are infinity. + * -- One point is infinity and the other is on line Y = 0. + * The third case cannot happen with our curves (there is no valid point + * on line Y = 0 since that would be a point of order 2). If the two + * source points are non-infinity, then remains only the case where the + * two points are on the same horizontal line. + * + * This allows us to detect the "P1 == P2" case, assuming that P1 != 0 and + * P2 != 0: + * -- If the returned value is not the point at infinity, then it was properly + * computed. + * -- Otherwise, if the returned flag is 1, then P1+P2 = 0, and the result + * is indeed the point at infinity. + * -- Otherwise (result is infinity, flag is 0), then P1 = P2 and we should + * use the 'double' code. + * + * Cost: 16 multiplications + */ +static const uint16_t code_add[] = { + /* + * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). + */ + MMUL(t3, P2z, P2z), + MMUL(t1, P1x, t3), + MMUL(t4, P2z, t3), + MMUL(t3, P1y, t4), + + /* + * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). + */ + MMUL(t4, P1z, P1z), + MMUL(t2, P2x, t4), + MMUL(t5, P1z, t4), + MMUL(t4, P2y, t5), + + /* + * Compute h = u2 - u1 (in t2) and r = s2 - s1 (in t4). + */ + MSUB(t2, t1), + MSUB(t4, t3), + + /* + * Report cases where r = 0 through the returned flag. + */ + MTZ(t4), + + /* + * Compute u1*h^2 (in t6) and h^3 (in t5). + */ + MMUL(t7, t2, t2), + MMUL(t6, t1, t7), + MMUL(t5, t7, t2), + + /* + * Compute x3 = r^2 - h^3 - 2*u1*h^2. + * t1 and t7 can be used as scratch registers. + */ + MMUL(P1x, t4, t4), + MSUB(P1x, t5), + MSUB(P1x, t6), + MSUB(P1x, t6), + + /* + * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. + */ + MSUB(t6, P1x), + MMUL(P1y, t4, t6), + MMUL(t1, t5, t3), + MSUB(P1y, t1), + + /* + * Compute z3 = h*z1*z2. + */ + MMUL(t1, P1z, P2z), + MMUL(P1z, t1, t2), + + ENDCODE +}; + +/* + * Check that the point is on the curve. This code snippet assumes the + * following conventions: + * -- Coordinates x and y have been freshly decoded in P1 (but not + * converted to Montgomery coordinates yet). + * -- P2x, P2y and P2z are set to, respectively, R^2, b*R and 1. + */ +static const uint16_t code_check[] = { + + /* Convert x and y to Montgomery representation. */ + MMUL(t1, P1x, P2x), + MMUL(t2, P1y, P2x), + MSET(P1x, t1), + MSET(P1y, t2), + + /* Compute x^3 in t1. */ + MMUL(t2, P1x, P1x), + MMUL(t1, P1x, t2), + + /* Subtract 3*x from t1. */ + MSUB(t1, P1x), + MSUB(t1, P1x), + MSUB(t1, P1x), + + /* Add b. */ + MADD(t1, P2y), + + /* Compute y^2 in t2. */ + MMUL(t2, P1y, P1y), + + /* Compare y^2 with x^3 - 3*x + b; they must match. */ + MSUB(t1, t2), + MTZ(t1), + + /* Set z to 1 (in Montgomery representation). */ + MMUL(P1z, P2x, P2z), + + ENDCODE +}; + +/* + * Conversion back to affine coordinates. This code snippet assumes that + * the z coordinate of P2 is set to 1 (not in Montgomery representation). + */ +static const uint16_t code_affine[] = { + + /* Save z*R in t1. */ + MSET(t1, P1z), + + /* Compute z^3 in t2. */ + MMUL(t2, P1z, P1z), + MMUL(t3, P1z, t2), + MMUL(t2, t3, P2z), + + /* Invert to (1/z^3) in t2. */ + MINV(t2, t3, t4), + + /* Compute y. */ + MSET(t3, P1y), + MMUL(P1y, t2, t3), + + /* Compute (1/z^2) in t3. */ + MMUL(t3, t2, t1), + + /* Compute x. */ + MSET(t2, P1x), + MMUL(P1x, t2, t3), + + ENDCODE +}; + +static uint32_t +run_code(jacobian *P1, const jacobian *P2, + const curve_params *cc, const uint16_t *code) +{ + uint32_t r; + uint32_t t[13][I31_LEN]; + size_t u; + + r = 1; + + /* + * Copy the two operands in the dedicated registers. + */ + memcpy(t[P1x], P1->c, 3 * I31_LEN * sizeof(uint32_t)); + memcpy(t[P2x], P2->c, 3 * I31_LEN * sizeof(uint32_t)); + + /* + * Run formulas. + */ + for (u = 0;; u ++) { + unsigned op, d, a, b; + + op = code[u]; + if (op == 0) { + break; + } + d = (op >> 8) & 0x0F; + a = (op >> 4) & 0x0F; + b = op & 0x0F; + op >>= 12; + switch (op) { + uint32_t ctl; + size_t plen; + unsigned char tp[(BR_MAX_EC_SIZE + 7) >> 3]; + + case 0: + memcpy(t[d], t[a], I31_LEN * sizeof(uint32_t)); + break; + case 1: + ctl = br_i31_add(t[d], t[a], 1); + ctl |= NOT(br_i31_sub(t[d], cc->p, 0)); + br_i31_sub(t[d], cc->p, ctl); + break; + case 2: + br_i31_add(t[d], cc->p, br_i31_sub(t[d], t[a], 1)); + break; + case 3: + br_i31_montymul(t[d], t[a], t[b], cc->p, cc->p0i); + break; + case 4: + plen = (cc->p[0] - (cc->p[0] >> 5) + 7) >> 3; + br_i31_encode(tp, plen, cc->p); + tp[plen - 1] -= 2; + br_i31_modpow(t[d], tp, plen, + cc->p, cc->p0i, t[a], t[b]); + break; + default: + r &= ~br_i31_iszero(t[d]); + break; + } + } + + /* + * Copy back result. + */ + memcpy(P1->c, t[P1x], 3 * I31_LEN * sizeof(uint32_t)); + return r; +} + +static void +set_one(uint32_t *x, const uint32_t *p) +{ + size_t plen; + + plen = (p[0] + 63) >> 5; + memset(x, 0, plen * sizeof *x); + x[0] = p[0]; + x[1] = 0x00000001; +} + +static void +point_zero(jacobian *P, const curve_params *cc) +{ + memset(P, 0, sizeof *P); + P->c[0][0] = P->c[1][0] = P->c[2][0] = cc->p[0]; +} + +static inline void +point_double(jacobian *P, const curve_params *cc) +{ + run_code(P, P, cc, code_double); +} + +static inline uint32_t +point_add(jacobian *P1, const jacobian *P2, const curve_params *cc) +{ + return run_code(P1, P2, cc, code_add); +} + +static void +point_mul(jacobian *P, const unsigned char *x, size_t xlen, + const curve_params *cc) +{ + /* + * We do a simple double-and-add ladder with a 2-bit window + * to make only one add every two doublings. We thus first + * precompute 2P and 3P in some local buffers. + * + * We always perform two doublings and one addition; the + * addition is with P, 2P and 3P and is done in a temporary + * array. + * + * The addition code cannot handle cases where one of the + * operands is infinity, which is the case at the start of the + * ladder. We therefore need to maintain a flag that controls + * this situation. + */ + uint32_t qz; + jacobian P2, P3, Q, T, U; + + memcpy(&P2, P, sizeof P2); + point_double(&P2, cc); + memcpy(&P3, P, sizeof P3); + point_add(&P3, &P2, cc); + + point_zero(&Q, cc); + qz = 1; + while (xlen -- > 0) { + int k; + + for (k = 6; k >= 0; k -= 2) { + uint32_t bits; + uint32_t bnz; + + point_double(&Q, cc); + point_double(&Q, cc); + memcpy(&T, P, sizeof T); + memcpy(&U, &Q, sizeof U); + bits = (*x >> k) & (uint32_t)3; + bnz = NEQ(bits, 0); + CCOPY(EQ(bits, 2), &T, &P2, sizeof T); + CCOPY(EQ(bits, 3), &T, &P3, sizeof T); + point_add(&U, &T, cc); + CCOPY(bnz & qz, &Q, &T, sizeof Q); + CCOPY(bnz & ~qz, &Q, &U, sizeof Q); + qz &= ~bnz; + } + x ++; + } + memcpy(P, &Q, sizeof Q); +} + +/* + * Decode point into Jacobian coordinates. This function does not support + * the point at infinity. If the point is invalid then this returns 0, but + * the coordinates are still set to properly formed field elements. + */ +static uint32_t +point_decode(jacobian *P, const void *src, size_t len, const curve_params *cc) +{ + /* + * Points must use uncompressed format: + * -- first byte is 0x04; + * -- coordinates X and Y use unsigned big-endian, with the same + * length as the field modulus. + * + * We don't support hybrid format (uncompressed, but first byte + * has value 0x06 or 0x07, depending on the least significant bit + * of Y) because it is rather useless, and explicitly forbidden + * by PKIX (RFC 5480, section 2.2). + * + * We don't support compressed format either, because it is not + * much used in practice (there are or were patent-related + * concerns about point compression, which explains the lack of + * generalised support). Also, point compression support would + * need a bit more code. + */ + const unsigned char *buf; + size_t plen, zlen; + uint32_t r; + jacobian Q; + + buf = src; + point_zero(P, cc); + plen = (cc->p[0] - (cc->p[0] >> 5) + 7) >> 3; + if (len != 1 + (plen << 1)) { + return 0; + } + r = br_i31_decode_mod(P->c[0], buf + 1, plen, cc->p); + r &= br_i31_decode_mod(P->c[1], buf + 1 + plen, plen, cc->p); + + /* + * Check first byte. + */ + r &= EQ(buf[0], 0x04); + /* obsolete + r &= EQ(buf[0], 0x04) | (EQ(buf[0] & 0xFE, 0x06) + & ~(uint32_t)(buf[0] ^ buf[plen << 1])); + */ + + /* + * Convert coordinates and check that the point is valid. + */ + zlen = ((cc->p[0] + 63) >> 5) * sizeof(uint32_t); + memcpy(Q.c[0], cc->R2, zlen); + memcpy(Q.c[1], cc->b, zlen); + set_one(Q.c[2], cc->p); + r &= ~run_code(P, &Q, cc, code_check); + return r; +} + +/* + * Encode a point. This method assumes that the point is correct and is + * not the point at infinity. Encoded size is always 1+2*plen, where + * plen is the field modulus length, in bytes. + */ +static void +point_encode(void *dst, const jacobian *P, const curve_params *cc) +{ + unsigned char *buf; + uint32_t xbl; + size_t plen; + jacobian Q, T; + + buf = dst; + xbl = cc->p[0]; + xbl -= (xbl >> 5); + plen = (xbl + 7) >> 3; + buf[0] = 0x04; + memcpy(&Q, P, sizeof *P); + set_one(T.c[2], cc->p); + run_code(&Q, &T, cc, code_affine); + br_i31_encode(buf + 1, plen, Q.c[0]); + br_i31_encode(buf + 1 + plen, plen, Q.c[1]); +} + +static const br_ec_curve_def * +id_to_curve_def(int curve) +{ + switch (curve) { + case BR_EC_secp256r1: + return &br_secp256r1; + case BR_EC_secp384r1: + return &br_secp384r1; + case BR_EC_secp521r1: + return &br_secp521r1; + } + return NULL; +} + +static const unsigned char * +api_generator(int curve, size_t *len) +{ + const br_ec_curve_def *cd; + + cd = id_to_curve_def(curve); + *len = cd->generator_len; + return cd->generator; +} + +static const unsigned char * +api_order(int curve, size_t *len) +{ + const br_ec_curve_def *cd; + + cd = id_to_curve_def(curve); + *len = cd->order_len; + return cd->order; +} + +static size_t +api_xoff(int curve, size_t *len) +{ + api_generator(curve, len); + *len >>= 1; + return 1; +} + +static uint32_t +api_mul(unsigned char *G, size_t Glen, + const unsigned char *x, size_t xlen, int curve) +{ + uint32_t r; + const curve_params *cc; + jacobian P; + + cc = id_to_curve(curve); + r = point_decode(&P, G, Glen, cc); + point_mul(&P, x, xlen, cc); + point_encode(G, &P, cc); + return r; +} + +static size_t +api_mulgen(unsigned char *R, + const unsigned char *x, size_t xlen, int curve) +{ + const unsigned char *G; + size_t Glen; + + G = api_generator(curve, &Glen); + memcpy(R, G, Glen); + api_mul(R, Glen, x, xlen, curve); + return Glen; +} + +static uint32_t +api_muladd(unsigned char *A, const unsigned char *B, size_t len, + const unsigned char *x, size_t xlen, + const unsigned char *y, size_t ylen, int curve) +{ + uint32_t r, t, z; + const curve_params *cc; + jacobian P, Q; + + /* + * TODO: see about merging the two ladders. Right now, we do + * two independent point multiplications, which is a bit + * wasteful of CPU resources (but yields short code). + */ + + cc = id_to_curve(curve); + r = point_decode(&P, A, len, cc); + if (B == NULL) { + size_t Glen; + + B = api_generator(curve, &Glen); + } + r &= point_decode(&Q, B, len, cc); + point_mul(&P, x, xlen, cc); + point_mul(&Q, y, ylen, cc); + + /* + * We want to compute P+Q. Since the base points A and B are distinct + * from infinity, and the multipliers are non-zero and lower than the + * curve order, then we know that P and Q are non-infinity. This + * leaves two special situations to test for: + * -- If P = Q then we must use point_double(). + * -- If P+Q = 0 then we must report an error. + */ + t = point_add(&P, &Q, cc); + point_double(&Q, cc); + z = br_i31_iszero(P.c[2]); + + /* + * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we + * have the following: + * + * z = 0, t = 0 return P (normal addition) + * z = 0, t = 1 return P (normal addition) + * z = 1, t = 0 return Q (a 'double' case) + * z = 1, t = 1 report an error (P+Q = 0) + */ + CCOPY(z & ~t, &P, &Q, sizeof Q); + point_encode(A, &P, cc); + r &= ~(z & t); + + return r; +} + +/* see bearssl_ec.h */ +const br_ec_impl br_ec_prime_i31 = { + (uint32_t)0x03800000, + &api_generator, + &api_order, + &api_xoff, + &api_mul, + &api_mulgen, + &api_muladd +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_pubkey.c b/test/monniaux/BearSSL/src/ec/ec_pubkey.c new file mode 100644 index 00000000..383ff286 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_pubkey.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static const unsigned char POINT_LEN[] = { + 0, /* 0: not a valid curve ID */ + 43, /* sect163k1 */ + 43, /* sect163r1 */ + 43, /* sect163r2 */ + 51, /* sect193r1 */ + 51, /* sect193r2 */ + 61, /* sect233k1 */ + 61, /* sect233r1 */ + 61, /* sect239k1 */ + 73, /* sect283k1 */ + 73, /* sect283r1 */ + 105, /* sect409k1 */ + 105, /* sect409r1 */ + 145, /* sect571k1 */ + 145, /* sect571r1 */ + 41, /* secp160k1 */ + 41, /* secp160r1 */ + 41, /* secp160r2 */ + 49, /* secp192k1 */ + 49, /* secp192r1 */ + 57, /* secp224k1 */ + 57, /* secp224r1 */ + 65, /* secp256k1 */ + 65, /* secp256r1 */ + 97, /* secp384r1 */ + 133, /* secp521r1 */ + 65, /* brainpoolP256r1 */ + 97, /* brainpoolP384r1 */ + 129, /* brainpoolP512r1 */ + 32, /* curve25519 */ + 56, /* curve448 */ +}; + +/* see bearssl_ec.h */ +size_t +br_ec_compute_pub(const br_ec_impl *impl, br_ec_public_key *pk, + void *kbuf, const br_ec_private_key *sk) +{ + int curve; + size_t len; + + curve = sk->curve; + if (curve < 0 || curve >= 32 || curve >= (int)(sizeof POINT_LEN) + || ((impl->supported_curves >> curve) & 1) == 0) + { + return 0; + } + if (kbuf == NULL) { + return POINT_LEN[curve]; + } + len = impl->mulgen(kbuf, sk->x, sk->xlen, curve); + if (pk != NULL) { + pk->curve = curve; + pk->q = kbuf; + pk->qlen = len; + } + return len; +} diff --git a/test/monniaux/BearSSL/src/ec/ec_secp256r1.c b/test/monniaux/BearSSL/src/ec/ec_secp256r1.c new file mode 100644 index 00000000..a9d6c456 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_secp256r1.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static const unsigned char P256_N[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xBC, 0xE6, 0xFA, 0xAD, 0xA7, 0x17, 0x9E, 0x84, + 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, 0x25, 0x51 +}; + +static const unsigned char P256_G[] = { + 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, + 0x47, 0xF8, 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, + 0xF2, 0x77, 0x03, 0x7D, 0x81, 0x2D, 0xEB, 0x33, + 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, 0x98, 0xC2, + 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F, + 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, + 0x16, 0x2B, 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, + 0xCE, 0xCB, 0xB6, 0x40, 0x68, 0x37, 0xBF, 0x51, + 0xF5 +}; + +/* see inner.h */ +const br_ec_curve_def br_secp256r1 = { + BR_EC_secp256r1, + P256_N, sizeof P256_N, + P256_G, sizeof P256_G +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_secp384r1.c b/test/monniaux/BearSSL/src/ec/ec_secp384r1.c new file mode 100644 index 00000000..693d93e4 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_secp384r1.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static const unsigned char P384_N[] = { + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xC7, 0x63, 0x4D, 0x81, 0xF4, 0x37, 0x2D, 0xDF, + 0x58, 0x1A, 0x0D, 0xB2, 0x48, 0xB0, 0xA7, 0x7A, + 0xEC, 0xEC, 0x19, 0x6A, 0xCC, 0xC5, 0x29, 0x73 +}; + +static const unsigned char P384_G[] = { + 0x04, 0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, + 0x37, 0x8E, 0xB1, 0xC7, 0x1E, 0xF3, 0x20, 0xAD, + 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, + 0x98, 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, + 0x38, 0x55, 0x02, 0xF2, 0x5D, 0xBF, 0x55, 0x29, + 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, + 0xB7, 0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, + 0x6F, 0x5D, 0x9E, 0x98, 0xBF, 0x92, 0x92, 0xDC, + 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, + 0x7C, 0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, + 0xC0, 0x0A, 0x60, 0xB1, 0xCE, 0x1D, 0x7E, 0x81, + 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, + 0x5F +}; + +/* see inner.h */ +const br_ec_curve_def br_secp384r1 = { + BR_EC_secp384r1, + P384_N, sizeof P384_N, + P384_G, sizeof P384_G +}; diff --git a/test/monniaux/BearSSL/src/ec/ec_secp521r1.c b/test/monniaux/BearSSL/src/ec/ec_secp521r1.c new file mode 100644 index 00000000..161acd0e --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ec_secp521r1.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static const unsigned char P521_N[] = { + 0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFA, 0x51, 0x86, 0x87, 0x83, 0xBF, 0x2F, + 0x96, 0x6B, 0x7F, 0xCC, 0x01, 0x48, 0xF7, 0x09, + 0xA5, 0xD0, 0x3B, 0xB5, 0xC9, 0xB8, 0x89, 0x9C, + 0x47, 0xAE, 0xBB, 0x6F, 0xB7, 0x1E, 0x91, 0x38, + 0x64, 0x09 +}; + +static const unsigned char P521_G[] = { + 0x04, 0x00, 0xC6, 0x85, 0x8E, 0x06, 0xB7, 0x04, + 0x04, 0xE9, 0xCD, 0x9E, 0x3E, 0xCB, 0x66, 0x23, + 0x95, 0xB4, 0x42, 0x9C, 0x64, 0x81, 0x39, 0x05, + 0x3F, 0xB5, 0x21, 0xF8, 0x28, 0xAF, 0x60, 0x6B, + 0x4D, 0x3D, 0xBA, 0xA1, 0x4B, 0x5E, 0x77, 0xEF, + 0xE7, 0x59, 0x28, 0xFE, 0x1D, 0xC1, 0x27, 0xA2, + 0xFF, 0xA8, 0xDE, 0x33, 0x48, 0xB3, 0xC1, 0x85, + 0x6A, 0x42, 0x9B, 0xF9, 0x7E, 0x7E, 0x31, 0xC2, + 0xE5, 0xBD, 0x66, 0x01, 0x18, 0x39, 0x29, 0x6A, + 0x78, 0x9A, 0x3B, 0xC0, 0x04, 0x5C, 0x8A, 0x5F, + 0xB4, 0x2C, 0x7D, 0x1B, 0xD9, 0x98, 0xF5, 0x44, + 0x49, 0x57, 0x9B, 0x44, 0x68, 0x17, 0xAF, 0xBD, + 0x17, 0x27, 0x3E, 0x66, 0x2C, 0x97, 0xEE, 0x72, + 0x99, 0x5E, 0xF4, 0x26, 0x40, 0xC5, 0x50, 0xB9, + 0x01, 0x3F, 0xAD, 0x07, 0x61, 0x35, 0x3C, 0x70, + 0x86, 0xA2, 0x72, 0xC2, 0x40, 0x88, 0xBE, 0x94, + 0x76, 0x9F, 0xD1, 0x66, 0x50 +}; + +/* see inner.h */ +const br_ec_curve_def br_secp521r1 = { + BR_EC_secp521r1, + P521_N, sizeof P521_N, + P521_G, sizeof P521_G +}; diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_atr.c b/test/monniaux/BearSSL/src/ec/ecdsa_atr.c new file mode 100644 index 00000000..3a11226e --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_atr.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_ec.h */ +size_t +br_ecdsa_asn1_to_raw(void *sig, size_t sig_len) +{ + /* + * Note: this code is a bit lenient in that it accepts a few + * deviations to DER with regards to minimality of encoding of + * lengths and integer values. These deviations are still + * unambiguous. + * + * Signature format is a SEQUENCE of two INTEGER values. We + * support only integers of less than 127 bytes each (signed + * encoding) so the resulting raw signature will have length + * at most 254 bytes. + */ + + unsigned char *buf, *r, *s; + size_t zlen, rlen, slen, off; + unsigned char tmp[254]; + + buf = sig; + if (sig_len < 8) { + return 0; + } + + /* + * First byte is SEQUENCE tag. + */ + if (buf[0] != 0x30) { + return 0; + } + + /* + * The SEQUENCE length will be encoded over one or two bytes. We + * limit the total SEQUENCE contents to 255 bytes, because it + * makes things simpler; this is enough for subgroup orders up + * to 999 bits. + */ + zlen = buf[1]; + if (zlen > 0x80) { + if (zlen != 0x81) { + return 0; + } + zlen = buf[2]; + if (zlen != sig_len - 3) { + return 0; + } + off = 3; + } else { + if (zlen != sig_len - 2) { + return 0; + } + off = 2; + } + + /* + * First INTEGER (r). + */ + if (buf[off ++] != 0x02) { + return 0; + } + rlen = buf[off ++]; + if (rlen >= 0x80) { + return 0; + } + r = buf + off; + off += rlen; + + /* + * Second INTEGER (s). + */ + if (off + 2 > sig_len) { + return 0; + } + if (buf[off ++] != 0x02) { + return 0; + } + slen = buf[off ++]; + if (slen >= 0x80 || slen != sig_len - off) { + return 0; + } + s = buf + off; + + /* + * Removing leading zeros from r and s. + */ + while (rlen > 0 && *r == 0) { + rlen --; + r ++; + } + while (slen > 0 && *s == 0) { + slen --; + s ++; + } + + /* + * Compute common length for the two integers, then copy integers + * into the temporary buffer, and finally copy it back over the + * signature buffer. + */ + zlen = rlen > slen ? rlen : slen; + sig_len = zlen << 1; + memset(tmp, 0, sig_len); + memcpy(tmp + zlen - rlen, r, rlen); + memcpy(tmp + sig_len - slen, s, slen); + memcpy(sig, tmp, sig_len); + return sig_len; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c new file mode 100644 index 00000000..afbf8acb --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_asn1.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_ec.h */ +br_ecdsa_sign +br_ecdsa_sign_asn1_get_default(void) +{ +#if BR_LOMUL + return &br_ecdsa_i15_sign_asn1; +#else + return &br_ecdsa_i31_sign_asn1; +#endif +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c new file mode 100644 index 00000000..287c9704 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_sign_raw.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_ec.h */ +br_ecdsa_sign +br_ecdsa_sign_raw_get_default(void) +{ +#if BR_LOMUL + return &br_ecdsa_i15_sign_raw; +#else + return &br_ecdsa_i31_sign_raw; +#endif +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c new file mode 100644 index 00000000..fe0996e8 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_asn1.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_ec.h */ +br_ecdsa_vrfy +br_ecdsa_vrfy_asn1_get_default(void) +{ +#if BR_LOMUL + return &br_ecdsa_i15_vrfy_asn1; +#else + return &br_ecdsa_i31_vrfy_asn1; +#endif +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c new file mode 100644 index 00000000..e564a105 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_default_vrfy_raw.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_ec.h */ +br_ecdsa_vrfy +br_ecdsa_vrfy_raw_get_default(void) +{ +#if BR_LOMUL + return &br_ecdsa_i15_vrfy_raw; +#else + return &br_ecdsa_i31_vrfy_raw; +#endif +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c new file mode 100644 index 00000000..402d14a6 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_bits.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see inner.h */ +void +br_ecdsa_i15_bits2int(uint16_t *x, + const void *src, size_t len, uint32_t ebitlen) +{ + uint32_t bitlen, hbitlen; + int sc; + + bitlen = ebitlen - (ebitlen >> 4); + hbitlen = (uint32_t)len << 3; + if (hbitlen > bitlen) { + len = (bitlen + 7) >> 3; + sc = (int)((hbitlen - bitlen) & 7); + } else { + sc = 0; + } + br_i15_zero(x, ebitlen); + br_i15_decode(x, src, len); + br_i15_rshift(x, sc); + x[0] = ebitlen; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c new file mode 100644 index 00000000..ab4a283c --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_asn1.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define ORDER_LEN ((BR_MAX_EC_SIZE + 7) >> 3) + +/* see bearssl_ec.h */ +size_t +br_ecdsa_i15_sign_asn1(const br_ec_impl *impl, + const br_hash_class *hf, const void *hash_value, + const br_ec_private_key *sk, void *sig) +{ + unsigned char rsig[(ORDER_LEN << 1) + 12]; + size_t sig_len; + + sig_len = br_ecdsa_i15_sign_raw(impl, hf, hash_value, sk, rsig); + if (sig_len == 0) { + return 0; + } + sig_len = br_ecdsa_raw_to_asn1(rsig, sig_len); + memcpy(sig, rsig, sig_len); + return sig_len; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c new file mode 100644 index 00000000..39b2e1d7 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_sign_raw.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define I15_LEN ((BR_MAX_EC_SIZE + 29) / 15) +#define POINT_LEN (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1)) +#define ORDER_LEN ((BR_MAX_EC_SIZE + 7) >> 3) + +/* see bearssl_ec.h */ +size_t +br_ecdsa_i15_sign_raw(const br_ec_impl *impl, + const br_hash_class *hf, const void *hash_value, + const br_ec_private_key *sk, void *sig) +{ + /* + * IMPORTANT: this code is fit only for curves with a prime + * order. This is needed so that modular reduction of the X + * coordinate of a point can be done with a simple subtraction. + * We also rely on the last byte of the curve order to be distinct + * from 0 and 1. + */ + const br_ec_curve_def *cd; + uint16_t n[I15_LEN], r[I15_LEN], s[I15_LEN], x[I15_LEN]; + uint16_t m[I15_LEN], k[I15_LEN], t1[I15_LEN], t2[I15_LEN]; + unsigned char tt[ORDER_LEN << 1]; + unsigned char eU[POINT_LEN]; + size_t hash_len, nlen, ulen; + uint16_t n0i; + uint32_t ctl; + br_hmac_drbg_context drbg; + + /* + * If the curve is not supported, then exit with an error. + */ + if (((impl->supported_curves >> sk->curve) & 1) == 0) { + return 0; + } + + /* + * Get the curve parameters (generator and order). + */ + switch (sk->curve) { + case BR_EC_secp256r1: + cd = &br_secp256r1; + break; + case BR_EC_secp384r1: + cd = &br_secp384r1; + break; + case BR_EC_secp521r1: + cd = &br_secp521r1; + break; + default: + return 0; + } + + /* + * Get modulus. + */ + nlen = cd->order_len; + br_i15_decode(n, cd->order, nlen); + n0i = br_i15_ninv15(n[1]); + + /* + * Get private key as an i15 integer. This also checks that the + * private key is well-defined (not zero, and less than the + * curve order). + */ + if (!br_i15_decode_mod(x, sk->x, sk->xlen, n)) { + return 0; + } + if (br_i15_iszero(x)) { + return 0; + } + + /* + * Get hash length. + */ + hash_len = (hf->desc >> BR_HASHDESC_OUT_OFF) & BR_HASHDESC_OUT_MASK; + + /* + * Truncate and reduce the hash value modulo the curve order. + */ + br_ecdsa_i15_bits2int(m, hash_value, hash_len, n[0]); + br_i15_sub(m, n, br_i15_sub(m, n, 0) ^ 1); + + /* + * RFC 6979 generation of the "k" value. + * + * The process uses HMAC_DRBG (with the hash function used to + * process the message that is to be signed). The seed is the + * concatenation of the encodings of the private key and + * the hash value (after truncation and modular reduction). + */ + br_i15_encode(tt, nlen, x); + br_i15_encode(tt + nlen, nlen, m); + br_hmac_drbg_init(&drbg, hf, tt, nlen << 1); + for (;;) { + br_hmac_drbg_generate(&drbg, tt, nlen); + br_ecdsa_i15_bits2int(k, tt, nlen, n[0]); + if (br_i15_iszero(k)) { + continue; + } + if (br_i15_sub(k, n, 0)) { + break; + } + } + + /* + * Compute k*G and extract the X coordinate, then reduce it + * modulo the curve order. Since we support only curves with + * prime order, that reduction is only a matter of computing + * a subtraction. + */ + br_i15_encode(tt, nlen, k); + ulen = impl->mulgen(eU, tt, nlen, sk->curve); + br_i15_zero(r, n[0]); + br_i15_decode(r, &eU[1], ulen >> 1); + r[0] = n[0]; + br_i15_sub(r, n, br_i15_sub(r, n, 0) ^ 1); + + /* + * Compute 1/k in double-Montgomery representation. We do so by + * first converting _from_ Montgomery representation (twice), + * then using a modular exponentiation. + */ + br_i15_from_monty(k, n, n0i); + br_i15_from_monty(k, n, n0i); + memcpy(tt, cd->order, nlen); + tt[nlen - 1] -= 2; + br_i15_modpow(k, tt, nlen, n, n0i, t1, t2); + + /* + * Compute s = (m+xr)/k (mod n). + * The k[] array contains R^2/k (double-Montgomery representation); + * we thus can use direct Montgomery multiplications and conversions + * from Montgomery, avoiding any call to br_i15_to_monty() (which + * is slower). + */ + br_i15_from_monty(m, n, n0i); + br_i15_montymul(t1, x, r, n, n0i); + ctl = br_i15_add(t1, m, 1); + ctl |= br_i15_sub(t1, n, 0) ^ 1; + br_i15_sub(t1, n, ctl); + br_i15_montymul(s, t1, k, n, n0i); + + /* + * Encode r and s in the signature. + */ + br_i15_encode(sig, nlen, r); + br_i15_encode((unsigned char *)sig + nlen, nlen, s); + return nlen << 1; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c new file mode 100644 index 00000000..f4bef997 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_asn1.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define FIELD_LEN ((BR_MAX_EC_SIZE + 7) >> 3) + +/* see bearssl_ec.h */ +uint32_t +br_ecdsa_i15_vrfy_asn1(const br_ec_impl *impl, + const void *hash, size_t hash_len, + const br_ec_public_key *pk, + const void *sig, size_t sig_len) +{ + /* + * We use a double-sized buffer because a malformed ASN.1 signature + * may trigger a size expansion when converting to "raw" format. + */ + unsigned char rsig[(FIELD_LEN << 2) + 24]; + + if (sig_len > ((sizeof rsig) >> 1)) { + return 0; + } + memcpy(rsig, sig, sig_len); + sig_len = br_ecdsa_asn1_to_raw(rsig, sig_len); + return br_ecdsa_i15_vrfy_raw(impl, hash, hash_len, pk, rsig, sig_len); +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c new file mode 100644 index 00000000..14dd5e46 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i15_vrfy_raw.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define I15_LEN ((BR_MAX_EC_SIZE + 29) / 15) +#define POINT_LEN (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1)) + +/* see bearssl_ec.h */ +uint32_t +br_ecdsa_i15_vrfy_raw(const br_ec_impl *impl, + const void *hash, size_t hash_len, + const br_ec_public_key *pk, + const void *sig, size_t sig_len) +{ + /* + * IMPORTANT: this code is fit only for curves with a prime + * order. This is needed so that modular reduction of the X + * coordinate of a point can be done with a simple subtraction. + */ + const br_ec_curve_def *cd; + uint16_t n[I15_LEN], r[I15_LEN], s[I15_LEN], t1[I15_LEN], t2[I15_LEN]; + unsigned char tx[(BR_MAX_EC_SIZE + 7) >> 3]; + unsigned char ty[(BR_MAX_EC_SIZE + 7) >> 3]; + unsigned char eU[POINT_LEN]; + size_t nlen, rlen, ulen; + uint16_t n0i; + uint32_t res; + + /* + * If the curve is not supported, then report an error. + */ + if (((impl->supported_curves >> pk->curve) & 1) == 0) { + return 0; + } + + /* + * Get the curve parameters (generator and order). + */ + switch (pk->curve) { + case BR_EC_secp256r1: + cd = &br_secp256r1; + break; + case BR_EC_secp384r1: + cd = &br_secp384r1; + break; + case BR_EC_secp521r1: + cd = &br_secp521r1; + break; + default: + return 0; + } + + /* + * Signature length must be even. + */ + if (sig_len & 1) { + return 0; + } + rlen = sig_len >> 1; + + /* + * Public key point must have the proper size for this curve. + */ + if (pk->qlen != cd->generator_len) { + return 0; + } + + /* + * Get modulus; then decode the r and s values. They must be + * lower than the modulus, and s must not be null. + */ + nlen = cd->order_len; + br_i15_decode(n, cd->order, nlen); + n0i = br_i15_ninv15(n[1]); + if (!br_i15_decode_mod(r, sig, rlen, n)) { + return 0; + } + if (!br_i15_decode_mod(s, (const unsigned char *)sig + rlen, rlen, n)) { + return 0; + } + if (br_i15_iszero(s)) { + return 0; + } + + /* + * Invert s. We do that with a modular exponentiation; we use + * the fact that for all the curves we support, the least + * significant byte is not 0 or 1, so we can subtract 2 without + * any carry to process. + * We also want 1/s in Montgomery representation, which can be + * done by converting _from_ Montgomery representation before + * the inversion (because (1/s)*R = 1/(s/R)). + */ + br_i15_from_monty(s, n, n0i); + memcpy(tx, cd->order, nlen); + tx[nlen - 1] -= 2; + br_i15_modpow(s, tx, nlen, n, n0i, t1, t2); + + /* + * Truncate the hash to the modulus length (in bits) and reduce + * it modulo the curve order. The modular reduction can be done + * with a subtraction since the truncation already reduced the + * value to the modulus bit length. + */ + br_ecdsa_i15_bits2int(t1, hash, hash_len, n[0]); + br_i15_sub(t1, n, br_i15_sub(t1, n, 0) ^ 1); + + /* + * Multiply the (truncated, reduced) hash value with 1/s, result in + * t2, encoded in ty. + */ + br_i15_montymul(t2, t1, s, n, n0i); + br_i15_encode(ty, nlen, t2); + + /* + * Multiply r with 1/s, result in t1, encoded in tx. + */ + br_i15_montymul(t1, r, s, n, n0i); + br_i15_encode(tx, nlen, t1); + + /* + * Compute the point x*Q + y*G. + */ + ulen = cd->generator_len; + memcpy(eU, pk->q, ulen); + res = impl->muladd(eU, NULL, ulen, + tx, nlen, ty, nlen, cd->curve); + + /* + * Get the X coordinate, reduce modulo the curve order, and + * compare with the 'r' value. + * + * The modular reduction can be done with subtractions because + * we work with curves of prime order, so the curve order is + * close to the field order (Hasse's theorem). + */ + br_i15_zero(t1, n[0]); + br_i15_decode(t1, &eU[1], ulen >> 1); + t1[0] = n[0]; + br_i15_sub(t1, n, br_i15_sub(t1, n, 0) ^ 1); + res &= ~br_i15_sub(t1, r, 1); + res &= br_i15_iszero(t1); + return res; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c new file mode 100644 index 00000000..9a8d6730 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_bits.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see inner.h */ +void +br_ecdsa_i31_bits2int(uint32_t *x, + const void *src, size_t len, uint32_t ebitlen) +{ + uint32_t bitlen, hbitlen; + int sc; + + bitlen = ebitlen - (ebitlen >> 5); + hbitlen = (uint32_t)len << 3; + if (hbitlen > bitlen) { + len = (bitlen + 7) >> 3; + sc = (int)((hbitlen - bitlen) & 7); + } else { + sc = 0; + } + br_i31_zero(x, ebitlen); + br_i31_decode(x, src, len); + br_i31_rshift(x, sc); + x[0] = ebitlen; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c new file mode 100644 index 00000000..cf0d351d --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_asn1.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define ORDER_LEN ((BR_MAX_EC_SIZE + 7) >> 3) + +/* see bearssl_ec.h */ +size_t +br_ecdsa_i31_sign_asn1(const br_ec_impl *impl, + const br_hash_class *hf, const void *hash_value, + const br_ec_private_key *sk, void *sig) +{ + unsigned char rsig[(ORDER_LEN << 1) + 12]; + size_t sig_len; + + sig_len = br_ecdsa_i31_sign_raw(impl, hf, hash_value, sk, rsig); + if (sig_len == 0) { + return 0; + } + sig_len = br_ecdsa_raw_to_asn1(rsig, sig_len); + memcpy(sig, rsig, sig_len); + return sig_len; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c new file mode 100644 index 00000000..1df98fed --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_sign_raw.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define I31_LEN ((BR_MAX_EC_SIZE + 61) / 31) +#define POINT_LEN (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1)) +#define ORDER_LEN ((BR_MAX_EC_SIZE + 7) >> 3) + +/* see bearssl_ec.h */ +size_t +br_ecdsa_i31_sign_raw(const br_ec_impl *impl, + const br_hash_class *hf, const void *hash_value, + const br_ec_private_key *sk, void *sig) +{ + /* + * IMPORTANT: this code is fit only for curves with a prime + * order. This is needed so that modular reduction of the X + * coordinate of a point can be done with a simple subtraction. + * We also rely on the last byte of the curve order to be distinct + * from 0 and 1. + */ + const br_ec_curve_def *cd; + uint32_t n[I31_LEN], r[I31_LEN], s[I31_LEN], x[I31_LEN]; + uint32_t m[I31_LEN], k[I31_LEN], t1[I31_LEN], t2[I31_LEN]; + unsigned char tt[ORDER_LEN << 1]; + unsigned char eU[POINT_LEN]; + size_t hash_len, nlen, ulen; + uint32_t n0i, ctl; + br_hmac_drbg_context drbg; + + /* + * If the curve is not supported, then exit with an error. + */ + if (((impl->supported_curves >> sk->curve) & 1) == 0) { + return 0; + } + + /* + * Get the curve parameters (generator and order). + */ + switch (sk->curve) { + case BR_EC_secp256r1: + cd = &br_secp256r1; + break; + case BR_EC_secp384r1: + cd = &br_secp384r1; + break; + case BR_EC_secp521r1: + cd = &br_secp521r1; + break; + default: + return 0; + } + + /* + * Get modulus. + */ + nlen = cd->order_len; + br_i31_decode(n, cd->order, nlen); + n0i = br_i31_ninv31(n[1]); + + /* + * Get private key as an i31 integer. This also checks that the + * private key is well-defined (not zero, and less than the + * curve order). + */ + if (!br_i31_decode_mod(x, sk->x, sk->xlen, n)) { + return 0; + } + if (br_i31_iszero(x)) { + return 0; + } + + /* + * Get hash length. + */ + hash_len = (hf->desc >> BR_HASHDESC_OUT_OFF) & BR_HASHDESC_OUT_MASK; + + /* + * Truncate and reduce the hash value modulo the curve order. + */ + br_ecdsa_i31_bits2int(m, hash_value, hash_len, n[0]); + br_i31_sub(m, n, br_i31_sub(m, n, 0) ^ 1); + + /* + * RFC 6979 generation of the "k" value. + * + * The process uses HMAC_DRBG (with the hash function used to + * process the message that is to be signed). The seed is the + * concatenation of the encodings of the private key and + * the hash value (after truncation and modular reduction). + */ + br_i31_encode(tt, nlen, x); + br_i31_encode(tt + nlen, nlen, m); + br_hmac_drbg_init(&drbg, hf, tt, nlen << 1); + for (;;) { + br_hmac_drbg_generate(&drbg, tt, nlen); + br_ecdsa_i31_bits2int(k, tt, nlen, n[0]); + if (br_i31_iszero(k)) { + continue; + } + if (br_i31_sub(k, n, 0)) { + break; + } + } + + /* + * Compute k*G and extract the X coordinate, then reduce it + * modulo the curve order. Since we support only curves with + * prime order, that reduction is only a matter of computing + * a subtraction. + */ + br_i31_encode(tt, nlen, k); + ulen = impl->mulgen(eU, tt, nlen, sk->curve); + br_i31_zero(r, n[0]); + br_i31_decode(r, &eU[1], ulen >> 1); + r[0] = n[0]; + br_i31_sub(r, n, br_i31_sub(r, n, 0) ^ 1); + + /* + * Compute 1/k in double-Montgomery representation. We do so by + * first converting _from_ Montgomery representation (twice), + * then using a modular exponentiation. + */ + br_i31_from_monty(k, n, n0i); + br_i31_from_monty(k, n, n0i); + memcpy(tt, cd->order, nlen); + tt[nlen - 1] -= 2; + br_i31_modpow(k, tt, nlen, n, n0i, t1, t2); + + /* + * Compute s = (m+xr)/k (mod n). + * The k[] array contains R^2/k (double-Montgomery representation); + * we thus can use direct Montgomery multiplications and conversions + * from Montgomery, avoiding any call to br_i31_to_monty() (which + * is slower). + */ + br_i31_from_monty(m, n, n0i); + br_i31_montymul(t1, x, r, n, n0i); + ctl = br_i31_add(t1, m, 1); + ctl |= br_i31_sub(t1, n, 0) ^ 1; + br_i31_sub(t1, n, ctl); + br_i31_montymul(s, t1, k, n, n0i); + + /* + * Encode r and s in the signature. + */ + br_i31_encode(sig, nlen, r); + br_i31_encode((unsigned char *)sig + nlen, nlen, s); + return nlen << 1; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c new file mode 100644 index 00000000..4161aaaa --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_asn1.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define FIELD_LEN ((BR_MAX_EC_SIZE + 7) >> 3) + +/* see bearssl_ec.h */ +uint32_t +br_ecdsa_i31_vrfy_asn1(const br_ec_impl *impl, + const void *hash, size_t hash_len, + const br_ec_public_key *pk, + const void *sig, size_t sig_len) +{ + /* + * We use a double-sized buffer because a malformed ASN.1 signature + * may trigger a size expansion when converting to "raw" format. + */ + unsigned char rsig[(FIELD_LEN << 2) + 24]; + + if (sig_len > ((sizeof rsig) >> 1)) { + return 0; + } + memcpy(rsig, sig, sig_len); + sig_len = br_ecdsa_asn1_to_raw(rsig, sig_len); + return br_ecdsa_i31_vrfy_raw(impl, hash, hash_len, pk, rsig, sig_len); +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c new file mode 100644 index 00000000..259477fd --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_i31_vrfy_raw.c @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define I31_LEN ((BR_MAX_EC_SIZE + 61) / 31) +#define POINT_LEN (1 + (((BR_MAX_EC_SIZE + 7) >> 3) << 1)) + +/* see bearssl_ec.h */ +uint32_t +br_ecdsa_i31_vrfy_raw(const br_ec_impl *impl, + const void *hash, size_t hash_len, + const br_ec_public_key *pk, + const void *sig, size_t sig_len) +{ + /* + * IMPORTANT: this code is fit only for curves with a prime + * order. This is needed so that modular reduction of the X + * coordinate of a point can be done with a simple subtraction. + */ + const br_ec_curve_def *cd; + uint32_t n[I31_LEN], r[I31_LEN], s[I31_LEN], t1[I31_LEN], t2[I31_LEN]; + unsigned char tx[(BR_MAX_EC_SIZE + 7) >> 3]; + unsigned char ty[(BR_MAX_EC_SIZE + 7) >> 3]; + unsigned char eU[POINT_LEN]; + size_t nlen, rlen, ulen; + uint32_t n0i, res; + + /* + * If the curve is not supported, then report an error. + */ + if (((impl->supported_curves >> pk->curve) & 1) == 0) { + return 0; + } + + /* + * Get the curve parameters (generator and order). + */ + switch (pk->curve) { + case BR_EC_secp256r1: + cd = &br_secp256r1; + break; + case BR_EC_secp384r1: + cd = &br_secp384r1; + break; + case BR_EC_secp521r1: + cd = &br_secp521r1; + break; + default: + return 0; + } + + /* + * Signature length must be even. + */ + if (sig_len & 1) { + return 0; + } + rlen = sig_len >> 1; + + /* + * Public key point must have the proper size for this curve. + */ + if (pk->qlen != cd->generator_len) { + return 0; + } + + /* + * Get modulus; then decode the r and s values. They must be + * lower than the modulus, and s must not be null. + */ + nlen = cd->order_len; + br_i31_decode(n, cd->order, nlen); + n0i = br_i31_ninv31(n[1]); + if (!br_i31_decode_mod(r, sig, rlen, n)) { + return 0; + } + if (!br_i31_decode_mod(s, (const unsigned char *)sig + rlen, rlen, n)) { + return 0; + } + if (br_i31_iszero(s)) { + return 0; + } + + /* + * Invert s. We do that with a modular exponentiation; we use + * the fact that for all the curves we support, the least + * significant byte is not 0 or 1, so we can subtract 2 without + * any carry to process. + * We also want 1/s in Montgomery representation, which can be + * done by converting _from_ Montgomery representation before + * the inversion (because (1/s)*R = 1/(s/R)). + */ + br_i31_from_monty(s, n, n0i); + memcpy(tx, cd->order, nlen); + tx[nlen - 1] -= 2; + br_i31_modpow(s, tx, nlen, n, n0i, t1, t2); + + /* + * Truncate the hash to the modulus length (in bits) and reduce + * it modulo the curve order. The modular reduction can be done + * with a subtraction since the truncation already reduced the + * value to the modulus bit length. + */ + br_ecdsa_i31_bits2int(t1, hash, hash_len, n[0]); + br_i31_sub(t1, n, br_i31_sub(t1, n, 0) ^ 1); + + /* + * Multiply the (truncated, reduced) hash value with 1/s, result in + * t2, encoded in ty. + */ + br_i31_montymul(t2, t1, s, n, n0i); + br_i31_encode(ty, nlen, t2); + + /* + * Multiply r with 1/s, result in t1, encoded in tx. + */ + br_i31_montymul(t1, r, s, n, n0i); + br_i31_encode(tx, nlen, t1); + + /* + * Compute the point x*Q + y*G. + */ + ulen = cd->generator_len; + memcpy(eU, pk->q, ulen); + res = impl->muladd(eU, NULL, ulen, + tx, nlen, ty, nlen, cd->curve); + + /* + * Get the X coordinate, reduce modulo the curve order, and + * compare with the 'r' value. + * + * The modular reduction can be done with subtractions because + * we work with curves of prime order, so the curve order is + * close to the field order (Hasse's theorem). + */ + br_i31_zero(t1, n[0]); + br_i31_decode(t1, &eU[1], ulen >> 1); + t1[0] = n[0]; + br_i31_sub(t1, n, br_i31_sub(t1, n, 0) ^ 1); + res &= ~br_i31_sub(t1, r, 1); + res &= br_i31_iszero(t1); + return res; +} diff --git a/test/monniaux/BearSSL/src/ec/ecdsa_rta.c b/test/monniaux/BearSSL/src/ec/ecdsa_rta.c new file mode 100644 index 00000000..005c62c2 --- /dev/null +++ b/test/monniaux/BearSSL/src/ec/ecdsa_rta.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Compute ASN.1 encoded length for the provided integer. The ASN.1 + * encoding is signed, so its leading bit must have value 0; it must + * also be of minimal length (so leading bytes of value 0 must be + * removed, except if that would contradict the rule about the sign + * bit). + */ +static size_t +asn1_int_length(const unsigned char *x, size_t xlen) +{ + while (xlen > 0 && *x == 0) { + x ++; + xlen --; + } + if (xlen == 0 || *x >= 0x80) { + xlen ++; + } + return xlen; +} + +/* see bearssl_ec.h */ +size_t +br_ecdsa_raw_to_asn1(void *sig, size_t sig_len) +{ + /* + * Internal buffer is large enough to accommodate a signature + * such that r and s fit on 125 bytes each (signed encoding), + * meaning a curve order of up to 999 bits. This is the limit + * that ensures "simple" length encodings. + */ + unsigned char *buf; + size_t hlen, rlen, slen, zlen, off; + unsigned char tmp[257]; + + buf = sig; + if ((sig_len & 1) != 0) { + return 0; + } + + /* + * Compute lengths for the two integers. + */ + hlen = sig_len >> 1; + rlen = asn1_int_length(buf, hlen); + slen = asn1_int_length(buf + hlen, hlen); + if (rlen > 125 || slen > 125) { + return 0; + } + + /* + * SEQUENCE header. + */ + tmp[0] = 0x30; + zlen = rlen + slen + 4; + if (zlen >= 0x80) { + tmp[1] = 0x81; + tmp[2] = zlen; + off = 3; + } else { + tmp[1] = zlen; + off = 2; + } + + /* + * First INTEGER (r). + */ + tmp[off ++] = 0x02; + tmp[off ++] = rlen; + if (rlen > hlen) { + tmp[off] = 0x00; + memcpy(tmp + off + 1, buf, hlen); + } else { + memcpy(tmp + off, buf + hlen - rlen, rlen); + } + off += rlen; + + /* + * Second INTEGER (s). + */ + tmp[off ++] = 0x02; + tmp[off ++] = slen; + if (slen > hlen) { + tmp[off] = 0x00; + memcpy(tmp + off + 1, buf + hlen, hlen); + } else { + memcpy(tmp + off, buf + sig_len - slen, slen); + } + off += slen; + + /* + * Return ASN.1 signature. + */ + memcpy(sig, tmp, off); + return off; +} |