/* * Copyright (c) 2016 Thomas Pornin * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "inner.h" /* * Parameters for supported curves (field modulus, and 'b' equation * parameter; both values use the 'i31' format, and 'b' is in Montgomery * representation). */ static const uint32_t P256_P[] = { 0x00000108, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x00000007, 0x00000000, 0x00000000, 0x00000040, 0x7FFFFF80, 0x000000FF }; static const uint32_t P256_R2[] = { 0x00000108, 0x00014000, 0x00018000, 0x00000000, 0x7FF40000, 0x7FEFFFFF, 0x7FF7FFFF, 0x7FAFFFFF, 0x005FFFFF, 0x00000000 }; static const uint32_t P256_B[] = { 0x00000108, 0x6FEE1803, 0x6229C4BD, 0x21B139BE, 0x327150AA, 0x3567802E, 0x3F7212ED, 0x012E4355, 0x782DD38D, 0x0000000E }; static const uint32_t P384_P[] = { 0x0000018C, 0x7FFFFFFF, 0x00000001, 0x00000000, 0x7FFFFFF8, 0x7FFFFFEF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x00000FFF }; static const uint32_t P384_R2[] = { 0x0000018C, 0x00000000, 0x00000080, 0x7FFFFE00, 0x000001FF, 0x00000800, 0x00000000, 0x7FFFE000, 0x00001FFF, 0x00008000, 0x00008000, 0x00000000, 0x00000000, 0x00000000 }; static const uint32_t P384_B[] = { 0x0000018C, 0x6E666840, 0x070D0392, 0x5D810231, 0x7651D50C, 0x17E218D6, 0x1B192002, 0x44EFE441, 0x3A524E2B, 0x2719BA5F, 0x41F02209, 0x36C5643E, 0x5813EFFE, 0x000008A5 }; static const uint32_t P521_P[] = { 0x00000219, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x01FFFFFF }; static const uint32_t P521_R2[] = { 0x00000219, 0x00001000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; static const uint32_t P521_B[] = { 0x00000219, 0x540FC00A, 0x228FEA35, 0x2C34F1EF, 0x67BF107A, 0x46FC1CD5, 0x1605E9DD, 0x6937B165, 0x272A3D8F, 0x42785586, 0x44C8C778, 0x15F3B8B4, 0x64B73366, 0x03BA8B69, 0x0D05B42A, 0x21F929A2, 0x2C31C393, 0x00654FAE }; typedef struct { const uint32_t *p; const uint32_t *b; const uint32_t *R2; uint32_t p0i; } curve_params; static inline const curve_params * id_to_curve(int curve) { static const curve_params pp[] = { { P256_P, P256_B, P256_R2, 0x00000001 }, { P384_P, P384_B, P384_R2, 0x00000001 }, { P521_P, P521_B, P521_R2, 0x00000001 } }; return &pp[curve - BR_EC_secp256r1]; } #define I31_LEN ((BR_MAX_EC_SIZE + 61) / 31) /* * Type for a point in Jacobian coordinates: * -- three values, x, y and z, in Montgomery representation * -- affine coordinates are X = x / z^2 and Y = y / z^3 * -- for the point at infinity, z = 0 */ typedef struct { uint32_t c[3][I31_LEN]; } jacobian; /* * We use a custom interpreter that uses a dozen registers, and * only six operations: * MSET(d, a) copy a into d * MADD(d, a) d = d+a (modular) * MSUB(d, a) d = d-a (modular) * MMUL(d, a, b) d = a*b (Montgomery multiplication) * MINV(d, a, b) invert d modulo p; a and b are used as scratch registers * MTZ(d) clear return value if d = 0 * Destination of MMUL (d) must be distinct from operands (a and b). * There is no such constraint for MSUB and MADD. * * Registers include the operand coordinates, and temporaries. */ #define MSET(d, a) (0x0000 + ((d) << 8) + ((a) << 4)) #define MADD(d, a) (0x1000 + ((d) << 8) + ((a) << 4)) #define MSUB(d, a) (0x2000 + ((d) << 8) + ((a) << 4)) #define MMUL(d, a, b) (0x3000 + ((d) << 8) + ((a) << 4) + (b)) #define MINV(d, a, b) (0x4000 + ((d) << 8) + ((a) << 4) + (b)) #define MTZ(d) (0x5000 + ((d) << 8)) #define ENDCODE 0 /* * Registers for the input operands. */ #define P1x 0 #define P1y 1 #define P1z 2 #define P2x 3 #define P2y 4 #define P2z 5 /* * Alternate names for the first input operand. */ #define Px 0 #define Py 1 #define Pz 2 /* * Temporaries. */ #define t1 6 #define t2 7 #define t3 8 #define t4 9 #define t5 10 #define t6 11 #define t7 12 /* * Extra scratch registers available when there is no second operand (e.g. * for "double" and "affine"). */ #define t8 3 #define t9 4 #define t10 5 /* * Doubling formulas are: * * s = 4*x*y^2 * m = 3*(x + z^2)*(x - z^2) * x' = m^2 - 2*s * y' = m*(s - x') - 8*y^4 * z' = 2*y*z * * If y = 0 (P has order 2) then this yields infinity (z' = 0), as it * should. This case should not happen anyway, because our curves have * prime order, and thus do not contain any point of order 2. * * If P is infinity (z = 0), then again the formulas yield infinity, * which is correct. Thus, this code works for all points. * * Cost: 8 multiplications */ static const uint16_t code_double[] = { /* * Compute z^2 (in t1). */ MMUL(t1, Pz, Pz), /* * Compute x-z^2 (in t2) and then x+z^2 (in t1). */ MSET(t2, Px), MSUB(t2, t1), MADD(t1, Px), /* * Compute m = 3*(x+z^2)*(x-z^2) (in t1). */ MMUL(t3, t1, t2), MSET(t1, t3), MADD(t1, t3), MADD(t1, t3), /* * Compute s = 4*x*y^2 (in t2) and 2*y^2 (in t3). */ MMUL(t3, Py, Py), MADD(t3, t3), MMUL(t2, Px, t3), MADD(t2, t2), /* * Compute x' = m^2 - 2*s. */ MMUL(Px, t1, t1), MSUB(Px, t2), MSUB(Px, t2), /* * Compute z' = 2*y*z. */ MMUL(t4, Py, Pz), MSET(Pz, t4), MADD(Pz, t4), /* * Compute y' = m*(s - x') - 8*y^4. Note that we already have * 2*y^2 in t3. */ MSUB(t2, Px), MMUL(Py, t1, t2), MMUL(t4, t3, t3), MSUB(Py, t4), MSUB(Py, t4), ENDCODE }; /* * Addtions formulas are: * * u1 = x1 * z2^2 * u2 = x2 * z1^2 * s1 = y1 * z2^3 * s2 = y2 * z1^3 * h = u2 - u1 * r = s2 - s1 * x3 = r^2 - h^3 - 2 * u1 * h^2 * y3 = r * (u1 * h^2 - x3) - s1 * h^3 * z3 = h * z1 * z2 * * If both P1 and P2 are infinity, then z1 == 0 and z2 == 0, implying that * z3 == 0, so the result is correct. * If either of P1 or P2 is infinity, but not both, then z3 == 0, which is * not correct. * h == 0 only if u1 == u2; this happens in two cases: * -- if s1 == s2 then P1 and/or P2 is infinity, or P1 == P2 * -- if s1 != s2 then P1 + P2 == infinity (but neither P1 or P2 is infinity) * * Thus, the following situations are not handled correctly: * -- P1 = 0 and P2 != 0 * -- P1 != 0 and P2 = 0 * -- P1 = P2 * All other cases are properly computed. However, even in "incorrect" * situations, the three coordinates still are properly formed field * elements. * * The returned flag is cleared if r == 0. This happens in the following * cases: * -- Both points are on the same horizontal line (same Y coordinate). * -- Both points are infinity. * -- One point is infinity and the other is on line Y = 0. * The third case cannot happen with our curves (there is no valid point * on line Y = 0 since that would be a point of order 2). If the two * source points are non-infinity, then remains only the case where the * two points are on the same horizontal line. * * This allows us to detect the "P1 == P2" case, assuming that P1 != 0 and * P2 != 0: * -- If the returned value is not the point at infinity, then it was properly * computed. * -- Otherwise, if the returned flag is 1, then P1+P2 = 0, and the result * is indeed the point at infinity. * -- Otherwise (result is infinity, flag is 0), then P1 = P2 and we should * use the 'double' code. * * Cost: 16 multiplications */ static const uint16_t code_add[] = { /* * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). */ MMUL(t3, P2z, P2z), MMUL(t1, P1x, t3), MMUL(t4, P2z, t3), MMUL(t3, P1y, t4), /* * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). */ MMUL(t4, P1z, P1z), MMUL(t2, P2x, t4), MMUL(t5, P1z, t4), MMUL(t4, P2y, t5), /* * Compute h = u2 - u1 (in t2) and r = s2 - s1 (in t4). */ MSUB(t2, t1), MSUB(t4, t3), /* * Report cases where r = 0 through the returned flag. */ MTZ(t4), /* * Compute u1*h^2 (in t6) and h^3 (in t5). */ MMUL(t7, t2, t2), MMUL(t6, t1, t7), MMUL(t5, t7, t2), /* * Compute x3 = r^2 - h^3 - 2*u1*h^2. * t1 and t7 can be used as scratch registers. */ MMUL(P1x, t4, t4), MSUB(P1x, t5), MSUB(P1x, t6), MSUB(P1x, t6), /* * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. */ MSUB(t6, P1x), MMUL(P1y, t4, t6), MMUL(t1, t5, t3), MSUB(P1y, t1), /* * Compute z3 = h*z1*z2. */ MMUL(t1, P1z, P2z), MMUL(P1z, t1, t2), ENDCODE }; /* * Check that the point is on the curve. This code snippet assumes the * following conventions: * -- Coordinates x and y have been freshly decoded in P1 (but not * converted to Montgomery coordinates yet). * -- P2x, P2y and P2z are set to, respectively, R^2, b*R and 1. */ static const uint16_t code_check[] = { /* Convert x and y to Montgomery representation. */ MMUL(t1, P1x, P2x), MMUL(t2, P1y, P2x), MSET(P1x, t1), MSET(P1y, t2), /* Compute x^3 in t1. */ MMUL(t2, P1x, P1x), MMUL(t1, P1x, t2), /* Subtract 3*x from t1. */ MSUB(t1, P1x), MSUB(t1, P1x), MSUB(t1, P1x), /* Add b. */ MADD(t1, P2y), /* Compute y^2 in t2. */ MMUL(t2, P1y, P1y), /* Compare y^2 with x^3 - 3*x + b; they must match. */ MSUB(t1, t2), MTZ(t1), /* Set z to 1 (in Montgomery representation). */ MMUL(P1z, P2x, P2z), ENDCODE }; /* * Conversion back to affine coordinates. This code snippet assumes that * the z coordinate of P2 is set to 1 (not in Montgomery representation). */ static const uint16_t code_affine[] = { /* Save z*R in t1. */ MSET(t1, P1z), /* Compute z^3 in t2. */ MMUL(t2, P1z, P1z), MMUL(t3, P1z, t2), MMUL(t2, t3, P2z), /* Invert to (1/z^3) in t2. */ MINV(t2, t3, t4), /* Compute y. */ MSET(t3, P1y), MMUL(P1y, t2, t3), /* Compute (1/z^2) in t3. */ MMUL(t3, t2, t1), /* Compute x. */ MSET(t2, P1x), MMUL(P1x, t2, t3), ENDCODE }; static uint32_t run_code(jacobian *P1, const jacobian *P2, const curve_params *cc, const uint16_t *code) { uint32_t r; uint32_t t[13][I31_LEN]; size_t u; r = 1; /* * Copy the two operands in the dedicated registers. */ memcpy(t[P1x], P1->c, 3 * I31_LEN * sizeof(uint32_t)); memcpy(t[P2x], P2->c, 3 * I31_LEN * sizeof(uint32_t)); /* * Run formulas. */ for (u = 0;; u ++) { unsigned op, d, a, b; op = code[u]; if (op == 0) { break; } d = (op >> 8) & 0x0F; a = (op >> 4) & 0x0F; b = op & 0x0F; op >>= 12; switch (op) { uint32_t ctl; size_t plen; unsigned char tp[(BR_MAX_EC_SIZE + 7) >> 3]; case 0: memcpy(t[d], t[a], I31_LEN * sizeof(uint32_t)); break; case 1: ctl = br_i31_add(t[d], t[a], 1); ctl |= NOT(br_i31_sub(t[d], cc->p, 0)); br_i31_sub(t[d], cc->p, ctl); break; case 2: br_i31_add(t[d], cc->p, br_i31_sub(t[d], t[a], 1)); break; case 3: br_i31_montymul(t[d], t[a], t[b], cc->p, cc->p0i); break; case 4: plen = (cc->p[0] - (cc->p[0] >> 5) + 7) >> 3; br_i31_encode(tp, plen, cc->p); tp[plen - 1] -= 2; br_i31_modpow(t[d], tp, plen, cc->p, cc->p0i, t[a], t[b]); break; default: r &= ~br_i31_iszero(t[d]); break; } } /* * Copy back result. */ memcpy(P1->c, t[P1x], 3 * I31_LEN * sizeof(uint32_t)); return r; } static void set_one(uint32_t *x, const uint32_t *p) { size_t plen; plen = (p[0] + 63) >> 5; memset(x, 0, plen * sizeof *x); x[0] = p[0]; x[1] = 0x00000001; } static void point_zero(jacobian *P, const curve_params *cc) { memset(P, 0, sizeof *P); P->c[0][0] = P->c[1][0] = P->c[2][0] = cc->p[0]; } static inline void point_double(jacobian *P, const curve_params *cc) { run_code(P, P, cc, code_double); } static inline uint32_t point_add(jacobian *P1, const jacobian *P2, const curve_params *cc) { return run_code(P1, P2, cc, code_add); } static void point_mul(jacobian *P, const unsigned char *x, size_t xlen, const curve_params *cc) { /* * We do a simple double-and-add ladder with a 2-bit window * to make only one add every two doublings. We thus first * precompute 2P and 3P in some local buffers. * * We always perform two doublings and one addition; the * addition is with P, 2P and 3P and is done in a temporary * array. * * The addition code cannot handle cases where one of the * operands is infinity, which is the case at the start of the * ladder. We therefore need to maintain a flag that controls * this situation. */ uint32_t qz; jacobian P2, P3, Q, T, U; memcpy(&P2, P, sizeof P2); point_double(&P2, cc); memcpy(&P3, P, sizeof P3); point_add(&P3, &P2, cc); point_zero(&Q, cc); qz = 1; while (xlen -- > 0) { int k; for (k = 6; k >= 0; k -= 2) { uint32_t bits; uint32_t bnz; point_double(&Q, cc); point_double(&Q, cc); memcpy(&T, P, sizeof T); memcpy(&U, &Q, sizeof U); bits = (*x >> k) & (uint32_t)3; bnz = NEQ(bits, 0); CCOPY(EQ(bits, 2), &T, &P2, sizeof T); CCOPY(EQ(bits, 3), &T, &P3, sizeof T); point_add(&U, &T, cc); CCOPY(bnz & qz, &Q, &T, sizeof Q); CCOPY(bnz & ~qz, &Q, &U, sizeof Q); qz &= ~bnz; } x ++; } memcpy(P, &Q, sizeof Q); } /* * Decode point into Jacobian coordinates. This function does not support * the point at infinity. If the point is invalid then this returns 0, but * the coordinates are still set to properly formed field elements. */ static uint32_t point_decode(jacobian *P, const void *src, size_t len, const curve_params *cc) { /* * Points must use uncompressed format: * -- first byte is 0x04; * -- coordinates X and Y use unsigned big-endian, with the same * length as the field modulus. * * We don't support hybrid format (uncompressed, but first byte * has value 0x06 or 0x07, depending on the least significant bit * of Y) because it is rather useless, and explicitly forbidden * by PKIX (RFC 5480, section 2.2). * * We don't support compressed format either, because it is not * much used in practice (there are or were patent-related * concerns about point compression, which explains the lack of * generalised support). Also, point compression support would * need a bit more code. */ const unsigned char *buf; size_t plen, zlen; uint32_t r; jacobian Q; buf = src; point_zero(P, cc); plen = (cc->p[0] - (cc->p[0] >> 5) + 7) >> 3; if (len != 1 + (plen << 1)) { return 0; } r = br_i31_decode_mod(P->c[0], buf + 1, plen, cc->p); r &= br_i31_decode_mod(P->c[1], buf + 1 + plen, plen, cc->p); /* * Check first byte. */ r &= EQ(buf[0], 0x04); /* obsolete r &= EQ(buf[0], 0x04) | (EQ(buf[0] & 0xFE, 0x06) & ~(uint32_t)(buf[0] ^ buf[plen << 1])); */ /* * Convert coordinates and check that the point is valid. */ zlen = ((cc->p[0] + 63) >> 5) * sizeof(uint32_t); memcpy(Q.c[0], cc->R2, zlen); memcpy(Q.c[1], cc->b, zlen); set_one(Q.c[2], cc->p); r &= ~run_code(P, &Q, cc, code_check); return r; } /* * Encode a point. This method assumes that the point is correct and is * not the point at infinity. Encoded size is always 1+2*plen, where * plen is the field modulus length, in bytes. */ static void point_encode(void *dst, const jacobian *P, const curve_params *cc) { unsigned char *buf; uint32_t xbl; size_t plen; jacobian Q, T; buf = dst; xbl = cc->p[0]; xbl -= (xbl >> 5); plen = (xbl + 7) >> 3; buf[0] = 0x04; memcpy(&Q, P, sizeof *P); set_one(T.c[2], cc->p); run_code(&Q, &T, cc, code_affine); br_i31_encode(buf + 1, plen, Q.c[0]); br_i31_encode(buf + 1 + plen, plen, Q.c[1]); } static const br_ec_curve_def * id_to_curve_def(int curve) { switch (curve) { case BR_EC_secp256r1: return &br_secp256r1; case BR_EC_secp384r1: return &br_secp384r1; case BR_EC_secp521r1: return &br_secp521r1; } return NULL; } static const unsigned char * api_generator(int curve, size_t *len) { const br_ec_curve_def *cd; cd = id_to_curve_def(curve); *len = cd->generator_len; return cd->generator; } static const unsigned char * api_order(int curve, size_t *len) { const br_ec_curve_def *cd; cd = id_to_curve_def(curve); *len = cd->order_len; return cd->order; } static size_t api_xoff(int curve, size_t *len) { api_generator(curve, len); *len >>= 1; return 1; } static uint32_t api_mul(unsigned char *G, size_t Glen, const unsigned char *x, size_t xlen, int curve) { uint32_t r; const curve_params *cc; jacobian P; cc = id_to_curve(curve); r = point_decode(&P, G, Glen, cc); point_mul(&P, x, xlen, cc); point_encode(G, &P, cc); return r; } static size_t api_mulgen(unsigned char *R, const unsigned char *x, size_t xlen, int curve) { const unsigned char *G; size_t Glen; G = api_generator(curve, &Glen); memcpy(R, G, Glen); api_mul(R, Glen, x, xlen, curve); return Glen; } static uint32_t api_muladd(unsigned char *A, const unsigned char *B, size_t len, const unsigned char *x, size_t xlen, const unsigned char *y, size_t ylen, int curve) { uint32_t r, t, z; const curve_params *cc; jacobian P, Q; /* * TODO: see about merging the two ladders. Right now, we do * two independent point multiplications, which is a bit * wasteful of CPU resources (but yields short code). */ cc = id_to_curve(curve); r = point_decode(&P, A, len, cc); if (B == NULL) { size_t Glen; B = api_generator(curve, &Glen); } r &= point_decode(&Q, B, len, cc); point_mul(&P, x, xlen, cc); point_mul(&Q, y, ylen, cc); /* * We want to compute P+Q. Since the base points A and B are distinct * from infinity, and the multipliers are non-zero and lower than the * curve order, then we know that P and Q are non-infinity. This * leaves two special situations to test for: * -- If P = Q then we must use point_double(). * -- If P+Q = 0 then we must report an error. */ t = point_add(&P, &Q, cc); point_double(&Q, cc); z = br_i31_iszero(P.c[2]); /* * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we * have the following: * * z = 0, t = 0 return P (normal addition) * z = 0, t = 1 return P (normal addition) * z = 1, t = 0 return Q (a 'double' case) * z = 1, t = 1 report an error (P+Q = 0) */ CCOPY(z & ~t, &P, &Q, sizeof Q); point_encode(A, &P, cc); r &= ~(z & t); return r; } /* see bearssl_ec.h */ const br_ec_impl br_ec_prime_i31 = { (uint32_t)0x03800000, &api_generator, &api_order, &api_xoff, &api_mul, &api_mulgen, &api_muladd };