/* * Copyright (c) 2017 Thomas Pornin * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define BR_POWER_ASM_MACROS 1 #include "inner.h" /* * This is the GHASH implementation that leverages the POWER8 opcodes. */ #if BR_POWER8 /* * Some symbolic names for registers. * HB0 = 16 bytes of value 0 * HB1 = 16 bytes of value 1 * HB2 = 16 bytes of value 2 * HB6 = 16 bytes of value 6 * HB7 = 16 bytes of value 7 * TT0, TT1 and TT2 are temporaries * * BSW holds the pattern for byteswapping 32-bit words; this is set only * on little-endian systems. XBSW is the same register with the +32 offset * for access with the VSX opcodes. */ #define HB0 0 #define HB1 1 #define HB2 2 #define HB6 3 #define HB7 4 #define TT0 5 #define TT1 6 #define TT2 7 #define BSW 8 #define XBSW 40 /* * Macro to initialise the constants. */ #define INIT \ vxor(HB0, HB0, HB0) \ vspltisb(HB1, 1) \ vspltisb(HB2, 2) \ vspltisb(HB6, 6) \ vspltisb(HB7, 7) \ INIT_BSW /* * Fix endianness of a value after reading it or before writing it, if * necessary. */ #if BR_POWER8_LE #define INIT_BSW lxvw4x(XBSW, 0, %[idx2be]) #define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW) #else #define INIT_BSW #define FIX_ENDIAN(xx) #endif /* * Left-shift x0:x1 by one bit to the left. This is a corrective action * needed because GHASH is defined in full little-endian specification, * while the opcodes use full big-endian convention, so the 255-bit product * ends up one bit to the right. */ #define SL_256(x0, x1) \ vsldoi(TT0, HB0, x1, 1) \ vsl(x0, x0, HB1) \ vsr(TT0, TT0, HB7) \ vsl(x1, x1, HB1) \ vxor(x0, x0, TT0) /* * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as * x0 or x1, or a different register). x0 and x1 are modified. */ #define REDUCE_F128(xd, x0, x1) \ vxor(x0, x0, x1) \ vsr(TT0, x1, HB1) \ vsr(TT1, x1, HB2) \ vsr(TT2, x1, HB7) \ vxor(x0, x0, TT0) \ vxor(TT1, TT1, TT2) \ vxor(x0, x0, TT1) \ vsldoi(x1, x1, HB0, 15) \ vsl(TT1, x1, HB6) \ vsl(TT2, x1, HB1) \ vxor(x1, TT1, TT2) \ vsr(TT0, x1, HB1) \ vsr(TT1, x1, HB2) \ vsr(TT2, x1, HB7) \ vxor(x0, x0, x1) \ vxor(x0, x0, TT0) \ vxor(TT1, TT1, TT2) \ vxor(xd, x0, TT1) /* see bearssl_hash.h */ void br_ghash_pwr8(void *y, const void *h, const void *data, size_t len) { const unsigned char *buf1, *buf2; size_t num4, num1; unsigned char tmp[64]; long cc0, cc1, cc2, cc3; #if BR_POWER8_LE static const uint32_t idx2be[] = { 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C }; #endif buf1 = data; /* * Assembly code requires data into two chunks; first chunk * must contain a number of blocks which is a multiple of 4. * Since the processing for the first chunk is faster, we want * to make it as big as possible. * * For the remainder, there are two possibilities: * -- if the remainder size is a multiple of 16, then use it * in place; * -- otherwise, copy it to the tmp[] array and pad it with * zeros. */ num4 = len >> 6; buf2 = buf1 + (num4 << 6); len &= 63; num1 = (len + 15) >> 4; if ((len & 15) != 0) { memcpy(tmp, buf2, len); memset(tmp + len, 0, (num1 << 4) - len); buf2 = tmp; } cc0 = 0; cc1 = 16; cc2 = 32; cc3 = 48; asm volatile ( INIT /* * Load current h (denoted hereafter h1) in v9. */ lxvw4x(41, 0, %[h]) FIX_ENDIAN(9) /* * Load current y into v28. */ lxvw4x(60, 0, %[y]) FIX_ENDIAN(28) /* * Split h1 into three registers: * v17 = h1_1:h1_0 * v18 = 0:h1_0 * v19 = h1_1:0 */ xxpermdi(49, 41, 41, 2) vsldoi(18, HB0, 9, 8) vsldoi(19, 9, HB0, 8) /* * If num4 is 0, skip directly to the second chunk. */ cmpldi(%[num4], 0) beq(chunk1) /* * Compute h2 = h*h in v10. */ vpmsumd(10, 18, 18) vpmsumd(11, 19, 19) SL_256(10, 11) REDUCE_F128(10, 10, 11) /* * Compute h3 = h*h*h in v11. * We first split h2 into: * v10 = h2_0:h2_1 * v11 = 0:h2_0 * v12 = h2_1:0 * Then we do the product with h1, and reduce into v11. */ vsldoi(11, HB0, 10, 8) vsldoi(12, 10, HB0, 8) vpmsumd(13, 10, 17) vpmsumd(11, 11, 18) vpmsumd(12, 12, 19) vsldoi(14, HB0, 13, 8) vsldoi(15, 13, HB0, 8) vxor(11, 11, 14) vxor(12, 12, 15) SL_256(11, 12) REDUCE_F128(11, 11, 12) /* * Compute h4 = h*h*h*h in v12. This is done by squaring h2. */ vsldoi(12, HB0, 10, 8) vsldoi(13, 10, HB0, 8) vpmsumd(12, 12, 12) vpmsumd(13, 13, 13) SL_256(12, 13) REDUCE_F128(12, 12, 13) /* * Repack h1, h2, h3 and h4: * v13 = h4_0:h3_0 * v14 = h4_1:h3_1 * v15 = h2_0:h1_0 * v16 = h2_1:h1_1 */ xxpermdi(45, 44, 43, 0) xxpermdi(46, 44, 43, 3) xxpermdi(47, 42, 41, 0) xxpermdi(48, 42, 41, 3) /* * Loop for each group of four blocks. */ mtctr(%[num4]) label(loop4) /* * Read the four next blocks. * v20 = y + a0 = b0 * v21 = a1 = b1 * v22 = a2 = b2 * v23 = a3 = b3 */ lxvw4x(52, %[cc0], %[buf1]) lxvw4x(53, %[cc1], %[buf1]) lxvw4x(54, %[cc2], %[buf1]) lxvw4x(55, %[cc3], %[buf1]) FIX_ENDIAN(20) FIX_ENDIAN(21) FIX_ENDIAN(22) FIX_ENDIAN(23) addi(%[buf1], %[buf1], 64) vxor(20, 20, 28) /* * Repack the blocks into v9, v10, v11 and v12. * v9 = b0_0:b1_0 * v10 = b0_1:b1_1 * v11 = b2_0:b3_0 * v12 = b2_1:b3_1 */ xxpermdi(41, 52, 53, 0) xxpermdi(42, 52, 53, 3) xxpermdi(43, 54, 55, 0) xxpermdi(44, 54, 55, 3) /* * Compute the products. * v20 = b0_0*h4_0 + b1_0*h3_0 * v21 = b0_1*h4_0 + b1_1*h3_0 * v22 = b0_0*h4_1 + b1_0*h3_1 * v23 = b0_1*h4_1 + b1_1*h3_1 * v24 = b2_0*h2_0 + b3_0*h1_0 * v25 = b2_1*h2_0 + b3_1*h1_0 * v26 = b2_0*h2_1 + b3_0*h1_1 * v27 = b2_1*h2_1 + b3_1*h1_1 */ vpmsumd(20, 13, 9) vpmsumd(21, 13, 10) vpmsumd(22, 14, 9) vpmsumd(23, 14, 10) vpmsumd(24, 15, 11) vpmsumd(25, 15, 12) vpmsumd(26, 16, 11) vpmsumd(27, 16, 12) /* * Sum products into a single 256-bit result in v11:v12. */ vxor(11, 20, 24) vxor(12, 23, 27) vxor( 9, 21, 22) vxor(10, 25, 26) vxor(20, 9, 10) vsldoi( 9, HB0, 20, 8) vsldoi(10, 20, HB0, 8) vxor(11, 11, 9) vxor(12, 12, 10) /* * Fix and reduce in GF(2^128); this is the new y (in v28). */ SL_256(11, 12) REDUCE_F128(28, 11, 12) /* * Loop for next group of four blocks. */ bdnz(loop4) /* * Process second chunk, one block at a time. */ label(chunk1) cmpldi(%[num1], 0) beq(done) mtctr(%[num1]) label(loop1) /* * Load next data block and XOR it into y. */ lxvw4x(41, 0, %[buf2]) #if BR_POWER8_LE FIX_ENDIAN(9) #endif addi(%[buf2], %[buf2], 16) vxor(9, 28, 9) /* * Split y into doublewords: * v9 = y_0:y_1 * v10 = 0:y_0 * v11 = y_1:0 */ vsldoi(10, HB0, 9, 8) vsldoi(11, 9, HB0, 8) /* * Compute products with h: * v12 = y_0 * h_0 * v13 = y_1 * h_1 * v14 = y_1 * h_0 + y_0 * h_1 */ vpmsumd(14, 9, 17) vpmsumd(12, 10, 18) vpmsumd(13, 11, 19) /* * Propagate v14 into v12:v13 to finalise product. */ vsldoi(10, HB0, 14, 8) vsldoi(11, 14, HB0, 8) vxor(12, 12, 10) vxor(13, 13, 11) /* * Fix result and reduce into v28 (next value for y). */ SL_256(12, 13) REDUCE_F128(28, 12, 13) bdnz(loop1) label(done) /* * Write back the new y. */ FIX_ENDIAN(28) stxvw4x(60, 0, %[y]) : [buf1] "+b" (buf1), [buf2] "+b" (buf2) : [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1), [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3) #if BR_POWER8_LE , [idx2be] "b" (idx2be) #endif : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "ctr", "memory" ); } /* see bearssl_hash.h */ br_ghash br_ghash_pwr8_get(void) { return &br_ghash_pwr8; } #else /* see bearssl_hash.h */ br_ghash br_ghash_pwr8_get(void) { return 0; } #endif