test/monniaux/BearSSL/src/hash/ghash_pclmul.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389

/*
 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
 *
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#define BR_ENABLE_INTRINSICS   1
#include "inner.h"

/*
 * This is the GHASH implementation that leverages the pclmulqdq opcode
 * (from the AES-NI instructions).
 */

#if BR_AES_X86NI

/*
 * Test CPU support for PCLMULQDQ.
 */
static inline int
pclmul_supported(void)
{
	/*
	 * Bit mask for features in ECX:
	 *    1   PCLMULQDQ support
	 */
	return br_cpuid(0, 0, 0x00000002, 0);
}

/* see bearssl_hash.h */
br_ghash
br_ghash_pclmul_get(void)
{
	return pclmul_supported() ? &br_ghash_pclmul : 0;
}

BR_TARGETS_X86_UP

/*
 * GHASH is defined over elements of GF(2^128) with "full little-endian"
 * representation: leftmost byte is least significant, and, within each
 * byte, leftmost _bit_ is least significant. The natural ordering in
 * x86 is "mixed little-endian": bytes are ordered from least to most
 * significant, but bits within a byte are in most-to-least significant
 * order. Going to full little-endian representation would require
 * reversing bits within each byte, which is doable but expensive.
 *
 * Instead, we go to full big-endian representation, by swapping bytes
 * around, which is done with a single _mm_shuffle_epi8() opcode (it
 * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
 * can use a full big-endian representation because in a carryless
 * multiplication, we have a nice bit reversal property:
 *
 *    rev_128(x) * rev_128(y) = rev_255(x * y)
 *
 * So by using full big-endian, we still get the right result, except
 * that it is right-shifted by 1 bit. The left-shift is relatively
 * inexpensive, and it can be mutualised.
 *
 *
 * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
 * values with bit precision, we have to break down values into 64-bit
 * chunks. We number chunks from 0 to 3 in left to right order.
 */

/*
 * Byte-swap a complete 128-bit value. This normally uses
 * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
 * However, this crashes old Clang versions, so, for Clang before 3.8,
 * we use an alternate (and less efficient) version.
 */
#if BR_CLANG && !BR_CLANG_3_8
#define BYTESWAP_DECL
#define BYTESWAP_PREP   (void)0
#define BYTESWAP(x)   do { \
		__m128i byteswap1, byteswap2; \
		byteswap1 = (x); \
		byteswap2 = _mm_srli_epi16(byteswap1, 8); \
		byteswap1 = _mm_slli_epi16(byteswap1, 8); \
		byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
		byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
		byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
		(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
	} while (0)
#else
#define BYTESWAP_DECL   __m128i byteswap_index;
#define BYTESWAP_PREP   do { \
		byteswap_index = _mm_set_epi8( \
			0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
	} while (0)
#define BYTESWAP(x)   do { \
		(x) = _mm_shuffle_epi8((x), byteswap_index); \
	} while (0)
#endif

/*
 * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
 * for that compiler, we use inline assembly. Inline assembly is
 * potentially a bit slower because the compiler does not understand
 * what the opcode does, and thus cannot optimize instruction
 * scheduling.
 *
 * We use a target of "sse2" only, so that Clang may still handle the
 * '__m128i' type and allocate SSE2 registers.
 */
#if BR_CLANG
BR_TARGET("sse2")
static inline __m128i
pclmulqdq00(__m128i x, __m128i y)
{
	__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
	return x;
}
BR_TARGET("sse2")
static inline __m128i
pclmulqdq11(__m128i x, __m128i y)
{
	__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
	return x;
}
#else
#define pclmulqdq00(x, y)   _mm_clmulepi64_si128(x, y, 0x00)
#define pclmulqdq11(x, y)   _mm_clmulepi64_si128(x, y, 0x11)
#endif

/*
 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
 * halves of kw (into the right half of kx; left half is unspecified).
 */
#define BK(kw, kx)   do { \
		kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
	} while (0)

/*
 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
 * the XOR of the two values (kx).
 */
#define PBK(k0, k1, kw, kx)   do { \
		kw = _mm_unpacklo_epi64(k1, k0); \
		kx = _mm_xor_si128(k0, k1); \
	} while (0)

/*
 * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
 */
#define SL_256(x0, x1, x2, x3)   do { \
		x0 = _mm_or_si128( \
			_mm_slli_epi64(x0, 1), \
			_mm_srli_epi64(x1, 63)); \
		x1 = _mm_or_si128( \
			_mm_slli_epi64(x1, 1), \
			_mm_srli_epi64(x2, 63)); \
		x2 = _mm_or_si128( \
			_mm_slli_epi64(x2, 1), \
			_mm_srli_epi64(x3, 63)); \
		x3 = _mm_slli_epi64(x3, 1); \
	} while (0)

/*
 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
 * result is written in x0..x1.
 */
#define REDUCE_F128(x0, x1, x2, x3)   do { \
		x1 = _mm_xor_si128( \
			x1, \
			_mm_xor_si128( \
				_mm_xor_si128( \
					x3, \
					_mm_srli_epi64(x3, 1)), \
				_mm_xor_si128( \
					_mm_srli_epi64(x3, 2), \
					_mm_srli_epi64(x3, 7)))); \
		x2 = _mm_xor_si128( \
			_mm_xor_si128( \
				x2, \
				_mm_slli_epi64(x3, 63)), \
			_mm_xor_si128( \
				_mm_slli_epi64(x3, 62), \
				_mm_slli_epi64(x3, 57))); \
		x0 = _mm_xor_si128( \
			x0, \
			_mm_xor_si128( \
				_mm_xor_si128( \
					x2, \
					_mm_srli_epi64(x2, 1)), \
				_mm_xor_si128( \
					_mm_srli_epi64(x2, 2), \
					_mm_srli_epi64(x2, 7)))); \
		x1 = _mm_xor_si128( \
			_mm_xor_si128( \
				x1, \
				_mm_slli_epi64(x2, 63)), \
			_mm_xor_si128( \
				_mm_slli_epi64(x2, 62), \
				_mm_slli_epi64(x2, 57))); \
	} while (0)

/*
 * Square value kw into (dw,dx).
 */
#define SQUARE_F128(kw, dw, dx)   do { \
		__m128i z0, z1, z2, z3; \
		z1 = pclmulqdq11(kw, kw); \
		z3 = pclmulqdq00(kw, kw); \
		z0 = _mm_shuffle_epi32(z1, 0x0E); \
		z2 = _mm_shuffle_epi32(z3, 0x0E); \
		SL_256(z0, z1, z2, z3); \
		REDUCE_F128(z0, z1, z2, z3); \
		PBK(z0, z1, dw, dx); \
	} while (0)

/* see bearssl_hash.h */
BR_TARGET("ssse3,pclmul")
void
br_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
{
	const unsigned char *buf1, *buf2;
	unsigned char tmp[64];
	size_t num4, num1;
	__m128i yw, h1w, h1x;
	BYTESWAP_DECL

	/*
	 * We split data into two chunks. First chunk starts at buf1
	 * and contains num4 blocks of 64-byte values. Second chunk
	 * starts at buf2 and contains num1 blocks of 16-byte values.
	 * We want the first chunk to be as large as possible.
	 */
	buf1 = data;
	num4 = len >> 6;
	len &= 63;
	buf2 = buf1 + (num4 << 6);
	num1 = (len + 15) >> 4;
	if ((len & 15) != 0) {
		memcpy(tmp, buf2, len);
		memset(tmp + len, 0, (num1 << 4) - len);
		buf2 = tmp;
	}

	/*
	 * Preparatory step for endian conversions.
	 */
	BYTESWAP_PREP;

	/*
	 * Load y and h.
	 */
	yw = _mm_loadu_si128(y);
	h1w = _mm_loadu_si128(h);
	BYTESWAP(yw);
	BYTESWAP(h1w);
	BK(h1w, h1x);

	if (num4 > 0) {
		__m128i h2w, h2x, h3w, h3x, h4w, h4x;
		__m128i t0, t1, t2, t3;

		/*
		 * Compute h2 = h^2.
		 */
		SQUARE_F128(h1w, h2w, h2x);

		/*
		 * Compute h3 = h^3 = h*(h^2).
		 */
		t1 = pclmulqdq11(h1w, h2w);
		t3 = pclmulqdq00(h1w, h2w);
		t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
			_mm_xor_si128(t1, t3));
		t0 = _mm_shuffle_epi32(t1, 0x0E);
		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
		SL_256(t0, t1, t2, t3);
		REDUCE_F128(t0, t1, t2, t3);
		PBK(t0, t1, h3w, h3x);

		/*
		 * Compute h4 = h^4 = (h^2)^2.
		 */
		SQUARE_F128(h2w, h4w, h4x);

		while (num4 -- > 0) {
			__m128i aw0, aw1, aw2, aw3;
			__m128i ax0, ax1, ax2, ax3;

			aw0 = _mm_loadu_si128((void *)(buf1 +  0));
			aw1 = _mm_loadu_si128((void *)(buf1 + 16));
			aw2 = _mm_loadu_si128((void *)(buf1 + 32));
			aw3 = _mm_loadu_si128((void *)(buf1 + 48));
			BYTESWAP(aw0);
			BYTESWAP(aw1);
			BYTESWAP(aw2);
			BYTESWAP(aw3);
			buf1 += 64;

			aw0 = _mm_xor_si128(aw0, yw);
			BK(aw1, ax1);
			BK(aw2, ax2);
			BK(aw3, ax3);
			BK(aw0, ax0);

			t1 = _mm_xor_si128(
				_mm_xor_si128(
					pclmulqdq11(aw0, h4w),
					pclmulqdq11(aw1, h3w)),
				_mm_xor_si128(
					pclmulqdq11(aw2, h2w),
					pclmulqdq11(aw3, h1w)));
			t3 = _mm_xor_si128(
				_mm_xor_si128(
					pclmulqdq00(aw0, h4w),
					pclmulqdq00(aw1, h3w)),
				_mm_xor_si128(
					pclmulqdq00(aw2, h2w),
					pclmulqdq00(aw3, h1w)));
			t2 = _mm_xor_si128(
				_mm_xor_si128(
					pclmulqdq00(ax0, h4x),
					pclmulqdq00(ax1, h3x)),
				_mm_xor_si128(
					pclmulqdq00(ax2, h2x),
					pclmulqdq00(ax3, h1x)));
			t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
			t0 = _mm_shuffle_epi32(t1, 0x0E);
			t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
			t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
			SL_256(t0, t1, t2, t3);
			REDUCE_F128(t0, t1, t2, t3);
			yw = _mm_unpacklo_epi64(t1, t0);
		}
	}

	while (num1 -- > 0) {
		__m128i aw, ax;
		__m128i t0, t1, t2, t3;

		aw = _mm_loadu_si128((void *)buf2);
		BYTESWAP(aw);
		buf2 += 16;

		aw = _mm_xor_si128(aw, yw);
		BK(aw, ax);

		t1 = pclmulqdq11(aw, h1w);
		t3 = pclmulqdq00(aw, h1w);
		t2 = pclmulqdq00(ax, h1x);
		t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
		t0 = _mm_shuffle_epi32(t1, 0x0E);
		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
		SL_256(t0, t1, t2, t3);
		REDUCE_F128(t0, t1, t2, t3);
		yw = _mm_unpacklo_epi64(t1, t0);
	}

	BYTESWAP(yw);
	_mm_storeu_si128(y, yw);
}

BR_TARGETS_X86_DOWN

#else

/* see bearssl_hash.h */
br_ghash
br_ghash_pclmul_get(void)
{
	return 0;
}

#endif