50 files changed, 11684 insertions, 0 deletions
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_big_cbcdec.c
new file mode 100644
index 00000000..d969a3bf
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_big_cbcdec.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_big_cbcdec_init(br_aes_big_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_big_cbcdec_vtable;
+	ctx->num_rounds = br_aes_big_keysched_inv(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_big_cbcdec_run(const br_aes_big_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+
+	ivbuf = iv;
+	buf = data;
+	while (len > 0) {
+		unsigned char tmp[16];
+		int i;
+
+		memcpy(tmp, buf, 16);
+		br_aes_big_decrypt(ctx->num_rounds, ctx->skey, buf);
+		for (i = 0; i < 16; i ++) {
+			buf[i] ^= ivbuf[i];
+		}
+		memcpy(ivbuf, tmp, 16);
+		buf += 16;
+		len -= 16;
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_aes_big_cbcdec_vtable = {
+	sizeof(br_aes_big_cbcdec_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_aes_big_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_aes_big_cbcdec_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_big_cbcenc.c
new file mode 100644
index 00000000..265e53b8
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_big_cbcenc.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_big_cbcenc_init(br_aes_big_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_big_cbcenc_vtable;
+	ctx->num_rounds = br_aes_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_big_cbcenc_run(const br_aes_big_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+
+	ivbuf = iv;
+	buf = data;
+	while (len > 0) {
+		int i;
+
+		for (i = 0; i < 16; i ++) {
+			buf[i] ^= ivbuf[i];
+		}
+		br_aes_big_encrypt(ctx->num_rounds, ctx->skey, buf);
+		memcpy(ivbuf, buf, 16);
+		buf += 16;
+		len -= 16;
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_aes_big_cbcenc_vtable = {
+	sizeof(br_aes_big_cbcenc_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_aes_big_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_aes_big_cbcenc_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_big_ctr.c
new file mode 100644
index 00000000..18fbb846
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_big_ctr.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_big_ctr_init(br_aes_big_ctr_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_big_ctr_vtable;
+	ctx->num_rounds = br_aes_keysched(ctx->skey, key, len);
+}
+
+static void
+xorbuf(void *dst, const void *src, size_t len)
+{
+	unsigned char *d;
+	const unsigned char *s;
+
+	d = dst;
+	s = src;
+	while (len -- > 0) {
+		*d ++ ^= *s ++;
+	}
+}
+
+/* see bearssl_block.h */
+uint32_t
+br_aes_big_ctr_run(const br_aes_big_ctr_keys *ctx,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+
+	buf = data;
+	while (len > 0) {
+		unsigned char tmp[16];
+
+		memcpy(tmp, iv, 12);
+		br_enc32be(tmp + 12, cc ++);
+		br_aes_big_encrypt(ctx->num_rounds, ctx->skey, tmp);
+		if (len <= 16) {
+			xorbuf(buf, tmp, len);
+			break;
+		}
+		xorbuf(buf, tmp, 16);
+		buf += 16;
+		len -= 16;
+	}
+	return cc;
+}
+
+/* see bearssl_block.h */
+const br_block_ctr_class br_aes_big_ctr_vtable = {
+	sizeof(br_aes_big_ctr_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctr_class **, const void *, size_t))
+		&br_aes_big_ctr_init,
+	(uint32_t (*)(const br_block_ctr_class *const *,
+		const void *, uint32_t, void *, size_t))
+		&br_aes_big_ctr_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_big_ctrcbc.c
new file mode 100644
index 00000000..d45ca769
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_big_ctrcbc.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_big_ctrcbc_init(br_aes_big_ctrcbc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_big_ctrcbc_vtable;
+	ctx->num_rounds = br_aes_keysched(ctx->skey, key, len);
+}
+
+static void
+xorbuf(void *dst, const void *src, size_t len)
+{
+	unsigned char *d;
+	const unsigned char *s;
+
+	d = dst;
+	s = src;
+	while (len -- > 0) {
+		*d ++ ^= *s ++;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_aes_big_ctrcbc_ctr(const br_aes_big_ctrcbc_keys *ctx,
+	void *ctr, void *data, size_t len)
+{
+	unsigned char *buf, *bctr;
+	uint32_t cc0, cc1, cc2, cc3;
+
+	buf = data;
+	bctr = ctr;
+	cc3 = br_dec32be(bctr +  0);
+	cc2 = br_dec32be(bctr +  4);
+	cc1 = br_dec32be(bctr +  8);
+	cc0 = br_dec32be(bctr + 12);
+	while (len > 0) {
+		unsigned char tmp[16];
+		uint32_t carry;
+
+		br_enc32be(tmp +  0, cc3);
+		br_enc32be(tmp +  4, cc2);
+		br_enc32be(tmp +  8, cc1);
+		br_enc32be(tmp + 12, cc0);
+		br_aes_big_encrypt(ctx->num_rounds, ctx->skey, tmp);
+		xorbuf(buf, tmp, 16);
+		buf += 16;
+		len -= 16;
+		cc0 ++;
+		carry = (~(cc0 | -cc0)) >> 31;
+		cc1 += carry;
+		carry &= (~(cc1 | -cc1)) >> 31;
+		cc2 += carry;
+		carry &= (~(cc2 | -cc2)) >> 31;
+		cc3 += carry;
+	}
+	br_enc32be(bctr +  0, cc3);
+	br_enc32be(bctr +  4, cc2);
+	br_enc32be(bctr +  8, cc1);
+	br_enc32be(bctr + 12, cc0);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_big_ctrcbc_mac(const br_aes_big_ctrcbc_keys *ctx,
+	void *cbcmac, const void *data, size_t len)
+{
+	const unsigned char *buf;
+
+	buf = data;
+	while (len > 0) {
+		xorbuf(cbcmac, buf, 16);
+		br_aes_big_encrypt(ctx->num_rounds, ctx->skey, cbcmac);
+		buf += 16;
+		len -= 16;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_aes_big_ctrcbc_encrypt(const br_aes_big_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	br_aes_big_ctrcbc_ctr(ctx, ctr, data, len);
+	br_aes_big_ctrcbc_mac(ctx, cbcmac, data, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_big_ctrcbc_decrypt(const br_aes_big_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	br_aes_big_ctrcbc_mac(ctx, cbcmac, data, len);
+	br_aes_big_ctrcbc_ctr(ctx, ctr, data, len);
+}
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class br_aes_big_ctrcbc_vtable = {
+	sizeof(br_aes_big_ctrcbc_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
+		&br_aes_big_ctrcbc_init,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_big_ctrcbc_encrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_big_ctrcbc_decrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, size_t))
+		&br_aes_big_ctrcbc_ctr,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, const void *, size_t))
+		&br_aes_big_ctrcbc_mac
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_dec.c b/test/monniaux/BearSSL/src/symcipher/aes_big_dec.c
new file mode 100644
index 00000000..a5d0e3c6
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_big_dec.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Inverse S-box (used in key schedule for decryption).
+ */
+static const unsigned char iS[] = {
+	0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E,
+	0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
+	0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32,
+	0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
+	0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49,
+	0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16,
+	0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50,
+	0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
+	0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05,
+	0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02,
+	0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41,
+	0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
+	0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8,
+	0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89,
+	0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B,
+	0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
+	0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59,
+	0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D,
+	0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D,
+	0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
+	0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63,
+	0x55, 0x21, 0x0C, 0x7D
+};
+
+static const uint32_t iSsm0[] = {
+	0x51F4A750, 0x7E416553, 0x1A17A4C3, 0x3A275E96, 0x3BAB6BCB, 0x1F9D45F1,
+	0xACFA58AB, 0x4BE30393, 0x2030FA55, 0xAD766DF6, 0x88CC7691, 0xF5024C25,
+	0x4FE5D7FC, 0xC52ACBD7, 0x26354480, 0xB562A38F, 0xDEB15A49, 0x25BA1B67,
+	0x45EA0E98, 0x5DFEC0E1, 0xC32F7502, 0x814CF012, 0x8D4697A3, 0x6BD3F9C6,
+	0x038F5FE7, 0x15929C95, 0xBF6D7AEB, 0x955259DA, 0xD4BE832D, 0x587421D3,
+	0x49E06929, 0x8EC9C844, 0x75C2896A, 0xF48E7978, 0x99583E6B, 0x27B971DD,
+	0xBEE14FB6, 0xF088AD17, 0xC920AC66, 0x7DCE3AB4, 0x63DF4A18, 0xE51A3182,
+	0x97513360, 0x62537F45, 0xB16477E0, 0xBB6BAE84, 0xFE81A01C, 0xF9082B94,
+	0x70486858, 0x8F45FD19, 0x94DE6C87, 0x527BF8B7, 0xAB73D323, 0x724B02E2,
+	0xE31F8F57, 0x6655AB2A, 0xB2EB2807, 0x2FB5C203, 0x86C57B9A, 0xD33708A5,
+	0x302887F2, 0x23BFA5B2, 0x02036ABA, 0xED16825C, 0x8ACF1C2B, 0xA779B492,
+	0xF307F2F0, 0x4E69E2A1, 0x65DAF4CD, 0x0605BED5, 0xD134621F, 0xC4A6FE8A,
+	0x342E539D, 0xA2F355A0, 0x058AE132, 0xA4F6EB75, 0x0B83EC39, 0x4060EFAA,
+	0x5E719F06, 0xBD6E1051, 0x3E218AF9, 0x96DD063D, 0xDD3E05AE, 0x4DE6BD46,
+	0x91548DB5, 0x71C45D05, 0x0406D46F, 0x605015FF, 0x1998FB24, 0xD6BDE997,
+	0x894043CC, 0x67D99E77, 0xB0E842BD, 0x07898B88, 0xE7195B38, 0x79C8EEDB,
+	0xA17C0A47, 0x7C420FE9, 0xF8841EC9, 0x00000000, 0x09808683, 0x322BED48,
+	0x1E1170AC, 0x6C5A724E, 0xFD0EFFFB, 0x0F853856, 0x3DAED51E, 0x362D3927,
+	0x0A0FD964, 0x685CA621, 0x9B5B54D1, 0x24362E3A, 0x0C0A67B1, 0x9357E70F,
+	0xB4EE96D2, 0x1B9B919E, 0x80C0C54F, 0x61DC20A2, 0x5A774B69, 0x1C121A16,
+	0xE293BA0A, 0xC0A02AE5, 0x3C22E043, 0x121B171D, 0x0E090D0B, 0xF28BC7AD,
+	0x2DB6A8B9, 0x141EA9C8, 0x57F11985, 0xAF75074C, 0xEE99DDBB, 0xA37F60FD,
+	0xF701269F, 0x5C72F5BC, 0x44663BC5, 0x5BFB7E34, 0x8B432976, 0xCB23C6DC,
+	0xB6EDFC68, 0xB8E4F163, 0xD731DCCA, 0x42638510, 0x13972240, 0x84C61120,
+	0x854A247D, 0xD2BB3DF8, 0xAEF93211, 0xC729A16D, 0x1D9E2F4B, 0xDCB230F3,
+	0x0D8652EC, 0x77C1E3D0, 0x2BB3166C, 0xA970B999, 0x119448FA, 0x47E96422,
+	0xA8FC8CC4, 0xA0F03F1A, 0x567D2CD8, 0x223390EF, 0x87494EC7, 0xD938D1C1,
+	0x8CCAA2FE, 0x98D40B36, 0xA6F581CF, 0xA57ADE28, 0xDAB78E26, 0x3FADBFA4,
+	0x2C3A9DE4, 0x5078920D, 0x6A5FCC9B, 0x547E4662, 0xF68D13C2, 0x90D8B8E8,
+	0x2E39F75E, 0x82C3AFF5, 0x9F5D80BE, 0x69D0937C, 0x6FD52DA9, 0xCF2512B3,
+	0xC8AC993B, 0x10187DA7, 0xE89C636E, 0xDB3BBB7B, 0xCD267809, 0x6E5918F4,
+	0xEC9AB701, 0x834F9AA8, 0xE6956E65, 0xAAFFE67E, 0x21BCCF08, 0xEF15E8E6,
+	0xBAE79BD9, 0x4A6F36CE, 0xEA9F09D4, 0x29B07CD6, 0x31A4B2AF, 0x2A3F2331,
+	0xC6A59430, 0x35A266C0, 0x744EBC37, 0xFC82CAA6, 0xE090D0B0, 0x33A7D815,
+	0xF104984A, 0x41ECDAF7, 0x7FCD500E, 0x1791F62F, 0x764DD68D, 0x43EFB04D,
+	0xCCAA4D54, 0xE49604DF, 0x9ED1B5E3, 0x4C6A881B, 0xC12C1FB8, 0x4665517F,
+	0x9D5EEA04, 0x018C355D, 0xFA877473, 0xFB0B412E, 0xB3671D5A, 0x92DBD252,
+	0xE9105633, 0x6DD64713, 0x9AD7618C, 0x37A10C7A, 0x59F8148E, 0xEB133C89,
+	0xCEA927EE, 0xB761C935, 0xE11CE5ED, 0x7A47B13C, 0x9CD2DF59, 0x55F2733F,
+	0x1814CE79, 0x73C737BF, 0x53F7CDEA, 0x5FFDAA5B, 0xDF3D6F14, 0x7844DB86,
+	0xCAAFF381, 0xB968C43E, 0x3824342C, 0xC2A3405F, 0x161DC372, 0xBCE2250C,
+	0x283C498B, 0xFF0D9541, 0x39A80171, 0x080CB3DE, 0xD8B4E49C, 0x6456C190,
+	0x7BCB8461, 0xD532B670, 0x486C5C74, 0xD0B85742
+};
+
+static unsigned
+mul2(unsigned x)
+{
+	x <<= 1;
+	return x ^ ((unsigned)(-(int)(x >> 8)) & 0x11B);
+}
+
+static unsigned
+mul9(unsigned x)
+{
+	return x ^ mul2(mul2(mul2(x)));
+}
+
+static unsigned
+mulb(unsigned x)
+{
+	unsigned x2;
+	
+	x2 = mul2(x);
+	return x ^ x2 ^ mul2(mul2(x2));
+}
+
+static unsigned
+muld(unsigned x)
+{
+	unsigned x4;
+
+	x4 = mul2(mul2(x));
+	return x ^ x4 ^ mul2(x4);
+}
+
+static unsigned
+mule(unsigned x)
+{
+	unsigned x2, x4;
+
+	x2 = mul2(x);
+	x4 = mul2(x2);
+	return x2 ^ x4 ^ mul2(x4);
+}
+
+/* see inner.h */
+unsigned
+br_aes_big_keysched_inv(uint32_t *skey, const void *key, size_t key_len)
+{
+	unsigned num_rounds;
+	int i, m;
+
+	/*
+	 * Sub-keys for decryption are distinct from encryption sub-keys
+	 * in that InvMixColumns() is already applied for the inner
+	 * rounds.
+	 */
+	num_rounds = br_aes_keysched(skey, key, key_len);
+	m = (int)(num_rounds << 2);
+	for (i = 4; i < m; i ++) {
+		uint32_t p;
+		unsigned p0, p1, p2, p3;
+		uint32_t q0, q1, q2, q3;
+
+		p = skey[i];
+		p0 = p >> 24;
+		p1 = (p >> 16) & 0xFF;
+		p2 = (p >> 8) & 0xFF;
+		p3 = p & 0xFF;
+		q0 = mule(p0) ^ mulb(p1) ^ muld(p2) ^ mul9(p3);
+		q1 = mul9(p0) ^ mule(p1) ^ mulb(p2) ^ muld(p3);
+		q2 = muld(p0) ^ mul9(p1) ^ mule(p2) ^ mulb(p3);
+		q3 = mulb(p0) ^ muld(p1) ^ mul9(p2) ^ mule(p3);
+		skey[i] = (q0 << 24) | (q1 << 16) | (q2 << 8) | q3;
+	}
+	return num_rounds;
+}
+
+static inline uint32_t
+rotr(uint32_t x, int n)
+{
+	return (x << (32 - n)) | (x >> n);
+}
+
+#define iSboxExt0(x)   (iSsm0[x])
+#define iSboxExt1(x)   (rotr(iSsm0[x], 8))
+#define iSboxExt2(x)   (rotr(iSsm0[x], 16))
+#define iSboxExt3(x)   (rotr(iSsm0[x], 24))
+
+/* see bearssl.h */
+void
+br_aes_big_decrypt(unsigned num_rounds, const uint32_t *skey, void *data)
+{
+	unsigned char *buf;
+	uint32_t s0, s1, s2, s3;
+	uint32_t t0, t1, t2, t3;
+	unsigned u;
+
+	buf = data;
+	s0 = br_dec32be(buf);
+	s1 = br_dec32be(buf + 4);
+	s2 = br_dec32be(buf + 8);
+	s3 = br_dec32be(buf + 12);
+	s0 ^= skey[(num_rounds << 2) + 0];
+	s1 ^= skey[(num_rounds << 2) + 1];
+	s2 ^= skey[(num_rounds << 2) + 2];
+	s3 ^= skey[(num_rounds << 2) + 3];
+	for (u = num_rounds - 1; u > 0; u --) {
+		uint32_t v0 = iSboxExt0(s0 >> 24)
+			^ iSboxExt1((s3 >> 16) & 0xFF)
+			^ iSboxExt2((s2 >> 8) & 0xFF)
+			^ iSboxExt3(s1 & 0xFF);
+		uint32_t v1 = iSboxExt0(s1 >> 24)
+			^ iSboxExt1((s0 >> 16) & 0xFF)
+			^ iSboxExt2((s3 >> 8) & 0xFF)
+			^ iSboxExt3(s2 & 0xFF);
+		uint32_t v2 = iSboxExt0(s2 >> 24)
+			^ iSboxExt1((s1 >> 16) & 0xFF)
+			^ iSboxExt2((s0 >> 8) & 0xFF)
+			^ iSboxExt3(s3 & 0xFF);
+		uint32_t v3 = iSboxExt0(s3 >> 24)
+			^ iSboxExt1((s2 >> 16) & 0xFF)
+			^ iSboxExt2((s1 >> 8) & 0xFF)
+			^ iSboxExt3(s0 & 0xFF);
+		s0 = v0;
+		s1 = v1;
+		s2 = v2;
+		s3 = v3;
+		s0 ^= skey[u << 2];
+		s1 ^= skey[(u << 2) + 1];
+		s2 ^= skey[(u << 2) + 2];
+		s3 ^= skey[(u << 2) + 3];
+	}
+	t0 = ((uint32_t)iS[s0 >> 24] << 24)
+		| ((uint32_t)iS[(s3 >> 16) & 0xFF] << 16)
+		| ((uint32_t)iS[(s2 >> 8) & 0xFF] << 8)
+		| (uint32_t)iS[s1 & 0xFF];
+	t1 = ((uint32_t)iS[s1 >> 24] << 24)
+		| ((uint32_t)iS[(s0 >> 16) & 0xFF] << 16)
+		| ((uint32_t)iS[(s3 >> 8) & 0xFF] << 8)
+		| (uint32_t)iS[s2 & 0xFF];
+	t2 = ((uint32_t)iS[s2 >> 24] << 24)
+		| ((uint32_t)iS[(s1 >> 16) & 0xFF] << 16)
+		| ((uint32_t)iS[(s0 >> 8) & 0xFF] << 8)
+		| (uint32_t)iS[s3 & 0xFF];
+	t3 = ((uint32_t)iS[s3 >> 24] << 24)
+		| ((uint32_t)iS[(s2 >> 16) & 0xFF] << 16)
+		| ((uint32_t)iS[(s1 >> 8) & 0xFF] << 8)
+		| (uint32_t)iS[s0 & 0xFF];
+	s0 = t0 ^ skey[0];
+	s1 = t1 ^ skey[1];
+	s2 = t2 ^ skey[2];
+	s3 = t3 ^ skey[3];
+	br_enc32be(buf, s0);
+	br_enc32be(buf + 4, s1);
+	br_enc32be(buf + 8, s2);
+	br_enc32be(buf + 12, s3);
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_enc.c b/test/monniaux/BearSSL/src/symcipher/aes_big_enc.c
new file mode 100644
index 00000000..bbabb9a6
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_big_enc.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define S   br_aes_S
+
+static const uint32_t Ssm0[] = {
+	0xC66363A5, 0xF87C7C84, 0xEE777799, 0xF67B7B8D, 0xFFF2F20D, 0xD66B6BBD,
+	0xDE6F6FB1, 0x91C5C554, 0x60303050, 0x02010103, 0xCE6767A9, 0x562B2B7D,
+	0xE7FEFE19, 0xB5D7D762, 0x4DABABE6, 0xEC76769A, 0x8FCACA45, 0x1F82829D,
+	0x89C9C940, 0xFA7D7D87, 0xEFFAFA15, 0xB25959EB, 0x8E4747C9, 0xFBF0F00B,
+	0x41ADADEC, 0xB3D4D467, 0x5FA2A2FD, 0x45AFAFEA, 0x239C9CBF, 0x53A4A4F7,
+	0xE4727296, 0x9BC0C05B, 0x75B7B7C2, 0xE1FDFD1C, 0x3D9393AE, 0x4C26266A,
+	0x6C36365A, 0x7E3F3F41, 0xF5F7F702, 0x83CCCC4F, 0x6834345C, 0x51A5A5F4,
+	0xD1E5E534, 0xF9F1F108, 0xE2717193, 0xABD8D873, 0x62313153, 0x2A15153F,
+	0x0804040C, 0x95C7C752, 0x46232365, 0x9DC3C35E, 0x30181828, 0x379696A1,
+	0x0A05050F, 0x2F9A9AB5, 0x0E070709, 0x24121236, 0x1B80809B, 0xDFE2E23D,
+	0xCDEBEB26, 0x4E272769, 0x7FB2B2CD, 0xEA75759F, 0x1209091B, 0x1D83839E,
+	0x582C2C74, 0x341A1A2E, 0x361B1B2D, 0xDC6E6EB2, 0xB45A5AEE, 0x5BA0A0FB,
+	0xA45252F6, 0x763B3B4D, 0xB7D6D661, 0x7DB3B3CE, 0x5229297B, 0xDDE3E33E,
+	0x5E2F2F71, 0x13848497, 0xA65353F5, 0xB9D1D168, 0x00000000, 0xC1EDED2C,
+	0x40202060, 0xE3FCFC1F, 0x79B1B1C8, 0xB65B5BED, 0xD46A6ABE, 0x8DCBCB46,
+	0x67BEBED9, 0x7239394B, 0x944A4ADE, 0x984C4CD4, 0xB05858E8, 0x85CFCF4A,
+	0xBBD0D06B, 0xC5EFEF2A, 0x4FAAAAE5, 0xEDFBFB16, 0x864343C5, 0x9A4D4DD7,
+	0x66333355, 0x11858594, 0x8A4545CF, 0xE9F9F910, 0x04020206, 0xFE7F7F81,
+	0xA05050F0, 0x783C3C44, 0x259F9FBA, 0x4BA8A8E3, 0xA25151F3, 0x5DA3A3FE,
+	0x804040C0, 0x058F8F8A, 0x3F9292AD, 0x219D9DBC, 0x70383848, 0xF1F5F504,
+	0x63BCBCDF, 0x77B6B6C1, 0xAFDADA75, 0x42212163, 0x20101030, 0xE5FFFF1A,
+	0xFDF3F30E, 0xBFD2D26D, 0x81CDCD4C, 0x180C0C14, 0x26131335, 0xC3ECEC2F,
+	0xBE5F5FE1, 0x359797A2, 0x884444CC, 0x2E171739, 0x93C4C457, 0x55A7A7F2,
+	0xFC7E7E82, 0x7A3D3D47, 0xC86464AC, 0xBA5D5DE7, 0x3219192B, 0xE6737395,
+	0xC06060A0, 0x19818198, 0x9E4F4FD1, 0xA3DCDC7F, 0x44222266, 0x542A2A7E,
+	0x3B9090AB, 0x0B888883, 0x8C4646CA, 0xC7EEEE29, 0x6BB8B8D3, 0x2814143C,
+	0xA7DEDE79, 0xBC5E5EE2, 0x160B0B1D, 0xADDBDB76, 0xDBE0E03B, 0x64323256,
+	0x743A3A4E, 0x140A0A1E, 0x924949DB, 0x0C06060A, 0x4824246C, 0xB85C5CE4,
+	0x9FC2C25D, 0xBDD3D36E, 0x43ACACEF, 0xC46262A6, 0x399191A8, 0x319595A4,
+	0xD3E4E437, 0xF279798B, 0xD5E7E732, 0x8BC8C843, 0x6E373759, 0xDA6D6DB7,
+	0x018D8D8C, 0xB1D5D564, 0x9C4E4ED2, 0x49A9A9E0, 0xD86C6CB4, 0xAC5656FA,
+	0xF3F4F407, 0xCFEAEA25, 0xCA6565AF, 0xF47A7A8E, 0x47AEAEE9, 0x10080818,
+	0x6FBABAD5, 0xF0787888, 0x4A25256F, 0x5C2E2E72, 0x381C1C24, 0x57A6A6F1,
+	0x73B4B4C7, 0x97C6C651, 0xCBE8E823, 0xA1DDDD7C, 0xE874749C, 0x3E1F1F21,
+	0x964B4BDD, 0x61BDBDDC, 0x0D8B8B86, 0x0F8A8A85, 0xE0707090, 0x7C3E3E42,
+	0x71B5B5C4, 0xCC6666AA, 0x904848D8, 0x06030305, 0xF7F6F601, 0x1C0E0E12,
+	0xC26161A3, 0x6A35355F, 0xAE5757F9, 0x69B9B9D0, 0x17868691, 0x99C1C158,
+	0x3A1D1D27, 0x279E9EB9, 0xD9E1E138, 0xEBF8F813, 0x2B9898B3, 0x22111133,
+	0xD26969BB, 0xA9D9D970, 0x078E8E89, 0x339494A7, 0x2D9B9BB6, 0x3C1E1E22,
+	0x15878792, 0xC9E9E920, 0x87CECE49, 0xAA5555FF, 0x50282878, 0xA5DFDF7A,
+	0x038C8C8F, 0x59A1A1F8, 0x09898980, 0x1A0D0D17, 0x65BFBFDA, 0xD7E6E631,
+	0x844242C6, 0xD06868B8, 0x824141C3, 0x299999B0, 0x5A2D2D77, 0x1E0F0F11,
+	0x7BB0B0CB, 0xA85454FC, 0x6DBBBBD6, 0x2C16163A
+};
+
+static inline uint32_t
+rotr(uint32_t x, int n)
+{
+	return (x << (32 - n)) | (x >> n);
+}
+
+#define SboxExt0(x)   (Ssm0[x])
+#define SboxExt1(x)   (rotr(Ssm0[x], 8))
+#define SboxExt2(x)   (rotr(Ssm0[x], 16))
+#define SboxExt3(x)   (rotr(Ssm0[x], 24))
+
+
+/* see bearssl.h */
+void
+br_aes_big_encrypt(unsigned num_rounds, const uint32_t *skey, void *data)
+{
+	unsigned char *buf;
+	uint32_t s0, s1, s2, s3;
+	uint32_t t0, t1, t2, t3;
+	unsigned u;
+
+	buf = data;
+	s0 = br_dec32be(buf);
+	s1 = br_dec32be(buf + 4);
+	s2 = br_dec32be(buf + 8);
+	s3 = br_dec32be(buf + 12);
+	s0 ^= skey[0];
+	s1 ^= skey[1];
+	s2 ^= skey[2];
+	s3 ^= skey[3];
+	for (u = 1; u < num_rounds; u ++) {
+		uint32_t v0, v1, v2, v3;
+
+		v0 = SboxExt0(s0 >> 24)
+			^ SboxExt1((s1 >> 16) & 0xFF)
+			^ SboxExt2((s2 >> 8) & 0xFF)
+			^ SboxExt3(s3 & 0xFF);
+		v1 = SboxExt0(s1 >> 24)
+			^ SboxExt1((s2 >> 16) & 0xFF)
+			^ SboxExt2((s3 >> 8) & 0xFF)
+			^ SboxExt3(s0 & 0xFF);
+		v2 = SboxExt0(s2 >> 24)
+			^ SboxExt1((s3 >> 16) & 0xFF)
+			^ SboxExt2((s0 >> 8) & 0xFF)
+			^ SboxExt3(s1 & 0xFF);
+		v3 = SboxExt0(s3 >> 24)
+			^ SboxExt1((s0 >> 16) & 0xFF)
+			^ SboxExt2((s1 >> 8) & 0xFF)
+			^ SboxExt3(s2 & 0xFF);
+		s0 = v0;
+		s1 = v1;
+		s2 = v2;
+		s3 = v3;
+		s0 ^= skey[u << 2];
+		s1 ^= skey[(u << 2) + 1];
+		s2 ^= skey[(u << 2) + 2];
+		s3 ^= skey[(u << 2) + 3];
+	}
+	t0 = ((uint32_t)S[s0 >> 24] << 24)
+		| ((uint32_t)S[(s1 >> 16) & 0xFF] << 16)
+		| ((uint32_t)S[(s2 >> 8) & 0xFF] << 8)
+		| (uint32_t)S[s3 & 0xFF];
+	t1 = ((uint32_t)S[s1 >> 24] << 24)
+		| ((uint32_t)S[(s2 >> 16) & 0xFF] << 16)
+		| ((uint32_t)S[(s3 >> 8) & 0xFF] << 8)
+		| (uint32_t)S[s0 & 0xFF];
+	t2 = ((uint32_t)S[s2 >> 24] << 24)
+		| ((uint32_t)S[(s3 >> 16) & 0xFF] << 16)
+		| ((uint32_t)S[(s0 >> 8) & 0xFF] << 8)
+		| (uint32_t)S[s1 & 0xFF];
+	t3 = ((uint32_t)S[s3 >> 24] << 24)
+		| ((uint32_t)S[(s0 >> 16) & 0xFF] << 16)
+		| ((uint32_t)S[(s1 >> 8) & 0xFF] << 8)
+		| (uint32_t)S[s2 & 0xFF];
+	s0 = t0 ^ skey[num_rounds << 2];
+	s1 = t1 ^ skey[(num_rounds << 2) + 1];
+	s2 = t2 ^ skey[(num_rounds << 2) + 2];
+	s3 = t3 ^ skey[(num_rounds << 2) + 3];
+	br_enc32be(buf, s0);
+	br_enc32be(buf + 4, s1);
+	br_enc32be(buf + 8, s2);
+	br_enc32be(buf + 12, s3);
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_common.c b/test/monniaux/BearSSL/src/symcipher/aes_common.c
new file mode 100644
index 00000000..72c64fb1
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_common.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static const uint32_t Rcon[] = {
+	0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
+	0x40000000, 0x80000000, 0x1B000000, 0x36000000
+};
+
+#define S   br_aes_S
+
+/* see inner.h */
+const unsigned char br_aes_S[] = {
+	0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B,
+	0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
+	0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26,
+	0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+	0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2,
+	0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
+	0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED,
+	0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+	0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F,
+	0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
+	0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC,
+	0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+	0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14,
+	0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
+	0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D,
+	0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+	0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F,
+	0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
+	0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11,
+	0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+	0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F,
+	0xB0, 0x54, 0xBB, 0x16
+};
+
+static uint32_t
+SubWord(uint32_t x)
+{
+	return ((uint32_t)S[x >> 24] << 24)
+		| ((uint32_t)S[(x >> 16) & 0xFF] << 16)
+		| ((uint32_t)S[(x >> 8) & 0xFF] << 8)
+		| (uint32_t)S[x & 0xFF];
+}
+
+/* see inner.h */
+unsigned
+br_aes_keysched(uint32_t *skey, const void *key, size_t key_len)
+{
+	unsigned num_rounds;
+	int i, j, k, nk, nkf;
+
+	switch (key_len) {
+	case 16:
+		num_rounds = 10;
+		break;
+	case 24:
+		num_rounds = 12;
+		break;
+	case 32:
+		num_rounds = 14;
+		break;
+	default:
+		/* abort(); */
+		return 0;
+	}
+	nk = (int)(key_len >> 2);
+	nkf = (int)((num_rounds + 1) << 2);
+	for (i = 0; i < nk; i ++) {
+		skey[i] = br_dec32be((const unsigned char *)key + (i << 2));
+	}
+	for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+		uint32_t tmp;
+
+		tmp = skey[i - 1];
+		if (j == 0) {
+			tmp = (tmp << 8) | (tmp >> 24);
+			tmp = SubWord(tmp) ^ Rcon[k];
+		} else if (nk > 6 && j == 4) {
+			tmp = SubWord(tmp);
+		}
+		skey[i] = skey[i - nk] ^ tmp;
+		if (++ j == nk) {
+			j = 0;
+			k ++;
+		}
+	}
+	return num_rounds;
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct.c b/test/monniaux/BearSSL/src/symcipher/aes_ct.c
new file mode 100644
index 00000000..66776d9e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_aes_ct_bitslice_Sbox(uint32_t *q)
+{
+	/*
+	 * This S-box implementation is a straightforward translation of
+	 * the circuit described by Boyar and Peralta in "A new
+	 * combinational logic minimization technique with applications
+	 * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
+	 *
+	 * Note that variables x* (input) and s* (output) are numbered
+	 * in "reverse" order (x0 is the high bit, x7 is the low bit).
+	 */
+
+	uint32_t x0, x1, x2, x3, x4, x5, x6, x7;
+	uint32_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
+	uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
+	uint32_t y20, y21;
+	uint32_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
+	uint32_t z10, z11, z12, z13, z14, z15, z16, z17;
+	uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+	uint32_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
+	uint32_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+	uint32_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
+	uint32_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
+	uint32_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
+	uint32_t t60, t61, t62, t63, t64, t65, t66, t67;
+	uint32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+	x0 = q[7];
+	x1 = q[6];
+	x2 = q[5];
+	x3 = q[4];
+	x4 = q[3];
+	x5 = q[2];
+	x6 = q[1];
+	x7 = q[0];
+
+	/*
+	 * Top linear transformation.
+	 */
+	y14 = x3 ^ x5;
+	y13 = x0 ^ x6;
+	y9 = x0 ^ x3;
+	y8 = x0 ^ x5;
+	t0 = x1 ^ x2;
+	y1 = t0 ^ x7;
+	y4 = y1 ^ x3;
+	y12 = y13 ^ y14;
+	y2 = y1 ^ x0;
+	y5 = y1 ^ x6;
+	y3 = y5 ^ y8;
+	t1 = x4 ^ y12;
+	y15 = t1 ^ x5;
+	y20 = t1 ^ x1;
+	y6 = y15 ^ x7;
+	y10 = y15 ^ t0;
+	y11 = y20 ^ y9;
+	y7 = x7 ^ y11;
+	y17 = y10 ^ y11;
+	y19 = y10 ^ y8;
+	y16 = t0 ^ y11;
+	y21 = y13 ^ y16;
+	y18 = x0 ^ y16;
+
+	/*
+	 * Non-linear section.
+	 */
+	t2 = y12 & y15;
+	t3 = y3 & y6;
+	t4 = t3 ^ t2;
+	t5 = y4 & x7;
+	t6 = t5 ^ t2;
+	t7 = y13 & y16;
+	t8 = y5 & y1;
+	t9 = t8 ^ t7;
+	t10 = y2 & y7;
+	t11 = t10 ^ t7;
+	t12 = y9 & y11;
+	t13 = y14 & y17;
+	t14 = t13 ^ t12;
+	t15 = y8 & y10;
+	t16 = t15 ^ t12;
+	t17 = t4 ^ t14;
+	t18 = t6 ^ t16;
+	t19 = t9 ^ t14;
+	t20 = t11 ^ t16;
+	t21 = t17 ^ y20;
+	t22 = t18 ^ y19;
+	t23 = t19 ^ y21;
+	t24 = t20 ^ y18;
+
+	t25 = t21 ^ t22;
+	t26 = t21 & t23;
+	t27 = t24 ^ t26;
+	t28 = t25 & t27;
+	t29 = t28 ^ t22;
+	t30 = t23 ^ t24;
+	t31 = t22 ^ t26;
+	t32 = t31 & t30;
+	t33 = t32 ^ t24;
+	t34 = t23 ^ t33;
+	t35 = t27 ^ t33;
+	t36 = t24 & t35;
+	t37 = t36 ^ t34;
+	t38 = t27 ^ t36;
+	t39 = t29 & t38;
+	t40 = t25 ^ t39;
+
+	t41 = t40 ^ t37;
+	t42 = t29 ^ t33;
+	t43 = t29 ^ t40;
+	t44 = t33 ^ t37;
+	t45 = t42 ^ t41;
+	z0 = t44 & y15;
+	z1 = t37 & y6;
+	z2 = t33 & x7;
+	z3 = t43 & y16;
+	z4 = t40 & y1;
+	z5 = t29 & y7;
+	z6 = t42 & y11;
+	z7 = t45 & y17;
+	z8 = t41 & y10;
+	z9 = t44 & y12;
+	z10 = t37 & y3;
+	z11 = t33 & y4;
+	z12 = t43 & y13;
+	z13 = t40 & y5;
+	z14 = t29 & y2;
+	z15 = t42 & y9;
+	z16 = t45 & y14;
+	z17 = t41 & y8;
+
+	/*
+	 * Bottom linear transformation.
+	 */
+	t46 = z15 ^ z16;
+	t47 = z10 ^ z11;
+	t48 = z5 ^ z13;
+	t49 = z9 ^ z10;
+	t50 = z2 ^ z12;
+	t51 = z2 ^ z5;
+	t52 = z7 ^ z8;
+	t53 = z0 ^ z3;
+	t54 = z6 ^ z7;
+	t55 = z16 ^ z17;
+	t56 = z12 ^ t48;
+	t57 = t50 ^ t53;
+	t58 = z4 ^ t46;
+	t59 = z3 ^ t54;
+	t60 = t46 ^ t57;
+	t61 = z14 ^ t57;
+	t62 = t52 ^ t58;
+	t63 = t49 ^ t58;
+	t64 = z4 ^ t59;
+	t65 = t61 ^ t62;
+	t66 = z1 ^ t63;
+	s0 = t59 ^ t63;
+	s6 = t56 ^ ~t62;
+	s7 = t48 ^ ~t60;
+	t67 = t64 ^ t65;
+	s3 = t53 ^ t66;
+	s4 = t51 ^ t66;
+	s5 = t47 ^ t65;
+	s1 = t64 ^ ~s3;
+	s2 = t55 ^ ~t67;
+
+	q[7] = s0;
+	q[6] = s1;
+	q[5] = s2;
+	q[4] = s3;
+	q[3] = s4;
+	q[2] = s5;
+	q[1] = s6;
+	q[0] = s7;
+}
+
+/* see inner.h */
+void
+br_aes_ct_ortho(uint32_t *q)
+{
+#define SWAPN(cl, ch, s, x, y)   do { \
+		uint32_t a, b; \
+		a = (x); \
+		b = (y); \
+		(x) = (a & (uint32_t)cl) | ((b & (uint32_t)cl) << (s)); \
+		(y) = ((a & (uint32_t)ch) >> (s)) | (b & (uint32_t)ch); \
+	} while (0)
+
+#define SWAP2(x, y)   SWAPN(0x55555555, 0xAAAAAAAA, 1, x, y)
+#define SWAP4(x, y)   SWAPN(0x33333333, 0xCCCCCCCC, 2, x, y)
+#define SWAP8(x, y)   SWAPN(0x0F0F0F0F, 0xF0F0F0F0, 4, x, y)
+
+	SWAP2(q[0], q[1]);
+	SWAP2(q[2], q[3]);
+	SWAP2(q[4], q[5]);
+	SWAP2(q[6], q[7]);
+
+	SWAP4(q[0], q[2]);
+	SWAP4(q[1], q[3]);
+	SWAP4(q[4], q[6]);
+	SWAP4(q[5], q[7]);
+
+	SWAP8(q[0], q[4]);
+	SWAP8(q[1], q[5]);
+	SWAP8(q[2], q[6]);
+	SWAP8(q[3], q[7]);
+}
+
+static const unsigned char Rcon[] = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
+};
+
+static uint32_t
+sub_word(uint32_t x)
+{
+	uint32_t q[8];
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		q[i] = x;
+	}
+	br_aes_ct_ortho(q);
+	br_aes_ct_bitslice_Sbox(q);
+	br_aes_ct_ortho(q);
+	return q[0];
+}
+
+/* see inner.h */
+unsigned
+br_aes_ct_keysched(uint32_t *comp_skey, const void *key, size_t key_len)
+{
+	unsigned num_rounds;
+	int i, j, k, nk, nkf;
+	uint32_t tmp;
+	uint32_t skey[120];
+
+	switch (key_len) {
+	case 16:
+		num_rounds = 10;
+		break;
+	case 24:
+		num_rounds = 12;
+		break;
+	case 32:
+		num_rounds = 14;
+		break;
+	default:
+		/* abort(); */
+		return 0;
+	}
+	nk = (int)(key_len >> 2);
+	nkf = (int)((num_rounds + 1) << 2);
+	tmp = 0;
+	for (i = 0; i < nk; i ++) {
+		tmp = br_dec32le((const unsigned char *)key + (i << 2));
+		skey[(i << 1) + 0] = tmp;
+		skey[(i << 1) + 1] = tmp;
+	}
+	for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+		if (j == 0) {
+			tmp = (tmp << 24) | (tmp >> 8);
+			tmp = sub_word(tmp) ^ Rcon[k];
+		} else if (nk > 6 && j == 4) {
+			tmp = sub_word(tmp);
+		}
+		tmp ^= skey[(i - nk) << 1];
+		skey[(i << 1) + 0] = tmp;
+		skey[(i << 1) + 1] = tmp;
+		if (++ j == nk) {
+			j = 0;
+			k ++;
+		}
+	}
+	for (i = 0; i < nkf; i += 4) {
+		br_aes_ct_ortho(skey + (i << 1));
+	}
+	for (i = 0, j = 0; i < nkf; i ++, j += 2) {
+		comp_skey[i] = (skey[j + 0] & 0x55555555)
+			| (skey[j + 1] & 0xAAAAAAAA);
+	}
+	return num_rounds;
+}
+
+/* see inner.h */
+void
+br_aes_ct_skey_expand(uint32_t *skey,
+	unsigned num_rounds, const uint32_t *comp_skey)
+{
+	unsigned u, v, n;
+
+	n = (num_rounds + 1) << 2;
+	for (u = 0, v = 0; u < n; u ++, v += 2) {
+		uint32_t x, y;
+
+		x = y = comp_skey[u];
+		x &= 0x55555555;
+		skey[v + 0] = x | (x << 1);
+		y &= 0xAAAAAAAA;
+		skey[v + 1] = y | (y >> 1);
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64.c
new file mode 100644
index 00000000..15238116
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64.c
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_aes_ct64_bitslice_Sbox(uint64_t *q)
+{
+	/*
+	 * This S-box implementation is a straightforward translation of
+	 * the circuit described by Boyar and Peralta in "A new
+	 * combinational logic minimization technique with applications
+	 * to cryptology" (https://eprint.iacr.org/2009/191.pdf).
+	 *
+	 * Note that variables x* (input) and s* (output) are numbered
+	 * in "reverse" order (x0 is the high bit, x7 is the low bit).
+	 */
+
+	uint64_t x0, x1, x2, x3, x4, x5, x6, x7;
+	uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
+	uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
+	uint64_t y20, y21;
+	uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
+	uint64_t z10, z11, z12, z13, z14, z15, z16, z17;
+	uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
+	uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
+	uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
+	uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
+	uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
+	uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
+	uint64_t t60, t61, t62, t63, t64, t65, t66, t67;
+	uint64_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+	x0 = q[7];
+	x1 = q[6];
+	x2 = q[5];
+	x3 = q[4];
+	x4 = q[3];
+	x5 = q[2];
+	x6 = q[1];
+	x7 = q[0];
+
+	/*
+	 * Top linear transformation.
+	 */
+	y14 = x3 ^ x5;
+	y13 = x0 ^ x6;
+	y9 = x0 ^ x3;
+	y8 = x0 ^ x5;
+	t0 = x1 ^ x2;
+	y1 = t0 ^ x7;
+	y4 = y1 ^ x3;
+	y12 = y13 ^ y14;
+	y2 = y1 ^ x0;
+	y5 = y1 ^ x6;
+	y3 = y5 ^ y8;
+	t1 = x4 ^ y12;
+	y15 = t1 ^ x5;
+	y20 = t1 ^ x1;
+	y6 = y15 ^ x7;
+	y10 = y15 ^ t0;
+	y11 = y20 ^ y9;
+	y7 = x7 ^ y11;
+	y17 = y10 ^ y11;
+	y19 = y10 ^ y8;
+	y16 = t0 ^ y11;
+	y21 = y13 ^ y16;
+	y18 = x0 ^ y16;
+
+	/*
+	 * Non-linear section.
+	 */
+	t2 = y12 & y15;
+	t3 = y3 & y6;
+	t4 = t3 ^ t2;
+	t5 = y4 & x7;
+	t6 = t5 ^ t2;
+	t7 = y13 & y16;
+	t8 = y5 & y1;
+	t9 = t8 ^ t7;
+	t10 = y2 & y7;
+	t11 = t10 ^ t7;
+	t12 = y9 & y11;
+	t13 = y14 & y17;
+	t14 = t13 ^ t12;
+	t15 = y8 & y10;
+	t16 = t15 ^ t12;
+	t17 = t4 ^ t14;
+	t18 = t6 ^ t16;
+	t19 = t9 ^ t14;
+	t20 = t11 ^ t16;
+	t21 = t17 ^ y20;
+	t22 = t18 ^ y19;
+	t23 = t19 ^ y21;
+	t24 = t20 ^ y18;
+
+	t25 = t21 ^ t22;
+	t26 = t21 & t23;
+	t27 = t24 ^ t26;
+	t28 = t25 & t27;
+	t29 = t28 ^ t22;
+	t30 = t23 ^ t24;
+	t31 = t22 ^ t26;
+	t32 = t31 & t30;
+	t33 = t32 ^ t24;
+	t34 = t23 ^ t33;
+	t35 = t27 ^ t33;
+	t36 = t24 & t35;
+	t37 = t36 ^ t34;
+	t38 = t27 ^ t36;
+	t39 = t29 & t38;
+	t40 = t25 ^ t39;
+
+	t41 = t40 ^ t37;
+	t42 = t29 ^ t33;
+	t43 = t29 ^ t40;
+	t44 = t33 ^ t37;
+	t45 = t42 ^ t41;
+	z0 = t44 & y15;
+	z1 = t37 & y6;
+	z2 = t33 & x7;
+	z3 = t43 & y16;
+	z4 = t40 & y1;
+	z5 = t29 & y7;
+	z6 = t42 & y11;
+	z7 = t45 & y17;
+	z8 = t41 & y10;
+	z9 = t44 & y12;
+	z10 = t37 & y3;
+	z11 = t33 & y4;
+	z12 = t43 & y13;
+	z13 = t40 & y5;
+	z14 = t29 & y2;
+	z15 = t42 & y9;
+	z16 = t45 & y14;
+	z17 = t41 & y8;
+
+	/*
+	 * Bottom linear transformation.
+	 */
+	t46 = z15 ^ z16;
+	t47 = z10 ^ z11;
+	t48 = z5 ^ z13;
+	t49 = z9 ^ z10;
+	t50 = z2 ^ z12;
+	t51 = z2 ^ z5;
+	t52 = z7 ^ z8;
+	t53 = z0 ^ z3;
+	t54 = z6 ^ z7;
+	t55 = z16 ^ z17;
+	t56 = z12 ^ t48;
+	t57 = t50 ^ t53;
+	t58 = z4 ^ t46;
+	t59 = z3 ^ t54;
+	t60 = t46 ^ t57;
+	t61 = z14 ^ t57;
+	t62 = t52 ^ t58;
+	t63 = t49 ^ t58;
+	t64 = z4 ^ t59;
+	t65 = t61 ^ t62;
+	t66 = z1 ^ t63;
+	s0 = t59 ^ t63;
+	s6 = t56 ^ ~t62;
+	s7 = t48 ^ ~t60;
+	t67 = t64 ^ t65;
+	s3 = t53 ^ t66;
+	s4 = t51 ^ t66;
+	s5 = t47 ^ t65;
+	s1 = t64 ^ ~s3;
+	s2 = t55 ^ ~t67;
+
+	q[7] = s0;
+	q[6] = s1;
+	q[5] = s2;
+	q[4] = s3;
+	q[3] = s4;
+	q[2] = s5;
+	q[1] = s6;
+	q[0] = s7;
+}
+
+/* see inner.h */
+void
+br_aes_ct64_ortho(uint64_t *q)
+{
+#define SWAPN(cl, ch, s, x, y)   do { \
+		uint64_t a, b; \
+		a = (x); \
+		b = (y); \
+		(x) = (a & (uint64_t)cl) | ((b & (uint64_t)cl) << (s)); \
+		(y) = ((a & (uint64_t)ch) >> (s)) | (b & (uint64_t)ch); \
+	} while (0)
+
+#define SWAP2(x, y)    SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA,  1, x, y)
+#define SWAP4(x, y)    SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC,  2, x, y)
+#define SWAP8(x, y)    SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0,  4, x, y)
+
+	SWAP2(q[0], q[1]);
+	SWAP2(q[2], q[3]);
+	SWAP2(q[4], q[5]);
+	SWAP2(q[6], q[7]);
+
+	SWAP4(q[0], q[2]);
+	SWAP4(q[1], q[3]);
+	SWAP4(q[4], q[6]);
+	SWAP4(q[5], q[7]);
+
+	SWAP8(q[0], q[4]);
+	SWAP8(q[1], q[5]);
+	SWAP8(q[2], q[6]);
+	SWAP8(q[3], q[7]);
+}
+
+/* see inner.h */
+void
+br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w)
+{
+	uint64_t x0, x1, x2, x3;
+
+	x0 = w[0];
+	x1 = w[1];
+	x2 = w[2];
+	x3 = w[3];
+	x0 |= (x0 << 16);
+	x1 |= (x1 << 16);
+	x2 |= (x2 << 16);
+	x3 |= (x3 << 16);
+	x0 &= (uint64_t)0x0000FFFF0000FFFF;
+	x1 &= (uint64_t)0x0000FFFF0000FFFF;
+	x2 &= (uint64_t)0x0000FFFF0000FFFF;
+	x3 &= (uint64_t)0x0000FFFF0000FFFF;
+	x0 |= (x0 << 8);
+	x1 |= (x1 << 8);
+	x2 |= (x2 << 8);
+	x3 |= (x3 << 8);
+	x0 &= (uint64_t)0x00FF00FF00FF00FF;
+	x1 &= (uint64_t)0x00FF00FF00FF00FF;
+	x2 &= (uint64_t)0x00FF00FF00FF00FF;
+	x3 &= (uint64_t)0x00FF00FF00FF00FF;
+	*q0 = x0 | (x2 << 8);
+	*q1 = x1 | (x3 << 8);
+}
+
+/* see inner.h */
+void
+br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1)
+{
+	uint64_t x0, x1, x2, x3;
+
+	x0 = q0 & (uint64_t)0x00FF00FF00FF00FF;
+	x1 = q1 & (uint64_t)0x00FF00FF00FF00FF;
+	x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
+	x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
+	x0 |= (x0 >> 8);
+	x1 |= (x1 >> 8);
+	x2 |= (x2 >> 8);
+	x3 |= (x3 >> 8);
+	x0 &= (uint64_t)0x0000FFFF0000FFFF;
+	x1 &= (uint64_t)0x0000FFFF0000FFFF;
+	x2 &= (uint64_t)0x0000FFFF0000FFFF;
+	x3 &= (uint64_t)0x0000FFFF0000FFFF;
+	w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16);
+	w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16);
+	w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16);
+	w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16);
+}
+
+static const unsigned char Rcon[] = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
+};
+
+static uint32_t
+sub_word(uint32_t x)
+{
+	uint64_t q[8];
+
+	memset(q, 0, sizeof q);
+	q[0] = x;
+	br_aes_ct64_ortho(q);
+	br_aes_ct64_bitslice_Sbox(q);
+	br_aes_ct64_ortho(q);
+	return (uint32_t)q[0];
+}
+
+/* see inner.h */
+unsigned
+br_aes_ct64_keysched(uint64_t *comp_skey, const void *key, size_t key_len)
+{
+	unsigned num_rounds;
+	int i, j, k, nk, nkf;
+	uint32_t tmp;
+	uint32_t skey[60];
+
+	switch (key_len) {
+	case 16:
+		num_rounds = 10;
+		break;
+	case 24:
+		num_rounds = 12;
+		break;
+	case 32:
+		num_rounds = 14;
+		break;
+	default:
+		/* abort(); */
+		return 0;
+	}
+	nk = (int)(key_len >> 2);
+	nkf = (int)((num_rounds + 1) << 2);
+	br_range_dec32le(skey, (key_len >> 2), key);
+	tmp = skey[(key_len >> 2) - 1];
+	for (i = nk, j = 0, k = 0; i < nkf; i ++) {
+		if (j == 0) {
+			tmp = (tmp << 24) | (tmp >> 8);
+			tmp = sub_word(tmp) ^ Rcon[k];
+		} else if (nk > 6 && j == 4) {
+			tmp = sub_word(tmp);
+		}
+		tmp ^= skey[i - nk];
+		skey[i] = tmp;
+		if (++ j == nk) {
+			j = 0;
+			k ++;
+		}
+	}
+
+	for (i = 0, j = 0; i < nkf; i += 4, j += 2) {
+		uint64_t q[8];
+
+		br_aes_ct64_interleave_in(&q[0], &q[4], skey + i);
+		q[1] = q[0];
+		q[2] = q[0];
+		q[3] = q[0];
+		q[5] = q[4];
+		q[6] = q[4];
+		q[7] = q[4];
+		br_aes_ct64_ortho(q);
+		comp_skey[j + 0] =
+			  (q[0] & (uint64_t)0x1111111111111111)
+			| (q[1] & (uint64_t)0x2222222222222222)
+			| (q[2] & (uint64_t)0x4444444444444444)
+			| (q[3] & (uint64_t)0x8888888888888888);
+		comp_skey[j + 1] =
+			  (q[4] & (uint64_t)0x1111111111111111)
+			| (q[5] & (uint64_t)0x2222222222222222)
+			| (q[6] & (uint64_t)0x4444444444444444)
+			| (q[7] & (uint64_t)0x8888888888888888);
+	}
+	return num_rounds;
+}
+
+/* see inner.h */
+void
+br_aes_ct64_skey_expand(uint64_t *skey,
+	unsigned num_rounds, const uint64_t *comp_skey)
+{
+	unsigned u, v, n;
+
+	n = (num_rounds + 1) << 1;
+	for (u = 0, v = 0; u < n; u ++, v += 4) {
+		uint64_t x0, x1, x2, x3;
+
+		x0 = x1 = x2 = x3 = comp_skey[u];
+		x0 &= (uint64_t)0x1111111111111111;
+		x1 &= (uint64_t)0x2222222222222222;
+		x2 &= (uint64_t)0x4444444444444444;
+		x3 &= (uint64_t)0x8888888888888888;
+		x1 >>= 1;
+		x2 >>= 2;
+		x3 >>= 3;
+		skey[v + 0] = (x0 << 4) - x0;
+		skey[v + 1] = (x1 << 4) - x1;
+		skey[v + 2] = (x2 << 4) - x2;
+		skey[v + 3] = (x3 << 4) - x3;
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcdec.c
new file mode 100644
index 00000000..5a7360bc
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcdec.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_cbcdec_init(br_aes_ct64_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_ct64_cbcdec_vtable;
+	ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_cbcdec_run(const br_aes_ct64_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf;
+	uint64_t sk_exp[120];
+	uint32_t ivw[4];
+
+	br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+	br_range_dec32le(ivw, 4, iv);
+	buf = data;
+	while (len > 0) {
+		uint64_t q[8];
+		uint32_t w1[16], w2[16];
+		int i;
+
+		if (len >= 64) {
+			br_range_dec32le(w1, 16, buf);
+		} else {
+			br_range_dec32le(w1, len >> 2, buf);
+		}
+		for (i = 0; i < 4; i ++) {
+			br_aes_ct64_interleave_in(
+				&q[i], &q[i + 4], w1 + (i << 2));
+		}
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_decrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+		for (i = 0; i < 4; i ++) {
+			br_aes_ct64_interleave_out(
+				w2 + (i << 2), q[i], q[i + 4]);
+		}
+		for (i = 0; i < 4; i ++) {
+			w2[i] ^= ivw[i];
+		}
+		if (len >= 64) {
+			for (i = 4; i < 16; i ++) {
+				w2[i] ^= w1[i - 4];
+			}
+			memcpy(ivw, w1 + 12, sizeof ivw);
+			br_range_enc32le(buf, w2, 16);
+		} else {
+			int j;
+
+			j = (int)(len >> 2);
+			for (i = 4; i < j; i ++) {
+				w2[i] ^= w1[i - 4];
+			}
+			memcpy(ivw, w1 + j - 4, sizeof ivw);
+			br_range_enc32le(buf, w2, j);
+			break;
+		}
+		buf += 64;
+		len -= 64;
+	}
+	br_range_enc32le(iv, ivw, 4);
+}
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_aes_ct64_cbcdec_vtable = {
+	sizeof(br_aes_ct64_cbcdec_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_aes_ct64_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_aes_ct64_cbcdec_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcenc.c
new file mode 100644
index 00000000..6cb9dece
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcenc.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_cbcenc_init(br_aes_ct64_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_ct64_cbcenc_vtable;
+	ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_cbcenc_run(const br_aes_ct64_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf;
+	uint64_t sk_exp[120];
+	uint32_t ivw[4];
+
+	br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+	br_range_dec32le(ivw, 4, iv);
+	buf = data;
+	while (len > 0) {
+		uint32_t w[4];
+		uint64_t q[8];
+
+		w[0] = ivw[0] ^ br_dec32le(buf);
+		w[1] = ivw[1] ^ br_dec32le(buf + 4);
+		w[2] = ivw[2] ^ br_dec32le(buf + 8);
+		w[3] = ivw[3] ^ br_dec32le(buf + 12);
+		br_aes_ct64_interleave_in(&q[0], &q[4], w);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+		memcpy(ivw, w, sizeof w);
+		br_enc32le(buf, w[0]);
+		br_enc32le(buf + 4, w[1]);
+		br_enc32le(buf + 8, w[2]);
+		br_enc32le(buf + 12, w[3]);
+		buf += 16;
+		len -= 16;
+	}
+	br_range_enc32le(iv, ivw, 4);
+}
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_aes_ct64_cbcenc_vtable = {
+	sizeof(br_aes_ct64_cbcenc_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_aes_ct64_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_aes_ct64_cbcenc_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctr.c
new file mode 100644
index 00000000..1275873d
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctr.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_ctr_init(br_aes_ct64_ctr_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_ct64_ctr_vtable;
+	ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len);
+}
+
+static void
+xorbuf(void *dst, const void *src, size_t len)
+{
+	unsigned char *d;
+	const unsigned char *s;
+
+	d = dst;
+	s = src;
+	while (len -- > 0) {
+		*d ++ ^= *s ++;
+	}
+}
+
+/* see bearssl_block.h */
+uint32_t
+br_aes_ct64_ctr_run(const br_aes_ct64_ctr_keys *ctx,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+	uint32_t ivw[16];
+	uint64_t sk_exp[120];
+
+	br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+	br_range_dec32le(ivw, 3, iv);
+	memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t));
+	memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t));
+	memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t));
+	buf = data;
+	while (len > 0) {
+		uint64_t q[8];
+		uint32_t w[16];
+		unsigned char tmp[64];
+		int i;
+
+		/*
+		 * TODO: see if we can save on the first br_aes_ct64_ortho()
+		 * call, since iv0/iv1/iv2 are constant for the whole run.
+		 */
+		memcpy(w, ivw, sizeof ivw);
+		w[3] = br_swap32(cc);
+		w[7] = br_swap32(cc + 1);
+		w[11] = br_swap32(cc + 2);
+		w[15] = br_swap32(cc + 3);
+		for (i = 0; i < 4; i ++) {
+			br_aes_ct64_interleave_in(
+				&q[i], &q[i + 4], w + (i << 2));
+		}
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+		for (i = 0; i < 4; i ++) {
+			br_aes_ct64_interleave_out(
+				w + (i << 2), q[i], q[i + 4]);
+		}
+		br_range_enc32le(tmp, w, 16);
+		if (len <= 64) {
+			xorbuf(buf, tmp, len);
+			cc += (uint32_t)len >> 4;
+			break;
+		}
+		xorbuf(buf, tmp, 64);
+		buf += 64;
+		len -= 64;
+		cc += 4;
+	}
+	return cc;
+}
+
+/* see bearssl_block.h */
+const br_block_ctr_class br_aes_ct64_ctr_vtable = {
+	sizeof(br_aes_ct64_ctr_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctr_class **, const void *, size_t))
+		&br_aes_ct64_ctr_init,
+	(uint32_t (*)(const br_block_ctr_class *const *,
+		const void *, uint32_t, void *, size_t))
+		&br_aes_ct64_ctr_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctrcbc.c
new file mode 100644
index 00000000..21bb8efa
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctrcbc.c
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_ctrcbc_init(br_aes_ct64_ctrcbc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_ct64_ctrcbc_vtable;
+	ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len);
+}
+
+static void
+xorbuf(void *dst, const void *src, size_t len)
+{
+	unsigned char *d;
+	const unsigned char *s;
+
+	d = dst;
+	s = src;
+	while (len -- > 0) {
+		*d ++ ^= *s ++;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_ctrcbc_ctr(const br_aes_ct64_ctrcbc_keys *ctx,
+	void *ctr, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned char *ivbuf;
+	uint32_t iv0, iv1, iv2, iv3;
+	uint64_t sk_exp[120];
+
+	br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+
+	/*
+	 * We keep the counter as four 32-bit values, with big-endian
+	 * convention, because that's what is expected for purposes of
+	 * incrementing the counter value.
+	 */
+	ivbuf = ctr;
+	iv0 = br_dec32be(ivbuf +  0);
+	iv1 = br_dec32be(ivbuf +  4);
+	iv2 = br_dec32be(ivbuf +  8);
+	iv3 = br_dec32be(ivbuf + 12);
+
+	buf = data;
+	while (len > 0) {
+		uint64_t q[8];
+		uint32_t w[16];
+		unsigned char tmp[64];
+		int i, j;
+
+		/*
+		 * The bitslice implementation expects values in
+		 * little-endian convention, so we have to byteswap them.
+		 */
+		j = (len >= 64) ? 16 : (int)(len >> 2);
+		for (i = 0; i < j; i += 4) {
+			uint32_t carry;
+
+			w[i + 0] = br_swap32(iv0);
+			w[i + 1] = br_swap32(iv1);
+			w[i + 2] = br_swap32(iv2);
+			w[i + 3] = br_swap32(iv3);
+			iv3 ++;
+			carry = ~(iv3 | -iv3) >> 31;
+			iv2 += carry;
+			carry &= -(~(iv2 | -iv2) >> 31);
+			iv1 += carry;
+			carry &= -(~(iv1 | -iv1) >> 31);
+			iv0 += carry;
+		}
+		memset(w + i, 0, (16 - i) * sizeof(uint32_t));
+
+		for (i = 0; i < 4; i ++) {
+			br_aes_ct64_interleave_in(
+				&q[i], &q[i + 4], w + (i << 2));
+		}
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+		for (i = 0; i < 4; i ++) {
+			br_aes_ct64_interleave_out(
+				w + (i << 2), q[i], q[i + 4]);
+		}
+
+		br_range_enc32le(tmp, w, 16);
+		if (len <= 64) {
+			xorbuf(buf, tmp, len);
+			break;
+		}
+		xorbuf(buf, tmp, 64);
+		buf += 64;
+		len -= 64;
+	}
+	br_enc32be(ivbuf +  0, iv0);
+	br_enc32be(ivbuf +  4, iv1);
+	br_enc32be(ivbuf +  8, iv2);
+	br_enc32be(ivbuf + 12, iv3);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_ctrcbc_mac(const br_aes_ct64_ctrcbc_keys *ctx,
+	void *cbcmac, const void *data, size_t len)
+{
+	const unsigned char *buf;
+	uint32_t cm0, cm1, cm2, cm3;
+	uint64_t q[8];
+	uint64_t sk_exp[120];
+
+	br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+
+	cm0 = br_dec32le((unsigned char *)cbcmac +  0);
+	cm1 = br_dec32le((unsigned char *)cbcmac +  4);
+	cm2 = br_dec32le((unsigned char *)cbcmac +  8);
+	cm3 = br_dec32le((unsigned char *)cbcmac + 12);
+
+	buf = data;
+	memset(q, 0, sizeof q);
+	while (len > 0) {
+		uint32_t w[4];
+
+		w[0] = cm0 ^ br_dec32le(buf +  0);
+		w[1] = cm1 ^ br_dec32le(buf +  4);
+		w[2] = cm2 ^ br_dec32le(buf +  8);
+		w[3] = cm3 ^ br_dec32le(buf + 12);
+
+		br_aes_ct64_interleave_in(&q[0], &q[4], w);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+
+		cm0 = w[0];
+		cm1 = w[1];
+		cm2 = w[2];
+		cm3 = w[3];
+		buf += 16;
+		len -= 16;
+	}
+
+	br_enc32le((unsigned char *)cbcmac +  0, cm0);
+	br_enc32le((unsigned char *)cbcmac +  4, cm1);
+	br_enc32le((unsigned char *)cbcmac +  8, cm2);
+	br_enc32le((unsigned char *)cbcmac + 12, cm3);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_ctrcbc_encrypt(const br_aes_ct64_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	/*
+	 * When encrypting, the CBC-MAC processing must be lagging by
+	 * one block, since it operates on the encrypted values, so
+	 * it must wait for that encryption to complete.
+	 */
+
+	unsigned char *buf;
+	unsigned char *ivbuf;
+	uint32_t iv0, iv1, iv2, iv3;
+	uint32_t cm0, cm1, cm2, cm3;
+	uint64_t sk_exp[120];
+	uint64_t q[8];
+	int first_iter;
+
+	br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+
+	/*
+	 * We keep the counter as four 32-bit values, with big-endian
+	 * convention, because that's what is expected for purposes of
+	 * incrementing the counter value.
+	 */
+	ivbuf = ctr;
+	iv0 = br_dec32be(ivbuf +  0);
+	iv1 = br_dec32be(ivbuf +  4);
+	iv2 = br_dec32be(ivbuf +  8);
+	iv3 = br_dec32be(ivbuf + 12);
+
+	/*
+	 * The current CBC-MAC value is kept in little-endian convention.
+	 */
+	cm0 = br_dec32le((unsigned char *)cbcmac +  0);
+	cm1 = br_dec32le((unsigned char *)cbcmac +  4);
+	cm2 = br_dec32le((unsigned char *)cbcmac +  8);
+	cm3 = br_dec32le((unsigned char *)cbcmac + 12);
+
+	buf = data;
+	first_iter = 1;
+	memset(q, 0, sizeof q);
+	while (len > 0) {
+		uint32_t w[8], carry;
+
+		/*
+		 * The bitslice implementation expects values in
+		 * little-endian convention, so we have to byteswap them.
+		 */
+		w[0] = br_swap32(iv0);
+		w[1] = br_swap32(iv1);
+		w[2] = br_swap32(iv2);
+		w[3] = br_swap32(iv3);
+		iv3 ++;
+		carry = ~(iv3 | -iv3) >> 31;
+		iv2 += carry;
+		carry &= -(~(iv2 | -iv2) >> 31);
+		iv1 += carry;
+		carry &= -(~(iv1 | -iv1) >> 31);
+		iv0 += carry;
+
+		/*
+		 * The block for CBC-MAC.
+		 */
+		w[4] = cm0;
+		w[5] = cm1;
+		w[6] = cm2;
+		w[7] = cm3;
+
+		br_aes_ct64_interleave_in(&q[0], &q[4], w);
+		br_aes_ct64_interleave_in(&q[1], &q[5], w + 4);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+		br_aes_ct64_interleave_out(w + 4, q[1], q[5]);
+
+		/*
+		 * We do the XOR with the plaintext in 32-bit registers,
+		 * so that the value are available for CBC-MAC processing
+		 * as well.
+		 */
+		w[0] ^= br_dec32le(buf +  0);
+		w[1] ^= br_dec32le(buf +  4);
+		w[2] ^= br_dec32le(buf +  8);
+		w[3] ^= br_dec32le(buf + 12);
+		br_enc32le(buf +  0, w[0]);
+		br_enc32le(buf +  4, w[1]);
+		br_enc32le(buf +  8, w[2]);
+		br_enc32le(buf + 12, w[3]);
+
+		buf += 16;
+		len -= 16;
+
+		/*
+		 * We set the cm* values to the block to encrypt in the
+		 * next iteration.
+		 */
+		if (first_iter) {
+			first_iter = 0;
+			cm0 ^= w[0];
+			cm1 ^= w[1];
+			cm2 ^= w[2];
+			cm3 ^= w[3];
+		} else {
+			cm0 = w[0] ^ w[4];
+			cm1 = w[1] ^ w[5];
+			cm2 = w[2] ^ w[6];
+			cm3 = w[3] ^ w[7];
+		}
+
+		/*
+		 * If this was the last iteration, then compute the
+		 * extra block encryption to complete CBC-MAC.
+		 */
+		if (len == 0) {
+			w[0] = cm0;
+			w[1] = cm1;
+			w[2] = cm2;
+			w[3] = cm3;
+			br_aes_ct64_interleave_in(&q[0], &q[4], w);
+			br_aes_ct64_ortho(q);
+			br_aes_ct64_bitslice_encrypt(
+				ctx->num_rounds, sk_exp, q);
+			br_aes_ct64_ortho(q);
+			br_aes_ct64_interleave_out(w, q[0], q[4]);
+			cm0 = w[0];
+			cm1 = w[1];
+			cm2 = w[2];
+			cm3 = w[3];
+			break;
+		}
+	}
+
+	br_enc32be(ivbuf +  0, iv0);
+	br_enc32be(ivbuf +  4, iv1);
+	br_enc32be(ivbuf +  8, iv2);
+	br_enc32be(ivbuf + 12, iv3);
+	br_enc32le((unsigned char *)cbcmac +  0, cm0);
+	br_enc32le((unsigned char *)cbcmac +  4, cm1);
+	br_enc32le((unsigned char *)cbcmac +  8, cm2);
+	br_enc32le((unsigned char *)cbcmac + 12, cm3);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct64_ctrcbc_decrypt(const br_aes_ct64_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned char *ivbuf;
+	uint32_t iv0, iv1, iv2, iv3;
+	uint32_t cm0, cm1, cm2, cm3;
+	uint64_t sk_exp[120];
+	uint64_t q[8];
+
+	br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+
+	/*
+	 * We keep the counter as four 32-bit values, with big-endian
+	 * convention, because that's what is expected for purposes of
+	 * incrementing the counter value.
+	 */
+	ivbuf = ctr;
+	iv0 = br_dec32be(ivbuf +  0);
+	iv1 = br_dec32be(ivbuf +  4);
+	iv2 = br_dec32be(ivbuf +  8);
+	iv3 = br_dec32be(ivbuf + 12);
+
+	/*
+	 * The current CBC-MAC value is kept in little-endian convention.
+	 */
+	cm0 = br_dec32le((unsigned char *)cbcmac +  0);
+	cm1 = br_dec32le((unsigned char *)cbcmac +  4);
+	cm2 = br_dec32le((unsigned char *)cbcmac +  8);
+	cm3 = br_dec32le((unsigned char *)cbcmac + 12);
+
+	buf = data;
+	memset(q, 0, sizeof q);
+	while (len > 0) {
+		uint32_t w[8], carry;
+		unsigned char tmp[16];
+
+		/*
+		 * The bitslice implementation expects values in
+		 * little-endian convention, so we have to byteswap them.
+		 */
+		w[0] = br_swap32(iv0);
+		w[1] = br_swap32(iv1);
+		w[2] = br_swap32(iv2);
+		w[3] = br_swap32(iv3);
+		iv3 ++;
+		carry = ~(iv3 | -iv3) >> 31;
+		iv2 += carry;
+		carry &= -(~(iv2 | -iv2) >> 31);
+		iv1 += carry;
+		carry &= -(~(iv1 | -iv1) >> 31);
+		iv0 += carry;
+
+		/*
+		 * The block for CBC-MAC.
+		 */
+		w[4] = cm0 ^ br_dec32le(buf +  0);
+		w[5] = cm1 ^ br_dec32le(buf +  4);
+		w[6] = cm2 ^ br_dec32le(buf +  8);
+		w[7] = cm3 ^ br_dec32le(buf + 12);
+
+		br_aes_ct64_interleave_in(&q[0], &q[4], w);
+		br_aes_ct64_interleave_in(&q[1], &q[5], w + 4);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct64_ortho(q);
+		br_aes_ct64_interleave_out(w, q[0], q[4]);
+		br_aes_ct64_interleave_out(w + 4, q[1], q[5]);
+
+		br_enc32le(tmp +  0, w[0]);
+		br_enc32le(tmp +  4, w[1]);
+		br_enc32le(tmp +  8, w[2]);
+		br_enc32le(tmp + 12, w[3]);
+		xorbuf(buf, tmp, 16);
+		cm0 = w[4];
+		cm1 = w[5];
+		cm2 = w[6];
+		cm3 = w[7];
+		buf += 16;
+		len -= 16;
+	}
+
+	br_enc32be(ivbuf +  0, iv0);
+	br_enc32be(ivbuf +  4, iv1);
+	br_enc32be(ivbuf +  8, iv2);
+	br_enc32be(ivbuf + 12, iv3);
+	br_enc32le((unsigned char *)cbcmac +  0, cm0);
+	br_enc32le((unsigned char *)cbcmac +  4, cm1);
+	br_enc32le((unsigned char *)cbcmac +  8, cm2);
+	br_enc32le((unsigned char *)cbcmac + 12, cm3);
+}
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class br_aes_ct64_ctrcbc_vtable = {
+	sizeof(br_aes_ct64_ctrcbc_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
+		&br_aes_ct64_ctrcbc_init,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_ct64_ctrcbc_encrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_ct64_ctrcbc_decrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, size_t))
+		&br_aes_ct64_ctrcbc_ctr,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, const void *, size_t))
+		&br_aes_ct64_ctrcbc_mac
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_dec.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_dec.c
new file mode 100644
index 00000000..ab00e099
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_dec.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_aes_ct64_bitslice_invSbox(uint64_t *q)
+{
+	/*
+	 * See br_aes_ct_bitslice_invSbox(). This is the natural extension
+	 * to 64-bit registers.
+	 */
+	uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = ~q[5];
+	q6 = ~q[6];
+	q7 = q[7];
+	q[7] = q1 ^ q4 ^ q6;
+	q[6] = q0 ^ q3 ^ q5;
+	q[5] = q7 ^ q2 ^ q4;
+	q[4] = q6 ^ q1 ^ q3;
+	q[3] = q5 ^ q0 ^ q2;
+	q[2] = q4 ^ q7 ^ q1;
+	q[1] = q3 ^ q6 ^ q0;
+	q[0] = q2 ^ q5 ^ q7;
+
+	br_aes_ct64_bitslice_Sbox(q);
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = ~q[5];
+	q6 = ~q[6];
+	q7 = q[7];
+	q[7] = q1 ^ q4 ^ q6;
+	q[6] = q0 ^ q3 ^ q5;
+	q[5] = q7 ^ q2 ^ q4;
+	q[4] = q6 ^ q1 ^ q3;
+	q[3] = q5 ^ q0 ^ q2;
+	q[2] = q4 ^ q7 ^ q1;
+	q[1] = q3 ^ q6 ^ q0;
+	q[0] = q2 ^ q5 ^ q7;
+}
+
+static void
+add_round_key(uint64_t *q, const uint64_t *sk)
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		q[i] ^= sk[i];
+	}
+}
+
+static void
+inv_shift_rows(uint64_t *q)
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		uint64_t x;
+
+		x = q[i];
+		q[i] = (x & (uint64_t)0x000000000000FFFF)
+			| ((x & (uint64_t)0x000000000FFF0000) << 4)
+			| ((x & (uint64_t)0x00000000F0000000) >> 12)
+			| ((x & (uint64_t)0x000000FF00000000) << 8)
+			| ((x & (uint64_t)0x0000FF0000000000) >> 8)
+			| ((x & (uint64_t)0x000F000000000000) << 12)
+			| ((x & (uint64_t)0xFFF0000000000000) >> 4);
+	}
+}
+
+static inline uint64_t
+rotr32(uint64_t x)
+{
+	return (x << 32) | (x >> 32);
+}
+
+static void
+inv_mix_columns(uint64_t *q)
+{
+	uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
+	uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = q[5];
+	q6 = q[6];
+	q7 = q[7];
+	r0 = (q0 >> 16) | (q0 << 48);
+	r1 = (q1 >> 16) | (q1 << 48);
+	r2 = (q2 >> 16) | (q2 << 48);
+	r3 = (q3 >> 16) | (q3 << 48);
+	r4 = (q4 >> 16) | (q4 << 48);
+	r5 = (q5 >> 16) | (q5 << 48);
+	r6 = (q6 >> 16) | (q6 << 48);
+	r7 = (q7 >> 16) | (q7 << 48);
+
+	q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr32(q0 ^ q5 ^ q6 ^ r0 ^ r5);
+	q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6);
+	q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr32(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7);
+	q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr32(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7);
+	q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6);
+	q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7);
+	q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr32(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7);
+	q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr32(q4 ^ q5 ^ q7 ^ r4 ^ r7);
+}
+
+/* see inner.h */
+void
+br_aes_ct64_bitslice_decrypt(unsigned num_rounds,
+	const uint64_t *skey, uint64_t *q)
+{
+	unsigned u;
+
+	add_round_key(q, skey + (num_rounds << 3));
+	for (u = num_rounds - 1; u > 0; u --) {
+		inv_shift_rows(q);
+		br_aes_ct64_bitslice_invSbox(q);
+		add_round_key(q, skey + (u << 3));
+		inv_mix_columns(q);
+	}
+	inv_shift_rows(q);
+	br_aes_ct64_bitslice_invSbox(q);
+	add_round_key(q, skey);
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_enc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_enc.c
new file mode 100644
index 00000000..78631ced
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_enc.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static inline void
+add_round_key(uint64_t *q, const uint64_t *sk)
+{
+	q[0] ^= sk[0];
+	q[1] ^= sk[1];
+	q[2] ^= sk[2];
+	q[3] ^= sk[3];
+	q[4] ^= sk[4];
+	q[5] ^= sk[5];
+	q[6] ^= sk[6];
+	q[7] ^= sk[7];
+}
+
+static inline void
+shift_rows(uint64_t *q)
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		uint64_t x;
+
+		x = q[i];
+		q[i] = (x & (uint64_t)0x000000000000FFFF)
+			| ((x & (uint64_t)0x00000000FFF00000) >> 4)
+			| ((x & (uint64_t)0x00000000000F0000) << 12)
+			| ((x & (uint64_t)0x0000FF0000000000) >> 8)
+			| ((x & (uint64_t)0x000000FF00000000) << 8)
+			| ((x & (uint64_t)0xF000000000000000) >> 12)
+			| ((x & (uint64_t)0x0FFF000000000000) << 4);
+	}
+}
+
+static inline uint64_t
+rotr32(uint64_t x)
+{
+	return (x << 32) | (x >> 32);
+}
+
+static inline void
+mix_columns(uint64_t *q)
+{
+	uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
+	uint64_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = q[5];
+	q6 = q[6];
+	q7 = q[7];
+	r0 = (q0 >> 16) | (q0 << 48);
+	r1 = (q1 >> 16) | (q1 << 48);
+	r2 = (q2 >> 16) | (q2 << 48);
+	r3 = (q3 >> 16) | (q3 << 48);
+	r4 = (q4 >> 16) | (q4 << 48);
+	r5 = (q5 >> 16) | (q5 << 48);
+	r6 = (q6 >> 16) | (q6 << 48);
+	r7 = (q7 >> 16) | (q7 << 48);
+
+	q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
+	q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
+	q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
+	q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
+	q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
+	q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
+	q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
+	q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
+}
+
+/* see inner.h */
+void
+br_aes_ct64_bitslice_encrypt(unsigned num_rounds,
+	const uint64_t *skey, uint64_t *q)
+{
+	unsigned u;
+
+	add_round_key(q, skey);
+	for (u = 1; u < num_rounds; u ++) {
+		br_aes_ct64_bitslice_Sbox(q);
+		shift_rows(q);
+		mix_columns(q);
+		add_round_key(q, skey + (u << 3));
+	}
+	br_aes_ct64_bitslice_Sbox(q);
+	shift_rows(q);
+	add_round_key(q, skey + (num_rounds << 3));
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcdec.c
new file mode 100644
index 00000000..522645ad
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcdec.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_ct_cbcdec_init(br_aes_ct_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_ct_cbcdec_vtable;
+	ctx->num_rounds = br_aes_ct_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct_cbcdec_run(const br_aes_ct_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+	uint32_t iv0, iv1, iv2, iv3;
+	uint32_t sk_exp[120];
+
+	br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+	ivbuf = iv;
+	iv0 = br_dec32le(ivbuf);
+	iv1 = br_dec32le(ivbuf + 4);
+	iv2 = br_dec32le(ivbuf + 8);
+	iv3 = br_dec32le(ivbuf + 12);
+	buf = data;
+	while (len > 0) {
+		uint32_t q[8], sq[8];
+
+		q[0] = br_dec32le(buf);
+		q[2] = br_dec32le(buf + 4);
+		q[4] = br_dec32le(buf + 8);
+		q[6] = br_dec32le(buf + 12);
+		if (len >= 32) {
+			q[1] = br_dec32le(buf + 16);
+			q[3] = br_dec32le(buf + 20);
+			q[5] = br_dec32le(buf + 24);
+			q[7] = br_dec32le(buf + 28);
+		} else {
+			q[1] = 0;
+			q[3] = 0;
+			q[5] = 0;
+			q[7] = 0;
+		}
+		memcpy(sq, q, sizeof q);
+		br_aes_ct_ortho(q);
+		br_aes_ct_bitslice_decrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct_ortho(q);
+		br_enc32le(buf, q[0] ^ iv0);
+		br_enc32le(buf + 4, q[2] ^ iv1);
+		br_enc32le(buf + 8, q[4] ^ iv2);
+		br_enc32le(buf + 12, q[6] ^ iv3);
+		if (len < 32) {
+			iv0 = sq[0];
+			iv1 = sq[2];
+			iv2 = sq[4];
+			iv3 = sq[6];
+			break;
+		}
+		br_enc32le(buf + 16, q[1] ^ sq[0]);
+		br_enc32le(buf + 20, q[3] ^ sq[2]);
+		br_enc32le(buf + 24, q[5] ^ sq[4]);
+		br_enc32le(buf + 28, q[7] ^ sq[6]);
+		iv0 = sq[1];
+		iv1 = sq[3];
+		iv2 = sq[5];
+		iv3 = sq[7];
+		buf += 32;
+		len -= 32;
+	}
+	br_enc32le(ivbuf, iv0);
+	br_enc32le(ivbuf + 4, iv1);
+	br_enc32le(ivbuf + 8, iv2);
+	br_enc32le(ivbuf + 12, iv3);
+}
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_aes_ct_cbcdec_vtable = {
+	sizeof(br_aes_ct_cbcdec_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_aes_ct_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_aes_ct_cbcdec_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcenc.c
new file mode 100644
index 00000000..cb85977b
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcenc.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_ct_cbcenc_init(br_aes_ct_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_ct_cbcenc_vtable;
+	ctx->num_rounds = br_aes_ct_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct_cbcenc_run(const br_aes_ct_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+	uint32_t q[8];
+	uint32_t iv0, iv1, iv2, iv3;
+	uint32_t sk_exp[120];
+
+	q[1] = 0;
+	q[3] = 0;
+	q[5] = 0;
+	q[7] = 0;
+	br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+	ivbuf = iv;
+	iv0 = br_dec32le(ivbuf);
+	iv1 = br_dec32le(ivbuf + 4);
+	iv2 = br_dec32le(ivbuf + 8);
+	iv3 = br_dec32le(ivbuf + 12);
+	buf = data;
+	while (len > 0) {
+		q[0] = iv0 ^ br_dec32le(buf);
+		q[2] = iv1 ^ br_dec32le(buf + 4);
+		q[4] = iv2 ^ br_dec32le(buf + 8);
+		q[6] = iv3 ^ br_dec32le(buf + 12);
+		br_aes_ct_ortho(q);
+		br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct_ortho(q);
+		iv0 = q[0];
+		iv1 = q[2];
+		iv2 = q[4];
+		iv3 = q[6];
+		br_enc32le(buf, iv0);
+		br_enc32le(buf + 4, iv1);
+		br_enc32le(buf + 8, iv2);
+		br_enc32le(buf + 12, iv3);
+		buf += 16;
+		len -= 16;
+	}
+	br_enc32le(ivbuf, iv0);
+	br_enc32le(ivbuf + 4, iv1);
+	br_enc32le(ivbuf + 8, iv2);
+	br_enc32le(ivbuf + 12, iv3);
+}
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_aes_ct_cbcenc_vtable = {
+	sizeof(br_aes_ct_cbcenc_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_aes_ct_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_aes_ct_cbcenc_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_ctr.c
new file mode 100644
index 00000000..f407689e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_ctr.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_ct_ctr_init(br_aes_ct_ctr_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_ct_ctr_vtable;
+	ctx->num_rounds = br_aes_ct_keysched(ctx->skey, key, len);
+}
+
+static void
+xorbuf(void *dst, const void *src, size_t len)
+{
+	unsigned char *d;
+	const unsigned char *s;
+
+	d = dst;
+	s = src;
+	while (len -- > 0) {
+		*d ++ ^= *s ++;
+	}
+}
+
+/* see bearssl_block.h */
+uint32_t
+br_aes_ct_ctr_run(const br_aes_ct_ctr_keys *ctx,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+	const unsigned char *ivbuf;
+	uint32_t iv0, iv1, iv2;
+	uint32_t sk_exp[120];
+
+	br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+	ivbuf = iv;
+	iv0 = br_dec32le(ivbuf);
+	iv1 = br_dec32le(ivbuf + 4);
+	iv2 = br_dec32le(ivbuf + 8);
+	buf = data;
+	while (len > 0) {
+		uint32_t q[8];
+		unsigned char tmp[32];
+
+		/*
+		 * TODO: see if we can save on the first br_aes_ct_ortho()
+		 * call, since iv0/iv1/iv2 are constant for the whole run.
+		 */
+		q[0] = q[1] = iv0;
+		q[2] = q[3] = iv1;
+		q[4] = q[5] = iv2;
+		q[6] = br_swap32(cc);
+		q[7] = br_swap32(cc + 1);
+		br_aes_ct_ortho(q);
+		br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct_ortho(q);
+		br_enc32le(tmp, q[0]);
+		br_enc32le(tmp + 4, q[2]);
+		br_enc32le(tmp + 8, q[4]);
+		br_enc32le(tmp + 12, q[6]);
+		br_enc32le(tmp + 16, q[1]);
+		br_enc32le(tmp + 20, q[3]);
+		br_enc32le(tmp + 24, q[5]);
+		br_enc32le(tmp + 28, q[7]);
+
+		if (len <= 32) {
+			xorbuf(buf, tmp, len);
+			cc ++;
+			if (len > 16) {
+				cc ++;
+			}
+			break;
+		}
+		xorbuf(buf, tmp, 32);
+		buf += 32;
+		len -= 32;
+		cc += 2;
+	}
+	return cc;
+}
+
+/* see bearssl_block.h */
+const br_block_ctr_class br_aes_ct_ctr_vtable = {
+	sizeof(br_aes_ct_ctr_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctr_class **, const void *, size_t))
+		&br_aes_ct_ctr_init,
+	(uint32_t (*)(const br_block_ctr_class *const *,
+		const void *, uint32_t, void *, size_t))
+		&br_aes_ct_ctr_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_ctrcbc.c
new file mode 100644
index 00000000..8ae9fc75
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_ctrcbc.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_ct_ctrcbc_init(br_aes_ct_ctrcbc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_ct_ctrcbc_vtable;
+	ctx->num_rounds = br_aes_ct_keysched(ctx->skey, key, len);
+}
+
+static void
+xorbuf(void *dst, const void *src, size_t len)
+{
+	unsigned char *d;
+	const unsigned char *s;
+
+	d = dst;
+	s = src;
+	while (len -- > 0) {
+		*d ++ ^= *s ++;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct_ctrcbc_ctr(const br_aes_ct_ctrcbc_keys *ctx,
+	void *ctr, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned char *ivbuf;
+	uint32_t iv0, iv1, iv2, iv3;
+	uint32_t sk_exp[120];
+
+	br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+
+	/*
+	 * We keep the counter as four 32-bit values, with big-endian
+	 * convention, because that's what is expected for purposes of
+	 * incrementing the counter value.
+	 */
+	ivbuf = ctr;
+	iv0 = br_dec32be(ivbuf +  0);
+	iv1 = br_dec32be(ivbuf +  4);
+	iv2 = br_dec32be(ivbuf +  8);
+	iv3 = br_dec32be(ivbuf + 12);
+
+	buf = data;
+	while (len > 0) {
+		uint32_t q[8], carry;
+		unsigned char tmp[32];
+
+		/*
+		 * The bitslice implementation expects values in
+		 * little-endian convention, so we have to byteswap them.
+		 */
+		q[0] = br_swap32(iv0);
+		q[2] = br_swap32(iv1);
+		q[4] = br_swap32(iv2);
+		q[6] = br_swap32(iv3);
+		iv3 ++;
+		carry = ~(iv3 | -iv3) >> 31;
+		iv2 += carry;
+		carry &= -(~(iv2 | -iv2) >> 31);
+		iv1 += carry;
+		carry &= -(~(iv1 | -iv1) >> 31);
+		iv0 += carry;
+		q[1] = br_swap32(iv0);
+		q[3] = br_swap32(iv1);
+		q[5] = br_swap32(iv2);
+		q[7] = br_swap32(iv3);
+		if (len > 16) {
+			iv3 ++;
+			carry = ~(iv3 | -iv3) >> 31;
+			iv2 += carry;
+			carry &= -(~(iv2 | -iv2) >> 31);
+			iv1 += carry;
+			carry &= -(~(iv1 | -iv1) >> 31);
+			iv0 += carry;
+		}
+
+		br_aes_ct_ortho(q);
+		br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct_ortho(q);
+
+		br_enc32le(tmp, q[0]);
+		br_enc32le(tmp + 4, q[2]);
+		br_enc32le(tmp + 8, q[4]);
+		br_enc32le(tmp + 12, q[6]);
+		br_enc32le(tmp + 16, q[1]);
+		br_enc32le(tmp + 20, q[3]);
+		br_enc32le(tmp + 24, q[5]);
+		br_enc32le(tmp + 28, q[7]);
+
+		if (len <= 32) {
+			xorbuf(buf, tmp, len);
+			break;
+		}
+		xorbuf(buf, tmp, 32);
+		buf += 32;
+		len -= 32;
+	}
+	br_enc32be(ivbuf +  0, iv0);
+	br_enc32be(ivbuf +  4, iv1);
+	br_enc32be(ivbuf +  8, iv2);
+	br_enc32be(ivbuf + 12, iv3);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct_ctrcbc_mac(const br_aes_ct_ctrcbc_keys *ctx,
+	void *cbcmac, const void *data, size_t len)
+{
+	const unsigned char *buf;
+	uint32_t cm0, cm1, cm2, cm3;
+	uint32_t q[8];
+	uint32_t sk_exp[120];
+
+	br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+
+	buf = data;
+	cm0 = br_dec32le((unsigned char *)cbcmac +  0);
+	cm1 = br_dec32le((unsigned char *)cbcmac +  4);
+	cm2 = br_dec32le((unsigned char *)cbcmac +  8);
+	cm3 = br_dec32le((unsigned char *)cbcmac + 12);
+	q[1] = 0;
+	q[3] = 0;
+	q[5] = 0;
+	q[7] = 0;
+
+	while (len > 0) {
+		q[0] = cm0 ^ br_dec32le(buf +  0);
+		q[2] = cm1 ^ br_dec32le(buf +  4);
+		q[4] = cm2 ^ br_dec32le(buf +  8);
+		q[6] = cm3 ^ br_dec32le(buf + 12);
+
+		br_aes_ct_ortho(q);
+		br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct_ortho(q);
+
+		cm0 = q[0];
+		cm1 = q[2];
+		cm2 = q[4];
+		cm3 = q[6];
+		buf += 16;
+		len -= 16;
+	}
+
+	br_enc32le((unsigned char *)cbcmac +  0, cm0);
+	br_enc32le((unsigned char *)cbcmac +  4, cm1);
+	br_enc32le((unsigned char *)cbcmac +  8, cm2);
+	br_enc32le((unsigned char *)cbcmac + 12, cm3);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct_ctrcbc_encrypt(const br_aes_ct_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	/*
+	 * When encrypting, the CBC-MAC processing must be lagging by
+	 * one block, since it operates on the encrypted values, so
+	 * it must wait for that encryption to complete.
+	 */
+
+	unsigned char *buf;
+	unsigned char *ivbuf;
+	uint32_t iv0, iv1, iv2, iv3;
+	uint32_t cm0, cm1, cm2, cm3;
+	uint32_t sk_exp[120];
+	int first_iter;
+
+	br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+
+	/*
+	 * We keep the counter as four 32-bit values, with big-endian
+	 * convention, because that's what is expected for purposes of
+	 * incrementing the counter value.
+	 */
+	ivbuf = ctr;
+	iv0 = br_dec32be(ivbuf +  0);
+	iv1 = br_dec32be(ivbuf +  4);
+	iv2 = br_dec32be(ivbuf +  8);
+	iv3 = br_dec32be(ivbuf + 12);
+
+	/*
+	 * The current CBC-MAC value is kept in little-endian convention.
+	 */
+	cm0 = br_dec32le((unsigned char *)cbcmac +  0);
+	cm1 = br_dec32le((unsigned char *)cbcmac +  4);
+	cm2 = br_dec32le((unsigned char *)cbcmac +  8);
+	cm3 = br_dec32le((unsigned char *)cbcmac + 12);
+
+	buf = data;
+	first_iter = 1;
+	while (len > 0) {
+		uint32_t q[8], carry;
+
+		/*
+		 * The bitslice implementation expects values in
+		 * little-endian convention, so we have to byteswap them.
+		 */
+		q[0] = br_swap32(iv0);
+		q[2] = br_swap32(iv1);
+		q[4] = br_swap32(iv2);
+		q[6] = br_swap32(iv3);
+		iv3 ++;
+		carry = ~(iv3 | -iv3) >> 31;
+		iv2 += carry;
+		carry &= -(~(iv2 | -iv2) >> 31);
+		iv1 += carry;
+		carry &= -(~(iv1 | -iv1) >> 31);
+		iv0 += carry;
+
+		/*
+		 * The odd values are used for CBC-MAC.
+		 */
+		q[1] = cm0;
+		q[3] = cm1;
+		q[5] = cm2;
+		q[7] = cm3;
+
+		br_aes_ct_ortho(q);
+		br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct_ortho(q);
+
+		/*
+		 * We do the XOR with the plaintext in 32-bit registers,
+		 * so that the value are available for CBC-MAC processing
+		 * as well.
+		 */
+		q[0] ^= br_dec32le(buf +  0);
+		q[2] ^= br_dec32le(buf +  4);
+		q[4] ^= br_dec32le(buf +  8);
+		q[6] ^= br_dec32le(buf + 12);
+		br_enc32le(buf +  0, q[0]);
+		br_enc32le(buf +  4, q[2]);
+		br_enc32le(buf +  8, q[4]);
+		br_enc32le(buf + 12, q[6]);
+
+		buf += 16;
+		len -= 16;
+
+		/*
+		 * We set the cm* values to the block to encrypt in the
+		 * next iteration.
+		 */
+		if (first_iter) {
+			first_iter = 0;
+			cm0 ^= q[0];
+			cm1 ^= q[2];
+			cm2 ^= q[4];
+			cm3 ^= q[6];
+		} else {
+			cm0 = q[0] ^ q[1];
+			cm1 = q[2] ^ q[3];
+			cm2 = q[4] ^ q[5];
+			cm3 = q[6] ^ q[7];
+		}
+
+		/*
+		 * If this was the last iteration, then compute the
+		 * extra block encryption to complete CBC-MAC.
+		 */
+		if (len == 0) {
+			q[0] = cm0;
+			q[2] = cm1;
+			q[4] = cm2;
+			q[6] = cm3;
+			br_aes_ct_ortho(q);
+			br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+			br_aes_ct_ortho(q);
+			cm0 = q[0];
+			cm1 = q[2];
+			cm2 = q[4];
+			cm3 = q[6];
+			break;
+		}
+	}
+
+	br_enc32be(ivbuf +  0, iv0);
+	br_enc32be(ivbuf +  4, iv1);
+	br_enc32be(ivbuf +  8, iv2);
+	br_enc32be(ivbuf + 12, iv3);
+	br_enc32le((unsigned char *)cbcmac +  0, cm0);
+	br_enc32le((unsigned char *)cbcmac +  4, cm1);
+	br_enc32le((unsigned char *)cbcmac +  8, cm2);
+	br_enc32le((unsigned char *)cbcmac + 12, cm3);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_ct_ctrcbc_decrypt(const br_aes_ct_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned char *ivbuf;
+	uint32_t iv0, iv1, iv2, iv3;
+	uint32_t cm0, cm1, cm2, cm3;
+	uint32_t sk_exp[120];
+
+	br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+
+	/*
+	 * We keep the counter as four 32-bit values, with big-endian
+	 * convention, because that's what is expected for purposes of
+	 * incrementing the counter value.
+	 */
+	ivbuf = ctr;
+	iv0 = br_dec32be(ivbuf +  0);
+	iv1 = br_dec32be(ivbuf +  4);
+	iv2 = br_dec32be(ivbuf +  8);
+	iv3 = br_dec32be(ivbuf + 12);
+
+	/*
+	 * The current CBC-MAC value is kept in little-endian convention.
+	 */
+	cm0 = br_dec32le((unsigned char *)cbcmac +  0);
+	cm1 = br_dec32le((unsigned char *)cbcmac +  4);
+	cm2 = br_dec32le((unsigned char *)cbcmac +  8);
+	cm3 = br_dec32le((unsigned char *)cbcmac + 12);
+
+	buf = data;
+	while (len > 0) {
+		uint32_t q[8], carry;
+		unsigned char tmp[16];
+
+		/*
+		 * The bitslice implementation expects values in
+		 * little-endian convention, so we have to byteswap them.
+		 */
+		q[0] = br_swap32(iv0);
+		q[2] = br_swap32(iv1);
+		q[4] = br_swap32(iv2);
+		q[6] = br_swap32(iv3);
+		iv3 ++;
+		carry = ~(iv3 | -iv3) >> 31;
+		iv2 += carry;
+		carry &= -(~(iv2 | -iv2) >> 31);
+		iv1 += carry;
+		carry &= -(~(iv1 | -iv1) >> 31);
+		iv0 += carry;
+
+		/*
+		 * The odd values are used for CBC-MAC.
+		 */
+		q[1] = cm0 ^ br_dec32le(buf +  0);
+		q[3] = cm1 ^ br_dec32le(buf +  4);
+		q[5] = cm2 ^ br_dec32le(buf +  8);
+		q[7] = cm3 ^ br_dec32le(buf + 12);
+
+		br_aes_ct_ortho(q);
+		br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q);
+		br_aes_ct_ortho(q);
+
+		br_enc32le(tmp +  0, q[0]);
+		br_enc32le(tmp +  4, q[2]);
+		br_enc32le(tmp +  8, q[4]);
+		br_enc32le(tmp + 12, q[6]);
+		xorbuf(buf, tmp, 16);
+		cm0 = q[1];
+		cm1 = q[3];
+		cm2 = q[5];
+		cm3 = q[7];
+		buf += 16;
+		len -= 16;
+	}
+
+	br_enc32be(ivbuf +  0, iv0);
+	br_enc32be(ivbuf +  4, iv1);
+	br_enc32be(ivbuf +  8, iv2);
+	br_enc32be(ivbuf + 12, iv3);
+	br_enc32le((unsigned char *)cbcmac +  0, cm0);
+	br_enc32le((unsigned char *)cbcmac +  4, cm1);
+	br_enc32le((unsigned char *)cbcmac +  8, cm2);
+	br_enc32le((unsigned char *)cbcmac + 12, cm3);
+}
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class br_aes_ct_ctrcbc_vtable = {
+	sizeof(br_aes_ct_ctrcbc_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
+		&br_aes_ct_ctrcbc_init,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_ct_ctrcbc_encrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_ct_ctrcbc_decrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, size_t))
+		&br_aes_ct_ctrcbc_ctr,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, const void *, size_t))
+		&br_aes_ct_ctrcbc_mac
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_dec.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_dec.c
new file mode 100644
index 00000000..7f32d2bd
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_dec.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_aes_ct_bitslice_invSbox(uint32_t *q)
+{
+	/*
+	 * AES S-box is:
+	 *   S(x) = A(I(x)) ^ 0x63
+	 * where I() is inversion in GF(256), and A() is a linear
+	 * transform (0 is formally defined to be its own inverse).
+	 * Since inversion is an involution, the inverse S-box can be
+	 * computed from the S-box as:
+	 *   iS(x) = B(S(B(x ^ 0x63)) ^ 0x63)
+	 * where B() is the inverse of A(). Indeed, for any y in GF(256):
+	 *   iS(S(y)) = B(A(I(B(A(I(y)) ^ 0x63 ^ 0x63))) ^ 0x63 ^ 0x63) = y
+	 *
+	 * Note: we reuse the implementation of the forward S-box,
+	 * instead of duplicating it here, so that total code size is
+	 * lower. By merging the B() transforms into the S-box circuit
+	 * we could make faster CBC decryption, but CBC decryption is
+	 * already quite faster than CBC encryption because we can
+	 * process two blocks in parallel.
+	 */
+	uint32_t q0, q1, q2, q3, q4, q5, q6, q7;
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = ~q[5];
+	q6 = ~q[6];
+	q7 = q[7];
+	q[7] = q1 ^ q4 ^ q6;
+	q[6] = q0 ^ q3 ^ q5;
+	q[5] = q7 ^ q2 ^ q4;
+	q[4] = q6 ^ q1 ^ q3;
+	q[3] = q5 ^ q0 ^ q2;
+	q[2] = q4 ^ q7 ^ q1;
+	q[1] = q3 ^ q6 ^ q0;
+	q[0] = q2 ^ q5 ^ q7;
+
+	br_aes_ct_bitslice_Sbox(q);
+
+	q0 = ~q[0];
+	q1 = ~q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = ~q[5];
+	q6 = ~q[6];
+	q7 = q[7];
+	q[7] = q1 ^ q4 ^ q6;
+	q[6] = q0 ^ q3 ^ q5;
+	q[5] = q7 ^ q2 ^ q4;
+	q[4] = q6 ^ q1 ^ q3;
+	q[3] = q5 ^ q0 ^ q2;
+	q[2] = q4 ^ q7 ^ q1;
+	q[1] = q3 ^ q6 ^ q0;
+	q[0] = q2 ^ q5 ^ q7;
+}
+
+static void
+add_round_key(uint32_t *q, const uint32_t *sk)
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		q[i] ^= sk[i];
+	}
+}
+
+static void
+inv_shift_rows(uint32_t *q)
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		uint32_t x;
+
+		x = q[i];
+		q[i] = (x & 0x000000FF)
+			| ((x & 0x00003F00) << 2) | ((x & 0x0000C000) >> 6)
+			| ((x & 0x000F0000) << 4) | ((x & 0x00F00000) >> 4)
+			| ((x & 0x03000000) << 6) | ((x & 0xFC000000) >> 2);
+	}
+}
+
+static inline uint32_t
+rotr16(uint32_t x)
+{
+	return (x << 16) | (x >> 16);
+}
+
+static void
+inv_mix_columns(uint32_t *q)
+{
+	uint32_t q0, q1, q2, q3, q4, q5, q6, q7;
+	uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = q[5];
+	q6 = q[6];
+	q7 = q[7];
+	r0 = (q0 >> 8) | (q0 << 24);
+	r1 = (q1 >> 8) | (q1 << 24);
+	r2 = (q2 >> 8) | (q2 << 24);
+	r3 = (q3 >> 8) | (q3 << 24);
+	r4 = (q4 >> 8) | (q4 << 24);
+	r5 = (q5 >> 8) | (q5 << 24);
+	r6 = (q6 >> 8) | (q6 << 24);
+	r7 = (q7 >> 8) | (q7 << 24);
+
+	q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr16(q0 ^ q5 ^ q6 ^ r0 ^ r5);
+	q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6);
+	q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr16(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7);
+	q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr16(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7);
+	q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6);
+	q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7);
+	q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr16(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7);
+	q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr16(q4 ^ q5 ^ q7 ^ r4 ^ r7);
+}
+
+/* see inner.h */
+void
+br_aes_ct_bitslice_decrypt(unsigned num_rounds,
+	const uint32_t *skey, uint32_t *q)
+{
+	unsigned u;
+
+	add_round_key(q, skey + (num_rounds << 3));
+	for (u = num_rounds - 1; u > 0; u --) {
+		inv_shift_rows(q);
+		br_aes_ct_bitslice_invSbox(q);
+		add_round_key(q, skey + (u << 3));
+		inv_mix_columns(q);
+	}
+	inv_shift_rows(q);
+	br_aes_ct_bitslice_invSbox(q);
+	add_round_key(q, skey);
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_enc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_enc.c
new file mode 100644
index 00000000..089bf356
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_enc.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+static inline void
+add_round_key(uint32_t *q, const uint32_t *sk)
+{
+	q[0] ^= sk[0];
+	q[1] ^= sk[1];
+	q[2] ^= sk[2];
+	q[3] ^= sk[3];
+	q[4] ^= sk[4];
+	q[5] ^= sk[5];
+	q[6] ^= sk[6];
+	q[7] ^= sk[7];
+}
+
+static inline void
+shift_rows(uint32_t *q)
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		uint32_t x;
+
+		x = q[i];
+		q[i] = (x & 0x000000FF)
+			| ((x & 0x0000FC00) >> 2) | ((x & 0x00000300) << 6)
+			| ((x & 0x00F00000) >> 4) | ((x & 0x000F0000) << 4)
+			| ((x & 0xC0000000) >> 6) | ((x & 0x3F000000) << 2);
+	}
+}
+
+static inline uint32_t
+rotr16(uint32_t x)
+{
+	return (x << 16) | (x >> 16);
+}
+
+static inline void
+mix_columns(uint32_t *q)
+{
+	uint32_t q0, q1, q2, q3, q4, q5, q6, q7;
+	uint32_t r0, r1, r2, r3, r4, r5, r6, r7;
+
+	q0 = q[0];
+	q1 = q[1];
+	q2 = q[2];
+	q3 = q[3];
+	q4 = q[4];
+	q5 = q[5];
+	q6 = q[6];
+	q7 = q[7];
+	r0 = (q0 >> 8) | (q0 << 24);
+	r1 = (q1 >> 8) | (q1 << 24);
+	r2 = (q2 >> 8) | (q2 << 24);
+	r3 = (q3 >> 8) | (q3 << 24);
+	r4 = (q4 >> 8) | (q4 << 24);
+	r5 = (q5 >> 8) | (q5 << 24);
+	r6 = (q6 >> 8) | (q6 << 24);
+	r7 = (q7 >> 8) | (q7 << 24);
+
+	q[0] = q7 ^ r7 ^ r0 ^ rotr16(q0 ^ r0);
+	q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr16(q1 ^ r1);
+	q[2] = q1 ^ r1 ^ r2 ^ rotr16(q2 ^ r2);
+	q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr16(q3 ^ r3);
+	q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr16(q4 ^ r4);
+	q[5] = q4 ^ r4 ^ r5 ^ rotr16(q5 ^ r5);
+	q[6] = q5 ^ r5 ^ r6 ^ rotr16(q6 ^ r6);
+	q[7] = q6 ^ r6 ^ r7 ^ rotr16(q7 ^ r7);
+}
+
+/* see inner.h */
+void
+br_aes_ct_bitslice_encrypt(unsigned num_rounds,
+	const uint32_t *skey, uint32_t *q)
+{
+	unsigned u;
+
+	add_round_key(q, skey);
+	for (u = 1; u < num_rounds; u ++) {
+		br_aes_ct_bitslice_Sbox(q);
+		shift_rows(q);
+		mix_columns(q);
+		add_round_key(q, skey + (u << 3));
+	}
+	br_aes_ct_bitslice_Sbox(q);
+	shift_rows(q);
+	add_round_key(q, skey + (num_rounds << 3));
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8.c
new file mode 100644
index 00000000..b2c63c32
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+/*
+ * This code contains the AES key schedule implementation using the
+ * POWER8 opcodes.
+ */
+
+#if BR_POWER8
+
+static void
+key_schedule_128(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+	static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2 = current subkey
+		 * v3 = Rcon (x4 words)
+		 * v6 = constant 8, copied into four words
+		 * v7 = constant 0x11B, copied into four words
+		 * v8 = constant for byteswapping words
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		lxvw4x(34, 0, %[key])
+		vspltisw(3, 1)
+		vspltisw(6, 8)
+		lxvw4x(39, 0, %[fmod])
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * First subkey is a copy of the key itself.
+		 */
+#if BR_POWER8_LE
+		vperm(4, 2, 2, 8)
+		stxvw4x(36, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+		/*
+		 * Loop must run 10 times.
+		 */
+		li(%[cc], 10)
+		mtctr(%[cc])
+	label(loop)
+		/* Increment subkey address */
+		addi(%[sk], %[sk], 16)
+
+		/* Compute SubWord(RotWord(temp)) xor Rcon  (into v4, splat) */
+		vrlw(4, 2, 1)
+		vsbox(4, 4)
+#if BR_POWER8_LE
+		vxor(4, 4, 3)
+#else
+		vsldoi(5, 3, 0, 3)
+		vxor(4, 4, 5)
+#endif
+		vspltw(4, 4, 3)
+
+		/* XOR words for next subkey */
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vxor(2, 2, 4)
+
+		/* Store next subkey */
+#if BR_POWER8_LE
+		vperm(4, 2, 2, 8)
+		stxvw4x(36, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+		/* Update Rcon */
+		vadduwm(3, 3, 3)
+		vsrw(4, 3, 6)
+		vsubuwm(4, 0, 4)
+		vand(4, 4, 7)
+		vxor(3, 3, 4)
+
+		bdnz(loop)
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key), [fmod] "b" (fmod)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
+	);
+}
+
+static void
+key_schedule_192(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2, v3 = current subkey
+		 * v5 = Rcon (x4 words) (already shifted on big-endian)
+		 * v6 = constant 8, copied into four words
+		 * v8 = constant for byteswapping words
+		 *
+		 * The left two words of v3 are ignored.
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		li(%[cc], 8)
+		lxvw4x(34, 0, %[key])
+		lxvw4x(35, %[cc], %[key])
+		vsldoi(3, 3, 0, 8)
+		vspltisw(5, 1)
+#if !BR_POWER8_LE
+		vsldoi(5, 5, 0, 3)
+#endif
+		vspltisw(6, 8)
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * Loop must run 8 times. Each iteration produces 256
+		 * bits of subkeys, with a 64-bit overlap.
+		 */
+		li(%[cc], 8)
+		mtctr(%[cc])
+		li(%[cc], 16)
+	label(loop)
+
+		/*
+		 * Last 6 words in v2:v3l. Compute next 6 words into
+		 * v3r:v4.
+		 */
+		vrlw(10, 3, 1)
+		vsbox(10, 10)
+		vxor(10, 10, 5)
+		vspltw(10, 10, 1)
+		vsldoi(11, 0, 10, 8)
+
+		vsldoi(12, 0, 2, 12)
+		vxor(12, 2, 12)
+		vsldoi(13, 0, 12, 12)
+		vxor(12, 12, 13)
+		vsldoi(13, 0, 12, 12)
+		vxor(12, 12, 13)
+
+		vspltw(13, 12, 3)
+		vxor(13, 13, 3)
+		vsldoi(14, 0, 3, 12)
+		vxor(13, 13, 14)
+
+		vsldoi(4, 12, 13, 8)
+		vsldoi(14, 0, 3, 8)
+		vsldoi(3, 14, 12, 8)
+
+		vxor(3, 3, 11)
+		vxor(4, 4, 10)
+
+		/*
+		 * Update Rcon. Since for a 192-bit key, we use only 8
+		 * such constants, we will not hit the field modulus,
+		 * so a simple shift (addition) works well.
+		 */
+		vadduwm(5, 5, 5)
+
+		/*
+		 * Write out the two left 128-bit words
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		vperm(11, 3, 3, 8)
+		stxvw4x(42, 0, %[sk])
+		stxvw4x(43, %[cc], %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+		stxvw4x(35, %[cc], %[sk])
+#endif
+		addi(%[sk], %[sk], 24)
+
+		/*
+		 * Shift words for next iteration.
+		 */
+		vsldoi(2, 3, 4, 8)
+		vsldoi(3, 4, 0, 8)
+
+		bdnz(loop)
+
+		/*
+		 * The loop wrote the first 50 subkey words, but we need
+		 * to produce 52, so we must do one last write.
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		stxvw4x(42, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
+	);
+}
+
+static void
+key_schedule_256(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2, v3 = current subkey
+		 * v6 = Rcon (x4 words) (already shifted on big-endian)
+		 * v7 = constant 8, copied into four words
+		 * v8 = constant for byteswapping words
+		 *
+		 * The left two words of v3 are ignored.
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		li(%[cc], 16)
+		lxvw4x(34, 0, %[key])
+		lxvw4x(35, %[cc], %[key])
+		vspltisw(6, 1)
+#if !BR_POWER8_LE
+		vsldoi(6, 6, 0, 3)
+#endif
+		vspltisw(7, 8)
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * Loop must run 7 times. Each iteration produces two
+		 * subkeys.
+		 */
+		li(%[cc], 7)
+		mtctr(%[cc])
+		li(%[cc], 16)
+	label(loop)
+
+		/*
+		 * Current words are in v2:v3. Compute next word in v4.
+		 */
+		vrlw(10, 3, 1)
+		vsbox(10, 10)
+		vxor(10, 10, 6)
+		vspltw(10, 10, 3)
+
+		vsldoi(4, 0, 2, 12)
+		vxor(4, 2, 4)
+		vsldoi(5, 0, 4, 12)
+		vxor(4, 4, 5)
+		vsldoi(5, 0, 4, 12)
+		vxor(4, 4, 5)
+		vxor(4, 4, 10)
+
+		/*
+		 * Then other word in v5.
+		 */
+		vsbox(10, 4)
+		vspltw(10, 10, 3)
+
+		vsldoi(5, 0, 3, 12)
+		vxor(5, 3, 5)
+		vsldoi(11, 0, 5, 12)
+		vxor(5, 5, 11)
+		vsldoi(11, 0, 5, 12)
+		vxor(5, 5, 11)
+		vxor(5, 5, 10)
+
+		/*
+		 * Update Rcon. Since for a 256-bit key, we use only 7
+		 * such constants, we will not hit the field modulus,
+		 * so a simple shift (addition) works well.
+		 */
+		vadduwm(6, 6, 6)
+
+		/*
+		 * Write out the two left 128-bit words
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		vperm(11, 3, 3, 8)
+		stxvw4x(42, 0, %[sk])
+		stxvw4x(43, %[cc], %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+		stxvw4x(35, %[cc], %[sk])
+#endif
+		addi(%[sk], %[sk], 32)
+
+		/*
+		 * Replace v2:v3 with v4:v5.
+		 */
+		vxor(2, 0, 4)
+		vxor(3, 0, 5)
+
+		bdnz(loop)
+
+		/*
+		 * The loop wrote the first 14 subkeys, but we need 15,
+		 * so we must do an extra write.
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		stxvw4x(42, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
+	);
+}
+
+/* see inner.h */
+int
+br_aes_pwr8_supported(void)
+{
+	return 1;
+}
+
+/* see inner.h */
+unsigned
+br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
+{
+	switch (len) {
+	case 16:
+		key_schedule_128(sk, key);
+		return 10;
+	case 24:
+		key_schedule_192(sk, key);
+		return 12;
+	default:
+		key_schedule_256(sk, key);
+		return 14;
+	}
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcdec.c
new file mode 100644
index 00000000..e535ba6f
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcdec.c
@@ -0,0 +1,670 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+#if BR_POWER8
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_cbcdec_init(br_aes_pwr8_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_pwr8_cbcdec_vtable;
+	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
+}
+
+static void
+cbcdec_128(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v10
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v24.
+		 */
+		lxvw4x(56, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(24, 24, 24, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next ciphertext words in v16..v19. Also save them
+		 * in v20..v23.
+		 */
+		lxvw4x(48, %[cc0], %[buf])
+		lxvw4x(49, %[cc1], %[buf])
+		lxvw4x(50, %[cc2], %[buf])
+		lxvw4x(51, %[cc3], %[buf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		vand(20, 16, 16)
+		vand(21, 17, 17)
+		vand(22, 18, 18)
+		vand(23, 19, 19)
+
+		/*
+		 * Decrypt the blocks.
+		 */
+		vxor(16, 16, 10)
+		vxor(17, 17, 10)
+		vxor(18, 18, 10)
+		vxor(19, 19, 10)
+		vncipher(16, 16, 9)
+		vncipher(17, 17, 9)
+		vncipher(18, 18, 9)
+		vncipher(19, 19, 9)
+		vncipher(16, 16, 8)
+		vncipher(17, 17, 8)
+		vncipher(18, 18, 8)
+		vncipher(19, 19, 8)
+		vncipher(16, 16, 7)
+		vncipher(17, 17, 7)
+		vncipher(18, 18, 7)
+		vncipher(19, 19, 7)
+		vncipher(16, 16, 6)
+		vncipher(17, 17, 6)
+		vncipher(18, 18, 6)
+		vncipher(19, 19, 6)
+		vncipher(16, 16, 5)
+		vncipher(17, 17, 5)
+		vncipher(18, 18, 5)
+		vncipher(19, 19, 5)
+		vncipher(16, 16, 4)
+		vncipher(17, 17, 4)
+		vncipher(18, 18, 4)
+		vncipher(19, 19, 4)
+		vncipher(16, 16, 3)
+		vncipher(17, 17, 3)
+		vncipher(18, 18, 3)
+		vncipher(19, 19, 3)
+		vncipher(16, 16, 2)
+		vncipher(17, 17, 2)
+		vncipher(18, 18, 2)
+		vncipher(19, 19, 2)
+		vncipher(16, 16, 1)
+		vncipher(17, 17, 1)
+		vncipher(18, 18, 1)
+		vncipher(19, 19, 1)
+		vncipherlast(16, 16, 0)
+		vncipherlast(17, 17, 0)
+		vncipherlast(18, 18, 0)
+		vncipherlast(19, 19, 0)
+
+		/*
+		 * XOR decrypted blocks with IV / previous block.
+		 */
+		vxor(16, 16, 24)
+		vxor(17, 17, 20)
+		vxor(18, 18, 21)
+		vxor(19, 19, 22)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		/*
+		 * Fourth encrypted block is IV for next run.
+		 */
+		vand(24, 23, 23)
+
+		addi(%[buf], %[buf], 64)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+static void
+cbcdec_192(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v12
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(43, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(44, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v24.
+		 */
+		lxvw4x(56, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(24, 24, 24, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next ciphertext words in v16..v19. Also save them
+		 * in v20..v23.
+		 */
+		lxvw4x(48, %[cc0], %[buf])
+		lxvw4x(49, %[cc1], %[buf])
+		lxvw4x(50, %[cc2], %[buf])
+		lxvw4x(51, %[cc3], %[buf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		vand(20, 16, 16)
+		vand(21, 17, 17)
+		vand(22, 18, 18)
+		vand(23, 19, 19)
+
+		/*
+		 * Decrypt the blocks.
+		 */
+		vxor(16, 16, 12)
+		vxor(17, 17, 12)
+		vxor(18, 18, 12)
+		vxor(19, 19, 12)
+		vncipher(16, 16, 11)
+		vncipher(17, 17, 11)
+		vncipher(18, 18, 11)
+		vncipher(19, 19, 11)
+		vncipher(16, 16, 10)
+		vncipher(17, 17, 10)
+		vncipher(18, 18, 10)
+		vncipher(19, 19, 10)
+		vncipher(16, 16, 9)
+		vncipher(17, 17, 9)
+		vncipher(18, 18, 9)
+		vncipher(19, 19, 9)
+		vncipher(16, 16, 8)
+		vncipher(17, 17, 8)
+		vncipher(18, 18, 8)
+		vncipher(19, 19, 8)
+		vncipher(16, 16, 7)
+		vncipher(17, 17, 7)
+		vncipher(18, 18, 7)
+		vncipher(19, 19, 7)
+		vncipher(16, 16, 6)
+		vncipher(17, 17, 6)
+		vncipher(18, 18, 6)
+		vncipher(19, 19, 6)
+		vncipher(16, 16, 5)
+		vncipher(17, 17, 5)
+		vncipher(18, 18, 5)
+		vncipher(19, 19, 5)
+		vncipher(16, 16, 4)
+		vncipher(17, 17, 4)
+		vncipher(18, 18, 4)
+		vncipher(19, 19, 4)
+		vncipher(16, 16, 3)
+		vncipher(17, 17, 3)
+		vncipher(18, 18, 3)
+		vncipher(19, 19, 3)
+		vncipher(16, 16, 2)
+		vncipher(17, 17, 2)
+		vncipher(18, 18, 2)
+		vncipher(19, 19, 2)
+		vncipher(16, 16, 1)
+		vncipher(17, 17, 1)
+		vncipher(18, 18, 1)
+		vncipher(19, 19, 1)
+		vncipherlast(16, 16, 0)
+		vncipherlast(17, 17, 0)
+		vncipherlast(18, 18, 0)
+		vncipherlast(19, 19, 0)
+
+		/*
+		 * XOR decrypted blocks with IV / previous block.
+		 */
+		vxor(16, 16, 24)
+		vxor(17, 17, 20)
+		vxor(18, 18, 21)
+		vxor(19, 19, 22)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		/*
+		 * Fourth encrypted block is IV for next run.
+		 */
+		vand(24, 23, 23)
+
+		addi(%[buf], %[buf], 64)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+static void
+cbcdec_256(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v14
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(43, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(44, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(45, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(46, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v24.
+		 */
+		lxvw4x(56, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(24, 24, 24, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next ciphertext words in v16..v19. Also save them
+		 * in v20..v23.
+		 */
+		lxvw4x(48, %[cc0], %[buf])
+		lxvw4x(49, %[cc1], %[buf])
+		lxvw4x(50, %[cc2], %[buf])
+		lxvw4x(51, %[cc3], %[buf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		vand(20, 16, 16)
+		vand(21, 17, 17)
+		vand(22, 18, 18)
+		vand(23, 19, 19)
+
+		/*
+		 * Decrypt the blocks.
+		 */
+		vxor(16, 16, 14)
+		vxor(17, 17, 14)
+		vxor(18, 18, 14)
+		vxor(19, 19, 14)
+		vncipher(16, 16, 13)
+		vncipher(17, 17, 13)
+		vncipher(18, 18, 13)
+		vncipher(19, 19, 13)
+		vncipher(16, 16, 12)
+		vncipher(17, 17, 12)
+		vncipher(18, 18, 12)
+		vncipher(19, 19, 12)
+		vncipher(16, 16, 11)
+		vncipher(17, 17, 11)
+		vncipher(18, 18, 11)
+		vncipher(19, 19, 11)
+		vncipher(16, 16, 10)
+		vncipher(17, 17, 10)
+		vncipher(18, 18, 10)
+		vncipher(19, 19, 10)
+		vncipher(16, 16, 9)
+		vncipher(17, 17, 9)
+		vncipher(18, 18, 9)
+		vncipher(19, 19, 9)
+		vncipher(16, 16, 8)
+		vncipher(17, 17, 8)
+		vncipher(18, 18, 8)
+		vncipher(19, 19, 8)
+		vncipher(16, 16, 7)
+		vncipher(17, 17, 7)
+		vncipher(18, 18, 7)
+		vncipher(19, 19, 7)
+		vncipher(16, 16, 6)
+		vncipher(17, 17, 6)
+		vncipher(18, 18, 6)
+		vncipher(19, 19, 6)
+		vncipher(16, 16, 5)
+		vncipher(17, 17, 5)
+		vncipher(18, 18, 5)
+		vncipher(19, 19, 5)
+		vncipher(16, 16, 4)
+		vncipher(17, 17, 4)
+		vncipher(18, 18, 4)
+		vncipher(19, 19, 4)
+		vncipher(16, 16, 3)
+		vncipher(17, 17, 3)
+		vncipher(18, 18, 3)
+		vncipher(19, 19, 3)
+		vncipher(16, 16, 2)
+		vncipher(17, 17, 2)
+		vncipher(18, 18, 2)
+		vncipher(19, 19, 2)
+		vncipher(16, 16, 1)
+		vncipher(17, 17, 1)
+		vncipher(18, 18, 1)
+		vncipher(19, 19, 1)
+		vncipherlast(16, 16, 0)
+		vncipherlast(17, 17, 0)
+		vncipherlast(18, 18, 0)
+		vncipherlast(19, 19, 0)
+
+		/*
+		 * XOR decrypted blocks with IV / previous block.
+		 */
+		vxor(16, 16, 24)
+		vxor(17, 17, 20)
+		vxor(18, 18, 21)
+		vxor(19, 19, 22)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		/*
+		 * Fourth encrypted block is IV for next run.
+		 */
+		vand(24, 23, 23)
+
+		addi(%[buf], %[buf], 64)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_cbcdec_run(const br_aes_pwr8_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char nextiv[16];
+	unsigned char *buf;
+
+	if (len == 0) {
+		return;
+	}
+	buf = data;
+	memcpy(nextiv, buf + len - 16, 16);
+	if (len >= 64) {
+		size_t num_blocks;
+		unsigned char tmp[16];
+
+		num_blocks = (len >> 4) & ~(size_t)3;
+		memcpy(tmp, buf + (num_blocks << 4) - 16, 16);
+		switch (ctx->num_rounds) {
+		case 10:
+			cbcdec_128(ctx->skey.skni, iv, buf, num_blocks);
+			break;
+		case 12:
+			cbcdec_192(ctx->skey.skni, iv, buf, num_blocks);
+			break;
+		default:
+			cbcdec_256(ctx->skey.skni, iv, buf, num_blocks);
+			break;
+		}
+		buf += num_blocks << 4;
+		len &= 63;
+		memcpy(iv, tmp, 16);
+	}
+	if (len > 0) {
+		unsigned char tmp[64];
+
+		memcpy(tmp, buf, len);
+		memset(tmp + len, 0, (sizeof tmp) - len);
+		switch (ctx->num_rounds) {
+		case 10:
+			cbcdec_128(ctx->skey.skni, iv, tmp, 4);
+			break;
+		case 12:
+			cbcdec_192(ctx->skey.skni, iv, tmp, 4);
+			break;
+		default:
+			cbcdec_256(ctx->skey.skni, iv, tmp, 4);
+			break;
+		}
+		memcpy(buf, tmp, len);
+	}
+	memcpy(iv, nextiv, 16);
+}
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_aes_pwr8_cbcdec_vtable = {
+	sizeof(br_aes_pwr8_cbcdec_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_aes_pwr8_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_aes_pwr8_cbcdec_run
+};
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class *
+br_aes_pwr8_cbcdec_get_vtable(void)
+{
+	return br_aes_pwr8_supported() ? &br_aes_pwr8_cbcdec_vtable : NULL;
+}
+
+#else
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class *
+br_aes_pwr8_cbcdec_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcenc.c
new file mode 100644
index 00000000..00f8eca7
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcenc.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+#if BR_POWER8
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_cbcenc_init(br_aes_pwr8_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_pwr8_cbcenc_vtable;
+	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
+}
+
+static void
+cbcenc_128(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t len)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v10
+		 */
+		lxvw4x(32, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(33, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(34, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(35, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(36, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(37, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(38, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(39, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(40, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(41, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(42, %[cc], %[sk])
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v16.
+		 */
+		lxvw4x(48, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next plaintext word and XOR with current IV.
+		 */
+		lxvw4x(49, 0, %[buf])
+#if BR_POWER8_LE
+		vperm(17, 17, 17, 15)
+#endif
+		vxor(16, 16, 17)
+
+		/*
+		 * Encrypt the block.
+		 */
+		vxor(16, 16, 0)
+		vcipher(16, 16, 1)
+		vcipher(16, 16, 2)
+		vcipher(16, 16, 3)
+		vcipher(16, 16, 4)
+		vcipher(16, 16, 5)
+		vcipher(16, 16, 6)
+		vcipher(16, 16, 7)
+		vcipher(16, 16, 8)
+		vcipher(16, 16, 9)
+		vcipherlast(16, 16, 10)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(17, 16, 16, 15)
+		stxvw4x(49, 0, %[buf])
+#else
+		stxvw4x(48, 0, %[buf])
+#endif
+		addi(%[buf], %[buf], 16)
+
+		bdnz(loop)
+
+: [cc] "+b" (cc), [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "ctr", "memory"
+	);
+}
+
+static void
+cbcenc_192(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t len)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v12
+		 */
+		lxvw4x(32, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(33, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(34, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(35, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(36, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(37, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(38, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(39, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(40, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(41, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(42, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(43, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(44, %[cc], %[sk])
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v16.
+		 */
+		lxvw4x(48, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next plaintext word and XOR with current IV.
+		 */
+		lxvw4x(49, 0, %[buf])
+#if BR_POWER8_LE
+		vperm(17, 17, 17, 15)
+#endif
+		vxor(16, 16, 17)
+
+		/*
+		 * Encrypt the block.
+		 */
+		vxor(16, 16, 0)
+		vcipher(16, 16, 1)
+		vcipher(16, 16, 2)
+		vcipher(16, 16, 3)
+		vcipher(16, 16, 4)
+		vcipher(16, 16, 5)
+		vcipher(16, 16, 6)
+		vcipher(16, 16, 7)
+		vcipher(16, 16, 8)
+		vcipher(16, 16, 9)
+		vcipher(16, 16, 10)
+		vcipher(16, 16, 11)
+		vcipherlast(16, 16, 12)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(17, 16, 16, 15)
+		stxvw4x(49, 0, %[buf])
+#else
+		stxvw4x(48, 0, %[buf])
+#endif
+		addi(%[buf], %[buf], 16)
+
+		bdnz(loop)
+
+: [cc] "+b" (cc), [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "ctr", "memory"
+	);
+}
+
+static void
+cbcenc_256(const unsigned char *sk,
+	const unsigned char *iv, unsigned char *buf, size_t len)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v14
+		 */
+		lxvw4x(32, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(33, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(34, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(35, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(36, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(37, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(38, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(39, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(40, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(41, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(42, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(43, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(44, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(45, %[cc], %[sk])
+		addi(%[cc], %[cc], 16)
+		lxvw4x(46, %[cc], %[sk])
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * Load IV into v16.
+		 */
+		lxvw4x(48, 0, %[iv])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Load next plaintext word and XOR with current IV.
+		 */
+		lxvw4x(49, 0, %[buf])
+#if BR_POWER8_LE
+		vperm(17, 17, 17, 15)
+#endif
+		vxor(16, 16, 17)
+
+		/*
+		 * Encrypt the block.
+		 */
+		vxor(16, 16, 0)
+		vcipher(16, 16, 1)
+		vcipher(16, 16, 2)
+		vcipher(16, 16, 3)
+		vcipher(16, 16, 4)
+		vcipher(16, 16, 5)
+		vcipher(16, 16, 6)
+		vcipher(16, 16, 7)
+		vcipher(16, 16, 8)
+		vcipher(16, 16, 9)
+		vcipher(16, 16, 10)
+		vcipher(16, 16, 11)
+		vcipher(16, 16, 12)
+		vcipher(16, 16, 13)
+		vcipherlast(16, 16, 14)
+
+		/*
+		 * Store back result (with byteswap)
+		 */
+#if BR_POWER8_LE
+		vperm(17, 16, 16, 15)
+		stxvw4x(49, 0, %[buf])
+#else
+		stxvw4x(48, 0, %[buf])
+#endif
+		addi(%[buf], %[buf], 16)
+
+		bdnz(loop)
+
+: [cc] "+b" (cc), [buf] "+b" (buf)
+: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "ctr", "memory"
+	);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_cbcenc_run(const br_aes_pwr8_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	if (len > 0) {
+		switch (ctx->num_rounds) {
+		case 10:
+			cbcenc_128(ctx->skey.skni, iv, data, len);
+			break;
+		case 12:
+			cbcenc_192(ctx->skey.skni, iv, data, len);
+			break;
+		default:
+			cbcenc_256(ctx->skey.skni, iv, data, len);
+			break;
+		}
+		memcpy(iv, (unsigned char *)data + (len - 16), 16);
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_aes_pwr8_cbcenc_vtable = {
+	sizeof(br_aes_pwr8_cbcenc_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_aes_pwr8_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_aes_pwr8_cbcenc_run
+};
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class *
+br_aes_pwr8_cbcenc_get_vtable(void)
+{
+	return br_aes_pwr8_supported() ? &br_aes_pwr8_cbcenc_vtable : NULL;
+}
+
+#else
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class *
+br_aes_pwr8_cbcenc_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctr.c
new file mode 100644
index 00000000..f5d20c0b
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctr.c
@@ -0,0 +1,717 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+#if BR_POWER8
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_ctr_init(br_aes_pwr8_ctr_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_pwr8_ctr_vtable;
+	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
+}
+
+static void
+ctr_128(const unsigned char *sk, const unsigned char *ivbuf,
+	unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+	static const uint32_t ctrinc[] = {
+		0, 0, 0, 4
+	};
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v10
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * v28 = increment for IV counter.
+		 */
+		lxvw4x(60, 0, %[ctrinc])
+
+		/*
+		 * Load IV into v16..v19
+		 */
+		lxvw4x(48, %[cc0], %[ivbuf])
+		lxvw4x(49, %[cc1], %[ivbuf])
+		lxvw4x(50, %[cc2], %[ivbuf])
+		lxvw4x(51, %[cc3], %[ivbuf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Compute next IV into v24..v27
+		 */
+		vadduwm(24, 16, 28)
+		vadduwm(25, 17, 28)
+		vadduwm(26, 18, 28)
+		vadduwm(27, 19, 28)
+
+		/*
+		 * Load next data blocks. We do this early on but we
+		 * won't need them until IV encryption is done.
+		 */
+		lxvw4x(52, %[cc0], %[buf])
+		lxvw4x(53, %[cc1], %[buf])
+		lxvw4x(54, %[cc2], %[buf])
+		lxvw4x(55, %[cc3], %[buf])
+
+		/*
+		 * Encrypt the current IV.
+		 */
+		vxor(16, 16, 0)
+		vxor(17, 17, 0)
+		vxor(18, 18, 0)
+		vxor(19, 19, 0)
+		vcipher(16, 16, 1)
+		vcipher(17, 17, 1)
+		vcipher(18, 18, 1)
+		vcipher(19, 19, 1)
+		vcipher(16, 16, 2)
+		vcipher(17, 17, 2)
+		vcipher(18, 18, 2)
+		vcipher(19, 19, 2)
+		vcipher(16, 16, 3)
+		vcipher(17, 17, 3)
+		vcipher(18, 18, 3)
+		vcipher(19, 19, 3)
+		vcipher(16, 16, 4)
+		vcipher(17, 17, 4)
+		vcipher(18, 18, 4)
+		vcipher(19, 19, 4)
+		vcipher(16, 16, 5)
+		vcipher(17, 17, 5)
+		vcipher(18, 18, 5)
+		vcipher(19, 19, 5)
+		vcipher(16, 16, 6)
+		vcipher(17, 17, 6)
+		vcipher(18, 18, 6)
+		vcipher(19, 19, 6)
+		vcipher(16, 16, 7)
+		vcipher(17, 17, 7)
+		vcipher(18, 18, 7)
+		vcipher(19, 19, 7)
+		vcipher(16, 16, 8)
+		vcipher(17, 17, 8)
+		vcipher(18, 18, 8)
+		vcipher(19, 19, 8)
+		vcipher(16, 16, 9)
+		vcipher(17, 17, 9)
+		vcipher(18, 18, 9)
+		vcipher(19, 19, 9)
+		vcipherlast(16, 16, 10)
+		vcipherlast(17, 17, 10)
+		vcipherlast(18, 18, 10)
+		vcipherlast(19, 19, 10)
+
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		/*
+		 * Load next plaintext word and XOR with encrypted IV.
+		 */
+		vxor(16, 20, 16)
+		vxor(17, 21, 17)
+		vxor(18, 22, 18)
+		vxor(19, 23, 19)
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		addi(%[buf], %[buf], 64)
+
+		/*
+		 * Update IV.
+		 */
+		vand(16, 24, 24)
+		vand(17, 25, 25)
+		vand(18, 26, 26)
+		vand(19, 27, 27)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
+  [ctrinc] "b" (ctrinc)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+static void
+ctr_192(const unsigned char *sk, const unsigned char *ivbuf,
+	unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+	static const uint32_t ctrinc[] = {
+		0, 0, 0, 4
+	};
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v12
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(43, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(44, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * v28 = increment for IV counter.
+		 */
+		lxvw4x(60, 0, %[ctrinc])
+
+		/*
+		 * Load IV into v16..v19
+		 */
+		lxvw4x(48, %[cc0], %[ivbuf])
+		lxvw4x(49, %[cc1], %[ivbuf])
+		lxvw4x(50, %[cc2], %[ivbuf])
+		lxvw4x(51, %[cc3], %[ivbuf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Compute next IV into v24..v27
+		 */
+		vadduwm(24, 16, 28)
+		vadduwm(25, 17, 28)
+		vadduwm(26, 18, 28)
+		vadduwm(27, 19, 28)
+
+		/*
+		 * Load next data blocks. We do this early on but we
+		 * won't need them until IV encryption is done.
+		 */
+		lxvw4x(52, %[cc0], %[buf])
+		lxvw4x(53, %[cc1], %[buf])
+		lxvw4x(54, %[cc2], %[buf])
+		lxvw4x(55, %[cc3], %[buf])
+
+		/*
+		 * Encrypt the current IV.
+		 */
+		vxor(16, 16, 0)
+		vxor(17, 17, 0)
+		vxor(18, 18, 0)
+		vxor(19, 19, 0)
+		vcipher(16, 16, 1)
+		vcipher(17, 17, 1)
+		vcipher(18, 18, 1)
+		vcipher(19, 19, 1)
+		vcipher(16, 16, 2)
+		vcipher(17, 17, 2)
+		vcipher(18, 18, 2)
+		vcipher(19, 19, 2)
+		vcipher(16, 16, 3)
+		vcipher(17, 17, 3)
+		vcipher(18, 18, 3)
+		vcipher(19, 19, 3)
+		vcipher(16, 16, 4)
+		vcipher(17, 17, 4)
+		vcipher(18, 18, 4)
+		vcipher(19, 19, 4)
+		vcipher(16, 16, 5)
+		vcipher(17, 17, 5)
+		vcipher(18, 18, 5)
+		vcipher(19, 19, 5)
+		vcipher(16, 16, 6)
+		vcipher(17, 17, 6)
+		vcipher(18, 18, 6)
+		vcipher(19, 19, 6)
+		vcipher(16, 16, 7)
+		vcipher(17, 17, 7)
+		vcipher(18, 18, 7)
+		vcipher(19, 19, 7)
+		vcipher(16, 16, 8)
+		vcipher(17, 17, 8)
+		vcipher(18, 18, 8)
+		vcipher(19, 19, 8)
+		vcipher(16, 16, 9)
+		vcipher(17, 17, 9)
+		vcipher(18, 18, 9)
+		vcipher(19, 19, 9)
+		vcipher(16, 16, 10)
+		vcipher(17, 17, 10)
+		vcipher(18, 18, 10)
+		vcipher(19, 19, 10)
+		vcipher(16, 16, 11)
+		vcipher(17, 17, 11)
+		vcipher(18, 18, 11)
+		vcipher(19, 19, 11)
+		vcipherlast(16, 16, 12)
+		vcipherlast(17, 17, 12)
+		vcipherlast(18, 18, 12)
+		vcipherlast(19, 19, 12)
+
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		/*
+		 * Load next plaintext word and XOR with encrypted IV.
+		 */
+		vxor(16, 20, 16)
+		vxor(17, 21, 17)
+		vxor(18, 22, 18)
+		vxor(19, 23, 19)
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		addi(%[buf], %[buf], 64)
+
+		/*
+		 * Update IV.
+		 */
+		vand(16, 24, 24)
+		vand(17, 25, 25)
+		vand(18, 26, 26)
+		vand(19, 27, 27)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
+  [ctrinc] "b" (ctrinc)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+static void
+ctr_256(const unsigned char *sk, const unsigned char *ivbuf,
+	unsigned char *buf, size_t num_blocks)
+{
+	long cc0, cc1, cc2, cc3;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+	static const uint32_t ctrinc[] = {
+		0, 0, 0, 4
+	};
+
+	cc0 = 0;
+	cc1 = 16;
+	cc2 = 32;
+	cc3 = 48;
+	asm volatile (
+
+		/*
+		 * Load subkeys into v0..v14
+		 */
+		lxvw4x(32, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(33, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(34, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(35, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(36, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(37, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(38, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(39, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(40, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(41, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(42, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(43, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(44, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(45, %[cc0], %[sk])
+		addi(%[cc0], %[cc0], 16)
+		lxvw4x(46, %[cc0], %[sk])
+		li(%[cc0], 0)
+
+#if BR_POWER8_LE
+		/*
+		 * v15 = constant for byteswapping words
+		 */
+		lxvw4x(47, 0, %[idx2be])
+#endif
+		/*
+		 * v28 = increment for IV counter.
+		 */
+		lxvw4x(60, 0, %[ctrinc])
+
+		/*
+		 * Load IV into v16..v19
+		 */
+		lxvw4x(48, %[cc0], %[ivbuf])
+		lxvw4x(49, %[cc1], %[ivbuf])
+		lxvw4x(50, %[cc2], %[ivbuf])
+		lxvw4x(51, %[cc3], %[ivbuf])
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		mtctr(%[num_blocks])
+	label(loop)
+		/*
+		 * Compute next IV into v24..v27
+		 */
+		vadduwm(24, 16, 28)
+		vadduwm(25, 17, 28)
+		vadduwm(26, 18, 28)
+		vadduwm(27, 19, 28)
+
+		/*
+		 * Load next data blocks. We do this early on but we
+		 * won't need them until IV encryption is done.
+		 */
+		lxvw4x(52, %[cc0], %[buf])
+		lxvw4x(53, %[cc1], %[buf])
+		lxvw4x(54, %[cc2], %[buf])
+		lxvw4x(55, %[cc3], %[buf])
+
+		/*
+		 * Encrypt the current IV.
+		 */
+		vxor(16, 16, 0)
+		vxor(17, 17, 0)
+		vxor(18, 18, 0)
+		vxor(19, 19, 0)
+		vcipher(16, 16, 1)
+		vcipher(17, 17, 1)
+		vcipher(18, 18, 1)
+		vcipher(19, 19, 1)
+		vcipher(16, 16, 2)
+		vcipher(17, 17, 2)
+		vcipher(18, 18, 2)
+		vcipher(19, 19, 2)
+		vcipher(16, 16, 3)
+		vcipher(17, 17, 3)
+		vcipher(18, 18, 3)
+		vcipher(19, 19, 3)
+		vcipher(16, 16, 4)
+		vcipher(17, 17, 4)
+		vcipher(18, 18, 4)
+		vcipher(19, 19, 4)
+		vcipher(16, 16, 5)
+		vcipher(17, 17, 5)
+		vcipher(18, 18, 5)
+		vcipher(19, 19, 5)
+		vcipher(16, 16, 6)
+		vcipher(17, 17, 6)
+		vcipher(18, 18, 6)
+		vcipher(19, 19, 6)
+		vcipher(16, 16, 7)
+		vcipher(17, 17, 7)
+		vcipher(18, 18, 7)
+		vcipher(19, 19, 7)
+		vcipher(16, 16, 8)
+		vcipher(17, 17, 8)
+		vcipher(18, 18, 8)
+		vcipher(19, 19, 8)
+		vcipher(16, 16, 9)
+		vcipher(17, 17, 9)
+		vcipher(18, 18, 9)
+		vcipher(19, 19, 9)
+		vcipher(16, 16, 10)
+		vcipher(17, 17, 10)
+		vcipher(18, 18, 10)
+		vcipher(19, 19, 10)
+		vcipher(16, 16, 11)
+		vcipher(17, 17, 11)
+		vcipher(18, 18, 11)
+		vcipher(19, 19, 11)
+		vcipher(16, 16, 12)
+		vcipher(17, 17, 12)
+		vcipher(18, 18, 12)
+		vcipher(19, 19, 12)
+		vcipher(16, 16, 13)
+		vcipher(17, 17, 13)
+		vcipher(18, 18, 13)
+		vcipher(19, 19, 13)
+		vcipherlast(16, 16, 14)
+		vcipherlast(17, 17, 14)
+		vcipherlast(18, 18, 14)
+		vcipherlast(19, 19, 14)
+
+#if BR_POWER8_LE
+		vperm(16, 16, 16, 15)
+		vperm(17, 17, 17, 15)
+		vperm(18, 18, 18, 15)
+		vperm(19, 19, 19, 15)
+#endif
+
+		/*
+		 * Load next plaintext word and XOR with encrypted IV.
+		 */
+		vxor(16, 20, 16)
+		vxor(17, 21, 17)
+		vxor(18, 22, 18)
+		vxor(19, 23, 19)
+		stxvw4x(48, %[cc0], %[buf])
+		stxvw4x(49, %[cc1], %[buf])
+		stxvw4x(50, %[cc2], %[buf])
+		stxvw4x(51, %[cc3], %[buf])
+
+		addi(%[buf], %[buf], 64)
+
+		/*
+		 * Update IV.
+		 */
+		vand(16, 24, 24)
+		vand(17, 25, 25)
+		vand(18, 26, 26)
+		vand(19, 27, 27)
+
+		bdnz(loop)
+
+: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
+  [buf] "+b" (buf)
+: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
+  [ctrinc] "b" (ctrinc)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+  "ctr", "memory"
+	);
+}
+
+/* see bearssl_block.h */
+uint32_t
+br_aes_pwr8_ctr_run(const br_aes_pwr8_ctr_keys *ctx,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned char ivbuf[64];
+
+	buf = data;
+	memcpy(ivbuf +  0, iv, 12);
+	memcpy(ivbuf + 16, iv, 12);
+	memcpy(ivbuf + 32, iv, 12);
+	memcpy(ivbuf + 48, iv, 12);
+	if (len >= 64) {
+		br_enc32be(ivbuf + 12, cc + 0);
+		br_enc32be(ivbuf + 28, cc + 1);
+		br_enc32be(ivbuf + 44, cc + 2);
+		br_enc32be(ivbuf + 60, cc + 3);
+		switch (ctx->num_rounds) {
+		case 10:
+			ctr_128(ctx->skey.skni, ivbuf, buf,
+				(len >> 4) & ~(size_t)3);
+			break;
+		case 12:
+			ctr_192(ctx->skey.skni, ivbuf, buf,
+				(len >> 4) & ~(size_t)3);
+			break;
+		default:
+			ctr_256(ctx->skey.skni, ivbuf, buf,
+				(len >> 4) & ~(size_t)3);
+			break;
+		}
+		cc += (len >> 4) & ~(size_t)3;
+		buf += len & ~(size_t)63;
+		len &= 63;
+	}
+	if (len > 0) {
+		unsigned char tmp[64];
+
+		memcpy(tmp, buf, len);
+		memset(tmp + len, 0, (sizeof tmp) - len);
+		br_enc32be(ivbuf + 12, cc + 0);
+		br_enc32be(ivbuf + 28, cc + 1);
+		br_enc32be(ivbuf + 44, cc + 2);
+		br_enc32be(ivbuf + 60, cc + 3);
+		switch (ctx->num_rounds) {
+		case 10:
+			ctr_128(ctx->skey.skni, ivbuf, tmp, 4);
+			break;
+		case 12:
+			ctr_192(ctx->skey.skni, ivbuf, tmp, 4);
+			break;
+		default:
+			ctr_256(ctx->skey.skni, ivbuf, tmp, 4);
+			break;
+		}
+		memcpy(buf, tmp, len);
+		cc += (len + 15) >> 4;
+	}
+	return cc;
+}
+
+/* see bearssl_block.h */
+const br_block_ctr_class br_aes_pwr8_ctr_vtable = {
+	sizeof(br_aes_pwr8_ctr_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctr_class **, const void *, size_t))
+		&br_aes_pwr8_ctr_init,
+	(uint32_t (*)(const br_block_ctr_class *const *,
+		const void *, uint32_t, void *, size_t))
+		&br_aes_pwr8_ctr_run
+};
+
+/* see bearssl_block.h */
+const br_block_ctr_class *
+br_aes_pwr8_ctr_get_vtable(void)
+{
+	return br_aes_pwr8_supported() ? &br_aes_pwr8_ctr_vtable : NULL;
+}
+
+#else
+
+/* see bearssl_block.h */
+const br_block_ctr_class *
+br_aes_pwr8_ctr_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctrcbc.c
new file mode 100644
index 00000000..a67d30b6
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctrcbc.c
@@ -0,0 +1,946 @@
+/*
+ * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+#if BR_POWER8
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class *
+br_aes_pwr8_ctrcbc_get_vtable(void)
+{
+	return br_aes_pwr8_supported() ? &br_aes_pwr8_ctrcbc_vtable : NULL;
+}
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_ctrcbc_init(br_aes_pwr8_ctrcbc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_pwr8_ctrcbc_vtable;
+	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
+}
+
+/*
+ * Register conventions for CTR + CBC-MAC:
+ *
+ *   AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
+ *   Register v15 contains the byteswap index register (little-endian only)
+ *   Register v16 contains the CTR counter value
+ *   Register v17 contains the CBC-MAC current value
+ *   Registers v18 to v27 are scratch
+ *   Counter increment uses v28, v29 and v30
+ *
+ * For CTR alone:
+ *  
+ *   AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
+ *   Register v15 contains the byteswap index register (little-endian only)
+ *   Registers v16 to v19 contain the CTR counter values (four blocks)
+ *   Registers v20 to v27 are scratch
+ *   Counter increment uses v28, v29 and v30
+ */
+
+#define LOAD_SUBKEYS_128 \
+		lxvw4x(32, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(33, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(34, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(35, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(36, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(37, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(38, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(39, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(40, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(41, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(42, %[cc], %[sk])
+
+#define LOAD_SUBKEYS_192 \
+		LOAD_SUBKEYS_128 \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(43, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(44, %[cc], %[sk])
+
+#define LOAD_SUBKEYS_256 \
+		LOAD_SUBKEYS_192 \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(45, %[cc], %[sk])   \
+		addi(%[cc], %[cc], 16)     \
+		lxvw4x(46, %[cc], %[sk])
+
+#define BLOCK_ENCRYPT_128(x) \
+		vxor(x, x, 0) \
+		vcipher(x, x, 1) \
+		vcipher(x, x, 2) \
+		vcipher(x, x, 3) \
+		vcipher(x, x, 4) \
+		vcipher(x, x, 5) \
+		vcipher(x, x, 6) \
+		vcipher(x, x, 7) \
+		vcipher(x, x, 8) \
+		vcipher(x, x, 9) \
+		vcipherlast(x, x, 10)
+
+#define BLOCK_ENCRYPT_192(x) \
+		vxor(x, x, 0) \
+		vcipher(x, x, 1) \
+		vcipher(x, x, 2) \
+		vcipher(x, x, 3) \
+		vcipher(x, x, 4) \
+		vcipher(x, x, 5) \
+		vcipher(x, x, 6) \
+		vcipher(x, x, 7) \
+		vcipher(x, x, 8) \
+		vcipher(x, x, 9) \
+		vcipher(x, x, 10) \
+		vcipher(x, x, 11) \
+		vcipherlast(x, x, 12)
+
+#define BLOCK_ENCRYPT_256(x) \
+		vxor(x, x, 0) \
+		vcipher(x, x, 1) \
+		vcipher(x, x, 2) \
+		vcipher(x, x, 3) \
+		vcipher(x, x, 4) \
+		vcipher(x, x, 5) \
+		vcipher(x, x, 6) \
+		vcipher(x, x, 7) \
+		vcipher(x, x, 8) \
+		vcipher(x, x, 9) \
+		vcipher(x, x, 10) \
+		vcipher(x, x, 11) \
+		vcipher(x, x, 12) \
+		vcipher(x, x, 13) \
+		vcipherlast(x, x, 14)
+
+#define BLOCK_ENCRYPT_X2_128(x, y) \
+		vxor(x, x, 0) \
+		vxor(y, y, 0) \
+		vcipher(x, x, 1) \
+		vcipher(y, y, 1) \
+		vcipher(x, x, 2) \
+		vcipher(y, y, 2) \
+		vcipher(x, x, 3) \
+		vcipher(y, y, 3) \
+		vcipher(x, x, 4) \
+		vcipher(y, y, 4) \
+		vcipher(x, x, 5) \
+		vcipher(y, y, 5) \
+		vcipher(x, x, 6) \
+		vcipher(y, y, 6) \
+		vcipher(x, x, 7) \
+		vcipher(y, y, 7) \
+		vcipher(x, x, 8) \
+		vcipher(y, y, 8) \
+		vcipher(x, x, 9) \
+		vcipher(y, y, 9) \
+		vcipherlast(x, x, 10) \
+		vcipherlast(y, y, 10)
+
+#define BLOCK_ENCRYPT_X2_192(x, y) \
+		vxor(x, x, 0) \
+		vxor(y, y, 0) \
+		vcipher(x, x, 1) \
+		vcipher(y, y, 1) \
+		vcipher(x, x, 2) \
+		vcipher(y, y, 2) \
+		vcipher(x, x, 3) \
+		vcipher(y, y, 3) \
+		vcipher(x, x, 4) \
+		vcipher(y, y, 4) \
+		vcipher(x, x, 5) \
+		vcipher(y, y, 5) \
+		vcipher(x, x, 6) \
+		vcipher(y, y, 6) \
+		vcipher(x, x, 7) \
+		vcipher(y, y, 7) \
+		vcipher(x, x, 8) \
+		vcipher(y, y, 8) \
+		vcipher(x, x, 9) \
+		vcipher(y, y, 9) \
+		vcipher(x, x, 10) \
+		vcipher(y, y, 10) \
+		vcipher(x, x, 11) \
+		vcipher(y, y, 11) \
+		vcipherlast(x, x, 12) \
+		vcipherlast(y, y, 12)
+
+#define BLOCK_ENCRYPT_X2_256(x, y) \
+		vxor(x, x, 0) \
+		vxor(y, y, 0) \
+		vcipher(x, x, 1) \
+		vcipher(y, y, 1) \
+		vcipher(x, x, 2) \
+		vcipher(y, y, 2) \
+		vcipher(x, x, 3) \
+		vcipher(y, y, 3) \
+		vcipher(x, x, 4) \
+		vcipher(y, y, 4) \
+		vcipher(x, x, 5) \
+		vcipher(y, y, 5) \
+		vcipher(x, x, 6) \
+		vcipher(y, y, 6) \
+		vcipher(x, x, 7) \
+		vcipher(y, y, 7) \
+		vcipher(x, x, 8) \
+		vcipher(y, y, 8) \
+		vcipher(x, x, 9) \
+		vcipher(y, y, 9) \
+		vcipher(x, x, 10) \
+		vcipher(y, y, 10) \
+		vcipher(x, x, 11) \
+		vcipher(y, y, 11) \
+		vcipher(x, x, 12) \
+		vcipher(y, y, 12) \
+		vcipher(x, x, 13) \
+		vcipher(y, y, 13) \
+		vcipherlast(x, x, 14) \
+		vcipherlast(y, y, 14)
+
+#define BLOCK_ENCRYPT_X4_128(x0, x1, x2, x3) \
+		vxor(x0, x0, 0) \
+		vxor(x1, x1, 0) \
+		vxor(x2, x2, 0) \
+		vxor(x3, x3, 0) \
+		vcipher(x0, x0, 1) \
+		vcipher(x1, x1, 1) \
+		vcipher(x2, x2, 1) \
+		vcipher(x3, x3, 1) \
+		vcipher(x0, x0, 2) \
+		vcipher(x1, x1, 2) \
+		vcipher(x2, x2, 2) \
+		vcipher(x3, x3, 2) \
+		vcipher(x0, x0, 3) \
+		vcipher(x1, x1, 3) \
+		vcipher(x2, x2, 3) \
+		vcipher(x3, x3, 3) \
+		vcipher(x0, x0, 4) \
+		vcipher(x1, x1, 4) \
+		vcipher(x2, x2, 4) \
+		vcipher(x3, x3, 4) \
+		vcipher(x0, x0, 5) \
+		vcipher(x1, x1, 5) \
+		vcipher(x2, x2, 5) \
+		vcipher(x3, x3, 5) \
+		vcipher(x0, x0, 6) \
+		vcipher(x1, x1, 6) \
+		vcipher(x2, x2, 6) \
+		vcipher(x3, x3, 6) \
+		vcipher(x0, x0, 7) \
+		vcipher(x1, x1, 7) \
+		vcipher(x2, x2, 7) \
+		vcipher(x3, x3, 7) \
+		vcipher(x0, x0, 8) \
+		vcipher(x1, x1, 8) \
+		vcipher(x2, x2, 8) \
+		vcipher(x3, x3, 8) \
+		vcipher(x0, x0, 9) \
+		vcipher(x1, x1, 9) \
+		vcipher(x2, x2, 9) \
+		vcipher(x3, x3, 9) \
+		vcipherlast(x0, x0, 10) \
+		vcipherlast(x1, x1, 10) \
+		vcipherlast(x2, x2, 10) \
+		vcipherlast(x3, x3, 10)
+
+#define BLOCK_ENCRYPT_X4_192(x0, x1, x2, x3) \
+		vxor(x0, x0, 0) \
+		vxor(x1, x1, 0) \
+		vxor(x2, x2, 0) \
+		vxor(x3, x3, 0) \
+		vcipher(x0, x0, 1) \
+		vcipher(x1, x1, 1) \
+		vcipher(x2, x2, 1) \
+		vcipher(x3, x3, 1) \
+		vcipher(x0, x0, 2) \
+		vcipher(x1, x1, 2) \
+		vcipher(x2, x2, 2) \
+		vcipher(x3, x3, 2) \
+		vcipher(x0, x0, 3) \
+		vcipher(x1, x1, 3) \
+		vcipher(x2, x2, 3) \
+		vcipher(x3, x3, 3) \
+		vcipher(x0, x0, 4) \
+		vcipher(x1, x1, 4) \
+		vcipher(x2, x2, 4) \
+		vcipher(x3, x3, 4) \
+		vcipher(x0, x0, 5) \
+		vcipher(x1, x1, 5) \
+		vcipher(x2, x2, 5) \
+		vcipher(x3, x3, 5) \
+		vcipher(x0, x0, 6) \
+		vcipher(x1, x1, 6) \
+		vcipher(x2, x2, 6) \
+		vcipher(x3, x3, 6) \
+		vcipher(x0, x0, 7) \
+		vcipher(x1, x1, 7) \
+		vcipher(x2, x2, 7) \
+		vcipher(x3, x3, 7) \
+		vcipher(x0, x0, 8) \
+		vcipher(x1, x1, 8) \
+		vcipher(x2, x2, 8) \
+		vcipher(x3, x3, 8) \
+		vcipher(x0, x0, 9) \
+		vcipher(x1, x1, 9) \
+		vcipher(x2, x2, 9) \
+		vcipher(x3, x3, 9) \
+		vcipher(x0, x0, 10) \
+		vcipher(x1, x1, 10) \
+		vcipher(x2, x2, 10) \
+		vcipher(x3, x3, 10) \
+		vcipher(x0, x0, 11) \
+		vcipher(x1, x1, 11) \
+		vcipher(x2, x2, 11) \
+		vcipher(x3, x3, 11) \
+		vcipherlast(x0, x0, 12) \
+		vcipherlast(x1, x1, 12) \
+		vcipherlast(x2, x2, 12) \
+		vcipherlast(x3, x3, 12)
+
+#define BLOCK_ENCRYPT_X4_256(x0, x1, x2, x3) \
+		vxor(x0, x0, 0) \
+		vxor(x1, x1, 0) \
+		vxor(x2, x2, 0) \
+		vxor(x3, x3, 0) \
+		vcipher(x0, x0, 1) \
+		vcipher(x1, x1, 1) \
+		vcipher(x2, x2, 1) \
+		vcipher(x3, x3, 1) \
+		vcipher(x0, x0, 2) \
+		vcipher(x1, x1, 2) \
+		vcipher(x2, x2, 2) \
+		vcipher(x3, x3, 2) \
+		vcipher(x0, x0, 3) \
+		vcipher(x1, x1, 3) \
+		vcipher(x2, x2, 3) \
+		vcipher(x3, x3, 3) \
+		vcipher(x0, x0, 4) \
+		vcipher(x1, x1, 4) \
+		vcipher(x2, x2, 4) \
+		vcipher(x3, x3, 4) \
+		vcipher(x0, x0, 5) \
+		vcipher(x1, x1, 5) \
+		vcipher(x2, x2, 5) \
+		vcipher(x3, x3, 5) \
+		vcipher(x0, x0, 6) \
+		vcipher(x1, x1, 6) \
+		vcipher(x2, x2, 6) \
+		vcipher(x3, x3, 6) \
+		vcipher(x0, x0, 7) \
+		vcipher(x1, x1, 7) \
+		vcipher(x2, x2, 7) \
+		vcipher(x3, x3, 7) \
+		vcipher(x0, x0, 8) \
+		vcipher(x1, x1, 8) \
+		vcipher(x2, x2, 8) \
+		vcipher(x3, x3, 8) \
+		vcipher(x0, x0, 9) \
+		vcipher(x1, x1, 9) \
+		vcipher(x2, x2, 9) \
+		vcipher(x3, x3, 9) \
+		vcipher(x0, x0, 10) \
+		vcipher(x1, x1, 10) \
+		vcipher(x2, x2, 10) \
+		vcipher(x3, x3, 10) \
+		vcipher(x0, x0, 11) \
+		vcipher(x1, x1, 11) \
+		vcipher(x2, x2, 11) \
+		vcipher(x3, x3, 11) \
+		vcipher(x0, x0, 12) \
+		vcipher(x1, x1, 12) \
+		vcipher(x2, x2, 12) \
+		vcipher(x3, x3, 12) \
+		vcipher(x0, x0, 13) \
+		vcipher(x1, x1, 13) \
+		vcipher(x2, x2, 13) \
+		vcipher(x3, x3, 13) \
+		vcipherlast(x0, x0, 14) \
+		vcipherlast(x1, x1, 14) \
+		vcipherlast(x2, x2, 14) \
+		vcipherlast(x3, x3, 14)
+
+#if BR_POWER8_LE
+static const uint32_t idx2be[] = {
+	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+};
+#define BYTESWAP_INIT     lxvw4x(47, 0, %[idx2be])
+#define BYTESWAP(x)       vperm(x, x, x, 15)
+#define BYTESWAPX(d, s)   vperm(d, s, s, 15)
+#define BYTESWAP_REG      , [idx2be] "b" (idx2be)
+#else
+#define BYTESWAP_INIT
+#define BYTESWAP(x)
+#define BYTESWAPX(d, s)   vand(d, s, s)
+#define BYTESWAP_REG
+#endif
+
+static const uint32_t ctrinc[] = {
+	0, 0, 0, 1
+};
+static const uint32_t ctrinc_x4[] = {
+	0, 0, 0, 4
+};
+#define INCR_128_INIT      lxvw4x(60, 0, %[ctrinc])
+#define INCR_128_X4_INIT   lxvw4x(60, 0, %[ctrinc_x4])
+#define INCR_128(d, s) \
+		vaddcuw(29, s, 28) \
+		vadduwm(d, s, 28) \
+		vsldoi(30, 29, 29, 4) \
+		vaddcuw(29, d, 30) \
+		vadduwm(d, d, 30) \
+		vsldoi(30, 29, 29, 4) \
+		vaddcuw(29, d, 30) \
+		vadduwm(d, d, 30) \
+		vsldoi(30, 29, 29, 4) \
+		vadduwm(d, d, 30)
+
+#define MKCTR(size) \
+static void \
+ctr_ ## size(const unsigned char *sk, \
+	unsigned char *ctrbuf, unsigned char *buf, size_t num_blocks_x4) \
+{ \
+	long cc, cc0, cc1, cc2, cc3; \
+ \
+	cc = 0; \
+	cc0 = 0; \
+	cc1 = 16; \
+	cc2 = 32; \
+	cc3 = 48; \
+	asm volatile ( \
+ \
+		/* \
+		 * Load subkeys into v0..v10 \
+		 */ \
+		LOAD_SUBKEYS_ ## size \
+		li(%[cc], 0) \
+ \
+		BYTESWAP_INIT \
+		INCR_128_X4_INIT \
+ \
+		/* \
+		 * Load current CTR counters into v16 to v19. \
+		 */ \
+		lxvw4x(48, %[cc0], %[ctrbuf]) \
+		lxvw4x(49, %[cc1], %[ctrbuf]) \
+		lxvw4x(50, %[cc2], %[ctrbuf]) \
+		lxvw4x(51, %[cc3], %[ctrbuf]) \
+		BYTESWAP(16) \
+		BYTESWAP(17) \
+		BYTESWAP(18) \
+		BYTESWAP(19) \
+ \
+		mtctr(%[num_blocks_x4]) \
+ \
+	label(loop) \
+		/* \
+		 * Compute next counter values into v20..v23. \
+		 */ \
+		INCR_128(20, 16) \
+		INCR_128(21, 17) \
+		INCR_128(22, 18) \
+		INCR_128(23, 19) \
+ \
+		/* \
+		 * Encrypt counter values and XOR into next data blocks. \
+		 */ \
+		lxvw4x(56, %[cc0], %[buf]) \
+		lxvw4x(57, %[cc1], %[buf]) \
+		lxvw4x(58, %[cc2], %[buf]) \
+		lxvw4x(59, %[cc3], %[buf]) \
+		BYTESWAP(24) \
+		BYTESWAP(25) \
+		BYTESWAP(26) \
+		BYTESWAP(27) \
+		BLOCK_ENCRYPT_X4_ ## size(16, 17, 18, 19) \
+		vxor(16, 16, 24) \
+		vxor(17, 17, 25) \
+		vxor(18, 18, 26) \
+		vxor(19, 19, 27) \
+		BYTESWAP(16) \
+		BYTESWAP(17) \
+		BYTESWAP(18) \
+		BYTESWAP(19) \
+		stxvw4x(48, %[cc0], %[buf]) \
+		stxvw4x(49, %[cc1], %[buf]) \
+		stxvw4x(50, %[cc2], %[buf]) \
+		stxvw4x(51, %[cc3], %[buf]) \
+ \
+		/* \
+		 * Update counters and data pointer. \
+		 */ \
+		vand(16, 20, 20) \
+		vand(17, 21, 21) \
+		vand(18, 22, 22) \
+		vand(19, 23, 23) \
+		addi(%[buf], %[buf], 64) \
+ \
+		bdnz(loop) \
+ \
+		/* \
+		 * Write back new counter values. \
+		 */ \
+		BYTESWAP(16) \
+		BYTESWAP(17) \
+		BYTESWAP(18) \
+		BYTESWAP(19) \
+		stxvw4x(48, %[cc0], %[ctrbuf]) \
+		stxvw4x(49, %[cc1], %[ctrbuf]) \
+		stxvw4x(50, %[cc2], %[ctrbuf]) \
+		stxvw4x(51, %[cc3], %[ctrbuf]) \
+ \
+: [cc] "+b" (cc), [buf] "+b" (buf), \
+	[cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3) \
+: [sk] "b" (sk), [ctrbuf] "b" (ctrbuf), \
+	[num_blocks_x4] "b" (num_blocks_x4), [ctrinc_x4] "b" (ctrinc_x4) \
+	BYTESWAP_REG \
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
+  "v30", "ctr", "memory" \
+	); \
+}
+
+MKCTR(128)
+MKCTR(192)
+MKCTR(256)
+
+#define MKCBCMAC(size) \
+static void \
+cbcmac_ ## size(const unsigned char *sk, \
+	unsigned char *cbcmac, const unsigned char *buf, size_t num_blocks) \
+{ \
+	long cc; \
+ \
+	cc = 0; \
+	asm volatile ( \
+ \
+		/* \
+		 * Load subkeys into v0..v10 \
+		 */ \
+		LOAD_SUBKEYS_ ## size \
+		li(%[cc], 0) \
+ \
+		BYTESWAP_INIT \
+ \
+		/* \
+		 * Load current CBC-MAC value into v16. \
+		 */ \
+		lxvw4x(48, %[cc], %[cbcmac]) \
+		BYTESWAP(16) \
+ \
+		mtctr(%[num_blocks]) \
+ \
+	label(loop) \
+		/* \
+		 * Load next block, XOR into current CBC-MAC value, \
+		 * and then encrypt it. \
+		 */ \
+		lxvw4x(49, %[cc], %[buf]) \
+		BYTESWAP(17) \
+		vxor(16, 16, 17) \
+		BLOCK_ENCRYPT_ ## size(16) \
+		addi(%[buf], %[buf], 16) \
+ \
+		bdnz(loop) \
+ \
+		/* \
+		 * Write back new CBC-MAC value. \
+		 */ \
+		BYTESWAP(16) \
+		stxvw4x(48, %[cc], %[cbcmac]) \
+ \
+: [cc] "+b" (cc), [buf] "+b" (buf) \
+: [sk] "b" (sk), [cbcmac] "b" (cbcmac), [num_blocks] "b" (num_blocks) \
+	BYTESWAP_REG \
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
+  "v30", "ctr", "memory" \
+	); \
+}
+
+MKCBCMAC(128)
+MKCBCMAC(192)
+MKCBCMAC(256)
+
+#define MKENCRYPT(size) \
+static void \
+ctrcbc_ ## size ## _encrypt(const unsigned char *sk, \
+	unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
+	size_t num_blocks) \
+{ \
+	long cc; \
+ \
+	cc = 0; \
+	asm volatile ( \
+ \
+		/* \
+		 * Load subkeys into v0..v10 \
+		 */ \
+		LOAD_SUBKEYS_ ## size \
+		li(%[cc], 0) \
+ \
+		BYTESWAP_INIT \
+		INCR_128_INIT \
+ \
+		/* \
+		 * Load current CTR counter into v16, and current \
+		 * CBC-MAC IV into v17. \
+		 */ \
+		lxvw4x(48, %[cc], %[ctr]) \
+		lxvw4x(49, %[cc], %[cbcmac]) \
+		BYTESWAP(16) \
+		BYTESWAP(17) \
+ \
+		/* \
+		 * At each iteration, we do two parallel encryption: \
+		 *  - new counter value for encryption of the next block; \
+		 *  - CBC-MAC over the previous encrypted block. \
+		 * Thus, each plaintext block implies two AES instances, \
+		 * over two successive iterations. This requires a single \
+		 * counter encryption before the loop, and a single \
+		 * CBC-MAC encryption after the loop. \
+		 */ \
+ \
+		/* \
+		 * Encrypt first block (into v20). \
+		 */ \
+		lxvw4x(52, %[cc], %[buf]) \
+		BYTESWAP(20) \
+		INCR_128(22, 16) \
+		BLOCK_ENCRYPT_ ## size(16) \
+		vxor(20, 20, 16) \
+		BYTESWAPX(21, 20) \
+		stxvw4x(53, %[cc], %[buf]) \
+		vand(16, 22, 22) \
+		addi(%[buf], %[buf], 16) \
+ \
+		/* \
+		 * Load loop counter; skip the loop if there is only \
+		 * one block in total (already handled by the boundary \
+		 * conditions). \
+		 */ \
+		mtctr(%[num_blocks]) \
+		bdz(fastexit) \
+ \
+	label(loop) \
+		/* \
+		 * Upon loop entry: \
+		 *    v16   counter value for next block \
+		 *    v17   current CBC-MAC value \
+		 *    v20   encrypted previous block \
+		 */ \
+		vxor(17, 17, 20) \
+		INCR_128(22, 16) \
+		lxvw4x(52, %[cc], %[buf]) \
+		BYTESWAP(20) \
+		BLOCK_ENCRYPT_X2_ ## size(16, 17) \
+		vxor(20, 20, 16) \
+		BYTESWAPX(21, 20) \
+		stxvw4x(53, %[cc], %[buf]) \
+		addi(%[buf], %[buf], 16) \
+		vand(16, 22, 22) \
+ \
+		bdnz(loop) \
+ \
+	label(fastexit) \
+		vxor(17, 17, 20) \
+		BLOCK_ENCRYPT_ ## size(17) \
+		BYTESWAP(16) \
+		BYTESWAP(17) \
+		stxvw4x(48, %[cc], %[ctr]) \
+		stxvw4x(49, %[cc], %[cbcmac]) \
+ \
+: [cc] "+b" (cc), [buf] "+b" (buf) \
+: [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
+	[num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
+	BYTESWAP_REG \
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
+  "v30", "ctr", "memory" \
+	); \
+}
+
+MKENCRYPT(128)
+MKENCRYPT(192)
+MKENCRYPT(256)
+
+#define MKDECRYPT(size) \
+static void \
+ctrcbc_ ## size ## _decrypt(const unsigned char *sk, \
+	unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
+	size_t num_blocks) \
+{ \
+	long cc; \
+ \
+	cc = 0; \
+	asm volatile ( \
+ \
+		/* \
+		 * Load subkeys into v0..v10 \
+		 */ \
+		LOAD_SUBKEYS_ ## size \
+		li(%[cc], 0) \
+ \
+		BYTESWAP_INIT \
+		INCR_128_INIT \
+ \
+		/* \
+		 * Load current CTR counter into v16, and current \
+		 * CBC-MAC IV into v17. \
+		 */ \
+		lxvw4x(48, %[cc], %[ctr]) \
+		lxvw4x(49, %[cc], %[cbcmac]) \
+		BYTESWAP(16) \
+		BYTESWAP(17) \
+ \
+		/* \
+		 * At each iteration, we do two parallel encryption: \
+		 *  - new counter value for decryption of the next block; \
+		 *  - CBC-MAC over the next encrypted block. \
+		 * Each iteration performs the two AES instances related \
+		 * to the current block; there is thus no need for some \
+		 * extra pre-loop and post-loop work as in encryption. \
+		 */ \
+ \
+		mtctr(%[num_blocks]) \
+ \
+	label(loop) \
+		/* \
+		 * Upon loop entry: \
+		 *    v16   counter value for next block \
+		 *    v17   current CBC-MAC value \
+		 */ \
+		lxvw4x(52, %[cc], %[buf]) \
+		BYTESWAP(20) \
+		vxor(17, 17, 20) \
+		INCR_128(22, 16) \
+		BLOCK_ENCRYPT_X2_ ## size(16, 17) \
+		vxor(20, 20, 16) \
+		BYTESWAPX(21, 20) \
+		stxvw4x(53, %[cc], %[buf]) \
+		addi(%[buf], %[buf], 16) \
+		vand(16, 22, 22) \
+ \
+		bdnz(loop) \
+ \
+		/* \
+		 * Store back counter and CBC-MAC value. \
+		 */ \
+		BYTESWAP(16) \
+		BYTESWAP(17) \
+		stxvw4x(48, %[cc], %[ctr]) \
+		stxvw4x(49, %[cc], %[cbcmac]) \
+ \
+: [cc] "+b" (cc), [buf] "+b" (buf) \
+: [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
+	[num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
+	BYTESWAP_REG \
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
+  "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
+  "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
+  "v30", "ctr", "memory" \
+	); \
+}
+
+MKDECRYPT(128)
+MKDECRYPT(192)
+MKDECRYPT(256)
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_ctrcbc_encrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	if (len == 0) {
+		return;
+	}
+	switch (ctx->num_rounds) {
+	case 10:
+		ctrcbc_128_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
+		break;
+	case 12:
+		ctrcbc_192_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
+		break;
+	default:
+		ctrcbc_256_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
+		break;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_ctrcbc_decrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	if (len == 0) {
+		return;
+	}
+	switch (ctx->num_rounds) {
+	case 10:
+		ctrcbc_128_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
+		break;
+	case 12:
+		ctrcbc_192_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
+		break;
+	default:
+		ctrcbc_256_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
+		break;
+	}
+}
+
+static inline void
+incr_ctr(void *dst, const void *src)
+{
+	uint64_t hi, lo;
+
+	hi = br_dec64be(src);
+	lo = br_dec64be((const unsigned char *)src + 8);
+	lo ++;
+	hi += ((lo | -lo) >> 63) ^ (uint64_t)1;
+	br_enc64be(dst, hi);
+	br_enc64be((unsigned char *)dst + 8, lo);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_ctrcbc_ctr(const br_aes_pwr8_ctrcbc_keys *ctx,
+	void *ctr, void *data, size_t len)
+{
+	unsigned char ctrbuf[64];
+
+	memcpy(ctrbuf, ctr, 16);
+	incr_ctr(ctrbuf + 16, ctrbuf);
+	incr_ctr(ctrbuf + 32, ctrbuf + 16);
+	incr_ctr(ctrbuf + 48, ctrbuf + 32);
+	if (len >= 64) {
+		switch (ctx->num_rounds) {
+		case 10:
+			ctr_128(ctx->skey.skni, ctrbuf, data, len >> 6);
+			break;
+		case 12:
+			ctr_192(ctx->skey.skni, ctrbuf, data, len >> 6);
+			break;
+		default:
+			ctr_256(ctx->skey.skni, ctrbuf, data, len >> 6);
+			break;
+		}
+		data = (unsigned char *)data + (len & ~(size_t)63);
+		len &= 63;
+	}
+	if (len > 0) {
+		unsigned char tmp[64];
+
+		if (len >= 32) {
+			if (len >= 48) {
+				memcpy(ctr, ctrbuf + 48, 16);
+			} else {
+				memcpy(ctr, ctrbuf + 32, 16);
+			}
+		} else {
+			if (len >= 16) {
+				memcpy(ctr, ctrbuf + 16, 16);
+			}
+		}
+		memcpy(tmp, data, len);
+		memset(tmp + len, 0, (sizeof tmp) - len);
+		switch (ctx->num_rounds) {
+		case 10:
+			ctr_128(ctx->skey.skni, ctrbuf, tmp, 1);
+			break;
+		case 12:
+			ctr_192(ctx->skey.skni, ctrbuf, tmp, 1);
+			break;
+		default:
+			ctr_256(ctx->skey.skni, ctrbuf, tmp, 1);
+			break;
+		}
+		memcpy(data, tmp, len);
+	} else {
+		memcpy(ctr, ctrbuf, 16);
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_aes_pwr8_ctrcbc_mac(const br_aes_pwr8_ctrcbc_keys *ctx,
+	void *cbcmac, const void *data, size_t len)
+{
+	if (len > 0) {
+		switch (ctx->num_rounds) {
+		case 10:
+			cbcmac_128(ctx->skey.skni, cbcmac, data, len >> 4);
+			break;
+		case 12:
+			cbcmac_192(ctx->skey.skni, cbcmac, data, len >> 4);
+			break;
+		default:
+			cbcmac_256(ctx->skey.skni, cbcmac, data, len >> 4);
+			break;
+		}
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class br_aes_pwr8_ctrcbc_vtable = {
+	sizeof(br_aes_pwr8_ctrcbc_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
+		&br_aes_pwr8_ctrcbc_init,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_pwr8_ctrcbc_encrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_pwr8_ctrcbc_decrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, size_t))
+		&br_aes_pwr8_ctrcbc_ctr,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, const void *, size_t))
+		&br_aes_pwr8_ctrcbc_mac
+};
+
+#else
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class *
+br_aes_pwr8_ctrcbc_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_small_cbcdec.c
new file mode 100644
index 00000000..8567244b
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_small_cbcdec.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_small_cbcdec_init(br_aes_small_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_small_cbcdec_vtable;
+	ctx->num_rounds = br_aes_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_small_cbcdec_run(const br_aes_small_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+
+	ivbuf = iv;
+	buf = data;
+	while (len > 0) {
+		unsigned char tmp[16];
+		int i;
+
+		memcpy(tmp, buf, 16);
+		br_aes_small_decrypt(ctx->num_rounds, ctx->skey, buf);
+		for (i = 0; i < 16; i ++) {
+			buf[i] ^= ivbuf[i];
+		}
+		memcpy(ivbuf, tmp, 16);
+		buf += 16;
+		len -= 16;
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_aes_small_cbcdec_vtable = {
+	sizeof(br_aes_small_cbcdec_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_aes_small_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_aes_small_cbcdec_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_small_cbcenc.c
new file mode 100644
index 00000000..0dc2910a
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_small_cbcenc.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_small_cbcenc_init(br_aes_small_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_small_cbcenc_vtable;
+	ctx->num_rounds = br_aes_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_small_cbcenc_run(const br_aes_small_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+
+	ivbuf = iv;
+	buf = data;
+	while (len > 0) {
+		int i;
+
+		for (i = 0; i < 16; i ++) {
+			buf[i] ^= ivbuf[i];
+		}
+		br_aes_small_encrypt(ctx->num_rounds, ctx->skey, buf);
+		memcpy(ivbuf, buf, 16);
+		buf += 16;
+		len -= 16;
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_aes_small_cbcenc_vtable = {
+	sizeof(br_aes_small_cbcenc_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_aes_small_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_aes_small_cbcenc_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_small_ctr.c
new file mode 100644
index 00000000..d5d371c6
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_small_ctr.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_small_ctr_init(br_aes_small_ctr_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_small_ctr_vtable;
+	ctx->num_rounds = br_aes_keysched(ctx->skey, key, len);
+}
+
+static void
+xorbuf(void *dst, const void *src, size_t len)
+{
+	unsigned char *d;
+	const unsigned char *s;
+
+	d = dst;
+	s = src;
+	while (len -- > 0) {
+		*d ++ ^= *s ++;
+	}
+}
+
+/* see bearssl_block.h */
+uint32_t
+br_aes_small_ctr_run(const br_aes_small_ctr_keys *ctx,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+
+	buf = data;
+	while (len > 0) {
+		unsigned char tmp[16];
+
+		memcpy(tmp, iv, 12);
+		br_enc32be(tmp + 12, cc ++);
+		br_aes_small_encrypt(ctx->num_rounds, ctx->skey, tmp);
+		if (len <= 16) {
+			xorbuf(buf, tmp, len);
+			break;
+		}
+		xorbuf(buf, tmp, 16);
+		buf += 16;
+		len -= 16;
+	}
+	return cc;
+}
+
+/* see bearssl_block.h */
+const br_block_ctr_class br_aes_small_ctr_vtable = {
+	sizeof(br_aes_small_ctr_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctr_class **, const void *, size_t))
+		&br_aes_small_ctr_init,
+	(uint32_t (*)(const br_block_ctr_class *const *,
+		const void *, uint32_t, void *, size_t))
+		&br_aes_small_ctr_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_small_ctrcbc.c
new file mode 100644
index 00000000..2d6ba329
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_small_ctrcbc.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_aes_small_ctrcbc_init(br_aes_small_ctrcbc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_small_ctrcbc_vtable;
+	ctx->num_rounds = br_aes_keysched(ctx->skey, key, len);
+}
+
+static void
+xorbuf(void *dst, const void *src, size_t len)
+{
+	unsigned char *d;
+	const unsigned char *s;
+
+	d = dst;
+	s = src;
+	while (len -- > 0) {
+		*d ++ ^= *s ++;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_aes_small_ctrcbc_ctr(const br_aes_small_ctrcbc_keys *ctx,
+	void *ctr, void *data, size_t len)
+{
+	unsigned char *buf, *bctr;
+	uint32_t cc0, cc1, cc2, cc3;
+
+	buf = data;
+	bctr = ctr;
+	cc3 = br_dec32be(bctr +  0);
+	cc2 = br_dec32be(bctr +  4);
+	cc1 = br_dec32be(bctr +  8);
+	cc0 = br_dec32be(bctr + 12);
+	while (len > 0) {
+		unsigned char tmp[16];
+		uint32_t carry;
+
+		br_enc32be(tmp +  0, cc3);
+		br_enc32be(tmp +  4, cc2);
+		br_enc32be(tmp +  8, cc1);
+		br_enc32be(tmp + 12, cc0);
+		br_aes_small_encrypt(ctx->num_rounds, ctx->skey, tmp);
+		xorbuf(buf, tmp, 16);
+		buf += 16;
+		len -= 16;
+		cc0 ++;
+		carry = (~(cc0 | -cc0)) >> 31;
+		cc1 += carry;
+		carry &= (~(cc1 | -cc1)) >> 31;
+		cc2 += carry;
+		carry &= (~(cc2 | -cc2)) >> 31;
+		cc3 += carry;
+	}
+	br_enc32be(bctr +  0, cc3);
+	br_enc32be(bctr +  4, cc2);
+	br_enc32be(bctr +  8, cc1);
+	br_enc32be(bctr + 12, cc0);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_small_ctrcbc_mac(const br_aes_small_ctrcbc_keys *ctx,
+	void *cbcmac, const void *data, size_t len)
+{
+	const unsigned char *buf;
+
+	buf = data;
+	while (len > 0) {
+		xorbuf(cbcmac, buf, 16);
+		br_aes_small_encrypt(ctx->num_rounds, ctx->skey, cbcmac);
+		buf += 16;
+		len -= 16;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_aes_small_ctrcbc_encrypt(const br_aes_small_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	br_aes_small_ctrcbc_ctr(ctx, ctr, data, len);
+	br_aes_small_ctrcbc_mac(ctx, cbcmac, data, len);
+}
+
+/* see bearssl_block.h */
+void
+br_aes_small_ctrcbc_decrypt(const br_aes_small_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	br_aes_small_ctrcbc_mac(ctx, cbcmac, data, len);
+	br_aes_small_ctrcbc_ctr(ctx, ctr, data, len);
+}
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class br_aes_small_ctrcbc_vtable = {
+	sizeof(br_aes_small_ctrcbc_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
+		&br_aes_small_ctrcbc_init,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_small_ctrcbc_encrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_small_ctrcbc_decrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, size_t))
+		&br_aes_small_ctrcbc_ctr,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, const void *, size_t))
+		&br_aes_small_ctrcbc_mac
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_dec.c b/test/monniaux/BearSSL/src/symcipher/aes_small_dec.c
new file mode 100644
index 00000000..59dca8ec
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_small_dec.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Inverse S-box.
+ */
+static const unsigned char iS[] = {
+	0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E,
+	0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
+	0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32,
+	0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
+	0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49,
+	0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16,
+	0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50,
+	0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
+	0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05,
+	0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02,
+	0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41,
+	0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
+	0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8,
+	0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89,
+	0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B,
+	0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
+	0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59,
+	0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D,
+	0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D,
+	0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
+	0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63,
+	0x55, 0x21, 0x0C, 0x7D
+};
+
+static void
+add_round_key(unsigned *state, const uint32_t *skeys)
+{
+	int i;
+
+	for (i = 0; i < 16; i += 4) {
+		uint32_t k;
+
+		k = *skeys ++;
+		state[i + 0] ^= (unsigned)(k >> 24);
+		state[i + 1] ^= (unsigned)(k >> 16) & 0xFF;
+		state[i + 2] ^= (unsigned)(k >> 8) & 0xFF;
+		state[i + 3] ^= (unsigned)k & 0xFF;
+	}
+}
+
+static void
+inv_sub_bytes(unsigned *state)
+{
+	int i;
+
+	for (i = 0; i < 16; i ++) {
+		state[i] = iS[state[i]];
+	}
+}
+
+static void
+inv_shift_rows(unsigned *state)
+{
+	unsigned tmp;
+
+	tmp = state[13];
+	state[13] = state[9];
+	state[9] = state[5];
+	state[5] = state[1];
+	state[1] = tmp;
+
+	tmp = state[2];
+	state[2] = state[10];
+	state[10] = tmp;
+	tmp = state[6];
+	state[6] = state[14];
+	state[14] = tmp;
+
+	tmp = state[3];
+	state[3] = state[7];
+	state[7] = state[11];
+	state[11] = state[15];
+	state[15] = tmp;
+}
+
+static inline unsigned
+gf256red(unsigned x)
+{
+	unsigned y;
+
+	y = x >> 8;
+	return (x ^ y ^ (y << 1) ^ (y << 3) ^ (y << 4)) & 0xFF;
+}
+
+static void
+inv_mix_columns(unsigned *state)
+{
+	int i;
+
+	for (i = 0; i < 16; i += 4) {
+		unsigned s0, s1, s2, s3;
+		unsigned t0, t1, t2, t3;
+
+		s0 = state[i + 0];
+		s1 = state[i + 1];
+		s2 = state[i + 2];
+		s3 = state[i + 3];
+		t0 = (s0 << 1) ^ (s0 << 2) ^ (s0 << 3)
+			^ s1 ^ (s1 << 1) ^ (s1 << 3)
+			^ s2 ^ (s2 << 2) ^ (s2 << 3)
+			^ s3 ^ (s3 << 3);
+		t1 = s0 ^ (s0 << 3)
+			^ (s1 << 1) ^ (s1 << 2) ^ (s1 << 3)
+			^ s2 ^ (s2 << 1) ^ (s2 << 3)
+			^ s3 ^ (s3 << 2) ^ (s3 << 3);
+		t2 = s0 ^ (s0 << 2) ^ (s0 << 3)
+			^ s1 ^ (s1 << 3)
+			^ (s2 << 1) ^ (s2 << 2) ^ (s2 << 3)
+			^ s3 ^ (s3 << 1) ^ (s3 << 3);
+		t3 = s0 ^ (s0 << 1) ^ (s0 << 3)
+			^ s1 ^ (s1 << 2) ^ (s1 << 3)
+			^ s2 ^ (s2 << 3)
+			^ (s3 << 1) ^ (s3 << 2) ^ (s3 << 3);
+		state[i + 0] = gf256red(t0);
+		state[i + 1] = gf256red(t1);
+		state[i + 2] = gf256red(t2);
+		state[i + 3] = gf256red(t3);
+	}
+}
+
+/* see inner.h */
+void
+br_aes_small_decrypt(unsigned num_rounds, const uint32_t *skey, void *data)
+{
+	unsigned char *buf;
+	unsigned state[16];
+	unsigned u;
+
+	buf = data;
+	for (u = 0; u < 16; u ++) {
+		state[u] = buf[u];
+	}
+	add_round_key(state, skey + (num_rounds << 2));
+	for (u = num_rounds - 1; u > 0; u --) {
+		inv_shift_rows(state);
+		inv_sub_bytes(state);
+		add_round_key(state, skey + (u << 2));
+		inv_mix_columns(state);
+	}
+	inv_shift_rows(state);
+	inv_sub_bytes(state);
+	add_round_key(state, skey);
+	for (u = 0; u < 16; u ++) {
+		buf[u] = state[u];
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_enc.c b/test/monniaux/BearSSL/src/symcipher/aes_small_enc.c
new file mode 100644
index 00000000..29f48a8f
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_small_enc.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#define S   br_aes_S
+
+static void
+add_round_key(unsigned *state, const uint32_t *skeys)
+{
+	int i;
+
+	for (i = 0; i < 16; i += 4) {
+		uint32_t k;
+
+		k = *skeys ++;
+		state[i + 0] ^= (unsigned)(k >> 24);
+		state[i + 1] ^= (unsigned)(k >> 16) & 0xFF;
+		state[i + 2] ^= (unsigned)(k >> 8) & 0xFF;
+		state[i + 3] ^= (unsigned)k & 0xFF;
+	}
+}
+
+static void
+sub_bytes(unsigned *state)
+{
+	int i;
+
+	for (i = 0; i < 16; i ++) {
+		state[i] = S[state[i]];
+	}
+}
+
+static void
+shift_rows(unsigned *state)
+{
+	unsigned tmp;
+
+	tmp = state[1];
+	state[1] = state[5];
+	state[5] = state[9];
+	state[9] = state[13];
+	state[13] = tmp;
+
+	tmp = state[2];
+	state[2] = state[10];
+	state[10] = tmp;
+	tmp = state[6];
+	state[6] = state[14];
+	state[14] = tmp;
+
+	tmp = state[15];
+	state[15] = state[11];
+	state[11] = state[7];
+	state[7] = state[3];
+	state[3] = tmp;
+}
+
+static void
+mix_columns(unsigned *state)
+{
+	int i;
+
+	for (i = 0; i < 16; i += 4) {
+		unsigned s0, s1, s2, s3;
+		unsigned t0, t1, t2, t3;
+
+		s0 = state[i + 0];
+		s1 = state[i + 1];
+		s2 = state[i + 2];
+		s3 = state[i + 3];
+		t0 = (s0 << 1) ^ s1 ^ (s1 << 1) ^ s2 ^ s3;
+		t1 = s0 ^ (s1 << 1) ^ s2 ^ (s2 << 1) ^ s3;
+		t2 = s0 ^ s1 ^ (s2 << 1) ^ s3 ^ (s3 << 1);
+		t3 = s0 ^ (s0 << 1) ^ s1 ^ s2 ^ (s3 << 1);
+		state[i + 0] = t0 ^ ((unsigned)(-(int)(t0 >> 8)) & 0x11B);
+		state[i + 1] = t1 ^ ((unsigned)(-(int)(t1 >> 8)) & 0x11B);
+		state[i + 2] = t2 ^ ((unsigned)(-(int)(t2 >> 8)) & 0x11B);
+		state[i + 3] = t3 ^ ((unsigned)(-(int)(t3 >> 8)) & 0x11B);
+	}
+}
+
+/* see inner.h */
+void
+br_aes_small_encrypt(unsigned num_rounds, const uint32_t *skey, void *data)
+{
+	unsigned char *buf;
+	unsigned state[16];
+	unsigned u;
+
+	buf = data;
+	for (u = 0; u < 16; u ++) {
+		state[u] = buf[u];
+	}
+	add_round_key(state, skey);
+	for (u = 1; u < num_rounds; u ++) {
+		sub_bytes(state);
+		shift_rows(state);
+		mix_columns(state);
+		add_round_key(state, skey + (u << 2));
+	}
+	sub_bytes(state);
+	shift_rows(state);
+	add_round_key(state, skey + (num_rounds << 2));
+	for (u = 0; u < 16; u ++) {
+		buf[u] = state[u];
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni.c
new file mode 100644
index 00000000..d5408f13
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_ENABLE_INTRINSICS   1
+#include "inner.h"
+
+/*
+ * This code contains the AES key schedule implementation using the
+ * AES-NI opcodes.
+ */
+
+#if BR_AES_X86NI
+
+/* see inner.h */
+int
+br_aes_x86ni_supported(void)
+{
+	/*
+	 * Bit mask for features in ECX:
+	 *   19   SSE4.1 (used for _mm_insert_epi32(), for AES-CTR)
+	 *   25   AES-NI
+	 */
+	return br_cpuid(0, 0, 0x02080000, 0);
+}
+
+BR_TARGETS_X86_UP
+
+BR_TARGET("sse2,aes")
+static inline __m128i
+expand_step128(__m128i k, __m128i k2)
+{
+	k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
+	k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
+	k = _mm_xor_si128(k, _mm_slli_si128(k, 4));
+	k2 = _mm_shuffle_epi32(k2, 0xFF);
+	return _mm_xor_si128(k, k2);
+}
+
+BR_TARGET("sse2,aes")
+static inline void
+expand_step192(__m128i *t1, __m128i *t2, __m128i *t3)
+{
+	__m128i t4;
+
+	*t2 = _mm_shuffle_epi32(*t2, 0x55);
+	t4 = _mm_slli_si128(*t1, 0x4);
+	*t1 = _mm_xor_si128(*t1, t4);
+	t4 = _mm_slli_si128(t4, 0x4);
+	*t1 = _mm_xor_si128(*t1, t4);
+	t4 = _mm_slli_si128(t4, 0x4);
+	*t1 = _mm_xor_si128(*t1, t4);
+	*t1 = _mm_xor_si128(*t1, *t2);
+	*t2 = _mm_shuffle_epi32(*t1, 0xFF);
+	t4 = _mm_slli_si128(*t3, 0x4);
+	*t3 = _mm_xor_si128(*t3, t4);
+	*t3 = _mm_xor_si128(*t3, *t2);
+}
+
+BR_TARGET("sse2,aes")
+static inline void
+expand_step256_1(__m128i *t1, __m128i *t2)
+{
+	__m128i t4;
+
+	*t2 = _mm_shuffle_epi32(*t2, 0xFF);
+	t4 = _mm_slli_si128(*t1, 0x4);
+	*t1 = _mm_xor_si128(*t1, t4);
+	t4 = _mm_slli_si128(t4, 0x4);
+	*t1 = _mm_xor_si128(*t1, t4);
+	t4 = _mm_slli_si128(t4, 0x4);
+	*t1 = _mm_xor_si128(*t1, t4);
+	*t1 = _mm_xor_si128(*t1, *t2);
+}
+
+BR_TARGET("sse2,aes")
+static inline void
+expand_step256_2(__m128i *t1, __m128i *t3)
+{
+	__m128i t2, t4;
+
+	t4 = _mm_aeskeygenassist_si128(*t1, 0x0);
+	t2 = _mm_shuffle_epi32(t4, 0xAA);
+	t4 = _mm_slli_si128(*t3, 0x4);
+	*t3 = _mm_xor_si128(*t3, t4);
+	t4 = _mm_slli_si128(t4, 0x4);
+	*t3 = _mm_xor_si128(*t3, t4);
+	t4 = _mm_slli_si128(t4, 0x4);
+	*t3 = _mm_xor_si128(*t3, t4);
+	*t3 = _mm_xor_si128(*t3, t2);
+}
+
+/*
+ * Perform key schedule for AES, encryption direction. Subkeys are written
+ * in sk[], and the number of rounds is returned. Key length MUST be 16,
+ * 24 or 32 bytes.
+ */
+BR_TARGET("sse2,aes")
+static unsigned
+x86ni_keysched(__m128i *sk, const void *key, size_t len)
+{
+	const unsigned char *kb;
+
+#define KEXP128(k, i, rcon)   do { \
+		k = expand_step128(k, _mm_aeskeygenassist_si128(k, rcon)); \
+		sk[i] = k; \
+	} while (0)
+
+#define KEXP192(i, rcon1, rcon2)   do { \
+		sk[(i) + 0] = t1; \
+		sk[(i) + 1] = t3; \
+		t2 = _mm_aeskeygenassist_si128(t3, rcon1); \
+		expand_step192(&t1, &t2, &t3); \
+		sk[(i) + 1] = _mm_castpd_si128(_mm_shuffle_pd( \
+			_mm_castsi128_pd(sk[(i) + 1]), \
+			_mm_castsi128_pd(t1), 0)); \
+		sk[(i) + 2] = _mm_castpd_si128(_mm_shuffle_pd( \
+			_mm_castsi128_pd(t1), \
+			_mm_castsi128_pd(t3), 1)); \
+		t2 = _mm_aeskeygenassist_si128(t3, rcon2); \
+		expand_step192(&t1, &t2, &t3); \
+	} while (0)
+
+#define KEXP256(i, rcon)   do { \
+		sk[(i) + 0] = t3; \
+		t2 = _mm_aeskeygenassist_si128(t3, rcon); \
+		expand_step256_1(&t1, &t2); \
+		sk[(i) + 1] = t1; \
+		expand_step256_2(&t1, &t3); \
+	} while (0)
+
+	kb = key;
+	switch (len) {
+		__m128i t1, t2, t3;
+
+	case 16:
+		t1 = _mm_loadu_si128((const void *)kb);
+		sk[0] = t1;
+		KEXP128(t1,  1, 0x01);
+		KEXP128(t1,  2, 0x02);
+		KEXP128(t1,  3, 0x04);
+		KEXP128(t1,  4, 0x08);
+		KEXP128(t1,  5, 0x10);
+		KEXP128(t1,  6, 0x20);
+		KEXP128(t1,  7, 0x40);
+		KEXP128(t1,  8, 0x80);
+		KEXP128(t1,  9, 0x1B);
+		KEXP128(t1, 10, 0x36);
+		return 10;
+
+	case 24:
+		t1 = _mm_loadu_si128((const void *)kb);
+		t3 = _mm_loadu_si128((const void *)(kb + 8));
+		t3 = _mm_shuffle_epi32(t3, 0x4E);
+		KEXP192(0, 0x01, 0x02);
+		KEXP192(3, 0x04, 0x08);
+		KEXP192(6, 0x10, 0x20);
+		KEXP192(9, 0x40, 0x80);
+		sk[12] = t1;
+		return 12;
+
+	case 32:
+		t1 = _mm_loadu_si128((const void *)kb);
+		t3 = _mm_loadu_si128((const void *)(kb + 16));
+		sk[0] = t1;
+		KEXP256( 1, 0x01);
+		KEXP256( 3, 0x02);
+		KEXP256( 5, 0x04);
+		KEXP256( 7, 0x08);
+		KEXP256( 9, 0x10);
+		KEXP256(11, 0x20);
+		sk[13] = t3;
+		t2 = _mm_aeskeygenassist_si128(t3, 0x40);
+		expand_step256_1(&t1, &t2);
+		sk[14] = t1;
+		return 14;
+
+	default:
+		return 0;
+	}
+
+#undef KEXP128
+#undef KEXP192
+#undef KEXP256
+}
+
+/* see inner.h */
+BR_TARGET("sse2,aes")
+unsigned
+br_aes_x86ni_keysched_enc(unsigned char *skni, const void *key, size_t len)
+{
+	__m128i sk[15];
+	unsigned num_rounds;
+
+	num_rounds = x86ni_keysched(sk, key, len);
+	memcpy(skni, sk, (num_rounds + 1) << 4);
+	return num_rounds;
+}
+
+/* see inner.h */
+BR_TARGET("sse2,aes")
+unsigned
+br_aes_x86ni_keysched_dec(unsigned char *skni, const void *key, size_t len)
+{
+	__m128i sk[15];
+	unsigned u, num_rounds;
+
+	num_rounds = x86ni_keysched(sk, key, len);
+	_mm_storeu_si128((void *)skni, sk[num_rounds]);
+	for (u = 1; u < num_rounds; u ++) {
+		_mm_storeu_si128((void *)(skni + (u << 4)),
+			_mm_aesimc_si128(sk[num_rounds - u]));
+	}
+	_mm_storeu_si128((void *)(skni + (num_rounds << 4)), sk[0]);
+	return num_rounds;
+}
+
+BR_TARGETS_X86_DOWN
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcdec.c
new file mode 100644
index 00000000..862b1b5b
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcdec.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_ENABLE_INTRINSICS   1
+#include "inner.h"
+
+#if BR_AES_X86NI
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class *
+br_aes_x86ni_cbcdec_get_vtable(void)
+{
+	return br_aes_x86ni_supported() ? &br_aes_x86ni_cbcdec_vtable : NULL;
+}
+
+/* see bearssl_block.h */
+void
+br_aes_x86ni_cbcdec_init(br_aes_x86ni_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_x86ni_cbcdec_vtable;
+	ctx->num_rounds = br_aes_x86ni_keysched_dec(ctx->skey.skni, key, len);
+}
+
+BR_TARGETS_X86_UP
+
+/* see bearssl_block.h */
+BR_TARGET("sse2,aes")
+void
+br_aes_x86ni_cbcdec_run(const br_aes_x86ni_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned num_rounds;
+	__m128i sk[15], ivx;
+	unsigned u;
+
+	buf = data;
+	ivx = _mm_loadu_si128(iv);
+	num_rounds = ctx->num_rounds;
+	for (u = 0; u <= num_rounds; u ++) {
+		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
+	}
+	while (len > 0) {
+		__m128i x0, x1, x2, x3, e0, e1, e2, e3;
+
+		x0 = _mm_loadu_si128((void *)(buf +  0));
+		if (len >= 64) {
+			x1 = _mm_loadu_si128((void *)(buf + 16));
+			x2 = _mm_loadu_si128((void *)(buf + 32));
+			x3 = _mm_loadu_si128((void *)(buf + 48));
+		} else {
+			x0 = _mm_loadu_si128((void *)(buf +  0));
+			if (len >= 32) {
+				x1 = _mm_loadu_si128((void *)(buf + 16));
+				if (len >= 48) {
+					x2 = _mm_loadu_si128(
+						(void *)(buf + 32));
+					x3 = x2;
+				} else {
+					x2 = x0;
+					x3 = x1;
+				}
+			} else {
+				x1 = x0;
+				x2 = x0;
+				x3 = x0;
+			}
+		}
+		e0 = x0;
+		e1 = x1;
+		e2 = x2;
+		e3 = x3;
+		x0 = _mm_xor_si128(x0, sk[0]);
+		x1 = _mm_xor_si128(x1, sk[0]);
+		x2 = _mm_xor_si128(x2, sk[0]);
+		x3 = _mm_xor_si128(x3, sk[0]);
+		x0 = _mm_aesdec_si128(x0, sk[1]);
+		x1 = _mm_aesdec_si128(x1, sk[1]);
+		x2 = _mm_aesdec_si128(x2, sk[1]);
+		x3 = _mm_aesdec_si128(x3, sk[1]);
+		x0 = _mm_aesdec_si128(x0, sk[2]);
+		x1 = _mm_aesdec_si128(x1, sk[2]);
+		x2 = _mm_aesdec_si128(x2, sk[2]);
+		x3 = _mm_aesdec_si128(x3, sk[2]);
+		x0 = _mm_aesdec_si128(x0, sk[3]);
+		x1 = _mm_aesdec_si128(x1, sk[3]);
+		x2 = _mm_aesdec_si128(x2, sk[3]);
+		x3 = _mm_aesdec_si128(x3, sk[3]);
+		x0 = _mm_aesdec_si128(x0, sk[4]);
+		x1 = _mm_aesdec_si128(x1, sk[4]);
+		x2 = _mm_aesdec_si128(x2, sk[4]);
+		x3 = _mm_aesdec_si128(x3, sk[4]);
+		x0 = _mm_aesdec_si128(x0, sk[5]);
+		x1 = _mm_aesdec_si128(x1, sk[5]);
+		x2 = _mm_aesdec_si128(x2, sk[5]);
+		x3 = _mm_aesdec_si128(x3, sk[5]);
+		x0 = _mm_aesdec_si128(x0, sk[6]);
+		x1 = _mm_aesdec_si128(x1, sk[6]);
+		x2 = _mm_aesdec_si128(x2, sk[6]);
+		x3 = _mm_aesdec_si128(x3, sk[6]);
+		x0 = _mm_aesdec_si128(x0, sk[7]);
+		x1 = _mm_aesdec_si128(x1, sk[7]);
+		x2 = _mm_aesdec_si128(x2, sk[7]);
+		x3 = _mm_aesdec_si128(x3, sk[7]);
+		x0 = _mm_aesdec_si128(x0, sk[8]);
+		x1 = _mm_aesdec_si128(x1, sk[8]);
+		x2 = _mm_aesdec_si128(x2, sk[8]);
+		x3 = _mm_aesdec_si128(x3, sk[8]);
+		x0 = _mm_aesdec_si128(x0, sk[9]);
+		x1 = _mm_aesdec_si128(x1, sk[9]);
+		x2 = _mm_aesdec_si128(x2, sk[9]);
+		x3 = _mm_aesdec_si128(x3, sk[9]);
+		if (num_rounds == 10) {
+			x0 = _mm_aesdeclast_si128(x0, sk[10]);
+			x1 = _mm_aesdeclast_si128(x1, sk[10]);
+			x2 = _mm_aesdeclast_si128(x2, sk[10]);
+			x3 = _mm_aesdeclast_si128(x3, sk[10]);
+		} else if (num_rounds == 12) {
+			x0 = _mm_aesdec_si128(x0, sk[10]);
+			x1 = _mm_aesdec_si128(x1, sk[10]);
+			x2 = _mm_aesdec_si128(x2, sk[10]);
+			x3 = _mm_aesdec_si128(x3, sk[10]);
+			x0 = _mm_aesdec_si128(x0, sk[11]);
+			x1 = _mm_aesdec_si128(x1, sk[11]);
+			x2 = _mm_aesdec_si128(x2, sk[11]);
+			x3 = _mm_aesdec_si128(x3, sk[11]);
+			x0 = _mm_aesdeclast_si128(x0, sk[12]);
+			x1 = _mm_aesdeclast_si128(x1, sk[12]);
+			x2 = _mm_aesdeclast_si128(x2, sk[12]);
+			x3 = _mm_aesdeclast_si128(x3, sk[12]);
+		} else {
+			x0 = _mm_aesdec_si128(x0, sk[10]);
+			x1 = _mm_aesdec_si128(x1, sk[10]);
+			x2 = _mm_aesdec_si128(x2, sk[10]);
+			x3 = _mm_aesdec_si128(x3, sk[10]);
+			x0 = _mm_aesdec_si128(x0, sk[11]);
+			x1 = _mm_aesdec_si128(x1, sk[11]);
+			x2 = _mm_aesdec_si128(x2, sk[11]);
+			x3 = _mm_aesdec_si128(x3, sk[11]);
+			x0 = _mm_aesdec_si128(x0, sk[12]);
+			x1 = _mm_aesdec_si128(x1, sk[12]);
+			x2 = _mm_aesdec_si128(x2, sk[12]);
+			x3 = _mm_aesdec_si128(x3, sk[12]);
+			x0 = _mm_aesdec_si128(x0, sk[13]);
+			x1 = _mm_aesdec_si128(x1, sk[13]);
+			x2 = _mm_aesdec_si128(x2, sk[13]);
+			x3 = _mm_aesdec_si128(x3, sk[13]);
+			x0 = _mm_aesdeclast_si128(x0, sk[14]);
+			x1 = _mm_aesdeclast_si128(x1, sk[14]);
+			x2 = _mm_aesdeclast_si128(x2, sk[14]);
+			x3 = _mm_aesdeclast_si128(x3, sk[14]);
+		}
+		x0 = _mm_xor_si128(x0, ivx);
+		x1 = _mm_xor_si128(x1, e0);
+		x2 = _mm_xor_si128(x2, e1);
+		x3 = _mm_xor_si128(x3, e2);
+		ivx = e3;
+		_mm_storeu_si128((void *)(buf +  0), x0);
+		if (len >= 64) {
+			_mm_storeu_si128((void *)(buf + 16), x1);
+			_mm_storeu_si128((void *)(buf + 32), x2);
+			_mm_storeu_si128((void *)(buf + 48), x3);
+			buf += 64;
+			len -= 64;
+		} else {
+			if (len >= 32) {
+				_mm_storeu_si128((void *)(buf + 16), x1);
+				if (len >= 48) {
+					_mm_storeu_si128(
+						(void *)(buf + 32), x2);
+				}
+			}
+			break;
+		}
+	}
+	_mm_storeu_si128(iv, ivx);
+}
+
+BR_TARGETS_X86_DOWN
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_aes_x86ni_cbcdec_vtable = {
+	sizeof(br_aes_x86ni_cbcdec_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_aes_x86ni_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_aes_x86ni_cbcdec_run
+};
+
+#else
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class *
+br_aes_x86ni_cbcdec_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcenc.c
new file mode 100644
index 00000000..85feecdb
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcenc.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_ENABLE_INTRINSICS   1
+#include "inner.h"
+
+#if BR_AES_X86NI
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class *
+br_aes_x86ni_cbcenc_get_vtable(void)
+{
+	return br_aes_x86ni_supported() ? &br_aes_x86ni_cbcenc_vtable : NULL;
+}
+
+/* see bearssl_block.h */
+void
+br_aes_x86ni_cbcenc_init(br_aes_x86ni_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_x86ni_cbcenc_vtable;
+	ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
+}
+
+BR_TARGETS_X86_UP
+
+/* see bearssl_block.h */
+BR_TARGET("sse2,aes")
+void
+br_aes_x86ni_cbcenc_run(const br_aes_x86ni_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned num_rounds;
+	__m128i sk[15], ivx;
+	unsigned u;
+
+	buf = data;
+	ivx = _mm_loadu_si128(iv);
+	num_rounds = ctx->num_rounds;
+	for (u = 0; u <= num_rounds; u ++) {
+		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
+	}
+	while (len > 0) {
+		__m128i x;
+
+		x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx);
+		x = _mm_xor_si128(x, sk[0]);
+		x = _mm_aesenc_si128(x, sk[1]);
+		x = _mm_aesenc_si128(x, sk[2]);
+		x = _mm_aesenc_si128(x, sk[3]);
+		x = _mm_aesenc_si128(x, sk[4]);
+		x = _mm_aesenc_si128(x, sk[5]);
+		x = _mm_aesenc_si128(x, sk[6]);
+		x = _mm_aesenc_si128(x, sk[7]);
+		x = _mm_aesenc_si128(x, sk[8]);
+		x = _mm_aesenc_si128(x, sk[9]);
+		if (num_rounds == 10) {
+			x = _mm_aesenclast_si128(x, sk[10]);
+		} else if (num_rounds == 12) {
+			x = _mm_aesenc_si128(x, sk[10]);
+			x = _mm_aesenc_si128(x, sk[11]);
+			x = _mm_aesenclast_si128(x, sk[12]);
+		} else {
+			x = _mm_aesenc_si128(x, sk[10]);
+			x = _mm_aesenc_si128(x, sk[11]);
+			x = _mm_aesenc_si128(x, sk[12]);
+			x = _mm_aesenc_si128(x, sk[13]);
+			x = _mm_aesenclast_si128(x, sk[14]);
+		}
+		ivx = x;
+		_mm_storeu_si128((void *)buf, x);
+		buf += 16;
+		len -= 16;
+	}
+	_mm_storeu_si128(iv, ivx);
+}
+
+BR_TARGETS_X86_DOWN
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_aes_x86ni_cbcenc_vtable = {
+	sizeof(br_aes_x86ni_cbcenc_keys),
+	16,
+	4,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_aes_x86ni_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_aes_x86ni_cbcenc_run
+};
+
+#else
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class *
+br_aes_x86ni_cbcenc_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctr.c
new file mode 100644
index 00000000..1cddd606
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctr.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_ENABLE_INTRINSICS   1
+#include "inner.h"
+
+#if BR_AES_X86NI
+
+/* see bearssl_block.h */
+const br_block_ctr_class *
+br_aes_x86ni_ctr_get_vtable(void)
+{
+	return br_aes_x86ni_supported() ? &br_aes_x86ni_ctr_vtable : NULL;
+}
+
+/* see bearssl_block.h */
+void
+br_aes_x86ni_ctr_init(br_aes_x86ni_ctr_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_x86ni_ctr_vtable;
+	ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
+}
+
+BR_TARGETS_X86_UP
+
+/* see bearssl_block.h */
+BR_TARGET("sse2,sse4.1,aes")
+uint32_t
+br_aes_x86ni_ctr_run(const br_aes_x86ni_ctr_keys *ctx,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned char ivbuf[16];
+	unsigned num_rounds;
+	__m128i sk[15];
+	__m128i ivx;
+	unsigned u;
+
+	buf = data;
+	memcpy(ivbuf, iv, 12);
+	num_rounds = ctx->num_rounds;
+	for (u = 0; u <= num_rounds; u ++) {
+		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
+	}
+	ivx = _mm_loadu_si128((void *)ivbuf);
+	while (len > 0) {
+		__m128i x0, x1, x2, x3;
+
+		x0 = _mm_insert_epi32(ivx, br_bswap32(cc + 0), 3);
+		x1 = _mm_insert_epi32(ivx, br_bswap32(cc + 1), 3);
+		x2 = _mm_insert_epi32(ivx, br_bswap32(cc + 2), 3);
+		x3 = _mm_insert_epi32(ivx, br_bswap32(cc + 3), 3);
+		x0 = _mm_xor_si128(x0, sk[0]);
+		x1 = _mm_xor_si128(x1, sk[0]);
+		x2 = _mm_xor_si128(x2, sk[0]);
+		x3 = _mm_xor_si128(x3, sk[0]);
+		x0 = _mm_aesenc_si128(x0, sk[1]);
+		x1 = _mm_aesenc_si128(x1, sk[1]);
+		x2 = _mm_aesenc_si128(x2, sk[1]);
+		x3 = _mm_aesenc_si128(x3, sk[1]);
+		x0 = _mm_aesenc_si128(x0, sk[2]);
+		x1 = _mm_aesenc_si128(x1, sk[2]);
+		x2 = _mm_aesenc_si128(x2, sk[2]);
+		x3 = _mm_aesenc_si128(x3, sk[2]);
+		x0 = _mm_aesenc_si128(x0, sk[3]);
+		x1 = _mm_aesenc_si128(x1, sk[3]);
+		x2 = _mm_aesenc_si128(x2, sk[3]);
+		x3 = _mm_aesenc_si128(x3, sk[3]);
+		x0 = _mm_aesenc_si128(x0, sk[4]);
+		x1 = _mm_aesenc_si128(x1, sk[4]);
+		x2 = _mm_aesenc_si128(x2, sk[4]);
+		x3 = _mm_aesenc_si128(x3, sk[4]);
+		x0 = _mm_aesenc_si128(x0, sk[5]);
+		x1 = _mm_aesenc_si128(x1, sk[5]);
+		x2 = _mm_aesenc_si128(x2, sk[5]);
+		x3 = _mm_aesenc_si128(x3, sk[5]);
+		x0 = _mm_aesenc_si128(x0, sk[6]);
+		x1 = _mm_aesenc_si128(x1, sk[6]);
+		x2 = _mm_aesenc_si128(x2, sk[6]);
+		x3 = _mm_aesenc_si128(x3, sk[6]);
+		x0 = _mm_aesenc_si128(x0, sk[7]);
+		x1 = _mm_aesenc_si128(x1, sk[7]);
+		x2 = _mm_aesenc_si128(x2, sk[7]);
+		x3 = _mm_aesenc_si128(x3, sk[7]);
+		x0 = _mm_aesenc_si128(x0, sk[8]);
+		x1 = _mm_aesenc_si128(x1, sk[8]);
+		x2 = _mm_aesenc_si128(x2, sk[8]);
+		x3 = _mm_aesenc_si128(x3, sk[8]);
+		x0 = _mm_aesenc_si128(x0, sk[9]);
+		x1 = _mm_aesenc_si128(x1, sk[9]);
+		x2 = _mm_aesenc_si128(x2, sk[9]);
+		x3 = _mm_aesenc_si128(x3, sk[9]);
+		if (num_rounds == 10) {
+			x0 = _mm_aesenclast_si128(x0, sk[10]);
+			x1 = _mm_aesenclast_si128(x1, sk[10]);
+			x2 = _mm_aesenclast_si128(x2, sk[10]);
+			x3 = _mm_aesenclast_si128(x3, sk[10]);
+		} else if (num_rounds == 12) {
+			x0 = _mm_aesenc_si128(x0, sk[10]);
+			x1 = _mm_aesenc_si128(x1, sk[10]);
+			x2 = _mm_aesenc_si128(x2, sk[10]);
+			x3 = _mm_aesenc_si128(x3, sk[10]);
+			x0 = _mm_aesenc_si128(x0, sk[11]);
+			x1 = _mm_aesenc_si128(x1, sk[11]);
+			x2 = _mm_aesenc_si128(x2, sk[11]);
+			x3 = _mm_aesenc_si128(x3, sk[11]);
+			x0 = _mm_aesenclast_si128(x0, sk[12]);
+			x1 = _mm_aesenclast_si128(x1, sk[12]);
+			x2 = _mm_aesenclast_si128(x2, sk[12]);
+			x3 = _mm_aesenclast_si128(x3, sk[12]);
+		} else {
+			x0 = _mm_aesenc_si128(x0, sk[10]);
+			x1 = _mm_aesenc_si128(x1, sk[10]);
+			x2 = _mm_aesenc_si128(x2, sk[10]);
+			x3 = _mm_aesenc_si128(x3, sk[10]);
+			x0 = _mm_aesenc_si128(x0, sk[11]);
+			x1 = _mm_aesenc_si128(x1, sk[11]);
+			x2 = _mm_aesenc_si128(x2, sk[11]);
+			x3 = _mm_aesenc_si128(x3, sk[11]);
+			x0 = _mm_aesenc_si128(x0, sk[12]);
+			x1 = _mm_aesenc_si128(x1, sk[12]);
+			x2 = _mm_aesenc_si128(x2, sk[12]);
+			x3 = _mm_aesenc_si128(x3, sk[12]);
+			x0 = _mm_aesenc_si128(x0, sk[13]);
+			x1 = _mm_aesenc_si128(x1, sk[13]);
+			x2 = _mm_aesenc_si128(x2, sk[13]);
+			x3 = _mm_aesenc_si128(x3, sk[13]);
+			x0 = _mm_aesenclast_si128(x0, sk[14]);
+			x1 = _mm_aesenclast_si128(x1, sk[14]);
+			x2 = _mm_aesenclast_si128(x2, sk[14]);
+			x3 = _mm_aesenclast_si128(x3, sk[14]);
+		}
+		if (len >= 64) {
+			x0 = _mm_xor_si128(x0,
+				_mm_loadu_si128((void *)(buf +  0)));
+			x1 = _mm_xor_si128(x1,
+				_mm_loadu_si128((void *)(buf + 16)));
+			x2 = _mm_xor_si128(x2,
+				_mm_loadu_si128((void *)(buf + 32)));
+			x3 = _mm_xor_si128(x3,
+				_mm_loadu_si128((void *)(buf + 48)));
+			_mm_storeu_si128((void *)(buf +  0), x0);
+			_mm_storeu_si128((void *)(buf + 16), x1);
+			_mm_storeu_si128((void *)(buf + 32), x2);
+			_mm_storeu_si128((void *)(buf + 48), x3);
+			buf += 64;
+			len -= 64;
+			cc += 4;
+		} else {
+			unsigned char tmp[64];
+
+			_mm_storeu_si128((void *)(tmp +  0), x0);
+			_mm_storeu_si128((void *)(tmp + 16), x1);
+			_mm_storeu_si128((void *)(tmp + 32), x2);
+			_mm_storeu_si128((void *)(tmp + 48), x3);
+			for (u = 0; u < len; u ++) {
+				buf[u] ^= tmp[u];
+			}
+			cc += (uint32_t)len >> 4;
+			break;
+		}
+	}
+	return cc;
+}
+
+BR_TARGETS_X86_DOWN
+
+/* see bearssl_block.h */
+const br_block_ctr_class br_aes_x86ni_ctr_vtable = {
+	sizeof(br_aes_x86ni_ctr_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctr_class **, const void *, size_t))
+		&br_aes_x86ni_ctr_init,
+	(uint32_t (*)(const br_block_ctr_class *const *,
+		const void *, uint32_t, void *, size_t))
+		&br_aes_x86ni_ctr_run
+};
+
+#else
+
+/* see bearssl_block.h */
+const br_block_ctr_class *
+br_aes_x86ni_ctr_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctrcbc.c
new file mode 100644
index 00000000..f57fead6
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctrcbc.c
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_ENABLE_INTRINSICS   1
+#include "inner.h"
+
+#if BR_AES_X86NI
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class *
+br_aes_x86ni_ctrcbc_get_vtable(void)
+{
+	return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL;
+}
+
+/* see bearssl_block.h */
+void
+br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_aes_x86ni_ctrcbc_vtable;
+	ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len);
+}
+
+BR_TARGETS_X86_UP
+
+/* see bearssl_block.h */
+BR_TARGET("sse2,sse4.1,aes")
+void
+br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx,
+	void *ctr, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned num_rounds;
+	__m128i sk[15];
+	__m128i ivx0, ivx1, ivx2, ivx3;
+	__m128i erev, zero, one, four, notthree;
+	unsigned u;
+
+	buf = data;
+	num_rounds = ctx->num_rounds;
+	for (u = 0; u <= num_rounds; u ++) {
+		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
+	}
+
+	/*
+	 * Some SSE2 constants.
+	 */
+	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
+		8, 9, 10, 11, 12, 13, 14, 15);
+	zero = _mm_setzero_si128();
+	one = _mm_set_epi64x(0, 1);
+	four = _mm_set_epi64x(0, 4);
+	notthree = _mm_sub_epi64(zero, four);
+
+	/*
+	 * Decode the counter in big-endian and pre-increment the other
+	 * three counters.
+	 */
+	ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev);
+	ivx1 = _mm_add_epi64(ivx0, one);
+	ivx1 = _mm_sub_epi64(ivx1,
+		_mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8));
+	ivx2 = _mm_add_epi64(ivx1, one);
+	ivx2 = _mm_sub_epi64(ivx2,
+		_mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8));
+	ivx3 = _mm_add_epi64(ivx2, one);
+	ivx3 = _mm_sub_epi64(ivx3,
+		_mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8));
+	while (len > 0) {
+		__m128i x0, x1, x2, x3;
+
+		/*
+		 * Load counter values; we need to byteswap them because
+		 * the specification says that they use big-endian.
+		 */
+		x0 = _mm_shuffle_epi8(ivx0, erev);
+		x1 = _mm_shuffle_epi8(ivx1, erev);
+		x2 = _mm_shuffle_epi8(ivx2, erev);
+		x3 = _mm_shuffle_epi8(ivx3, erev);
+
+		x0 = _mm_xor_si128(x0, sk[0]);
+		x1 = _mm_xor_si128(x1, sk[0]);
+		x2 = _mm_xor_si128(x2, sk[0]);
+		x3 = _mm_xor_si128(x3, sk[0]);
+		x0 = _mm_aesenc_si128(x0, sk[1]);
+		x1 = _mm_aesenc_si128(x1, sk[1]);
+		x2 = _mm_aesenc_si128(x2, sk[1]);
+		x3 = _mm_aesenc_si128(x3, sk[1]);
+		x0 = _mm_aesenc_si128(x0, sk[2]);
+		x1 = _mm_aesenc_si128(x1, sk[2]);
+		x2 = _mm_aesenc_si128(x2, sk[2]);
+		x3 = _mm_aesenc_si128(x3, sk[2]);
+		x0 = _mm_aesenc_si128(x0, sk[3]);
+		x1 = _mm_aesenc_si128(x1, sk[3]);
+		x2 = _mm_aesenc_si128(x2, sk[3]);
+		x3 = _mm_aesenc_si128(x3, sk[3]);
+		x0 = _mm_aesenc_si128(x0, sk[4]);
+		x1 = _mm_aesenc_si128(x1, sk[4]);
+		x2 = _mm_aesenc_si128(x2, sk[4]);
+		x3 = _mm_aesenc_si128(x3, sk[4]);
+		x0 = _mm_aesenc_si128(x0, sk[5]);
+		x1 = _mm_aesenc_si128(x1, sk[5]);
+		x2 = _mm_aesenc_si128(x2, sk[5]);
+		x3 = _mm_aesenc_si128(x3, sk[5]);
+		x0 = _mm_aesenc_si128(x0, sk[6]);
+		x1 = _mm_aesenc_si128(x1, sk[6]);
+		x2 = _mm_aesenc_si128(x2, sk[6]);
+		x3 = _mm_aesenc_si128(x3, sk[6]);
+		x0 = _mm_aesenc_si128(x0, sk[7]);
+		x1 = _mm_aesenc_si128(x1, sk[7]);
+		x2 = _mm_aesenc_si128(x2, sk[7]);
+		x3 = _mm_aesenc_si128(x3, sk[7]);
+		x0 = _mm_aesenc_si128(x0, sk[8]);
+		x1 = _mm_aesenc_si128(x1, sk[8]);
+		x2 = _mm_aesenc_si128(x2, sk[8]);
+		x3 = _mm_aesenc_si128(x3, sk[8]);
+		x0 = _mm_aesenc_si128(x0, sk[9]);
+		x1 = _mm_aesenc_si128(x1, sk[9]);
+		x2 = _mm_aesenc_si128(x2, sk[9]);
+		x3 = _mm_aesenc_si128(x3, sk[9]);
+		if (num_rounds == 10) {
+			x0 = _mm_aesenclast_si128(x0, sk[10]);
+			x1 = _mm_aesenclast_si128(x1, sk[10]);
+			x2 = _mm_aesenclast_si128(x2, sk[10]);
+			x3 = _mm_aesenclast_si128(x3, sk[10]);
+		} else if (num_rounds == 12) {
+			x0 = _mm_aesenc_si128(x0, sk[10]);
+			x1 = _mm_aesenc_si128(x1, sk[10]);
+			x2 = _mm_aesenc_si128(x2, sk[10]);
+			x3 = _mm_aesenc_si128(x3, sk[10]);
+			x0 = _mm_aesenc_si128(x0, sk[11]);
+			x1 = _mm_aesenc_si128(x1, sk[11]);
+			x2 = _mm_aesenc_si128(x2, sk[11]);
+			x3 = _mm_aesenc_si128(x3, sk[11]);
+			x0 = _mm_aesenclast_si128(x0, sk[12]);
+			x1 = _mm_aesenclast_si128(x1, sk[12]);
+			x2 = _mm_aesenclast_si128(x2, sk[12]);
+			x3 = _mm_aesenclast_si128(x3, sk[12]);
+		} else {
+			x0 = _mm_aesenc_si128(x0, sk[10]);
+			x1 = _mm_aesenc_si128(x1, sk[10]);
+			x2 = _mm_aesenc_si128(x2, sk[10]);
+			x3 = _mm_aesenc_si128(x3, sk[10]);
+			x0 = _mm_aesenc_si128(x0, sk[11]);
+			x1 = _mm_aesenc_si128(x1, sk[11]);
+			x2 = _mm_aesenc_si128(x2, sk[11]);
+			x3 = _mm_aesenc_si128(x3, sk[11]);
+			x0 = _mm_aesenc_si128(x0, sk[12]);
+			x1 = _mm_aesenc_si128(x1, sk[12]);
+			x2 = _mm_aesenc_si128(x2, sk[12]);
+			x3 = _mm_aesenc_si128(x3, sk[12]);
+			x0 = _mm_aesenc_si128(x0, sk[13]);
+			x1 = _mm_aesenc_si128(x1, sk[13]);
+			x2 = _mm_aesenc_si128(x2, sk[13]);
+			x3 = _mm_aesenc_si128(x3, sk[13]);
+			x0 = _mm_aesenclast_si128(x0, sk[14]);
+			x1 = _mm_aesenclast_si128(x1, sk[14]);
+			x2 = _mm_aesenclast_si128(x2, sk[14]);
+			x3 = _mm_aesenclast_si128(x3, sk[14]);
+		}
+		if (len >= 64) {
+			x0 = _mm_xor_si128(x0,
+				_mm_loadu_si128((void *)(buf +  0)));
+			x1 = _mm_xor_si128(x1,
+				_mm_loadu_si128((void *)(buf + 16)));
+			x2 = _mm_xor_si128(x2,
+				_mm_loadu_si128((void *)(buf + 32)));
+			x3 = _mm_xor_si128(x3,
+				_mm_loadu_si128((void *)(buf + 48)));
+			_mm_storeu_si128((void *)(buf +  0), x0);
+			_mm_storeu_si128((void *)(buf + 16), x1);
+			_mm_storeu_si128((void *)(buf + 32), x2);
+			_mm_storeu_si128((void *)(buf + 48), x3);
+			buf += 64;
+			len -= 64;
+		} else {
+			unsigned char tmp[64];
+
+			_mm_storeu_si128((void *)(tmp +  0), x0);
+			_mm_storeu_si128((void *)(tmp + 16), x1);
+			_mm_storeu_si128((void *)(tmp + 32), x2);
+			_mm_storeu_si128((void *)(tmp + 48), x3);
+			for (u = 0; u < len; u ++) {
+				buf[u] ^= tmp[u];
+			}
+			switch (len) {
+			case 16:
+				ivx0 = ivx1;
+				break;
+			case 32:
+				ivx0 = ivx2;
+				break;
+			case 48:
+				ivx0 = ivx3;
+				break;
+			}
+			break;
+		}
+
+		/*
+		 * Add 4 to each counter value. For carry propagation
+		 * into the upper 64-bit words, we would need to compare
+		 * the results with 4, but SSE2+ has only _signed_
+		 * comparisons. Instead, we mask out the low two bits,
+		 * and check whether the remaining bits are zero.
+		 */
+		ivx0 = _mm_add_epi64(ivx0, four);
+		ivx1 = _mm_add_epi64(ivx1, four);
+		ivx2 = _mm_add_epi64(ivx2, four);
+		ivx3 = _mm_add_epi64(ivx3, four);
+		ivx0 = _mm_sub_epi64(ivx0,
+			_mm_slli_si128(_mm_cmpeq_epi64(
+				_mm_and_si128(ivx0, notthree), zero), 8));
+		ivx1 = _mm_sub_epi64(ivx1,
+			_mm_slli_si128(_mm_cmpeq_epi64(
+				_mm_and_si128(ivx1, notthree), zero), 8));
+		ivx2 = _mm_sub_epi64(ivx2,
+			_mm_slli_si128(_mm_cmpeq_epi64(
+				_mm_and_si128(ivx2, notthree), zero), 8));
+		ivx3 = _mm_sub_epi64(ivx3,
+			_mm_slli_si128(_mm_cmpeq_epi64(
+				_mm_and_si128(ivx3, notthree), zero), 8));
+	}
+
+	/*
+	 * Write back new counter value. The loop took care to put the
+	 * right counter value in ivx0.
+	 */
+	_mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev));
+}
+
+/* see bearssl_block.h */
+BR_TARGET("sse2,sse4.1,aes")
+void
+br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx,
+	void *cbcmac, const void *data, size_t len)
+{
+	const unsigned char *buf;
+	unsigned num_rounds;
+	__m128i sk[15], ivx;
+	unsigned u;
+
+	buf = data;
+	ivx = _mm_loadu_si128(cbcmac);
+	num_rounds = ctx->num_rounds;
+	for (u = 0; u <= num_rounds; u ++) {
+		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
+	}
+	while (len > 0) {
+		__m128i x;
+
+		x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx);
+		x = _mm_xor_si128(x, sk[0]);
+		x = _mm_aesenc_si128(x, sk[1]);
+		x = _mm_aesenc_si128(x, sk[2]);
+		x = _mm_aesenc_si128(x, sk[3]);
+		x = _mm_aesenc_si128(x, sk[4]);
+		x = _mm_aesenc_si128(x, sk[5]);
+		x = _mm_aesenc_si128(x, sk[6]);
+		x = _mm_aesenc_si128(x, sk[7]);
+		x = _mm_aesenc_si128(x, sk[8]);
+		x = _mm_aesenc_si128(x, sk[9]);
+		if (num_rounds == 10) {
+			x = _mm_aesenclast_si128(x, sk[10]);
+		} else if (num_rounds == 12) {
+			x = _mm_aesenc_si128(x, sk[10]);
+			x = _mm_aesenc_si128(x, sk[11]);
+			x = _mm_aesenclast_si128(x, sk[12]);
+		} else {
+			x = _mm_aesenc_si128(x, sk[10]);
+			x = _mm_aesenc_si128(x, sk[11]);
+			x = _mm_aesenc_si128(x, sk[12]);
+			x = _mm_aesenc_si128(x, sk[13]);
+			x = _mm_aesenclast_si128(x, sk[14]);
+		}
+		ivx = x;
+		buf += 16;
+		len -= 16;
+	}
+	_mm_storeu_si128(cbcmac, ivx);
+}
+
+/* see bearssl_block.h */
+BR_TARGET("sse2,sse4.1,aes")
+void
+br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned num_rounds;
+	__m128i sk[15];
+	__m128i ivx, cmx;
+	__m128i erev, zero, one;
+	unsigned u;
+	int first_iter;
+
+	num_rounds = ctx->num_rounds;
+	for (u = 0; u <= num_rounds; u ++) {
+		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
+	}
+
+	/*
+	 * Some SSE2 constants.
+	 */
+	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
+		8, 9, 10, 11, 12, 13, 14, 15);
+	zero = _mm_setzero_si128();
+	one = _mm_set_epi64x(0, 1);
+
+	/*
+	 * Decode the counter in big-endian.
+	 */
+	ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
+	cmx = _mm_loadu_si128(cbcmac);
+
+	buf = data;
+	first_iter = 1;
+	while (len > 0) {
+		__m128i dx, x0, x1;
+
+		/*
+		 * Load initial values:
+		 *   dx   encrypted block of data
+		 *   x0   counter (for CTR encryption)
+		 *   x1   input for CBC-MAC
+		 */
+		dx = _mm_loadu_si128((void *)buf);
+		x0 = _mm_shuffle_epi8(ivx, erev);
+		x1 = cmx;
+
+		x0 = _mm_xor_si128(x0, sk[0]);
+		x1 = _mm_xor_si128(x1, sk[0]);
+		x0 = _mm_aesenc_si128(x0, sk[1]);
+		x1 = _mm_aesenc_si128(x1, sk[1]);
+		x0 = _mm_aesenc_si128(x0, sk[2]);
+		x1 = _mm_aesenc_si128(x1, sk[2]);
+		x0 = _mm_aesenc_si128(x0, sk[3]);
+		x1 = _mm_aesenc_si128(x1, sk[3]);
+		x0 = _mm_aesenc_si128(x0, sk[4]);
+		x1 = _mm_aesenc_si128(x1, sk[4]);
+		x0 = _mm_aesenc_si128(x0, sk[5]);
+		x1 = _mm_aesenc_si128(x1, sk[5]);
+		x0 = _mm_aesenc_si128(x0, sk[6]);
+		x1 = _mm_aesenc_si128(x1, sk[6]);
+		x0 = _mm_aesenc_si128(x0, sk[7]);
+		x1 = _mm_aesenc_si128(x1, sk[7]);
+		x0 = _mm_aesenc_si128(x0, sk[8]);
+		x1 = _mm_aesenc_si128(x1, sk[8]);
+		x0 = _mm_aesenc_si128(x0, sk[9]);
+		x1 = _mm_aesenc_si128(x1, sk[9]);
+		if (num_rounds == 10) {
+			x0 = _mm_aesenclast_si128(x0, sk[10]);
+			x1 = _mm_aesenclast_si128(x1, sk[10]);
+		} else if (num_rounds == 12) {
+			x0 = _mm_aesenc_si128(x0, sk[10]);
+			x1 = _mm_aesenc_si128(x1, sk[10]);
+			x0 = _mm_aesenc_si128(x0, sk[11]);
+			x1 = _mm_aesenc_si128(x1, sk[11]);
+			x0 = _mm_aesenclast_si128(x0, sk[12]);
+			x1 = _mm_aesenclast_si128(x1, sk[12]);
+		} else {
+			x0 = _mm_aesenc_si128(x0, sk[10]);
+			x1 = _mm_aesenc_si128(x1, sk[10]);
+			x0 = _mm_aesenc_si128(x0, sk[11]);
+			x1 = _mm_aesenc_si128(x1, sk[11]);
+			x0 = _mm_aesenc_si128(x0, sk[12]);
+			x1 = _mm_aesenc_si128(x1, sk[12]);
+			x0 = _mm_aesenc_si128(x0, sk[13]);
+			x1 = _mm_aesenc_si128(x1, sk[13]);
+			x0 = _mm_aesenclast_si128(x0, sk[14]);
+			x1 = _mm_aesenclast_si128(x1, sk[14]);
+		}
+
+		x0 = _mm_xor_si128(x0, dx);
+		if (first_iter) {
+			cmx = _mm_xor_si128(cmx, x0);
+			first_iter = 0;
+		} else {
+			cmx = _mm_xor_si128(x1, x0);
+		}
+		_mm_storeu_si128((void *)buf, x0);
+
+		buf += 16;
+		len -= 16;
+
+		/*
+		 * Increment the counter value.
+		 */
+		ivx = _mm_add_epi64(ivx, one);
+		ivx = _mm_sub_epi64(ivx,
+			_mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
+
+		/*
+		 * If this was the last iteration, then compute the
+		 * extra block encryption to complete CBC-MAC.
+		 */
+		if (len == 0) {
+			cmx = _mm_xor_si128(cmx, sk[0]);
+			cmx = _mm_aesenc_si128(cmx, sk[1]);
+			cmx = _mm_aesenc_si128(cmx, sk[2]);
+			cmx = _mm_aesenc_si128(cmx, sk[3]);
+			cmx = _mm_aesenc_si128(cmx, sk[4]);
+			cmx = _mm_aesenc_si128(cmx, sk[5]);
+			cmx = _mm_aesenc_si128(cmx, sk[6]);
+			cmx = _mm_aesenc_si128(cmx, sk[7]);
+			cmx = _mm_aesenc_si128(cmx, sk[8]);
+			cmx = _mm_aesenc_si128(cmx, sk[9]);
+			if (num_rounds == 10) {
+				cmx = _mm_aesenclast_si128(cmx, sk[10]);
+			} else if (num_rounds == 12) {
+				cmx = _mm_aesenc_si128(cmx, sk[10]);
+				cmx = _mm_aesenc_si128(cmx, sk[11]);
+				cmx = _mm_aesenclast_si128(cmx, sk[12]);
+			} else {
+				cmx = _mm_aesenc_si128(cmx, sk[10]);
+				cmx = _mm_aesenc_si128(cmx, sk[11]);
+				cmx = _mm_aesenc_si128(cmx, sk[12]);
+				cmx = _mm_aesenc_si128(cmx, sk[13]);
+				cmx = _mm_aesenclast_si128(cmx, sk[14]);
+			}
+			break;
+		}
+	}
+
+	/*
+	 * Write back new counter value and CBC-MAC value.
+	 */
+	_mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
+	_mm_storeu_si128(cbcmac, cmx);
+}
+
+/* see bearssl_block.h */
+BR_TARGET("sse2,sse4.1,aes")
+void
+br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx,
+	void *ctr, void *cbcmac, void *data, size_t len)
+{
+	unsigned char *buf;
+	unsigned num_rounds;
+	__m128i sk[15];
+	__m128i ivx, cmx;
+	__m128i erev, zero, one;
+	unsigned u;
+
+	num_rounds = ctx->num_rounds;
+	for (u = 0; u <= num_rounds; u ++) {
+		sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4)));
+	}
+
+	/*
+	 * Some SSE2 constants.
+	 */
+	erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
+		8, 9, 10, 11, 12, 13, 14, 15);
+	zero = _mm_setzero_si128();
+	one = _mm_set_epi64x(0, 1);
+
+	/*
+	 * Decode the counter in big-endian.
+	 */
+	ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev);
+	cmx = _mm_loadu_si128(cbcmac);
+
+	buf = data;
+	while (len > 0) {
+		__m128i dx, x0, x1;
+
+		/*
+		 * Load initial values:
+		 *   dx   encrypted block of data
+		 *   x0   counter (for CTR encryption)
+		 *   x1   input for CBC-MAC
+		 */
+		dx = _mm_loadu_si128((void *)buf);
+		x0 = _mm_shuffle_epi8(ivx, erev);
+		x1 = _mm_xor_si128(cmx, dx);
+
+		x0 = _mm_xor_si128(x0, sk[0]);
+		x1 = _mm_xor_si128(x1, sk[0]);
+		x0 = _mm_aesenc_si128(x0, sk[1]);
+		x1 = _mm_aesenc_si128(x1, sk[1]);
+		x0 = _mm_aesenc_si128(x0, sk[2]);
+		x1 = _mm_aesenc_si128(x1, sk[2]);
+		x0 = _mm_aesenc_si128(x0, sk[3]);
+		x1 = _mm_aesenc_si128(x1, sk[3]);
+		x0 = _mm_aesenc_si128(x0, sk[4]);
+		x1 = _mm_aesenc_si128(x1, sk[4]);
+		x0 = _mm_aesenc_si128(x0, sk[5]);
+		x1 = _mm_aesenc_si128(x1, sk[5]);
+		x0 = _mm_aesenc_si128(x0, sk[6]);
+		x1 = _mm_aesenc_si128(x1, sk[6]);
+		x0 = _mm_aesenc_si128(x0, sk[7]);
+		x1 = _mm_aesenc_si128(x1, sk[7]);
+		x0 = _mm_aesenc_si128(x0, sk[8]);
+		x1 = _mm_aesenc_si128(x1, sk[8]);
+		x0 = _mm_aesenc_si128(x0, sk[9]);
+		x1 = _mm_aesenc_si128(x1, sk[9]);
+		if (num_rounds == 10) {
+			x0 = _mm_aesenclast_si128(x0, sk[10]);
+			x1 = _mm_aesenclast_si128(x1, sk[10]);
+		} else if (num_rounds == 12) {
+			x0 = _mm_aesenc_si128(x0, sk[10]);
+			x1 = _mm_aesenc_si128(x1, sk[10]);
+			x0 = _mm_aesenc_si128(x0, sk[11]);
+			x1 = _mm_aesenc_si128(x1, sk[11]);
+			x0 = _mm_aesenclast_si128(x0, sk[12]);
+			x1 = _mm_aesenclast_si128(x1, sk[12]);
+		} else {
+			x0 = _mm_aesenc_si128(x0, sk[10]);
+			x1 = _mm_aesenc_si128(x1, sk[10]);
+			x0 = _mm_aesenc_si128(x0, sk[11]);
+			x1 = _mm_aesenc_si128(x1, sk[11]);
+			x0 = _mm_aesenc_si128(x0, sk[12]);
+			x1 = _mm_aesenc_si128(x1, sk[12]);
+			x0 = _mm_aesenc_si128(x0, sk[13]);
+			x1 = _mm_aesenc_si128(x1, sk[13]);
+			x0 = _mm_aesenclast_si128(x0, sk[14]);
+			x1 = _mm_aesenclast_si128(x1, sk[14]);
+		}
+		x0 = _mm_xor_si128(x0, dx);
+		cmx = x1;
+		_mm_storeu_si128((void *)buf, x0);
+
+		buf += 16;
+		len -= 16;
+
+		/*
+		 * Increment the counter value.
+		 */
+		ivx = _mm_add_epi64(ivx, one);
+		ivx = _mm_sub_epi64(ivx,
+			_mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8));
+	}
+
+	/*
+	 * Write back new counter value and CBC-MAC value.
+	 */
+	_mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev));
+	_mm_storeu_si128(cbcmac, cmx);
+}
+
+BR_TARGETS_X86_DOWN
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = {
+	sizeof(br_aes_x86ni_ctrcbc_keys),
+	16,
+	4,
+	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
+		&br_aes_x86ni_ctrcbc_init,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_x86ni_ctrcbc_encrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, void *, size_t))
+		&br_aes_x86ni_ctrcbc_decrypt,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, void *, size_t))
+		&br_aes_x86ni_ctrcbc_ctr,
+	(void (*)(const br_block_ctrcbc_class *const *,
+		void *, const void *, size_t))
+		&br_aes_x86ni_ctrcbc_mac
+};
+
+#else
+
+/* see bearssl_block.h */
+const br_block_ctrcbc_class *
+br_aes_x86ni_ctrcbc_get_vtable(void)
+{
+	return NULL;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/chacha20_ct.c b/test/monniaux/BearSSL/src/symcipher/chacha20_ct.c
new file mode 100644
index 00000000..9961eb11
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/chacha20_ct.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+uint32_t
+br_chacha20_ct_run(const void *key,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+	uint32_t kw[8], ivw[3];
+	size_t u;
+
+	static const uint32_t CW[] = {
+		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+	};
+
+	buf = data;
+	for (u = 0; u < 8; u ++) {
+		kw[u] = br_dec32le((const unsigned char *)key + (u << 2));
+	}
+	for (u = 0; u < 3; u ++) {
+		ivw[u] = br_dec32le((const unsigned char *)iv + (u << 2));
+	}
+	while (len > 0) {
+		uint32_t state[16];
+		int i;
+		size_t clen;
+		unsigned char tmp[64];
+
+		memcpy(&state[0], CW, sizeof CW);
+		memcpy(&state[4], kw, sizeof kw);
+		state[12] = cc;
+		memcpy(&state[13], ivw, sizeof ivw);
+		for (i = 0; i < 10; i ++) {
+
+#define QROUND(a, b, c, d)   do { \
+		state[a] += state[b]; \
+		state[d] ^= state[a]; \
+		state[d] = (state[d] << 16) | (state[d] >> 16); \
+		state[c] += state[d]; \
+		state[b] ^= state[c]; \
+		state[b] = (state[b] << 12) | (state[b] >> 20); \
+		state[a] += state[b]; \
+		state[d] ^= state[a]; \
+		state[d] = (state[d] <<  8) | (state[d] >> 24); \
+		state[c] += state[d]; \
+		state[b] ^= state[c]; \
+		state[b] = (state[b] <<  7) | (state[b] >> 25); \
+	} while (0)
+
+			QROUND( 0,  4,  8, 12);
+			QROUND( 1,  5,  9, 13);
+			QROUND( 2,  6, 10, 14);
+			QROUND( 3,  7, 11, 15);
+			QROUND( 0,  5, 10, 15);
+			QROUND( 1,  6, 11, 12);
+			QROUND( 2,  7,  8, 13);
+			QROUND( 3,  4,  9, 14);
+
+#undef QROUND
+
+		}
+		for (u = 0; u < 4; u ++) {
+			br_enc32le(&tmp[u << 2], state[u] + CW[u]);
+		}
+		for (u = 4; u < 12; u ++) {
+			br_enc32le(&tmp[u << 2], state[u] + kw[u - 4]);
+		}
+		br_enc32le(&tmp[48], state[12] + cc);
+		for (u = 13; u < 16; u ++) {
+			br_enc32le(&tmp[u << 2], state[u] + ivw[u - 13]);
+		}
+
+		clen = len < 64 ? len : 64;
+		for (u = 0; u < clen; u ++) {
+			buf[u] ^= tmp[u];
+		}
+		buf += clen;
+		len -= clen;
+		cc ++;
+	}
+	return cc;
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/chacha20_sse2.c b/test/monniaux/BearSSL/src/symcipher/chacha20_sse2.c
new file mode 100644
index 00000000..92b4a4a8
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/chacha20_sse2.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_ENABLE_INTRINSICS   1
+#include "inner.h"
+
+#if BR_SSE2
+
+/*
+ * This file contains a ChaCha20 implementation that leverages SSE2
+ * opcodes for better performance.
+ */
+
+/* see bearssl_block.h */
+br_chacha20_run
+br_chacha20_sse2_get(void)
+{
+	/*
+	 * If using 64-bit mode, then SSE2 opcodes should be automatically
+	 * available, since they are part of the ABI.
+	 *
+	 * In 32-bit mode, we use CPUID to detect the SSE2 feature.
+	 */
+
+#if BR_amd64
+	return &br_chacha20_sse2_run;
+#else
+
+	/*
+	 * SSE2 support is indicated by bit 26 in EDX.
+	 */
+	if (br_cpuid(0, 0, 0, 0x04000000)) {
+		return &br_chacha20_sse2_run;
+	} else {
+		return 0;
+	}
+#endif
+}
+
+BR_TARGETS_X86_UP
+
+/* see bearssl_block.h */
+BR_TARGET("sse2")
+uint32_t
+br_chacha20_sse2_run(const void *key,
+	const void *iv, uint32_t cc, void *data, size_t len)
+{
+	unsigned char *buf;
+	uint32_t ivtmp[4];
+	__m128i kw0, kw1;
+	__m128i iw, cw;
+	__m128i one;
+
+	static const uint32_t CW[] = {
+		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
+	};
+
+	buf = data;
+	kw0 = _mm_loadu_si128(key);
+	kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
+	ivtmp[0] = cc;
+	memcpy(ivtmp + 1, iv, 12);
+	iw = _mm_loadu_si128((const void *)ivtmp);
+	cw = _mm_loadu_si128((const void *)CW);
+	one = _mm_set_epi32(0, 0, 0, 1);
+
+	while (len > 0) {
+		/*
+		 * sj contains state words 4*j to 4*j+3.
+		 */
+		__m128i s0, s1, s2, s3;
+		int i;
+
+		s0 = cw;
+		s1 = kw0;
+		s2 = kw1;
+		s3 = iw;
+		for (i = 0; i < 10; i ++) {
+			/*
+			 * Even round is straightforward application on
+			 * the state words.
+			 */
+			s0 = _mm_add_epi32(s0, s1);
+			s3 = _mm_xor_si128(s3, s0);
+			s3 = _mm_or_si128(
+				_mm_slli_epi32(s3, 16),
+				_mm_srli_epi32(s3, 16));
+
+			s2 = _mm_add_epi32(s2, s3);
+			s1 = _mm_xor_si128(s1, s2);
+			s1 = _mm_or_si128(
+				_mm_slli_epi32(s1, 12),
+				_mm_srli_epi32(s1, 20));
+
+			s0 = _mm_add_epi32(s0, s1);
+			s3 = _mm_xor_si128(s3, s0);
+			s3 = _mm_or_si128(
+				_mm_slli_epi32(s3, 8),
+				_mm_srli_epi32(s3, 24));
+
+			s2 = _mm_add_epi32(s2, s3);
+			s1 = _mm_xor_si128(s1, s2);
+			s1 = _mm_or_si128(
+				_mm_slli_epi32(s1, 7),
+				_mm_srli_epi32(s1, 25));
+
+			/*
+			 * For the odd round, we must rotate some state
+			 * words so that the computations apply on the
+			 * right combinations of words.
+			 */
+			s1 = _mm_shuffle_epi32(s1, 0x39);
+			s2 = _mm_shuffle_epi32(s2, 0x4E);
+			s3 = _mm_shuffle_epi32(s3, 0x93);
+
+			s0 = _mm_add_epi32(s0, s1);
+			s3 = _mm_xor_si128(s3, s0);
+			s3 = _mm_or_si128(
+				_mm_slli_epi32(s3, 16),
+				_mm_srli_epi32(s3, 16));
+
+			s2 = _mm_add_epi32(s2, s3);
+			s1 = _mm_xor_si128(s1, s2);
+			s1 = _mm_or_si128(
+				_mm_slli_epi32(s1, 12),
+				_mm_srli_epi32(s1, 20));
+
+			s0 = _mm_add_epi32(s0, s1);
+			s3 = _mm_xor_si128(s3, s0);
+			s3 = _mm_or_si128(
+				_mm_slli_epi32(s3, 8),
+				_mm_srli_epi32(s3, 24));
+
+			s2 = _mm_add_epi32(s2, s3);
+			s1 = _mm_xor_si128(s1, s2);
+			s1 = _mm_or_si128(
+				_mm_slli_epi32(s1, 7),
+				_mm_srli_epi32(s1, 25));
+
+			/*
+			 * After the odd round, we rotate back the values
+			 * to undo the rotate at the start of the odd round.
+			 */
+			s1 = _mm_shuffle_epi32(s1, 0x93);
+			s2 = _mm_shuffle_epi32(s2, 0x4E);
+			s3 = _mm_shuffle_epi32(s3, 0x39);
+		}
+
+		/*
+		 * Addition with the initial state.
+		 */
+		s0 = _mm_add_epi32(s0, cw);
+		s1 = _mm_add_epi32(s1, kw0);
+		s2 = _mm_add_epi32(s2, kw1);
+		s3 = _mm_add_epi32(s3, iw);
+
+		/*
+		 * Increment block counter.
+		 */
+		iw = _mm_add_epi32(iw, one);
+
+		/*
+		 * XOR final state with the data.
+		 */
+		if (len < 64) {
+			unsigned char tmp[64];
+			size_t u;
+
+			_mm_storeu_si128((void *)(tmp +  0), s0);
+			_mm_storeu_si128((void *)(tmp + 16), s1);
+			_mm_storeu_si128((void *)(tmp + 32), s2);
+			_mm_storeu_si128((void *)(tmp + 48), s3);
+			for (u = 0; u < len; u ++) {
+				buf[u] ^= tmp[u];
+			}
+			break;
+		} else {
+			__m128i b0, b1, b2, b3;
+
+			b0 = _mm_loadu_si128((const void *)(buf +  0));
+			b1 = _mm_loadu_si128((const void *)(buf + 16));
+			b2 = _mm_loadu_si128((const void *)(buf + 32));
+			b3 = _mm_loadu_si128((const void *)(buf + 48));
+			b0 = _mm_xor_si128(b0, s0);
+			b1 = _mm_xor_si128(b1, s1);
+			b2 = _mm_xor_si128(b2, s2);
+			b3 = _mm_xor_si128(b3, s3);
+			_mm_storeu_si128((void *)(buf +  0), b0);
+			_mm_storeu_si128((void *)(buf + 16), b1);
+			_mm_storeu_si128((void *)(buf + 32), b2);
+			_mm_storeu_si128((void *)(buf + 48), b3);
+			buf += 64;
+			len -= 64;
+		}
+	}
+
+	/*
+	 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
+	 * raw SSE2, thus we use _mm_extract_epi16().
+	 */
+	return (uint32_t)_mm_extract_epi16(iw, 0)
+		| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
+}
+
+BR_TARGETS_X86_DOWN
+
+#else
+
+/* see bearssl_block.h */
+br_chacha20_run
+br_chacha20_sse2_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/des_ct.c b/test/monniaux/BearSSL/src/symcipher/des_ct.c
new file mode 100644
index 00000000..581c0ab2
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/des_ct.c
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * During key schedule, we need to apply bit extraction PC-2 then permute
+ * things into our bitslice representation. PC-2 extracts 48 bits out
+ * of two 28-bit words (kl and kr), and we store these bits into two
+ * 32-bit words sk0 and sk1.
+ *
+ *  -- bit 16+x of sk0 comes from bit QL0[x] of kl
+ *  -- bit x of sk0 comes from bit QR0[x] of kr
+ *  -- bit 16+x of sk1 comes from bit QL1[x] of kl
+ *  -- bit x of sk1 comes from bit QR1[x] of kr
+ */
+
+static const unsigned char QL0[] = {
+	17,  4, 27, 23, 13, 22,  7, 18,
+	16, 24,  2, 20,  1,  8, 15, 26
+};
+
+static const unsigned char QR0[] = {
+	25, 19,  9,  1,  5, 11, 23,  8,
+	17,  0, 22,  3,  6, 20, 27, 24
+};
+
+static const unsigned char QL1[] = {
+	28, 28, 14, 11, 28, 28, 25,  0,
+	28, 28,  5,  9, 28, 28, 12, 21
+};
+
+static const unsigned char QR1[] = {
+	28, 28, 15,  4, 28, 28, 26, 16,
+	28, 28, 12,  7, 28, 28, 10, 14
+};
+
+/*
+ * 32-bit rotation. The C compiler is supposed to recognize it as a
+ * rotation and use the local architecture rotation opcode (if available).
+ */
+static inline uint32_t
+rotl(uint32_t x, int n)
+{
+	return (x << n) | (x >> (32 - n));
+}
+
+/*
+ * Compute key schedule for 8 key bytes (produces 32 subkey words).
+ */
+static void
+keysched_unit(uint32_t *skey, const void *key)
+{
+	int i;
+
+	br_des_keysched_unit(skey, key);
+
+	/*
+	 * Apply PC-2 + bitslicing.
+	 */
+	for (i = 0; i < 16; i ++) {
+		uint32_t kl, kr, sk0, sk1;
+		int j;
+
+		kl = skey[(i << 1) + 0];
+		kr = skey[(i << 1) + 1];
+		sk0 = 0;
+		sk1 = 0;
+		for (j = 0; j < 16; j ++) {
+			sk0 <<= 1;
+			sk1 <<= 1;
+			sk0 |= ((kl >> QL0[j]) & (uint32_t)1) << 16;
+			sk0 |= (kr >> QR0[j]) & (uint32_t)1;
+			sk1 |= ((kl >> QL1[j]) & (uint32_t)1) << 16;
+			sk1 |= (kr >> QR1[j]) & (uint32_t)1;
+		}
+
+		skey[(i << 1) + 0] = sk0;
+		skey[(i << 1) + 1] = sk1;
+	}
+
+#if 0
+		/*
+		 * Speed-optimized version for PC-2 + bitslicing.
+		 * (Unused. Kept for reference only.)
+		 */
+		sk0 = kl & (uint32_t)0x00100000;
+		sk0 |= (kl & (uint32_t)0x08008000) << 2;
+		sk0 |= (kl & (uint32_t)0x00400000) << 4;
+		sk0 |= (kl & (uint32_t)0x00800000) << 5;
+		sk0 |= (kl & (uint32_t)0x00040000) << 6;
+		sk0 |= (kl & (uint32_t)0x00010000) << 7;
+		sk0 |= (kl & (uint32_t)0x00000100) << 10;
+		sk0 |= (kl & (uint32_t)0x00022000) << 14;
+		sk0 |= (kl & (uint32_t)0x00000082) << 18;
+		sk0 |= (kl & (uint32_t)0x00000004) << 19;
+		sk0 |= (kl & (uint32_t)0x04000000) >> 10;
+		sk0 |= (kl & (uint32_t)0x00000010) << 26;
+		sk0 |= (kl & (uint32_t)0x01000000) >> 2;
+
+		sk0 |= kr & (uint32_t)0x00000100;
+		sk0 |= (kr & (uint32_t)0x00000008) << 1;
+		sk0 |= (kr & (uint32_t)0x00000200) << 4;
+		sk0 |= rotl(kr & (uint32_t)0x08000021, 6);
+		sk0 |= (kr & (uint32_t)0x01000000) >> 24;
+		sk0 |= (kr & (uint32_t)0x00000002) << 11;
+		sk0 |= (kr & (uint32_t)0x00100000) >> 18;
+		sk0 |= (kr & (uint32_t)0x00400000) >> 17;
+		sk0 |= (kr & (uint32_t)0x00800000) >> 14;
+		sk0 |= (kr & (uint32_t)0x02020000) >> 10;
+		sk0 |= (kr & (uint32_t)0x00080000) >> 5;
+		sk0 |= (kr & (uint32_t)0x00000040) >> 3;
+		sk0 |= (kr & (uint32_t)0x00000800) >> 1;
+
+		sk1 = kl & (uint32_t)0x02000000;
+		sk1 |= (kl & (uint32_t)0x00001000) << 5;
+		sk1 |= (kl & (uint32_t)0x00000200) << 11;
+		sk1 |= (kl & (uint32_t)0x00004000) << 15;
+		sk1 |= (kl & (uint32_t)0x00000020) << 16;
+		sk1 |= (kl & (uint32_t)0x00000800) << 17;
+		sk1 |= (kl & (uint32_t)0x00000001) << 24;
+		sk1 |= (kl & (uint32_t)0x00200000) >> 5;
+
+		sk1 |= (kr & (uint32_t)0x00000010) << 8;
+		sk1 |= (kr & (uint32_t)0x04000000) >> 17;
+		sk1 |= (kr & (uint32_t)0x00004000) >> 14;
+		sk1 |= (kr & (uint32_t)0x00000400) >> 9;
+		sk1 |= (kr & (uint32_t)0x00010000) >> 8;
+		sk1 |= (kr & (uint32_t)0x00001000) >> 7;
+		sk1 |= (kr & (uint32_t)0x00000080) >> 3;
+		sk1 |= (kr & (uint32_t)0x00008000) >> 2;
+#endif
+}
+
+/* see inner.h */
+unsigned
+br_des_ct_keysched(uint32_t *skey, const void *key, size_t key_len)
+{
+	switch (key_len) {
+	case 8:
+		keysched_unit(skey, key);
+		return 1;
+	case 16:
+		keysched_unit(skey, key);
+		keysched_unit(skey + 32, (const unsigned char *)key + 8);
+		br_des_rev_skey(skey + 32);
+		memcpy(skey + 64, skey, 32 * sizeof *skey);
+		return 3;
+	default:
+		keysched_unit(skey, key);
+		keysched_unit(skey + 32, (const unsigned char *)key + 8);
+		br_des_rev_skey(skey + 32);
+		keysched_unit(skey + 64, (const unsigned char *)key + 16);
+		return 3;
+	}
+}
+
+/*
+ * DES confusion function. This function performs expansion E (32 to
+ * 48 bits), XOR with subkey, S-boxes, and permutation P.
+ */
+static inline uint32_t
+Fconf(uint32_t r0, const uint32_t *sk)
+{
+	/*
+	 * Each 6->4 S-box is virtually turned into four 6->1 boxes; we
+	 * thus end up with 32 boxes that we call "T-boxes" here. We will
+	 * evaluate them with bitslice code.
+	 *
+	 * Each T-box is a circuit of multiplexers (sort of) and thus
+	 * takes 70 inputs: the 6 actual T-box inputs, and 64 constants
+	 * that describe the T-box output for all combinations of the
+	 * 6 inputs. With this model, all T-boxes are identical (with
+	 * distinct inputs) and thus can be executed in parallel with
+	 * bitslice code.
+	 *
+	 * T-boxes are numbered from 0 to 31, in least-to-most
+	 * significant order. Thus, S-box S1 corresponds to T-boxes 31,
+	 * 30, 29 and 28, in that order. T-box 'n' is computed with the
+	 * bits at rank 'n' in the 32-bit words.
+	 *
+	 * Words x0 to x5 contain the T-box inputs 0 to 5.
+	 */
+	uint32_t x0, x1, x2, x3, x4, x5, z0;
+	uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9;
+	uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
+	uint32_t y20, y21, y22, y23, y24, y25, y26, y27, y28, y29;
+	uint32_t y30;
+
+	/*
+	 * Spread input bits over the 6 input words x*.
+	 */
+	x1 = r0 & (uint32_t)0x11111111;
+	x2 = (r0 >> 1) & (uint32_t)0x11111111;
+	x3 = (r0 >> 2) & (uint32_t)0x11111111;
+	x4 = (r0 >> 3) & (uint32_t)0x11111111;
+	x1 = (x1 << 4) - x1;
+	x2 = (x2 << 4) - x2;
+	x3 = (x3 << 4) - x3;
+	x4 = (x4 << 4) - x4;
+	x0 = (x4 << 4) | (x4 >> 28);
+	x5 = (x1 >> 4) | (x1 << 28);
+
+	/*
+	 * XOR with the subkey for this round.
+	 */
+	x0 ^= sk[0];
+	x1 ^= sk[1];
+	x2 ^= sk[2];
+	x3 ^= sk[3];
+	x4 ^= sk[4];
+	x5 ^= sk[5];
+
+	/*
+	 * The T-boxes are done in parallel, since they all use a
+	 * "tree of multiplexer". We use "fake multiplexers":
+	 *
+	 *   y = a ^ (x & b)
+	 *
+	 * computes y as either 'a' (if x == 0) or 'a ^ b' (if x == 1).
+	 */
+	y0 = (uint32_t)0xEFA72C4D ^ (x0 & (uint32_t)0xEC7AC69C);
+	y1 = (uint32_t)0xAEAAEDFF ^ (x0 & (uint32_t)0x500FB821);
+	y2 = (uint32_t)0x37396665 ^ (x0 & (uint32_t)0x40EFA809);
+	y3 = (uint32_t)0x68D7B833 ^ (x0 & (uint32_t)0xA5EC0B28);
+	y4 = (uint32_t)0xC9C755BB ^ (x0 & (uint32_t)0x252CF820);
+	y5 = (uint32_t)0x73FC3606 ^ (x0 & (uint32_t)0x40205801);
+	y6 = (uint32_t)0xA2A0A918 ^ (x0 & (uint32_t)0xE220F929);
+	y7 = (uint32_t)0x8222BD90 ^ (x0 & (uint32_t)0x44A3F9E1);
+	y8 = (uint32_t)0xD6B6AC77 ^ (x0 & (uint32_t)0x794F104A);
+	y9 = (uint32_t)0x3069300C ^ (x0 & (uint32_t)0x026F320B);
+	y10 = (uint32_t)0x6CE0D5CC ^ (x0 & (uint32_t)0x7640B01A);
+	y11 = (uint32_t)0x59A9A22D ^ (x0 & (uint32_t)0x238F1572);
+	y12 = (uint32_t)0xAC6D0BD4 ^ (x0 & (uint32_t)0x7A63C083);
+	y13 = (uint32_t)0x21C83200 ^ (x0 & (uint32_t)0x11CCA000);
+	y14 = (uint32_t)0xA0E62188 ^ (x0 & (uint32_t)0x202F69AA);
+	/* y15 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
+	y16 = (uint32_t)0xAF7D655A ^ (x0 & (uint32_t)0x51B33BE9);
+	y17 = (uint32_t)0xF0168AA3 ^ (x0 & (uint32_t)0x3B0FE8AE);
+	y18 = (uint32_t)0x90AA30C6 ^ (x0 & (uint32_t)0x90BF8816);
+	y19 = (uint32_t)0x5AB2750A ^ (x0 & (uint32_t)0x09E34F9B);
+	y20 = (uint32_t)0x5391BE65 ^ (x0 & (uint32_t)0x0103BE88);
+	y21 = (uint32_t)0x93372BAF ^ (x0 & (uint32_t)0x49AC8E25);
+	y22 = (uint32_t)0xF288210C ^ (x0 & (uint32_t)0x922C313D);
+	y23 = (uint32_t)0x920AF5C0 ^ (x0 & (uint32_t)0x70EF31B0);
+	y24 = (uint32_t)0x63D312C0 ^ (x0 & (uint32_t)0x6A707100);
+	y25 = (uint32_t)0x537B3006 ^ (x0 & (uint32_t)0xB97C9011);
+	y26 = (uint32_t)0xA2EFB0A5 ^ (x0 & (uint32_t)0xA320C959);
+	y27 = (uint32_t)0xBC8F96A5 ^ (x0 & (uint32_t)0x6EA0AB4A);
+	y28 = (uint32_t)0xFAD176A5 ^ (x0 & (uint32_t)0x6953DDF8);
+	y29 = (uint32_t)0x665A14A3 ^ (x0 & (uint32_t)0xF74F3E2B);
+	y30 = (uint32_t)0xF2EFF0CC ^ (x0 & (uint32_t)0xF0306CAD);
+	/* y31 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */
+
+	y0 = y0 ^ (x1 & y1);
+	y1 = y2 ^ (x1 & y3);
+	y2 = y4 ^ (x1 & y5);
+	y3 = y6 ^ (x1 & y7);
+	y4 = y8 ^ (x1 & y9);
+	y5 = y10 ^ (x1 & y11);
+	y6 = y12 ^ (x1 & y13);
+	y7 = y14; /* was: y14 ^ (x1 & y15) */
+	y8 = y16 ^ (x1 & y17);
+	y9 = y18 ^ (x1 & y19);
+	y10 = y20 ^ (x1 & y21);
+	y11 = y22 ^ (x1 & y23);
+	y12 = y24 ^ (x1 & y25);
+	y13 = y26 ^ (x1 & y27);
+	y14 = y28 ^ (x1 & y29);
+	y15 = y30; /* was: y30 ^ (x1 & y31) */
+
+	y0 = y0 ^ (x2 & y1);
+	y1 = y2 ^ (x2 & y3);
+	y2 = y4 ^ (x2 & y5);
+	y3 = y6 ^ (x2 & y7);
+	y4 = y8 ^ (x2 & y9);
+	y5 = y10 ^ (x2 & y11);
+	y6 = y12 ^ (x2 & y13);
+	y7 = y14 ^ (x2 & y15);
+
+	y0 = y0 ^ (x3 & y1);
+	y1 = y2 ^ (x3 & y3);
+	y2 = y4 ^ (x3 & y5);
+	y3 = y6 ^ (x3 & y7);
+
+	y0 = y0 ^ (x4 & y1);
+	y1 = y2 ^ (x4 & y3);
+
+	y0 = y0 ^ (x5 & y1);
+
+	/*
+	 * The P permutation:
+	 * -- Each bit move is converted into a mask + left rotation.
+	 * -- Rotations that use the same movement are coalesced together.
+	 * -- Left and right shifts are used as alternatives to a rotation
+	 * where appropriate (this will help architectures that do not have
+	 * a rotation opcode).
+	 */
+	z0 = (y0 & (uint32_t)0x00000004) << 3;
+	z0 |= (y0 & (uint32_t)0x00004000) << 4;
+	z0 |= rotl(y0 & 0x12020120, 5);
+	z0 |= (y0 & (uint32_t)0x00100000) << 6;
+	z0 |= (y0 & (uint32_t)0x00008000) << 9;
+	z0 |= (y0 & (uint32_t)0x04000000) >> 22;
+	z0 |= (y0 & (uint32_t)0x00000001) << 11;
+	z0 |= rotl(y0 & 0x20000200, 12);
+	z0 |= (y0 & (uint32_t)0x00200000) >> 19;
+	z0 |= (y0 & (uint32_t)0x00000040) << 14;
+	z0 |= (y0 & (uint32_t)0x00010000) << 15;
+	z0 |= (y0 & (uint32_t)0x00000002) << 16;
+	z0 |= rotl(y0 & 0x40801800, 17);
+	z0 |= (y0 & (uint32_t)0x00080000) >> 13;
+	z0 |= (y0 & (uint32_t)0x00000010) << 21;
+	z0 |= (y0 & (uint32_t)0x01000000) >> 10;
+	z0 |= rotl(y0 & 0x88000008, 24);
+	z0 |= (y0 & (uint32_t)0x00000480) >> 7;
+	z0 |= (y0 & (uint32_t)0x00442000) >> 6;
+	return z0;
+}
+
+/*
+ * Process one block through 16 successive rounds, omitting the swap
+ * in the final round.
+ */
+static void
+process_block_unit(uint32_t *pl, uint32_t *pr, const uint32_t *sk_exp)
+{
+	int i;
+	uint32_t l, r;
+
+	l = *pl;
+	r = *pr;
+	for (i = 0; i < 16; i ++) {
+		uint32_t t;
+
+		t = l ^ Fconf(r, sk_exp);
+		l = r;
+		r = t;
+		sk_exp += 6;
+	}
+	*pl = r;
+	*pr = l;
+}
+
+/* see inner.h */
+void
+br_des_ct_process_block(unsigned num_rounds,
+	const uint32_t *sk_exp, void *block)
+{
+	unsigned char *buf;
+	uint32_t l, r;
+
+	buf = block;
+	l = br_dec32be(buf);
+	r = br_dec32be(buf + 4);
+	br_des_do_IP(&l, &r);
+	while (num_rounds -- > 0) {
+		process_block_unit(&l, &r, sk_exp);
+		sk_exp += 96;
+	}
+	br_des_do_invIP(&l, &r);
+	br_enc32be(buf, l);
+	br_enc32be(buf + 4, r);
+}
+
+/* see inner.h */
+void
+br_des_ct_skey_expand(uint32_t *sk_exp,
+	unsigned num_rounds, const uint32_t *skey)
+{
+	num_rounds <<= 4;
+	while (num_rounds -- > 0) {
+		uint32_t v, w0, w1, w2, w3;
+
+		v = *skey ++;
+		w0 = v & 0x11111111;
+		w1 = (v >> 1) & 0x11111111;
+		w2 = (v >> 2) & 0x11111111;
+		w3 = (v >> 3) & 0x11111111;
+		*sk_exp ++ = (w0 << 4) - w0;
+		*sk_exp ++ = (w1 << 4) - w1;
+		*sk_exp ++ = (w2 << 4) - w2;
+		*sk_exp ++ = (w3 << 4) - w3;
+		v = *skey ++;
+		w0 = v & 0x11111111;
+		w1 = (v >> 1) & 0x11111111;
+		*sk_exp ++ = (w0 << 4) - w0;
+		*sk_exp ++ = (w1 << 4) - w1;
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/des_ct_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/des_ct_cbcdec.c
new file mode 100644
index 00000000..d208a3d2
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/des_ct_cbcdec.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_des_ct_cbcdec_init(br_des_ct_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_des_ct_cbcdec_vtable;
+	ctx->num_rounds = br_des_ct_keysched(ctx->skey, key, len);
+	if (len == 8) {
+		br_des_rev_skey(ctx->skey);
+	} else {
+		int i;
+
+		for (i = 0; i < 48; i += 2) {
+			uint32_t t;
+
+			t = ctx->skey[i];
+			ctx->skey[i] = ctx->skey[94 - i];
+			ctx->skey[94 - i] = t;
+			t = ctx->skey[i + 1];
+			ctx->skey[i + 1] = ctx->skey[95 - i];
+			ctx->skey[95 - i] = t;
+		}
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_des_ct_cbcdec_run(const br_des_ct_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+	uint32_t sk_exp[288];
+
+	br_des_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+	ivbuf = iv;
+	buf = data;
+	while (len > 0) {
+		unsigned char tmp[8];
+		int i;
+
+		memcpy(tmp, buf, 8);
+		br_des_ct_process_block(ctx->num_rounds, sk_exp, buf);
+		for (i = 0; i < 8; i ++) {
+			buf[i] ^= ivbuf[i];
+		}
+		memcpy(ivbuf, tmp, 8);
+		buf += 8;
+		len -= 8;
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_des_ct_cbcdec_vtable = {
+	sizeof(br_des_ct_cbcdec_keys),
+	8,
+	3,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_des_ct_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_des_ct_cbcdec_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/des_ct_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/des_ct_cbcenc.c
new file mode 100644
index 00000000..4b3610e0
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/des_ct_cbcenc.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_des_ct_cbcenc_init(br_des_ct_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_des_ct_cbcenc_vtable;
+	ctx->num_rounds = br_des_ct_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_des_ct_cbcenc_run(const br_des_ct_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+	uint32_t sk_exp[288];
+
+	br_des_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey);
+	ivbuf = iv;
+	buf = data;
+	while (len > 0) {
+		int i;
+
+		for (i = 0; i < 8; i ++) {
+			buf[i] ^= ivbuf[i];
+		}
+		br_des_ct_process_block(ctx->num_rounds, sk_exp, buf);
+		memcpy(ivbuf, buf, 8);
+		buf += 8;
+		len -= 8;
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_des_ct_cbcenc_vtable = {
+	sizeof(br_des_ct_cbcenc_keys),
+	8,
+	3,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_des_ct_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_des_ct_cbcenc_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/des_support.c b/test/monniaux/BearSSL/src/symcipher/des_support.c
new file mode 100644
index 00000000..37f6db32
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/des_support.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see inner.h */
+void
+br_des_do_IP(uint32_t *xl, uint32_t *xr)
+{
+	/*
+	 * Permutation algorithm is initially from Richard Outerbridge;
+	 * implementation here is adapted from Crypto++ "des.cpp" file
+	 * (which is in public domain).
+	 */
+	uint32_t l, r, t;
+
+	l = *xl;
+	r = *xr;
+	t = ((l >>  4) ^ r) & (uint32_t)0x0F0F0F0F;
+	r ^= t;
+	l ^= t <<  4;
+	t = ((l >> 16) ^ r) & (uint32_t)0x0000FFFF;
+	r ^= t;
+	l ^= t << 16;
+	t = ((r >>  2) ^ l) & (uint32_t)0x33333333;
+	l ^= t;
+	r ^= t <<  2;
+	t = ((r >>  8) ^ l) & (uint32_t)0x00FF00FF;
+	l ^= t;
+	r ^= t <<  8;
+	t = ((l >>  1) ^ r) & (uint32_t)0x55555555;
+	r ^= t;
+	l ^= t <<  1;
+	*xl = l;
+	*xr = r;
+}
+
+/* see inner.h */
+void
+br_des_do_invIP(uint32_t *xl, uint32_t *xr)
+{
+	/*
+	 * See br_des_do_IP().
+	 */
+	uint32_t l, r, t;
+
+	l = *xl;
+	r = *xr;
+	t = ((l >>  1) ^ r) & 0x55555555;
+	r ^= t;
+	l ^= t <<  1;
+	t = ((r >>  8) ^ l) & 0x00FF00FF;
+	l ^= t;
+	r ^= t <<  8;
+	t = ((r >>  2) ^ l) & 0x33333333;
+	l ^= t;
+	r ^= t <<  2;
+	t = ((l >> 16) ^ r) & 0x0000FFFF;
+	r ^= t;
+	l ^= t << 16;
+	t = ((l >>  4) ^ r) & 0x0F0F0F0F;
+	r ^= t;
+	l ^= t <<  4;
+	*xl = l;
+	*xr = r;
+}
+
+/* see inner.h */
+void
+br_des_keysched_unit(uint32_t *skey, const void *key)
+{
+	uint32_t xl, xr, kl, kr;
+	int i;
+
+	xl = br_dec32be(key);
+	xr = br_dec32be((const unsigned char *)key + 4);
+
+	/*
+	 * Permutation PC-1 is quite similar to the IP permutation.
+	 * Definition of IP (in FIPS 46-3 notations) is:
+	 *   58 50 42 34 26 18 10 2
+	 *   60 52 44 36 28 20 12 4
+	 *   62 54 46 38 30 22 14 6
+	 *   64 56 48 40 32 24 16 8
+	 *   57 49 41 33 25 17  9 1
+	 *   59 51 43 35 27 19 11 3
+	 *   61 53 45 37 29 21 13 5
+	 *   63 55 47 39 31 23 15 7
+	 *
+	 * Definition of PC-1 is:
+	 *   57 49 41 33 25 17  9 1
+	 *   58 50 42 34 26 18 10 2
+	 *   59 51 43 35 27 19 11 3
+	 *   60 52 44 36
+	 *   63 55 47 39 31 23 15 7
+	 *   62 54 46 38 30 22 14 6
+	 *   61 53 45 37 29 21 13 5
+	 *   28 20 12  4
+	 */
+	br_des_do_IP(&xl, &xr);
+	kl = ((xr & (uint32_t)0xFF000000) >> 4)
+		| ((xl & (uint32_t)0xFF000000) >> 12)
+		| ((xr & (uint32_t)0x00FF0000) >> 12)
+		| ((xl & (uint32_t)0x00FF0000) >> 20);
+	kr = ((xr & (uint32_t)0x000000FF) << 20)
+		| ((xl & (uint32_t)0x0000FF00) << 4)
+		| ((xr & (uint32_t)0x0000FF00) >> 4)
+		| ((xl & (uint32_t)0x000F0000) >> 16);
+
+	/*
+	 * For each round, rotate the two 28-bit words kl and kr.
+	 * The extraction of the 48-bit subkey (PC-2) is not done yet.
+	 */
+	for (i = 0; i < 16; i ++) {
+		if ((1 << i) & 0x8103) {
+			kl = (kl << 1) | (kl >> 27);
+			kr = (kr << 1) | (kr >> 27);
+		} else {
+			kl = (kl << 2) | (kl >> 26);
+			kr = (kr << 2) | (kr >> 26);
+		}
+		kl &= (uint32_t)0x0FFFFFFF;
+		kr &= (uint32_t)0x0FFFFFFF;
+		skey[(i << 1) + 0] = kl;
+		skey[(i << 1) + 1] = kr;
+	}
+}
+
+/* see inner.h */
+void
+br_des_rev_skey(uint32_t *skey)
+{
+	int i;
+
+	for (i = 0; i < 16; i += 2) {
+		uint32_t t;
+
+		t = skey[i + 0];
+		skey[i + 0] = skey[30 - i];
+		skey[30 - i] = t;
+		t = skey[i + 1];
+		skey[i + 1] = skey[31 - i];
+		skey[31 - i] = t;
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/des_tab.c b/test/monniaux/BearSSL/src/symcipher/des_tab.c
new file mode 100644
index 00000000..3f8e4f9f
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/des_tab.c
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * PC2left[x] tells where bit x goes when applying PC-2. 'x' is a bit
+ * position in the left rotated key word. Both position are in normal
+ * order (rightmost bit is 0).
+ */
+static const unsigned char PC2left[] = {
+	16,  3,  7, 24, 20, 11, 24,
+	13,  2, 10, 24, 22,  5, 15,
+	23,  1,  9, 21, 12, 24,  6,
+	 4, 14, 18,  8, 17,  0, 19
+};
+
+/*
+ * Similar to PC2left[x], for the right rotated key word.
+ */
+static const unsigned char PC2right[] = {
+	 8, 18, 24,  6, 22, 15,  3,
+	10, 12, 19,  5, 14, 11, 24,
+	 4, 23, 16,  9, 24, 20,  2,
+	24,  7, 13,  0, 21, 17,  1
+};
+
+/*
+ * S-boxes and PC-1 merged.
+ */
+static const uint32_t S1[] = {
+	0x00808200, 0x00000000, 0x00008000, 0x00808202,
+	0x00808002, 0x00008202, 0x00000002, 0x00008000,
+	0x00000200, 0x00808200, 0x00808202, 0x00000200,
+	0x00800202, 0x00808002, 0x00800000, 0x00000002,
+	0x00000202, 0x00800200, 0x00800200, 0x00008200,
+	0x00008200, 0x00808000, 0x00808000, 0x00800202,
+	0x00008002, 0x00800002, 0x00800002, 0x00008002,
+	0x00000000, 0x00000202, 0x00008202, 0x00800000,
+	0x00008000, 0x00808202, 0x00000002, 0x00808000,
+	0x00808200, 0x00800000, 0x00800000, 0x00000200,
+	0x00808002, 0x00008000, 0x00008200, 0x00800002,
+	0x00000200, 0x00000002, 0x00800202, 0x00008202,
+	0x00808202, 0x00008002, 0x00808000, 0x00800202,
+	0x00800002, 0x00000202, 0x00008202, 0x00808200,
+	0x00000202, 0x00800200, 0x00800200, 0x00000000,
+	0x00008002, 0x00008200, 0x00000000, 0x00808002
+};
+
+static const uint32_t S2[] = {
+	0x40084010, 0x40004000, 0x00004000, 0x00084010,
+	0x00080000, 0x00000010, 0x40080010, 0x40004010,
+	0x40000010, 0x40084010, 0x40084000, 0x40000000,
+	0x40004000, 0x00080000, 0x00000010, 0x40080010,
+	0x00084000, 0x00080010, 0x40004010, 0x00000000,
+	0x40000000, 0x00004000, 0x00084010, 0x40080000,
+	0x00080010, 0x40000010, 0x00000000, 0x00084000,
+	0x00004010, 0x40084000, 0x40080000, 0x00004010,
+	0x00000000, 0x00084010, 0x40080010, 0x00080000,
+	0x40004010, 0x40080000, 0x40084000, 0x00004000,
+	0x40080000, 0x40004000, 0x00000010, 0x40084010,
+	0x00084010, 0x00000010, 0x00004000, 0x40000000,
+	0x00004010, 0x40084000, 0x00080000, 0x40000010,
+	0x00080010, 0x40004010, 0x40000010, 0x00080010,
+	0x00084000, 0x00000000, 0x40004000, 0x00004010,
+	0x40000000, 0x40080010, 0x40084010, 0x00084000
+};
+
+static const uint32_t S3[] = {
+	0x00000104, 0x04010100, 0x00000000, 0x04010004,
+	0x04000100, 0x00000000, 0x00010104, 0x04000100,
+	0x00010004, 0x04000004, 0x04000004, 0x00010000,
+	0x04010104, 0x00010004, 0x04010000, 0x00000104,
+	0x04000000, 0x00000004, 0x04010100, 0x00000100,
+	0x00010100, 0x04010000, 0x04010004, 0x00010104,
+	0x04000104, 0x00010100, 0x00010000, 0x04000104,
+	0x00000004, 0x04010104, 0x00000100, 0x04000000,
+	0x04010100, 0x04000000, 0x00010004, 0x00000104,
+	0x00010000, 0x04010100, 0x04000100, 0x00000000,
+	0x00000100, 0x00010004, 0x04010104, 0x04000100,
+	0x04000004, 0x00000100, 0x00000000, 0x04010004,
+	0x04000104, 0x00010000, 0x04000000, 0x04010104,
+	0x00000004, 0x00010104, 0x00010100, 0x04000004,
+	0x04010000, 0x04000104, 0x00000104, 0x04010000,
+	0x00010104, 0x00000004, 0x04010004, 0x00010100
+};
+
+static const uint32_t S4[] = {
+	0x80401000, 0x80001040, 0x80001040, 0x00000040,
+	0x00401040, 0x80400040, 0x80400000, 0x80001000,
+	0x00000000, 0x00401000, 0x00401000, 0x80401040,
+	0x80000040, 0x00000000, 0x00400040, 0x80400000,
+	0x80000000, 0x00001000, 0x00400000, 0x80401000,
+	0x00000040, 0x00400000, 0x80001000, 0x00001040,
+	0x80400040, 0x80000000, 0x00001040, 0x00400040,
+	0x00001000, 0x00401040, 0x80401040, 0x80000040,
+	0x00400040, 0x80400000, 0x00401000, 0x80401040,
+	0x80000040, 0x00000000, 0x00000000, 0x00401000,
+	0x00001040, 0x00400040, 0x80400040, 0x80000000,
+	0x80401000, 0x80001040, 0x80001040, 0x00000040,
+	0x80401040, 0x80000040, 0x80000000, 0x00001000,
+	0x80400000, 0x80001000, 0x00401040, 0x80400040,
+	0x80001000, 0x00001040, 0x00400000, 0x80401000,
+	0x00000040, 0x00400000, 0x00001000, 0x00401040
+};
+
+static const uint32_t S5[] = {
+	0x00000080, 0x01040080, 0x01040000, 0x21000080,
+	0x00040000, 0x00000080, 0x20000000, 0x01040000,
+	0x20040080, 0x00040000, 0x01000080, 0x20040080,
+	0x21000080, 0x21040000, 0x00040080, 0x20000000,
+	0x01000000, 0x20040000, 0x20040000, 0x00000000,
+	0x20000080, 0x21040080, 0x21040080, 0x01000080,
+	0x21040000, 0x20000080, 0x00000000, 0x21000000,
+	0x01040080, 0x01000000, 0x21000000, 0x00040080,
+	0x00040000, 0x21000080, 0x00000080, 0x01000000,
+	0x20000000, 0x01040000, 0x21000080, 0x20040080,
+	0x01000080, 0x20000000, 0x21040000, 0x01040080,
+	0x20040080, 0x00000080, 0x01000000, 0x21040000,
+	0x21040080, 0x00040080, 0x21000000, 0x21040080,
+	0x01040000, 0x00000000, 0x20040000, 0x21000000,
+	0x00040080, 0x01000080, 0x20000080, 0x00040000,
+	0x00000000, 0x20040000, 0x01040080, 0x20000080
+};
+
+static const uint32_t S6[] = {
+	0x10000008, 0x10200000, 0x00002000, 0x10202008,
+	0x10200000, 0x00000008, 0x10202008, 0x00200000,
+	0x10002000, 0x00202008, 0x00200000, 0x10000008,
+	0x00200008, 0x10002000, 0x10000000, 0x00002008,
+	0x00000000, 0x00200008, 0x10002008, 0x00002000,
+	0x00202000, 0x10002008, 0x00000008, 0x10200008,
+	0x10200008, 0x00000000, 0x00202008, 0x10202000,
+	0x00002008, 0x00202000, 0x10202000, 0x10000000,
+	0x10002000, 0x00000008, 0x10200008, 0x00202000,
+	0x10202008, 0x00200000, 0x00002008, 0x10000008,
+	0x00200000, 0x10002000, 0x10000000, 0x00002008,
+	0x10000008, 0x10202008, 0x00202000, 0x10200000,
+	0x00202008, 0x10202000, 0x00000000, 0x10200008,
+	0x00000008, 0x00002000, 0x10200000, 0x00202008,
+	0x00002000, 0x00200008, 0x10002008, 0x00000000,
+	0x10202000, 0x10000000, 0x00200008, 0x10002008
+};
+
+static const uint32_t S7[] = {
+	0x00100000, 0x02100001, 0x02000401, 0x00000000,
+	0x00000400, 0x02000401, 0x00100401, 0x02100400,
+	0x02100401, 0x00100000, 0x00000000, 0x02000001,
+	0x00000001, 0x02000000, 0x02100001, 0x00000401,
+	0x02000400, 0x00100401, 0x00100001, 0x02000400,
+	0x02000001, 0x02100000, 0x02100400, 0x00100001,
+	0x02100000, 0x00000400, 0x00000401, 0x02100401,
+	0x00100400, 0x00000001, 0x02000000, 0x00100400,
+	0x02000000, 0x00100400, 0x00100000, 0x02000401,
+	0x02000401, 0x02100001, 0x02100001, 0x00000001,
+	0x00100001, 0x02000000, 0x02000400, 0x00100000,
+	0x02100400, 0x00000401, 0x00100401, 0x02100400,
+	0x00000401, 0x02000001, 0x02100401, 0x02100000,
+	0x00100400, 0x00000000, 0x00000001, 0x02100401,
+	0x00000000, 0x00100401, 0x02100000, 0x00000400,
+	0x02000001, 0x02000400, 0x00000400, 0x00100001
+};
+
+static const uint32_t S8[] = {
+	0x08000820, 0x00000800, 0x00020000, 0x08020820,
+	0x08000000, 0x08000820, 0x00000020, 0x08000000,
+	0x00020020, 0x08020000, 0x08020820, 0x00020800,
+	0x08020800, 0x00020820, 0x00000800, 0x00000020,
+	0x08020000, 0x08000020, 0x08000800, 0x00000820,
+	0x00020800, 0x00020020, 0x08020020, 0x08020800,
+	0x00000820, 0x00000000, 0x00000000, 0x08020020,
+	0x08000020, 0x08000800, 0x00020820, 0x00020000,
+	0x00020820, 0x00020000, 0x08020800, 0x00000800,
+	0x00000020, 0x08020020, 0x00000800, 0x00020820,
+	0x08000800, 0x00000020, 0x08000020, 0x08020000,
+	0x08020020, 0x08000000, 0x00020000, 0x08000820,
+	0x00000000, 0x08020820, 0x00020020, 0x08000020,
+	0x08020000, 0x08000800, 0x08000820, 0x00000000,
+	0x08020820, 0x00020800, 0x00020800, 0x00000820,
+	0x00000820, 0x00020020, 0x08000000, 0x08020800
+};
+
+static inline uint32_t
+Fconf(uint32_t r0, uint32_t skl, uint32_t skr)
+{
+	uint32_t r1;
+
+	r1 = (r0 << 16) | (r0 >> 16);
+	return
+		  S1[((r1 >> 11) ^ (skl >> 18)) & 0x3F]
+		| S2[((r0 >> 23) ^ (skl >> 12)) & 0x3F]
+		| S3[((r0 >> 19) ^ (skl >>  6)) & 0x3F]
+		| S4[((r0 >> 15) ^ (skl      )) & 0x3F]
+		| S5[((r0 >> 11) ^ (skr >> 18)) & 0x3F]
+		| S6[((r0 >>  7) ^ (skr >> 12)) & 0x3F]
+		| S7[((r0 >>  3) ^ (skr >>  6)) & 0x3F]
+		| S8[((r1 >> 15) ^ (skr      )) & 0x3F];
+}
+
+static void
+process_block_unit(uint32_t *pl, uint32_t *pr, const uint32_t *skey)
+{
+	int i;
+	uint32_t l, r;
+
+	l = *pl;
+	r = *pr;
+	for (i = 0; i < 16; i ++) {
+		uint32_t t;
+
+		t = l ^ Fconf(r, skey[(i << 1) + 0], skey[(i << 1) + 1]);
+		l = r;
+		r = t;
+	}
+	*pl = r;
+	*pr = l;
+}
+
+/* see inner.h */
+void
+br_des_tab_process_block(unsigned num_rounds, const uint32_t *skey, void *block)
+{
+	unsigned char *buf;
+	uint32_t l, r;
+
+	buf = block;
+	l = br_dec32be(buf);
+	r = br_dec32be(buf + 4);
+	br_des_do_IP(&l, &r);
+	while (num_rounds -- > 0) {
+		process_block_unit(&l, &r, skey);
+		skey += 32;
+	}
+	br_des_do_invIP(&l, &r);
+	br_enc32be(buf, l);
+	br_enc32be(buf + 4, r);
+}
+
+static void
+keysched_unit(uint32_t *skey, const void *key)
+{
+	int i;
+
+	br_des_keysched_unit(skey, key);
+
+	/*
+	 * Apply PC-2 to get the 48-bit subkeys.
+	 */
+	for (i = 0; i < 16; i ++) {
+		uint32_t xl, xr, ul, ur;
+		int j;
+
+		xl = skey[(i << 1) + 0];
+		xr = skey[(i << 1) + 1];
+		ul = 0;
+		ur = 0;
+		for (j = 0; j < 28; j ++) {
+			ul |= (xl & 1) << PC2left[j];
+			ur |= (xr & 1) << PC2right[j];
+			xl >>= 1;
+			xr >>= 1;
+		}
+		skey[(i << 1) + 0] = ul;
+		skey[(i << 1) + 1] = ur;
+	}
+}
+
+/* see inner.h */
+unsigned
+br_des_tab_keysched(uint32_t *skey, const void *key, size_t key_len)
+{
+	switch (key_len) {
+	case 8:
+		keysched_unit(skey, key);
+		return 1;
+	case 16:
+		keysched_unit(skey, key);
+		keysched_unit(skey + 32, (const unsigned char *)key + 8);
+		br_des_rev_skey(skey + 32);
+		memcpy(skey + 64, skey, 32 * sizeof *skey);
+		return 3;
+	default:
+		keysched_unit(skey, key);
+		keysched_unit(skey + 32, (const unsigned char *)key + 8);
+		br_des_rev_skey(skey + 32);
+		keysched_unit(skey + 64, (const unsigned char *)key + 16);
+		return 3;
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/des_tab_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/des_tab_cbcdec.c
new file mode 100644
index 00000000..e7eabe9d
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/des_tab_cbcdec.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_des_tab_cbcdec_init(br_des_tab_cbcdec_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_des_tab_cbcdec_vtable;
+	ctx->num_rounds = br_des_tab_keysched(ctx->skey, key, len);
+	if (len == 8) {
+		br_des_rev_skey(ctx->skey);
+	} else {
+		int i;
+
+		for (i = 0; i < 48; i += 2) {
+			uint32_t t;
+
+			t = ctx->skey[i];
+			ctx->skey[i] = ctx->skey[94 - i];
+			ctx->skey[94 - i] = t;
+			t = ctx->skey[i + 1];
+			ctx->skey[i + 1] = ctx->skey[95 - i];
+			ctx->skey[95 - i] = t;
+		}
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_des_tab_cbcdec_run(const br_des_tab_cbcdec_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+
+	ivbuf = iv;
+	buf = data;
+	while (len > 0) {
+		unsigned char tmp[8];
+		int i;
+
+		memcpy(tmp, buf, 8);
+		br_des_tab_process_block(ctx->num_rounds, ctx->skey, buf);
+		for (i = 0; i < 8; i ++) {
+			buf[i] ^= ivbuf[i];
+		}
+		memcpy(ivbuf, tmp, 8);
+		buf += 8;
+		len -= 8;
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcdec_class br_des_tab_cbcdec_vtable = {
+	sizeof(br_des_tab_cbcdec_keys),
+	8,
+	3,
+	(void (*)(const br_block_cbcdec_class **, const void *, size_t))
+		&br_des_tab_cbcdec_init,
+	(void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t))
+		&br_des_tab_cbcdec_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/des_tab_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/des_tab_cbcenc.c
new file mode 100644
index 00000000..3a45ba3e
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/des_tab_cbcenc.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/* see bearssl_block.h */
+void
+br_des_tab_cbcenc_init(br_des_tab_cbcenc_keys *ctx,
+	const void *key, size_t len)
+{
+	ctx->vtable = &br_des_tab_cbcenc_vtable;
+	ctx->num_rounds = br_des_tab_keysched(ctx->skey, key, len);
+}
+
+/* see bearssl_block.h */
+void
+br_des_tab_cbcenc_run(const br_des_tab_cbcenc_keys *ctx,
+	void *iv, void *data, size_t len)
+{
+	unsigned char *buf, *ivbuf;
+
+	ivbuf = iv;
+	buf = data;
+	while (len > 0) {
+		int i;
+
+		for (i = 0; i < 8; i ++) {
+			buf[i] ^= ivbuf[i];
+		}
+		br_des_tab_process_block(ctx->num_rounds, ctx->skey, buf);
+		memcpy(ivbuf, buf, 8);
+		buf += 8;
+		len -= 8;
+	}
+}
+
+/* see bearssl_block.h */
+const br_block_cbcenc_class br_des_tab_cbcenc_vtable = {
+	sizeof(br_des_tab_cbcenc_keys),
+	8,
+	3,
+	(void (*)(const br_block_cbcenc_class **, const void *, size_t))
+		&br_des_tab_cbcenc_init,
+	(void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t))
+		&br_des_tab_cbcenc_run
+};
diff --git a/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul.c b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul.c
new file mode 100644
index 00000000..150e610a
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Perform the inner processing of blocks for Poly1305. The accumulator
+ * and the r key are provided as arrays of 26-bit words (these words
+ * are allowed to have an extra bit, i.e. use 27 bits).
+ *
+ * On output, all accumulator words fit on 26 bits, except acc[1], which
+ * may be slightly larger (but by a very small amount only).
+ */
+static void
+poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len)
+{
+	/*
+	 * Implementation notes: we split the 130-bit values into five
+	 * 26-bit words. This gives us some space for carries.
+	 *
+	 * This code is inspired from the public-domain code available
+	 * on:
+	 *      https://github.com/floodyberry/poly1305-donna
+	 *
+	 * Since we compute modulo 2^130-5, the "upper words" become
+	 * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
+	 */
+	const unsigned char *buf;
+	uint32_t a0, a1, a2, a3, a4;
+	uint32_t r0, r1, r2, r3, r4;
+	uint32_t u1, u2, u3, u4;
+
+	r0 = r[0];
+	r1 = r[1];
+	r2 = r[2];
+	r3 = r[3];
+	r4 = r[4];
+
+	u1 = r1 * 5;
+	u2 = r2 * 5;
+	u3 = r3 * 5;
+	u4 = r4 * 5;
+
+	a0 = acc[0];
+	a1 = acc[1];
+	a2 = acc[2];
+	a3 = acc[3];
+	a4 = acc[4];
+
+	buf = data;
+	while (len > 0) {
+		uint64_t w0, w1, w2, w3, w4;
+		uint64_t c;
+		unsigned char tmp[16];
+
+		/*
+		 * If there is a partial block, right-pad it with zeros.
+		 */
+		if (len < 16) {
+			memset(tmp, 0, sizeof tmp);
+			memcpy(tmp, buf, len);
+			buf = tmp;
+			len = 16;
+		}
+
+		/*
+		 * Decode next block and apply the "high bit"; that value
+		 * is added to the accumulator.
+		 */
+		a0 += br_dec32le(buf) & 0x03FFFFFF;
+		a1 += (br_dec32le(buf +  3) >> 2) & 0x03FFFFFF;
+		a2 += (br_dec32le(buf +  6) >> 4) & 0x03FFFFFF;
+		a3 += (br_dec32le(buf +  9) >> 6) & 0x03FFFFFF;
+		a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000;
+
+		/*
+		 * Compute multiplication.
+		 */
+#define M(x, y)   ((uint64_t)(x) * (uint64_t)(y))
+
+		w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1);
+		w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2);
+		w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3);
+		w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4);
+		w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0);
+
+#undef M
+		/*
+		 * Perform some (partial) modular reduction. This step is
+		 * enough to keep values in ranges such that there won't
+		 * be carry overflows. Most of the reduction was done in
+		 * the multiplication step (by using the 'u*' values, and
+		 * using the fact that 2^130 = -5 mod p); here we perform
+		 * some carry propagation.
+		 */
+		c = w0 >> 26;
+		a0 = (uint32_t)w0 & 0x3FFFFFF;
+		w1 += c;
+		c = w1 >> 26;
+		a1 = (uint32_t)w1 & 0x3FFFFFF;
+		w2 += c;
+		c = w2 >> 26;
+		a2 = (uint32_t)w2 & 0x3FFFFFF;
+		w3 += c;
+		c = w3 >> 26;
+		a3 = (uint32_t)w3 & 0x3FFFFFF;
+		w4 += c;
+		c = w4 >> 26;
+		a4 = (uint32_t)w4 & 0x3FFFFFF;
+		a0 += (uint32_t)c * 5;
+		a1 += a0 >> 26;
+		a0 &= 0x3FFFFFF;
+
+		buf += 16;
+		len -= 16;
+	}
+
+	acc[0] = a0;
+	acc[1] = a1;
+	acc[2] = a2;
+	acc[3] = a3;
+	acc[4] = a4;
+}
+
+/* see bearssl_block.h */
+void
+br_poly1305_ctmul_run(const void *key, const void *iv,
+	void *data, size_t len, const void *aad, size_t aad_len,
+	void *tag, br_chacha20_run ichacha, int encrypt)
+{
+	unsigned char pkey[32], foot[16];
+	uint32_t r[5], acc[5], cc, ctl, hi;
+	uint64_t w;
+	int i;
+
+	/*
+	 * Compute the MAC key. The 'r' value is the first 16 bytes of
+	 * pkey[].
+	 */
+	memset(pkey, 0, sizeof pkey);
+	ichacha(key, iv, 0, pkey, sizeof pkey);
+
+	/*
+	 * If encrypting, ChaCha20 must run first, followed by Poly1305.
+	 * When decrypting, the operations are reversed.
+	 */
+	if (encrypt) {
+		ichacha(key, iv, 1, data, len);
+	}
+
+	/*
+	 * Run Poly1305. We must process the AAD, then ciphertext, then
+	 * the footer (with the lengths). Note that the AAD and ciphertext
+	 * are meant to be padded with zeros up to the next multiple of 16,
+	 * and the length of the footer is 16 bytes as well.
+	 */
+
+	/*
+	 * Decode the 'r' value into 26-bit words, with the "clamping"
+	 * operation applied.
+	 */
+	r[0] = br_dec32le(pkey) & 0x03FFFFFF;
+	r[1] = (br_dec32le(pkey +  3) >> 2) & 0x03FFFF03;
+	r[2] = (br_dec32le(pkey +  6) >> 4) & 0x03FFC0FF;
+	r[3] = (br_dec32le(pkey +  9) >> 6) & 0x03F03FFF;
+	r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
+
+	/*
+	 * Accumulator is 0.
+	 */
+	memset(acc, 0, sizeof acc);
+
+	/*
+	 * Process the additional authenticated data, ciphertext, and
+	 * footer in due order.
+	 */
+	br_enc64le(foot, (uint64_t)aad_len);
+	br_enc64le(foot + 8, (uint64_t)len);
+	poly1305_inner(acc, r, aad, aad_len);
+	poly1305_inner(acc, r, data, len);
+	poly1305_inner(acc, r, foot, sizeof foot);
+
+	/*
+	 * Finalise modular reduction. This is done with carry propagation
+	 * and applying the '2^130 = -5 mod p' rule. Note that the output
+	 * of poly1035_inner() is already mostly reduced, since only
+	 * acc[1] may be (very slightly) above 2^26. A single loop back
+	 * to acc[1] will be enough to make the value fit in 130 bits.
+	 */
+	cc = 0;
+	for (i = 1; i <= 6; i ++) {
+		int j;
+
+		j = (i >= 5) ? i - 5 : i;
+		acc[j] += cc;
+		cc = acc[j] >> 26;
+		acc[j] &= 0x03FFFFFF;
+	}
+
+	/*
+	 * We may still have a value in the 2^130-5..2^130-1 range, in
+	 * which case we must reduce it again. The code below selects,
+	 * in constant-time, between 'acc' and 'acc-p',
+	 */
+	ctl = GT(acc[0], 0x03FFFFFA);
+	for (i = 1; i < 5; i ++) {
+		ctl &= EQ(acc[i], 0x03FFFFFF);
+	}
+	cc = 5;
+	for (i = 0; i < 5; i ++) {
+		uint32_t t;
+
+		t = (acc[i] + cc);
+		cc = t >> 26;
+		t &= 0x03FFFFFF;
+		acc[i] = MUX(ctl, t, acc[i]);
+	}
+
+	/*
+	 * Convert back the accumulator to 32-bit words, and add the
+	 * 's' value (second half of pkey[]). That addition is done
+	 * modulo 2^128.
+	 */
+	w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16);
+	br_enc32le((unsigned char *)tag, (uint32_t)w);
+	w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20);
+	br_enc32le((unsigned char *)tag + 4, (uint32_t)w);
+	w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24);
+	br_enc32le((unsigned char *)tag + 8, (uint32_t)w);
+	hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28);
+	br_enc32le((unsigned char *)tag + 12, hi);
+
+	/*
+	 * If decrypting, then ChaCha20 runs _after_ Poly1305.
+	 */
+	if (!encrypt) {
+		ichacha(key, iv, 1, data, len);
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul32.c b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul32.c
new file mode 100644
index 00000000..15d9635d
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul32.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * Perform the inner processing of blocks for Poly1305.
+ */
+static void
+poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len)
+{
+	/*
+	 * Implementation notes: we split the 130-bit values into ten
+	 * 13-bit words. This gives us some space for carries and allows
+	 * using only 32x32->32 multiplications, which are way faster than
+	 * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also
+	 * help in making constant-time code on the Cortex-M3.
+	 *
+	 * Since we compute modulo 2^130-5, the "upper words" become
+	 * low words with a factor of 5; that is, x*2^130 = x*5 mod p.
+	 * This has already been integrated in the r[] array, which
+	 * is extended to the 0..18 range.
+	 *
+	 * In each loop iteration, a[] and r[] words are 13-bit each,
+	 * except a[1] which may use 14 bits.
+	 */
+	const unsigned char *buf;
+
+	buf = data;
+	while (len > 0) {
+		unsigned char tmp[16];
+		uint32_t b[10];
+		unsigned u, v;
+		uint32_t z, cc1, cc2;
+
+		/*
+		 * If there is a partial block, right-pad it with zeros.
+		 */
+		if (len < 16) {
+			memset(tmp, 0, sizeof tmp);
+			memcpy(tmp, buf, len);
+			buf = tmp;
+			len = 16;
+		}
+
+		/*
+		 * Decode next block and apply the "high bit"; that value
+		 * is added to the accumulator.
+		 */
+		v = br_dec16le(buf);
+		a[0] += v & 0x01FFF;
+		v >>= 13;
+		v |= buf[2] << 3;
+		v |= buf[3] << 11;
+		a[1] += v & 0x01FFF;
+		v >>= 13;
+		v |= buf[4] << 6;
+		a[2] += v & 0x01FFF;
+		v >>= 13;
+		v |= buf[5] << 1;
+		v |= buf[6] << 9;
+		a[3] += v & 0x01FFF;
+		v >>= 13;
+		v |= buf[7] << 4;
+		v |= buf[8] << 12;
+		a[4] += v & 0x01FFF;
+		v >>= 13;
+		v |= buf[9] << 7;
+		a[5] += v & 0x01FFF;
+		v >>= 13;
+		v |= buf[10] << 2;
+		v |= buf[11] << 10;
+		a[6] += v & 0x01FFF;
+		v >>= 13;
+		v |= buf[12] << 5;
+		a[7] += v & 0x01FFF;
+		v = br_dec16le(buf + 13);
+		a[8] += v & 0x01FFF;
+		v >>= 13;
+		v |= buf[15] << 3;
+		a[9] += v | 0x00800;
+
+		/*
+		 * At that point, all a[] values fit on 14 bits, while
+		 * all r[] values fit on 13 bits. Thus products fit on
+		 * 27 bits, and we can accumulate up to 31 of them in
+		 * a 32-bit word and still have some room for carries.
+		 */
+
+		/*
+		 * Now a[] contains words with values up to 14 bits each.
+		 * We perform the multiplication with r[].
+		 *
+		 * The extended words of r[] may be larger than 13 bits
+		 * (they are 5 times a 13-bit word) so the full summation
+		 * may yield values up to 46 times a 27-bit word, which
+		 * does not fit on a 32-bit word. To avoid that issue, we
+		 * must split the loop below in two, with a carry
+		 * propagation operation in the middle.
+		 */
+		cc1 = 0;
+		for (u = 0; u < 10; u ++) {
+			uint32_t s;
+
+			s = cc1
+				+ MUL15(a[0], r[u + 9 - 0])
+				+ MUL15(a[1], r[u + 9 - 1])
+				+ MUL15(a[2], r[u + 9 - 2])
+				+ MUL15(a[3], r[u + 9 - 3])
+				+ MUL15(a[4], r[u + 9 - 4]);
+			b[u] = s & 0x1FFF;
+			cc1 = s >> 13;
+		}
+		cc2 = 0;
+		for (u = 0; u < 10; u ++) {
+			uint32_t s;
+
+			s = b[u] + cc2
+				+ MUL15(a[5], r[u + 9 - 5])
+				+ MUL15(a[6], r[u + 9 - 6])
+				+ MUL15(a[7], r[u + 9 - 7])
+				+ MUL15(a[8], r[u + 9 - 8])
+				+ MUL15(a[9], r[u + 9 - 9]);
+			b[u] = s & 0x1FFF;
+			cc2 = s >> 13;
+		}
+		memcpy(a, b, sizeof b);
+
+		/*
+		 * The two carries "loop back" with a factor of 5. We
+		 * propagate them into a[0] and a[1].
+		 */
+		z = cc1 + cc2;
+		z += (z << 2) + a[0];
+		a[0] = z & 0x1FFF;
+		a[1] += z >> 13;
+
+		buf += 16;
+		len -= 16;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_poly1305_ctmul32_run(const void *key, const void *iv,
+	void *data, size_t len, const void *aad, size_t aad_len,
+	void *tag, br_chacha20_run ichacha, int encrypt)
+{
+	unsigned char pkey[32], foot[16];
+	uint32_t z, r[19], acc[10], cc, ctl;
+	int i;
+
+	/*
+	 * Compute the MAC key. The 'r' value is the first 16 bytes of
+	 * pkey[].
+	 */
+	memset(pkey, 0, sizeof pkey);
+	ichacha(key, iv, 0, pkey, sizeof pkey);
+
+	/*
+	 * If encrypting, ChaCha20 must run first, followed by Poly1305.
+	 * When decrypting, the operations are reversed.
+	 */
+	if (encrypt) {
+		ichacha(key, iv, 1, data, len);
+	}
+
+	/*
+	 * Run Poly1305. We must process the AAD, then ciphertext, then
+	 * the footer (with the lengths). Note that the AAD and ciphertext
+	 * are meant to be padded with zeros up to the next multiple of 16,
+	 * and the length of the footer is 16 bytes as well.
+	 */
+
+	/*
+	 * Decode the 'r' value into 13-bit words, with the "clamping"
+	 * operation applied.
+	 */
+	z = br_dec32le(pkey) & 0x03FFFFFF;
+	r[9] = z & 0x1FFF;
+	r[10] = z >> 13;
+	z = (br_dec32le(pkey +  3) >> 2) & 0x03FFFF03;
+	r[11] = z & 0x1FFF;
+	r[12] = z >> 13;
+	z = (br_dec32le(pkey +  6) >> 4) & 0x03FFC0FF;
+	r[13] = z & 0x1FFF;
+	r[14] = z >> 13;
+	z = (br_dec32le(pkey +  9) >> 6) & 0x03F03FFF;
+	r[15] = z & 0x1FFF;
+	r[16] = z >> 13;
+	z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF;
+	r[17] = z & 0x1FFF;
+	r[18] = z >> 13;
+
+	/*
+	 * Extend r[] with the 5x factor pre-applied.
+	 */
+	for (i = 0; i < 9; i ++) {
+		r[i] = MUL15(5, r[i + 10]);
+	}
+
+	/*
+	 * Accumulator is 0.
+	 */
+	memset(acc, 0, sizeof acc);
+
+	/*
+	 * Process the additional authenticated data, ciphertext, and
+	 * footer in due order.
+	 */
+	br_enc64le(foot, (uint64_t)aad_len);
+	br_enc64le(foot + 8, (uint64_t)len);
+	poly1305_inner(acc, r, aad, aad_len);
+	poly1305_inner(acc, r, data, len);
+	poly1305_inner(acc, r, foot, sizeof foot);
+
+	/*
+	 * Finalise modular reduction. This is done with carry propagation
+	 * and applying the '2^130 = -5 mod p' rule. Note that the output
+	 * of poly1035_inner() is already mostly reduced, since only
+	 * acc[1] may be (very slightly) above 2^13. A single loop back
+	 * to acc[1] will be enough to make the value fit in 130 bits.
+	 */
+	cc = 0;
+	for (i = 1; i < 10; i ++) {
+		z = acc[i] + cc;
+		acc[i] = z & 0x1FFF;
+		cc = z >> 13;
+	}
+	z = acc[0] + cc + (cc << 2);
+	acc[0] = z & 0x1FFF;
+	acc[1] += z >> 13;
+
+	/*
+	 * We may still have a value in the 2^130-5..2^130-1 range, in
+	 * which case we must reduce it again. The code below selects,
+	 * in constant-time, between 'acc' and 'acc-p',
+	 */
+	ctl = GT(acc[0], 0x1FFA);
+	for (i = 1; i < 10; i ++) {
+		ctl &= EQ(acc[i], 0x1FFF);
+	}
+	acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]);
+	for (i = 1; i < 10; i ++) {
+		acc[i] &= ~(-ctl);
+	}
+
+	/*
+	 * Convert back the accumulator to 32-bit words, and add the
+	 * 's' value (second half of pkey[]). That addition is done
+	 * modulo 2^128.
+	 */
+	z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16);
+	br_enc16le((unsigned char *)tag, z & 0xFFFF);
+	z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18);
+	br_enc16le((unsigned char *)tag + 2, z & 0xFFFF);
+	z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20);
+	br_enc16le((unsigned char *)tag + 4, z & 0xFFFF);
+	z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22);
+	br_enc16le((unsigned char *)tag + 6, z & 0xFFFF);
+	z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24);
+	br_enc16le((unsigned char *)tag + 8, z & 0xFFFF);
+	z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26);
+	br_enc16le((unsigned char *)tag + 10, z & 0xFFFF);
+	z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28);
+	br_enc16le((unsigned char *)tag + 12, z & 0xFFFF);
+	z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30);
+	br_enc16le((unsigned char *)tag + 14, z & 0xFFFF);
+
+	/*
+	 * If decrypting, then ChaCha20 runs _after_ Poly1305.
+	 */
+	if (!encrypt) {
+		ichacha(key, iv, 1, data, len);
+	}
+}
diff --git a/test/monniaux/BearSSL/src/symcipher/poly1305_ctmulq.c b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmulq.c
new file mode 100644
index 00000000..b00683a6
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmulq.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+#if BR_INT128 || BR_UMUL128
+
+#if BR_INT128
+
+#define MUL128(hi, lo, x, y)   do { \
+		unsigned __int128 mul128tmp; \
+		mul128tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \
+		(hi) = (uint64_t)(mul128tmp >> 64); \
+		(lo) = (uint64_t)mul128tmp; \
+	} while (0)
+
+#elif BR_UMUL128
+
+#include <intrin.h>
+
+#define MUL128(hi, lo, x, y)   do { \
+		(lo) = _umul128((x), (y), &(hi)); \
+	} while (0)
+
+#endif
+
+#define MASK42   ((uint64_t)0x000003FFFFFFFFFF)
+#define MASK44   ((uint64_t)0x00000FFFFFFFFFFF)
+
+/*
+ * The "accumulator" word is nominally a 130-bit value. We split it into
+ * words of 44 bits, each held in a 64-bit variable.
+ *
+ * If the current accumulator is a = a0 + a1*W + a2*W^2 (where W = 2^44)
+ * and r = r0 + r1*W + r2*W^2, then:
+ *
+ *   a*r = (a0*r0)
+ *       + (a0*r1 + a1*r0) * W
+ *       + (a0*r2 + a1*r1 + a2*r0) * W^2
+ *       + (a1*r2 + a2*r1) * W^3
+ *       + (a2*r2) * W^4
+ *
+ * We want to reduce that value modulo p = 2^130-5, so W^3 = 20 mod p,
+ * and W^4 = 20*W mod p. Thus, if we define u1 = 20*r1 and u2 = 20*r2,
+ * then the equations above become:
+ *
+ *  b0 = a0*r0 + a1*u2 + a2*u1
+ *  b1 = a0*r1 + a1*r0 + a2*u2
+ *  b2 = a0*r2 + a1*r1 + a2*r0
+ *
+ * In order to make u1 fit in 44 bits, we can change these equations
+ * into:
+ *
+ *  b0 = a0*r0 + a1*u2 + a2*t1
+ *  b1 = a0*r1 + a1*r0 + a2*t2
+ *  b2 = a0*r2 + a1*r1 + a2*r0
+ *
+ * Where t1 is u1 truncated to 44 bits, and t2 is u2 added to the extra
+ * bits of u1. Note that since r is clamped down to a 124-bit value, the
+ * values u2 and t2 fit on 44 bits too.
+ *
+ * The bx values are larger than 44 bits, so we may split them into a
+ * lower half (cx, 44 bits) and an upper half (dx). The new values for
+ * the accumulator are then:
+ *
+ *  e0 = c0 + 20*d2
+ *  e1 = c1 + d0
+ *  e2 = c2 + d1
+ *
+ * The equations allow for some room, i.e. the ax values may be larger
+ * than 44 bits. Similarly, the ex values will usually be larger than
+ * the ax. Thus, some sort of carry propagation must be done regularly,
+ * though not necessarily at each iteration. In particular, we do not
+ * need to compute the additions (for the bx values) over 128-bit
+ * quantities; we can stick to 64-bit computations.
+ *
+ *
+ * Since the 128-bit result of a 64x64 multiplication is actually
+ * represented over two 64-bit registers, it is cheaper to arrange for
+ * any split that happens between the "high" and "low" halves to be on
+ * that 64-bit boundary. This is done by left shifting the rx, ux and tx
+ * by 20 bits (since they all fit on 44 bits each, this shift is
+ * always possible).
+ */
+
+static void
+poly1305_inner_big(uint64_t *acc, uint64_t *r, const void *data, size_t len)
+{
+
+#define MX(hi, lo, m0, m1, m2)   do { \
+		uint64_t mxhi, mxlo; \
+		MUL128(mxhi, mxlo, a0, m0); \
+		(hi) = mxhi; \
+		(lo) = mxlo >> 20; \
+		MUL128(mxhi, mxlo, a1, m1); \
+		(hi) += mxhi; \
+		(lo) += mxlo >> 20; \
+		MUL128(mxhi, mxlo, a2, m2); \
+		(hi) += mxhi; \
+		(lo) += mxlo >> 20; \
+	} while (0)
+
+	const unsigned char *buf;
+	uint64_t a0, a1, a2;
+	uint64_t r0, r1, r2, t1, t2, u2;
+
+	r0 = r[0];
+	r1 = r[1];
+	r2 = r[2];
+	t1 = r[3];
+	t2 = r[4];
+	u2 = r[5];
+	a0 = acc[0];
+	a1 = acc[1];
+	a2 = acc[2];
+	buf = data;
+
+	while (len > 0) {
+		uint64_t v0, v1, v2;
+		uint64_t c0, c1, c2, d0, d1, d2;
+
+		v0 = br_dec64le(buf + 0);
+		v1 = br_dec64le(buf + 8);
+		v2 = v1 >> 24;
+		v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
+		v0 &= MASK44;
+		a0 += v0;
+		a1 += v1;
+		a2 += v2 + ((uint64_t)1 << 40);
+		MX(d0, c0, r0, u2, t1);
+		MX(d1, c1, r1, r0, t2);
+		MX(d2, c2, r2, r1, r0);
+		a0 = c0 + 20 * d2;
+		a1 = c1 + d0;
+		a2 = c2 + d1;
+
+		v0 = br_dec64le(buf + 16);
+		v1 = br_dec64le(buf + 24);
+		v2 = v1 >> 24;
+		v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
+		v0 &= MASK44;
+		a0 += v0;
+		a1 += v1;
+		a2 += v2 + ((uint64_t)1 << 40);
+		MX(d0, c0, r0, u2, t1);
+		MX(d1, c1, r1, r0, t2);
+		MX(d2, c2, r2, r1, r0);
+		a0 = c0 + 20 * d2;
+		a1 = c1 + d0;
+		a2 = c2 + d1;
+
+		v0 = br_dec64le(buf + 32);
+		v1 = br_dec64le(buf + 40);
+		v2 = v1 >> 24;
+		v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
+		v0 &= MASK44;
+		a0 += v0;
+		a1 += v1;
+		a2 += v2 + ((uint64_t)1 << 40);
+		MX(d0, c0, r0, u2, t1);
+		MX(d1, c1, r1, r0, t2);
+		MX(d2, c2, r2, r1, r0);
+		a0 = c0 + 20 * d2;
+		a1 = c1 + d0;
+		a2 = c2 + d1;
+
+		v0 = br_dec64le(buf + 48);
+		v1 = br_dec64le(buf + 56);
+		v2 = v1 >> 24;
+		v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
+		v0 &= MASK44;
+		a0 += v0;
+		a1 += v1;
+		a2 += v2 + ((uint64_t)1 << 40);
+		MX(d0, c0, r0, u2, t1);
+		MX(d1, c1, r1, r0, t2);
+		MX(d2, c2, r2, r1, r0);
+		a0 = c0 + 20 * d2;
+		a1 = c1 + d0;
+		a2 = c2 + d1;
+
+		a1 += a0 >> 44;
+		a0 &= MASK44;
+		a2 += a1 >> 44;
+		a1 &= MASK44;
+		a0 += 20 * (a2 >> 44);
+		a2 &= MASK44;
+
+		buf += 64;
+		len -= 64;
+	}
+	acc[0] = a0;
+	acc[1] = a1;
+	acc[2] = a2;
+
+#undef MX
+}
+
+static void
+poly1305_inner_small(uint64_t *acc, uint64_t *r, const void *data, size_t len)
+{
+	const unsigned char *buf;
+	uint64_t a0, a1, a2;
+	uint64_t r0, r1, r2, t1, t2, u2;
+
+	r0 = r[0];
+	r1 = r[1];
+	r2 = r[2];
+	t1 = r[3];
+	t2 = r[4];
+	u2 = r[5];
+	a0 = acc[0];
+	a1 = acc[1];
+	a2 = acc[2];
+	buf = data;
+
+	while (len > 0) {
+		uint64_t v0, v1, v2;
+		uint64_t c0, c1, c2, d0, d1, d2;
+		unsigned char tmp[16];
+
+		if (len < 16) {
+			memcpy(tmp, buf, len);
+			memset(tmp + len, 0, (sizeof tmp) - len);
+			buf = tmp;
+			len = 16;
+		}
+		v0 = br_dec64le(buf + 0);
+		v1 = br_dec64le(buf + 8);
+
+		v2 = v1 >> 24;
+		v1 = ((v0 >> 44) | (v1 << 20)) & MASK44;
+		v0 &= MASK44;
+
+		a0 += v0;
+		a1 += v1;
+		a2 += v2 + ((uint64_t)1 << 40);
+
+#define MX(hi, lo, m0, m1, m2)   do { \
+		uint64_t mxhi, mxlo; \
+		MUL128(mxhi, mxlo, a0, m0); \
+		(hi) = mxhi; \
+		(lo) = mxlo >> 20; \
+		MUL128(mxhi, mxlo, a1, m1); \
+		(hi) += mxhi; \
+		(lo) += mxlo >> 20; \
+		MUL128(mxhi, mxlo, a2, m2); \
+		(hi) += mxhi; \
+		(lo) += mxlo >> 20; \
+	} while (0)
+
+		MX(d0, c0, r0, u2, t1);
+		MX(d1, c1, r1, r0, t2);
+		MX(d2, c2, r2, r1, r0);
+
+#undef MX
+
+		a0 = c0 + 20 * d2;
+		a1 = c1 + d0;
+		a2 = c2 + d1;
+
+		a1 += a0 >> 44;
+		a0 &= MASK44;
+		a2 += a1 >> 44;
+		a1 &= MASK44;
+		a0 += 20 * (a2 >> 44);
+		a2 &= MASK44;
+
+		buf += 16;
+		len -= 16;
+	}
+	acc[0] = a0;
+	acc[1] = a1;
+	acc[2] = a2;
+}
+
+static inline void
+poly1305_inner(uint64_t *acc, uint64_t *r, const void *data, size_t len)
+{
+	if (len >= 64) {
+		size_t len2;
+
+		len2 = len & ~(size_t)63;
+		poly1305_inner_big(acc, r, data, len2);
+		data = (const unsigned char *)data + len2;
+		len -= len2;
+	}
+	if (len > 0) {
+		poly1305_inner_small(acc, r, data, len);
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_poly1305_ctmulq_run(const void *key, const void *iv,
+	void *data, size_t len, const void *aad, size_t aad_len,
+	void *tag, br_chacha20_run ichacha, int encrypt)
+{
+	unsigned char pkey[32], foot[16];
+	uint64_t r[6], acc[3], r0, r1;
+	uint32_t v0, v1, v2, v3, v4;
+	uint64_t w0, w1, w2, w3;
+	uint32_t ctl;
+
+	/*
+	 * Compute the MAC key. The 'r' value is the first 16 bytes of
+	 * pkey[].
+	 */
+	memset(pkey, 0, sizeof pkey);
+	ichacha(key, iv, 0, pkey, sizeof pkey);
+
+	/*
+	 * If encrypting, ChaCha20 must run first, followed by Poly1305.
+	 * When decrypting, the operations are reversed.
+	 */
+	if (encrypt) {
+		ichacha(key, iv, 1, data, len);
+	}
+
+	/*
+	 * Run Poly1305. We must process the AAD, then ciphertext, then
+	 * the footer (with the lengths). Note that the AAD and ciphertext
+	 * are meant to be padded with zeros up to the next multiple of 16,
+	 * and the length of the footer is 16 bytes as well.
+	 */
+
+	/*
+	 * Apply the "clamping" on r.
+	 */
+	pkey[ 3] &= 0x0F;
+	pkey[ 4] &= 0xFC;
+	pkey[ 7] &= 0x0F;
+	pkey[ 8] &= 0xFC;
+	pkey[11] &= 0x0F;
+	pkey[12] &= 0xFC;
+	pkey[15] &= 0x0F;
+
+	/*
+	 * Decode the 'r' value into 44-bit words, left-shifted by 20 bits.
+	 * Also compute the u1 and u2 values.
+	 */
+	r0 = br_dec64le(pkey +  0);
+	r1 = br_dec64le(pkey +  8);
+	r[0] = r0 << 20;
+	r[1] = ((r0 >> 24) | (r1 << 40)) & ~(uint64_t)0xFFFFF;
+	r[2] = (r1 >> 4) & ~(uint64_t)0xFFFFF;
+	r1 = 20 * (r[1] >> 20);
+	r[3] = r1 << 20;
+	r[5] = 20 * r[2];
+	r[4] = (r[5] + (r1 >> 24)) & ~(uint64_t)0xFFFFF;
+
+	/*
+	 * Accumulator is 0.
+	 */
+	acc[0] = 0;
+	acc[1] = 0;
+	acc[2] = 0;
+
+	/*
+	 * Process the additional authenticated data, ciphertext, and
+	 * footer in due order.
+	 */
+	br_enc64le(foot, (uint64_t)aad_len);
+	br_enc64le(foot + 8, (uint64_t)len);
+	poly1305_inner(acc, r, aad, aad_len);
+	poly1305_inner(acc, r, data, len);
+	poly1305_inner_small(acc, r, foot, sizeof foot);
+
+	/*
+	 * Finalise modular reduction. At that point, the value consists
+	 * in three 44-bit values (the lowest one might be slightly above
+	 * 2^44). Two loops shall be sufficient.
+	 */
+	acc[1] += (acc[0] >> 44);
+	acc[0] &= MASK44;
+	acc[2] += (acc[1] >> 44);
+	acc[1] &= MASK44;
+	acc[0] += 5 * (acc[2] >> 42);
+	acc[2] &= MASK42;
+	acc[1] += (acc[0] >> 44);
+	acc[0] &= MASK44;
+	acc[2] += (acc[1] >> 44);
+	acc[1] &= MASK44;
+	acc[0] += 5 * (acc[2] >> 42);
+	acc[2] &= MASK42;
+
+	/*
+	 * The value may still fall in the 2^130-5..2^130-1 range, in
+	 * which case we must reduce it again. The code below selects,
+	 * in constant-time, between 'acc' and 'acc-p'. We encode the
+	 * value over four 32-bit integers to finish the operation.
+	 */
+	v0 = (uint32_t)acc[0];
+	v1 = (uint32_t)(acc[0] >> 32) | ((uint32_t)acc[1] << 12);
+	v2 = (uint32_t)(acc[1] >> 20) | ((uint32_t)acc[2] << 24);
+	v3 = (uint32_t)(acc[2] >> 8);
+	v4 = (uint32_t)(acc[2] >> 40);
+
+	ctl = GT(v0, 0xFFFFFFFA);
+	ctl &= EQ(v1, 0xFFFFFFFF);
+	ctl &= EQ(v2, 0xFFFFFFFF);
+	ctl &= EQ(v3, 0xFFFFFFFF);
+	ctl &= EQ(v4, 0x00000003);
+	v0 = MUX(ctl, v0 + 5, v0);
+	v1 = MUX(ctl, 0, v1);
+	v2 = MUX(ctl, 0, v2);
+	v3 = MUX(ctl, 0, v3);
+
+	/*
+	 * Add the "s" value. This is done modulo 2^128. Don't forget
+	 * carry propagation...
+	 */
+	w0 = (uint64_t)v0 + (uint64_t)br_dec32le(pkey + 16);
+	w1 = (uint64_t)v1 + (uint64_t)br_dec32le(pkey + 20) + (w0 >> 32);
+	w2 = (uint64_t)v2 + (uint64_t)br_dec32le(pkey + 24) + (w1 >> 32);
+	w3 = (uint64_t)v3 + (uint64_t)br_dec32le(pkey + 28) + (w2 >> 32);
+	v0 = (uint32_t)w0;
+	v1 = (uint32_t)w1;
+	v2 = (uint32_t)w2;
+	v3 = (uint32_t)w3;
+
+	/*
+	 * Encode the tag.
+	 */
+	br_enc32le((unsigned char *)tag +  0, v0);
+	br_enc32le((unsigned char *)tag +  4, v1);
+	br_enc32le((unsigned char *)tag +  8, v2);
+	br_enc32le((unsigned char *)tag + 12, v3);
+
+	/*
+	 * If decrypting, then ChaCha20 runs _after_ Poly1305.
+	 */
+	if (!encrypt) {
+		ichacha(key, iv, 1, data, len);
+	}
+}
+
+/* see bearssl_block.h */
+br_poly1305_run
+br_poly1305_ctmulq_get(void)
+{
+	return &br_poly1305_ctmulq_run;
+}
+
+#else
+
+/* see bearssl_block.h */
+br_poly1305_run
+br_poly1305_ctmulq_get(void)
+{
+	return 0;
+}
+
+#endif
diff --git a/test/monniaux/BearSSL/src/symcipher/poly1305_i15.c b/test/monniaux/BearSSL/src/symcipher/poly1305_i15.c
new file mode 100644
index 00000000..6f892121
--- /dev/null
+++ b/test/monniaux/BearSSL/src/symcipher/poly1305_i15.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "inner.h"
+
+/*
+ * This is a "reference" implementation of Poly1305 that uses the
+ * generic "i15" code for big integers. It is slow, but it handles all
+ * big-integer operations with generic code, thereby avoiding most
+ * tricky situations with carry propagation and modular reduction.
+ */
+
+/*
+ * Modulus: 2^130-5.
+ */
+static const uint16_t P1305[] = {
+	0x008A,
+	0x7FFB, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x03FF
+};
+
+/*
+ * -p mod 2^15.
+ */
+#define P0I   0x4CCD
+
+/*
+ * R^2 mod p, for conversion to Montgomery representation (R = 2^135,
+ * since we use 9 words of 15 bits each, and 15*9 = 135).
+ */
+static const uint16_t R2[] = {
+	0x008A,
+	0x6400, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000
+};
+
+/*
+ * Perform the inner processing of blocks for Poly1305. The "r" array
+ * is in Montgomery representation, while the "a" array is not.
+ */
+static void
+poly1305_inner(uint16_t *a, const uint16_t *r, const void *data, size_t len)
+{
+	const unsigned char *buf;
+
+	buf = data;
+	while (len > 0) {
+		unsigned char tmp[16], rev[16];
+		uint16_t b[10];
+		uint32_t ctl;
+		int i;
+
+		/*
+		 * If there is a partial block, right-pad it with zeros.
+		 */
+		if (len < 16) {
+			memset(tmp, 0, sizeof tmp);
+			memcpy(tmp, buf, len);
+			buf = tmp;
+			len = 16;
+		}
+
+		/*
+		 * Decode next block and apply the "high bit". Since
+		 * decoding is little-endian, we must byte-swap the buffer.
+		 */
+		for (i = 0; i < 16; i ++) {
+			rev[i] = buf[15 - i];
+		}
+		br_i15_decode_mod(b, rev, sizeof rev, P1305);
+		b[9] |= 0x0100;
+
+		/*
+		 * Add the accumulator to the decoded block (modular
+		 * addition).
+		 */
+		ctl = br_i15_add(b, a, 1);
+		ctl |= NOT(br_i15_sub(b, P1305, 0));
+		br_i15_sub(b, P1305, ctl);
+
+		/*
+		 * Multiply by r, result is the new accumulator value.
+		 */
+		br_i15_montymul(a, b, r, P1305, P0I);
+
+		buf += 16;
+		len -= 16;
+	}
+}
+
+/*
+ * Byteswap a 16-byte value.
+ */
+static void
+byteswap16(unsigned char *buf)
+{
+	int i;
+
+	for (i = 0; i < 8; i ++) {
+		unsigned x;
+
+		x = buf[i];
+		buf[i] = buf[15 - i];
+		buf[15 - i] = x;
+	}
+}
+
+/* see bearssl_block.h */
+void
+br_poly1305_i15_run(const void *key, const void *iv,
+	void *data, size_t len, const void *aad, size_t aad_len,
+	void *tag, br_chacha20_run ichacha, int encrypt)
+{
+	unsigned char pkey[32], foot[16];
+	uint16_t t[10], r[10], acc[10];
+
+	/*
+	 * Compute the MAC key. The 'r' value is the first 16 bytes of
+	 * pkey[].
+	 */
+	memset(pkey, 0, sizeof pkey);
+	ichacha(key, iv, 0, pkey, sizeof pkey);
+
+	/*
+	 * If encrypting, ChaCha20 must run first, followed by Poly1305.
+	 * When decrypting, the operations are reversed.
+	 */
+	if (encrypt) {
+		ichacha(key, iv, 1, data, len);
+	}
+
+	/*
+	 * Run Poly1305. We must process the AAD, then ciphertext, then
+	 * the footer (with the lengths). Note that the AAD and ciphertext
+	 * are meant to be padded with zeros up to the next multiple of 16,
+	 * and the length of the footer is 16 bytes as well.
+	 */
+
+	/*
+	 * Apply the "clamping" operation on the encoded 'r' value.
+	 */
+	pkey[ 3] &= 0x0F;
+	pkey[ 7] &= 0x0F;
+	pkey[11] &= 0x0F;
+	pkey[15] &= 0x0F;
+	pkey[ 4] &= 0xFC;
+	pkey[ 8] &= 0xFC;
+	pkey[12] &= 0xFC;
+
+	/*
+	 * Decode the clamped 'r' value. Decoding should use little-endian
+	 * so we must byteswap the value first.
+	 */
+	byteswap16(pkey);
+	br_i15_decode_mod(t, pkey, 16, P1305);
+
+	/*
+	 * Convert 'r' to Montgomery representation.
+	 */
+	br_i15_montymul(r, t, R2, P1305, P0I);
+
+	/*
+	 * Accumulator is 0.
+	 */
+	br_i15_zero(acc, 0x8A);
+
+	/*
+	 * Process the additional authenticated data, ciphertext, and
+	 * footer in due order.
+	 */
+	br_enc64le(foot, (uint64_t)aad_len);
+	br_enc64le(foot + 8, (uint64_t)len);
+	poly1305_inner(acc, r, aad, aad_len);
+	poly1305_inner(acc, r, data, len);
+	poly1305_inner(acc, r, foot, sizeof foot);
+
+	/*
+	 * Decode the value 's'. Again, a byteswap is needed.
+	 */
+	byteswap16(pkey + 16);
+	br_i15_decode_mod(t, pkey + 16, 16, P1305);
+
+	/*
+	 * Add the value 's' to the accumulator. That addition is done
+	 * modulo 2^128, so we just ignore the carry.
+	 */
+	br_i15_add(acc, t, 1);
+
+	/*
+	 * Encode the result (128 low bits) to the tag. Encoding should
+	 * be little-endian.
+	 */
+	br_i15_encode(tag, 16, acc);
+	byteswap16(tag);
+
+	/*
+	 * If decrypting, then ChaCha20 runs _after_ Poly1305.
+	 */
+	if (!encrypt) {
+		ichacha(key, iv, 1, data, len);
+	}
+}