From dc25573ed79a0d55c5a24b20474aa8504a758a2c Mon Sep 17 00:00:00 2001 From: David Monniaux Date: Sat, 2 Feb 2019 12:03:44 +0100 Subject: BearSSL --- .../BearSSL/src/symcipher/aes_big_cbcdec.c | 69 ++ .../BearSSL/src/symcipher/aes_big_cbcenc.c | 67 ++ test/monniaux/BearSSL/src/symcipher/aes_big_ctr.c | 84 ++ .../BearSSL/src/symcipher/aes_big_ctrcbc.c | 142 ++++ test/monniaux/BearSSL/src/symcipher/aes_big_dec.c | 254 ++++++ test/monniaux/BearSSL/src/symcipher/aes_big_enc.c | 157 ++++ test/monniaux/BearSSL/src/symcipher/aes_common.c | 112 +++ test/monniaux/BearSSL/src/symcipher/aes_ct.c | 328 +++++++ test/monniaux/BearSSL/src/symcipher/aes_ct64.c | 398 +++++++++ .../BearSSL/src/symcipher/aes_ct64_cbcdec.c | 104 +++ .../BearSSL/src/symcipher/aes_ct64_cbcenc.c | 81 ++ test/monniaux/BearSSL/src/symcipher/aes_ct64_ctr.c | 114 +++ .../BearSSL/src/symcipher/aes_ct64_ctrcbc.c | 433 ++++++++++ test/monniaux/BearSSL/src/symcipher/aes_ct64_dec.c | 159 ++++ test/monniaux/BearSSL/src/symcipher/aes_ct64_enc.c | 115 +++ .../monniaux/BearSSL/src/symcipher/aes_ct_cbcdec.c | 111 +++ .../monniaux/BearSSL/src/symcipher/aes_ct_cbcenc.c | 91 ++ test/monniaux/BearSSL/src/symcipher/aes_ct_ctr.c | 116 +++ .../monniaux/BearSSL/src/symcipher/aes_ct_ctrcbc.c | 422 +++++++++ test/monniaux/BearSSL/src/symcipher/aes_ct_dec.c | 170 ++++ test/monniaux/BearSSL/src/symcipher/aes_ct_enc.c | 112 +++ test/monniaux/BearSSL/src/symcipher/aes_pwr8.c | 445 ++++++++++ .../BearSSL/src/symcipher/aes_pwr8_cbcdec.c | 670 +++++++++++++++ .../BearSSL/src/symcipher/aes_pwr8_cbcenc.c | 417 +++++++++ test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctr.c | 717 ++++++++++++++++ .../BearSSL/src/symcipher/aes_pwr8_ctrcbc.c | 946 +++++++++++++++++++++ .../BearSSL/src/symcipher/aes_small_cbcdec.c | 69 ++ .../BearSSL/src/symcipher/aes_small_cbcenc.c | 67 ++ .../monniaux/BearSSL/src/symcipher/aes_small_ctr.c | 84 ++ .../BearSSL/src/symcipher/aes_small_ctrcbc.c | 142 ++++ .../monniaux/BearSSL/src/symcipher/aes_small_dec.c | 176 ++++ .../monniaux/BearSSL/src/symcipher/aes_small_enc.c | 129 +++ test/monniaux/BearSSL/src/symcipher/aes_x86ni.c | 240 ++++++ .../BearSSL/src/symcipher/aes_x86ni_cbcdec.c | 223 +++++ .../BearSSL/src/symcipher/aes_x86ni_cbcenc.c | 122 +++ .../monniaux/BearSSL/src/symcipher/aes_x86ni_ctr.c | 211 +++++ .../BearSSL/src/symcipher/aes_x86ni_ctrcbc.c | 596 +++++++++++++ test/monniaux/BearSSL/src/symcipher/chacha20_ct.c | 106 +++ .../monniaux/BearSSL/src/symcipher/chacha20_sse2.c | 237 ++++++ test/monniaux/BearSSL/src/symcipher/des_ct.c | 411 +++++++++ .../monniaux/BearSSL/src/symcipher/des_ct_cbcdec.c | 87 ++ .../monniaux/BearSSL/src/symcipher/des_ct_cbcenc.c | 69 ++ test/monniaux/BearSSL/src/symcipher/des_support.c | 166 ++++ test/monniaux/BearSSL/src/symcipher/des_tab.c | 310 +++++++ .../BearSSL/src/symcipher/des_tab_cbcdec.c | 85 ++ .../BearSSL/src/symcipher/des_tab_cbcenc.c | 67 ++ .../BearSSL/src/symcipher/poly1305_ctmul.c | 260 ++++++ .../BearSSL/src/symcipher/poly1305_ctmul32.c | 297 +++++++ .../BearSSL/src/symcipher/poly1305_ctmulq.c | 475 +++++++++++ test/monniaux/BearSSL/src/symcipher/poly1305_i15.c | 221 +++++ 50 files changed, 11684 insertions(+) create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_big_cbcdec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_big_cbcenc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_big_ctr.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_big_ctrcbc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_big_dec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_big_enc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_common.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct64.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcdec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcenc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct64_ctr.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct64_ctrcbc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct64_dec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct64_enc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct_cbcdec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct_cbcenc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct_ctr.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct_ctrcbc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct_dec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_ct_enc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_pwr8.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcdec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcenc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctr.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctrcbc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_small_cbcdec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_small_cbcenc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_small_ctr.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_small_ctrcbc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_small_dec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_small_enc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_x86ni.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcdec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcenc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctr.c create mode 100644 test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctrcbc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/chacha20_ct.c create mode 100644 test/monniaux/BearSSL/src/symcipher/chacha20_sse2.c create mode 100644 test/monniaux/BearSSL/src/symcipher/des_ct.c create mode 100644 test/monniaux/BearSSL/src/symcipher/des_ct_cbcdec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/des_ct_cbcenc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/des_support.c create mode 100644 test/monniaux/BearSSL/src/symcipher/des_tab.c create mode 100644 test/monniaux/BearSSL/src/symcipher/des_tab_cbcdec.c create mode 100644 test/monniaux/BearSSL/src/symcipher/des_tab_cbcenc.c create mode 100644 test/monniaux/BearSSL/src/symcipher/poly1305_ctmul.c create mode 100644 test/monniaux/BearSSL/src/symcipher/poly1305_ctmul32.c create mode 100644 test/monniaux/BearSSL/src/symcipher/poly1305_ctmulq.c create mode 100644 test/monniaux/BearSSL/src/symcipher/poly1305_i15.c (limited to 'test/monniaux/BearSSL/src/symcipher') diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_big_cbcdec.c new file mode 100644 index 00000000..d969a3bf --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_big_cbcdec.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_big_cbcdec_init(br_aes_big_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_big_cbcdec_vtable; + ctx->num_rounds = br_aes_big_keysched_inv(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_aes_big_cbcdec_run(const br_aes_big_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + + ivbuf = iv; + buf = data; + while (len > 0) { + unsigned char tmp[16]; + int i; + + memcpy(tmp, buf, 16); + br_aes_big_decrypt(ctx->num_rounds, ctx->skey, buf); + for (i = 0; i < 16; i ++) { + buf[i] ^= ivbuf[i]; + } + memcpy(ivbuf, tmp, 16); + buf += 16; + len -= 16; + } +} + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_aes_big_cbcdec_vtable = { + sizeof(br_aes_big_cbcdec_keys), + 16, + 4, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_aes_big_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_aes_big_cbcdec_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_big_cbcenc.c new file mode 100644 index 00000000..265e53b8 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_big_cbcenc.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_big_cbcenc_init(br_aes_big_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_big_cbcenc_vtable; + ctx->num_rounds = br_aes_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_aes_big_cbcenc_run(const br_aes_big_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + + ivbuf = iv; + buf = data; + while (len > 0) { + int i; + + for (i = 0; i < 16; i ++) { + buf[i] ^= ivbuf[i]; + } + br_aes_big_encrypt(ctx->num_rounds, ctx->skey, buf); + memcpy(ivbuf, buf, 16); + buf += 16; + len -= 16; + } +} + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_aes_big_cbcenc_vtable = { + sizeof(br_aes_big_cbcenc_keys), + 16, + 4, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_aes_big_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_aes_big_cbcenc_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_big_ctr.c new file mode 100644 index 00000000..18fbb846 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_big_ctr.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_big_ctr_init(br_aes_big_ctr_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_big_ctr_vtable; + ctx->num_rounds = br_aes_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +uint32_t +br_aes_big_ctr_run(const br_aes_big_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + + buf = data; + while (len > 0) { + unsigned char tmp[16]; + + memcpy(tmp, iv, 12); + br_enc32be(tmp + 12, cc ++); + br_aes_big_encrypt(ctx->num_rounds, ctx->skey, tmp); + if (len <= 16) { + xorbuf(buf, tmp, len); + break; + } + xorbuf(buf, tmp, 16); + buf += 16; + len -= 16; + } + return cc; +} + +/* see bearssl_block.h */ +const br_block_ctr_class br_aes_big_ctr_vtable = { + sizeof(br_aes_big_ctr_keys), + 16, + 4, + (void (*)(const br_block_ctr_class **, const void *, size_t)) + &br_aes_big_ctr_init, + (uint32_t (*)(const br_block_ctr_class *const *, + const void *, uint32_t, void *, size_t)) + &br_aes_big_ctr_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_big_ctrcbc.c new file mode 100644 index 00000000..d45ca769 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_big_ctrcbc.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_big_ctrcbc_init(br_aes_big_ctrcbc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_big_ctrcbc_vtable; + ctx->num_rounds = br_aes_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +void +br_aes_big_ctrcbc_ctr(const br_aes_big_ctrcbc_keys *ctx, + void *ctr, void *data, size_t len) +{ + unsigned char *buf, *bctr; + uint32_t cc0, cc1, cc2, cc3; + + buf = data; + bctr = ctr; + cc3 = br_dec32be(bctr + 0); + cc2 = br_dec32be(bctr + 4); + cc1 = br_dec32be(bctr + 8); + cc0 = br_dec32be(bctr + 12); + while (len > 0) { + unsigned char tmp[16]; + uint32_t carry; + + br_enc32be(tmp + 0, cc3); + br_enc32be(tmp + 4, cc2); + br_enc32be(tmp + 8, cc1); + br_enc32be(tmp + 12, cc0); + br_aes_big_encrypt(ctx->num_rounds, ctx->skey, tmp); + xorbuf(buf, tmp, 16); + buf += 16; + len -= 16; + cc0 ++; + carry = (~(cc0 | -cc0)) >> 31; + cc1 += carry; + carry &= (~(cc1 | -cc1)) >> 31; + cc2 += carry; + carry &= (~(cc2 | -cc2)) >> 31; + cc3 += carry; + } + br_enc32be(bctr + 0, cc3); + br_enc32be(bctr + 4, cc2); + br_enc32be(bctr + 8, cc1); + br_enc32be(bctr + 12, cc0); +} + +/* see bearssl_block.h */ +void +br_aes_big_ctrcbc_mac(const br_aes_big_ctrcbc_keys *ctx, + void *cbcmac, const void *data, size_t len) +{ + const unsigned char *buf; + + buf = data; + while (len > 0) { + xorbuf(cbcmac, buf, 16); + br_aes_big_encrypt(ctx->num_rounds, ctx->skey, cbcmac); + buf += 16; + len -= 16; + } +} + +/* see bearssl_block.h */ +void +br_aes_big_ctrcbc_encrypt(const br_aes_big_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + br_aes_big_ctrcbc_ctr(ctx, ctr, data, len); + br_aes_big_ctrcbc_mac(ctx, cbcmac, data, len); +} + +/* see bearssl_block.h */ +void +br_aes_big_ctrcbc_decrypt(const br_aes_big_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + br_aes_big_ctrcbc_mac(ctx, cbcmac, data, len); + br_aes_big_ctrcbc_ctr(ctx, ctr, data, len); +} + +/* see bearssl_block.h */ +const br_block_ctrcbc_class br_aes_big_ctrcbc_vtable = { + sizeof(br_aes_big_ctrcbc_keys), + 16, + 4, + (void (*)(const br_block_ctrcbc_class **, const void *, size_t)) + &br_aes_big_ctrcbc_init, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_big_ctrcbc_encrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_big_ctrcbc_decrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, size_t)) + &br_aes_big_ctrcbc_ctr, + (void (*)(const br_block_ctrcbc_class *const *, + void *, const void *, size_t)) + &br_aes_big_ctrcbc_mac +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_dec.c b/test/monniaux/BearSSL/src/symcipher/aes_big_dec.c new file mode 100644 index 00000000..a5d0e3c6 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_big_dec.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Inverse S-box (used in key schedule for decryption). + */ +static const unsigned char iS[] = { + 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, + 0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, + 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32, + 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, + 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, + 0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50, + 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, + 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, + 0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, + 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41, + 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, + 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, + 0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, + 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B, + 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, + 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, + 0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, + 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D, + 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, + 0x55, 0x21, 0x0C, 0x7D +}; + +static const uint32_t iSsm0[] = { + 0x51F4A750, 0x7E416553, 0x1A17A4C3, 0x3A275E96, 0x3BAB6BCB, 0x1F9D45F1, + 0xACFA58AB, 0x4BE30393, 0x2030FA55, 0xAD766DF6, 0x88CC7691, 0xF5024C25, + 0x4FE5D7FC, 0xC52ACBD7, 0x26354480, 0xB562A38F, 0xDEB15A49, 0x25BA1B67, + 0x45EA0E98, 0x5DFEC0E1, 0xC32F7502, 0x814CF012, 0x8D4697A3, 0x6BD3F9C6, + 0x038F5FE7, 0x15929C95, 0xBF6D7AEB, 0x955259DA, 0xD4BE832D, 0x587421D3, + 0x49E06929, 0x8EC9C844, 0x75C2896A, 0xF48E7978, 0x99583E6B, 0x27B971DD, + 0xBEE14FB6, 0xF088AD17, 0xC920AC66, 0x7DCE3AB4, 0x63DF4A18, 0xE51A3182, + 0x97513360, 0x62537F45, 0xB16477E0, 0xBB6BAE84, 0xFE81A01C, 0xF9082B94, + 0x70486858, 0x8F45FD19, 0x94DE6C87, 0x527BF8B7, 0xAB73D323, 0x724B02E2, + 0xE31F8F57, 0x6655AB2A, 0xB2EB2807, 0x2FB5C203, 0x86C57B9A, 0xD33708A5, + 0x302887F2, 0x23BFA5B2, 0x02036ABA, 0xED16825C, 0x8ACF1C2B, 0xA779B492, + 0xF307F2F0, 0x4E69E2A1, 0x65DAF4CD, 0x0605BED5, 0xD134621F, 0xC4A6FE8A, + 0x342E539D, 0xA2F355A0, 0x058AE132, 0xA4F6EB75, 0x0B83EC39, 0x4060EFAA, + 0x5E719F06, 0xBD6E1051, 0x3E218AF9, 0x96DD063D, 0xDD3E05AE, 0x4DE6BD46, + 0x91548DB5, 0x71C45D05, 0x0406D46F, 0x605015FF, 0x1998FB24, 0xD6BDE997, + 0x894043CC, 0x67D99E77, 0xB0E842BD, 0x07898B88, 0xE7195B38, 0x79C8EEDB, + 0xA17C0A47, 0x7C420FE9, 0xF8841EC9, 0x00000000, 0x09808683, 0x322BED48, + 0x1E1170AC, 0x6C5A724E, 0xFD0EFFFB, 0x0F853856, 0x3DAED51E, 0x362D3927, + 0x0A0FD964, 0x685CA621, 0x9B5B54D1, 0x24362E3A, 0x0C0A67B1, 0x9357E70F, + 0xB4EE96D2, 0x1B9B919E, 0x80C0C54F, 0x61DC20A2, 0x5A774B69, 0x1C121A16, + 0xE293BA0A, 0xC0A02AE5, 0x3C22E043, 0x121B171D, 0x0E090D0B, 0xF28BC7AD, + 0x2DB6A8B9, 0x141EA9C8, 0x57F11985, 0xAF75074C, 0xEE99DDBB, 0xA37F60FD, + 0xF701269F, 0x5C72F5BC, 0x44663BC5, 0x5BFB7E34, 0x8B432976, 0xCB23C6DC, + 0xB6EDFC68, 0xB8E4F163, 0xD731DCCA, 0x42638510, 0x13972240, 0x84C61120, + 0x854A247D, 0xD2BB3DF8, 0xAEF93211, 0xC729A16D, 0x1D9E2F4B, 0xDCB230F3, + 0x0D8652EC, 0x77C1E3D0, 0x2BB3166C, 0xA970B999, 0x119448FA, 0x47E96422, + 0xA8FC8CC4, 0xA0F03F1A, 0x567D2CD8, 0x223390EF, 0x87494EC7, 0xD938D1C1, + 0x8CCAA2FE, 0x98D40B36, 0xA6F581CF, 0xA57ADE28, 0xDAB78E26, 0x3FADBFA4, + 0x2C3A9DE4, 0x5078920D, 0x6A5FCC9B, 0x547E4662, 0xF68D13C2, 0x90D8B8E8, + 0x2E39F75E, 0x82C3AFF5, 0x9F5D80BE, 0x69D0937C, 0x6FD52DA9, 0xCF2512B3, + 0xC8AC993B, 0x10187DA7, 0xE89C636E, 0xDB3BBB7B, 0xCD267809, 0x6E5918F4, + 0xEC9AB701, 0x834F9AA8, 0xE6956E65, 0xAAFFE67E, 0x21BCCF08, 0xEF15E8E6, + 0xBAE79BD9, 0x4A6F36CE, 0xEA9F09D4, 0x29B07CD6, 0x31A4B2AF, 0x2A3F2331, + 0xC6A59430, 0x35A266C0, 0x744EBC37, 0xFC82CAA6, 0xE090D0B0, 0x33A7D815, + 0xF104984A, 0x41ECDAF7, 0x7FCD500E, 0x1791F62F, 0x764DD68D, 0x43EFB04D, + 0xCCAA4D54, 0xE49604DF, 0x9ED1B5E3, 0x4C6A881B, 0xC12C1FB8, 0x4665517F, + 0x9D5EEA04, 0x018C355D, 0xFA877473, 0xFB0B412E, 0xB3671D5A, 0x92DBD252, + 0xE9105633, 0x6DD64713, 0x9AD7618C, 0x37A10C7A, 0x59F8148E, 0xEB133C89, + 0xCEA927EE, 0xB761C935, 0xE11CE5ED, 0x7A47B13C, 0x9CD2DF59, 0x55F2733F, + 0x1814CE79, 0x73C737BF, 0x53F7CDEA, 0x5FFDAA5B, 0xDF3D6F14, 0x7844DB86, + 0xCAAFF381, 0xB968C43E, 0x3824342C, 0xC2A3405F, 0x161DC372, 0xBCE2250C, + 0x283C498B, 0xFF0D9541, 0x39A80171, 0x080CB3DE, 0xD8B4E49C, 0x6456C190, + 0x7BCB8461, 0xD532B670, 0x486C5C74, 0xD0B85742 +}; + +static unsigned +mul2(unsigned x) +{ + x <<= 1; + return x ^ ((unsigned)(-(int)(x >> 8)) & 0x11B); +} + +static unsigned +mul9(unsigned x) +{ + return x ^ mul2(mul2(mul2(x))); +} + +static unsigned +mulb(unsigned x) +{ + unsigned x2; + + x2 = mul2(x); + return x ^ x2 ^ mul2(mul2(x2)); +} + +static unsigned +muld(unsigned x) +{ + unsigned x4; + + x4 = mul2(mul2(x)); + return x ^ x4 ^ mul2(x4); +} + +static unsigned +mule(unsigned x) +{ + unsigned x2, x4; + + x2 = mul2(x); + x4 = mul2(x2); + return x2 ^ x4 ^ mul2(x4); +} + +/* see inner.h */ +unsigned +br_aes_big_keysched_inv(uint32_t *skey, const void *key, size_t key_len) +{ + unsigned num_rounds; + int i, m; + + /* + * Sub-keys for decryption are distinct from encryption sub-keys + * in that InvMixColumns() is already applied for the inner + * rounds. + */ + num_rounds = br_aes_keysched(skey, key, key_len); + m = (int)(num_rounds << 2); + for (i = 4; i < m; i ++) { + uint32_t p; + unsigned p0, p1, p2, p3; + uint32_t q0, q1, q2, q3; + + p = skey[i]; + p0 = p >> 24; + p1 = (p >> 16) & 0xFF; + p2 = (p >> 8) & 0xFF; + p3 = p & 0xFF; + q0 = mule(p0) ^ mulb(p1) ^ muld(p2) ^ mul9(p3); + q1 = mul9(p0) ^ mule(p1) ^ mulb(p2) ^ muld(p3); + q2 = muld(p0) ^ mul9(p1) ^ mule(p2) ^ mulb(p3); + q3 = mulb(p0) ^ muld(p1) ^ mul9(p2) ^ mule(p3); + skey[i] = (q0 << 24) | (q1 << 16) | (q2 << 8) | q3; + } + return num_rounds; +} + +static inline uint32_t +rotr(uint32_t x, int n) +{ + return (x << (32 - n)) | (x >> n); +} + +#define iSboxExt0(x) (iSsm0[x]) +#define iSboxExt1(x) (rotr(iSsm0[x], 8)) +#define iSboxExt2(x) (rotr(iSsm0[x], 16)) +#define iSboxExt3(x) (rotr(iSsm0[x], 24)) + +/* see bearssl.h */ +void +br_aes_big_decrypt(unsigned num_rounds, const uint32_t *skey, void *data) +{ + unsigned char *buf; + uint32_t s0, s1, s2, s3; + uint32_t t0, t1, t2, t3; + unsigned u; + + buf = data; + s0 = br_dec32be(buf); + s1 = br_dec32be(buf + 4); + s2 = br_dec32be(buf + 8); + s3 = br_dec32be(buf + 12); + s0 ^= skey[(num_rounds << 2) + 0]; + s1 ^= skey[(num_rounds << 2) + 1]; + s2 ^= skey[(num_rounds << 2) + 2]; + s3 ^= skey[(num_rounds << 2) + 3]; + for (u = num_rounds - 1; u > 0; u --) { + uint32_t v0 = iSboxExt0(s0 >> 24) + ^ iSboxExt1((s3 >> 16) & 0xFF) + ^ iSboxExt2((s2 >> 8) & 0xFF) + ^ iSboxExt3(s1 & 0xFF); + uint32_t v1 = iSboxExt0(s1 >> 24) + ^ iSboxExt1((s0 >> 16) & 0xFF) + ^ iSboxExt2((s3 >> 8) & 0xFF) + ^ iSboxExt3(s2 & 0xFF); + uint32_t v2 = iSboxExt0(s2 >> 24) + ^ iSboxExt1((s1 >> 16) & 0xFF) + ^ iSboxExt2((s0 >> 8) & 0xFF) + ^ iSboxExt3(s3 & 0xFF); + uint32_t v3 = iSboxExt0(s3 >> 24) + ^ iSboxExt1((s2 >> 16) & 0xFF) + ^ iSboxExt2((s1 >> 8) & 0xFF) + ^ iSboxExt3(s0 & 0xFF); + s0 = v0; + s1 = v1; + s2 = v2; + s3 = v3; + s0 ^= skey[u << 2]; + s1 ^= skey[(u << 2) + 1]; + s2 ^= skey[(u << 2) + 2]; + s3 ^= skey[(u << 2) + 3]; + } + t0 = ((uint32_t)iS[s0 >> 24] << 24) + | ((uint32_t)iS[(s3 >> 16) & 0xFF] << 16) + | ((uint32_t)iS[(s2 >> 8) & 0xFF] << 8) + | (uint32_t)iS[s1 & 0xFF]; + t1 = ((uint32_t)iS[s1 >> 24] << 24) + | ((uint32_t)iS[(s0 >> 16) & 0xFF] << 16) + | ((uint32_t)iS[(s3 >> 8) & 0xFF] << 8) + | (uint32_t)iS[s2 & 0xFF]; + t2 = ((uint32_t)iS[s2 >> 24] << 24) + | ((uint32_t)iS[(s1 >> 16) & 0xFF] << 16) + | ((uint32_t)iS[(s0 >> 8) & 0xFF] << 8) + | (uint32_t)iS[s3 & 0xFF]; + t3 = ((uint32_t)iS[s3 >> 24] << 24) + | ((uint32_t)iS[(s2 >> 16) & 0xFF] << 16) + | ((uint32_t)iS[(s1 >> 8) & 0xFF] << 8) + | (uint32_t)iS[s0 & 0xFF]; + s0 = t0 ^ skey[0]; + s1 = t1 ^ skey[1]; + s2 = t2 ^ skey[2]; + s3 = t3 ^ skey[3]; + br_enc32be(buf, s0); + br_enc32be(buf + 4, s1); + br_enc32be(buf + 8, s2); + br_enc32be(buf + 12, s3); +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_big_enc.c b/test/monniaux/BearSSL/src/symcipher/aes_big_enc.c new file mode 100644 index 00000000..bbabb9a6 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_big_enc.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define S br_aes_S + +static const uint32_t Ssm0[] = { + 0xC66363A5, 0xF87C7C84, 0xEE777799, 0xF67B7B8D, 0xFFF2F20D, 0xD66B6BBD, + 0xDE6F6FB1, 0x91C5C554, 0x60303050, 0x02010103, 0xCE6767A9, 0x562B2B7D, + 0xE7FEFE19, 0xB5D7D762, 0x4DABABE6, 0xEC76769A, 0x8FCACA45, 0x1F82829D, + 0x89C9C940, 0xFA7D7D87, 0xEFFAFA15, 0xB25959EB, 0x8E4747C9, 0xFBF0F00B, + 0x41ADADEC, 0xB3D4D467, 0x5FA2A2FD, 0x45AFAFEA, 0x239C9CBF, 0x53A4A4F7, + 0xE4727296, 0x9BC0C05B, 0x75B7B7C2, 0xE1FDFD1C, 0x3D9393AE, 0x4C26266A, + 0x6C36365A, 0x7E3F3F41, 0xF5F7F702, 0x83CCCC4F, 0x6834345C, 0x51A5A5F4, + 0xD1E5E534, 0xF9F1F108, 0xE2717193, 0xABD8D873, 0x62313153, 0x2A15153F, + 0x0804040C, 0x95C7C752, 0x46232365, 0x9DC3C35E, 0x30181828, 0x379696A1, + 0x0A05050F, 0x2F9A9AB5, 0x0E070709, 0x24121236, 0x1B80809B, 0xDFE2E23D, + 0xCDEBEB26, 0x4E272769, 0x7FB2B2CD, 0xEA75759F, 0x1209091B, 0x1D83839E, + 0x582C2C74, 0x341A1A2E, 0x361B1B2D, 0xDC6E6EB2, 0xB45A5AEE, 0x5BA0A0FB, + 0xA45252F6, 0x763B3B4D, 0xB7D6D661, 0x7DB3B3CE, 0x5229297B, 0xDDE3E33E, + 0x5E2F2F71, 0x13848497, 0xA65353F5, 0xB9D1D168, 0x00000000, 0xC1EDED2C, + 0x40202060, 0xE3FCFC1F, 0x79B1B1C8, 0xB65B5BED, 0xD46A6ABE, 0x8DCBCB46, + 0x67BEBED9, 0x7239394B, 0x944A4ADE, 0x984C4CD4, 0xB05858E8, 0x85CFCF4A, + 0xBBD0D06B, 0xC5EFEF2A, 0x4FAAAAE5, 0xEDFBFB16, 0x864343C5, 0x9A4D4DD7, + 0x66333355, 0x11858594, 0x8A4545CF, 0xE9F9F910, 0x04020206, 0xFE7F7F81, + 0xA05050F0, 0x783C3C44, 0x259F9FBA, 0x4BA8A8E3, 0xA25151F3, 0x5DA3A3FE, + 0x804040C0, 0x058F8F8A, 0x3F9292AD, 0x219D9DBC, 0x70383848, 0xF1F5F504, + 0x63BCBCDF, 0x77B6B6C1, 0xAFDADA75, 0x42212163, 0x20101030, 0xE5FFFF1A, + 0xFDF3F30E, 0xBFD2D26D, 0x81CDCD4C, 0x180C0C14, 0x26131335, 0xC3ECEC2F, + 0xBE5F5FE1, 0x359797A2, 0x884444CC, 0x2E171739, 0x93C4C457, 0x55A7A7F2, + 0xFC7E7E82, 0x7A3D3D47, 0xC86464AC, 0xBA5D5DE7, 0x3219192B, 0xE6737395, + 0xC06060A0, 0x19818198, 0x9E4F4FD1, 0xA3DCDC7F, 0x44222266, 0x542A2A7E, + 0x3B9090AB, 0x0B888883, 0x8C4646CA, 0xC7EEEE29, 0x6BB8B8D3, 0x2814143C, + 0xA7DEDE79, 0xBC5E5EE2, 0x160B0B1D, 0xADDBDB76, 0xDBE0E03B, 0x64323256, + 0x743A3A4E, 0x140A0A1E, 0x924949DB, 0x0C06060A, 0x4824246C, 0xB85C5CE4, + 0x9FC2C25D, 0xBDD3D36E, 0x43ACACEF, 0xC46262A6, 0x399191A8, 0x319595A4, + 0xD3E4E437, 0xF279798B, 0xD5E7E732, 0x8BC8C843, 0x6E373759, 0xDA6D6DB7, + 0x018D8D8C, 0xB1D5D564, 0x9C4E4ED2, 0x49A9A9E0, 0xD86C6CB4, 0xAC5656FA, + 0xF3F4F407, 0xCFEAEA25, 0xCA6565AF, 0xF47A7A8E, 0x47AEAEE9, 0x10080818, + 0x6FBABAD5, 0xF0787888, 0x4A25256F, 0x5C2E2E72, 0x381C1C24, 0x57A6A6F1, + 0x73B4B4C7, 0x97C6C651, 0xCBE8E823, 0xA1DDDD7C, 0xE874749C, 0x3E1F1F21, + 0x964B4BDD, 0x61BDBDDC, 0x0D8B8B86, 0x0F8A8A85, 0xE0707090, 0x7C3E3E42, + 0x71B5B5C4, 0xCC6666AA, 0x904848D8, 0x06030305, 0xF7F6F601, 0x1C0E0E12, + 0xC26161A3, 0x6A35355F, 0xAE5757F9, 0x69B9B9D0, 0x17868691, 0x99C1C158, + 0x3A1D1D27, 0x279E9EB9, 0xD9E1E138, 0xEBF8F813, 0x2B9898B3, 0x22111133, + 0xD26969BB, 0xA9D9D970, 0x078E8E89, 0x339494A7, 0x2D9B9BB6, 0x3C1E1E22, + 0x15878792, 0xC9E9E920, 0x87CECE49, 0xAA5555FF, 0x50282878, 0xA5DFDF7A, + 0x038C8C8F, 0x59A1A1F8, 0x09898980, 0x1A0D0D17, 0x65BFBFDA, 0xD7E6E631, + 0x844242C6, 0xD06868B8, 0x824141C3, 0x299999B0, 0x5A2D2D77, 0x1E0F0F11, + 0x7BB0B0CB, 0xA85454FC, 0x6DBBBBD6, 0x2C16163A +}; + +static inline uint32_t +rotr(uint32_t x, int n) +{ + return (x << (32 - n)) | (x >> n); +} + +#define SboxExt0(x) (Ssm0[x]) +#define SboxExt1(x) (rotr(Ssm0[x], 8)) +#define SboxExt2(x) (rotr(Ssm0[x], 16)) +#define SboxExt3(x) (rotr(Ssm0[x], 24)) + + +/* see bearssl.h */ +void +br_aes_big_encrypt(unsigned num_rounds, const uint32_t *skey, void *data) +{ + unsigned char *buf; + uint32_t s0, s1, s2, s3; + uint32_t t0, t1, t2, t3; + unsigned u; + + buf = data; + s0 = br_dec32be(buf); + s1 = br_dec32be(buf + 4); + s2 = br_dec32be(buf + 8); + s3 = br_dec32be(buf + 12); + s0 ^= skey[0]; + s1 ^= skey[1]; + s2 ^= skey[2]; + s3 ^= skey[3]; + for (u = 1; u < num_rounds; u ++) { + uint32_t v0, v1, v2, v3; + + v0 = SboxExt0(s0 >> 24) + ^ SboxExt1((s1 >> 16) & 0xFF) + ^ SboxExt2((s2 >> 8) & 0xFF) + ^ SboxExt3(s3 & 0xFF); + v1 = SboxExt0(s1 >> 24) + ^ SboxExt1((s2 >> 16) & 0xFF) + ^ SboxExt2((s3 >> 8) & 0xFF) + ^ SboxExt3(s0 & 0xFF); + v2 = SboxExt0(s2 >> 24) + ^ SboxExt1((s3 >> 16) & 0xFF) + ^ SboxExt2((s0 >> 8) & 0xFF) + ^ SboxExt3(s1 & 0xFF); + v3 = SboxExt0(s3 >> 24) + ^ SboxExt1((s0 >> 16) & 0xFF) + ^ SboxExt2((s1 >> 8) & 0xFF) + ^ SboxExt3(s2 & 0xFF); + s0 = v0; + s1 = v1; + s2 = v2; + s3 = v3; + s0 ^= skey[u << 2]; + s1 ^= skey[(u << 2) + 1]; + s2 ^= skey[(u << 2) + 2]; + s3 ^= skey[(u << 2) + 3]; + } + t0 = ((uint32_t)S[s0 >> 24] << 24) + | ((uint32_t)S[(s1 >> 16) & 0xFF] << 16) + | ((uint32_t)S[(s2 >> 8) & 0xFF] << 8) + | (uint32_t)S[s3 & 0xFF]; + t1 = ((uint32_t)S[s1 >> 24] << 24) + | ((uint32_t)S[(s2 >> 16) & 0xFF] << 16) + | ((uint32_t)S[(s3 >> 8) & 0xFF] << 8) + | (uint32_t)S[s0 & 0xFF]; + t2 = ((uint32_t)S[s2 >> 24] << 24) + | ((uint32_t)S[(s3 >> 16) & 0xFF] << 16) + | ((uint32_t)S[(s0 >> 8) & 0xFF] << 8) + | (uint32_t)S[s1 & 0xFF]; + t3 = ((uint32_t)S[s3 >> 24] << 24) + | ((uint32_t)S[(s0 >> 16) & 0xFF] << 16) + | ((uint32_t)S[(s1 >> 8) & 0xFF] << 8) + | (uint32_t)S[s2 & 0xFF]; + s0 = t0 ^ skey[num_rounds << 2]; + s1 = t1 ^ skey[(num_rounds << 2) + 1]; + s2 = t2 ^ skey[(num_rounds << 2) + 2]; + s3 = t3 ^ skey[(num_rounds << 2) + 3]; + br_enc32be(buf, s0); + br_enc32be(buf + 4, s1); + br_enc32be(buf + 8, s2); + br_enc32be(buf + 12, s3); +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_common.c b/test/monniaux/BearSSL/src/symcipher/aes_common.c new file mode 100644 index 00000000..72c64fb1 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_common.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static const uint32_t Rcon[] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, + 0x40000000, 0x80000000, 0x1B000000, 0x36000000 +}; + +#define S br_aes_S + +/* see inner.h */ +const unsigned char br_aes_S[] = { + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, + 0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, + 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26, + 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, + 0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, + 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED, + 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, + 0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, + 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC, + 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, + 0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, + 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D, + 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, + 0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, + 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11, + 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, + 0xB0, 0x54, 0xBB, 0x16 +}; + +static uint32_t +SubWord(uint32_t x) +{ + return ((uint32_t)S[x >> 24] << 24) + | ((uint32_t)S[(x >> 16) & 0xFF] << 16) + | ((uint32_t)S[(x >> 8) & 0xFF] << 8) + | (uint32_t)S[x & 0xFF]; +} + +/* see inner.h */ +unsigned +br_aes_keysched(uint32_t *skey, const void *key, size_t key_len) +{ + unsigned num_rounds; + int i, j, k, nk, nkf; + + switch (key_len) { + case 16: + num_rounds = 10; + break; + case 24: + num_rounds = 12; + break; + case 32: + num_rounds = 14; + break; + default: + /* abort(); */ + return 0; + } + nk = (int)(key_len >> 2); + nkf = (int)((num_rounds + 1) << 2); + for (i = 0; i < nk; i ++) { + skey[i] = br_dec32be((const unsigned char *)key + (i << 2)); + } + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + uint32_t tmp; + + tmp = skey[i - 1]; + if (j == 0) { + tmp = (tmp << 8) | (tmp >> 24); + tmp = SubWord(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = SubWord(tmp); + } + skey[i] = skey[i - nk] ^ tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + return num_rounds; +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct.c b/test/monniaux/BearSSL/src/symcipher/aes_ct.c new file mode 100644 index 00000000..66776d9e --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see inner.h */ +void +br_aes_ct_bitslice_Sbox(uint32_t *q) +{ + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint32_t x0, x1, x2, x3, x4, x5, x6, x7; + uint32_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint32_t y20, y21; + uint32_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint32_t z10, z11, z12, z13, z14, z15, z16, z17; + uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint32_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint32_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint32_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint32_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint32_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint32_t t60, t61, t62, t63, t64, t65, t66, t67; + uint32_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +/* see inner.h */ +void +br_aes_ct_ortho(uint32_t *q) +{ +#define SWAPN(cl, ch, s, x, y) do { \ + uint32_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint32_t)cl) | ((b & (uint32_t)cl) << (s)); \ + (y) = ((a & (uint32_t)ch) >> (s)) | (b & (uint32_t)ch); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x55555555, 0xAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x33333333, 0xCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F, 0xF0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +static const unsigned char Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t +sub_word(uint32_t x) +{ + uint32_t q[8]; + int i; + + for (i = 0; i < 8; i ++) { + q[i] = x; + } + br_aes_ct_ortho(q); + br_aes_ct_bitslice_Sbox(q); + br_aes_ct_ortho(q); + return q[0]; +} + +/* see inner.h */ +unsigned +br_aes_ct_keysched(uint32_t *comp_skey, const void *key, size_t key_len) +{ + unsigned num_rounds; + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[120]; + + switch (key_len) { + case 16: + num_rounds = 10; + break; + case 24: + num_rounds = 12; + break; + case 32: + num_rounds = 14; + break; + default: + /* abort(); */ + return 0; + } + nk = (int)(key_len >> 2); + nkf = (int)((num_rounds + 1) << 2); + tmp = 0; + for (i = 0; i < nk; i ++) { + tmp = br_dec32le((const unsigned char *)key + (i << 2)); + skey[(i << 1) + 0] = tmp; + skey[(i << 1) + 1] = tmp; + } + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[(i - nk) << 1]; + skey[(i << 1) + 0] = tmp; + skey[(i << 1) + 1] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + for (i = 0; i < nkf; i += 4) { + br_aes_ct_ortho(skey + (i << 1)); + } + for (i = 0, j = 0; i < nkf; i ++, j += 2) { + comp_skey[i] = (skey[j + 0] & 0x55555555) + | (skey[j + 1] & 0xAAAAAAAA); + } + return num_rounds; +} + +/* see inner.h */ +void +br_aes_ct_skey_expand(uint32_t *skey, + unsigned num_rounds, const uint32_t *comp_skey) +{ + unsigned u, v, n; + + n = (num_rounds + 1) << 2; + for (u = 0, v = 0; u < n; u ++, v += 2) { + uint32_t x, y; + + x = y = comp_skey[u]; + x &= 0x55555555; + skey[v + 0] = x | (x << 1); + y &= 0xAAAAAAAA; + skey[v + 1] = y | (y >> 1); + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64.c new file mode 100644 index 00000000..15238116 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see inner.h */ +void +br_aes_ct64_bitslice_Sbox(uint64_t *q) +{ + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +/* see inner.h */ +void +br_aes_ct64_ortho(uint64_t *q) +{ +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)cl) | ((b & (uint64_t)cl) << (s)); \ + (y) = ((a & (uint64_t)ch) >> (s)) | (b & (uint64_t)ch); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +/* see inner.h */ +void +br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) +{ + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +/* see inner.h */ +void +br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) +{ + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const unsigned char Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t +sub_word(uint32_t x) +{ + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +/* see inner.h */ +unsigned +br_aes_ct64_keysched(uint64_t *comp_skey, const void *key, size_t key_len) +{ + unsigned num_rounds; + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + switch (key_len) { + case 16: + num_rounds = 10; + break; + case 24: + num_rounds = 12; + break; + case 32: + num_rounds = 14; + break; + default: + /* abort(); */ + return 0; + } + nk = (int)(key_len >> 2); + nkf = (int)((num_rounds + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } + return num_rounds; +} + +/* see inner.h */ +void +br_aes_ct64_skey_expand(uint64_t *skey, + unsigned num_rounds, const uint64_t *comp_skey) +{ + unsigned u, v, n; + + n = (num_rounds + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcdec.c new file mode 100644 index 00000000..5a7360bc --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcdec.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct64_cbcdec_init(br_aes_ct64_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct64_cbcdec_vtable; + ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_aes_ct64_cbcdec_run(const br_aes_ct64_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf; + uint64_t sk_exp[120]; + uint32_t ivw[4]; + + br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + br_range_dec32le(ivw, 4, iv); + buf = data; + while (len > 0) { + uint64_t q[8]; + uint32_t w1[16], w2[16]; + int i; + + if (len >= 64) { + br_range_dec32le(w1, 16, buf); + } else { + br_range_dec32le(w1, len >> 2, buf); + } + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_in( + &q[i], &q[i + 4], w1 + (i << 2)); + } + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_decrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out( + w2 + (i << 2), q[i], q[i + 4]); + } + for (i = 0; i < 4; i ++) { + w2[i] ^= ivw[i]; + } + if (len >= 64) { + for (i = 4; i < 16; i ++) { + w2[i] ^= w1[i - 4]; + } + memcpy(ivw, w1 + 12, sizeof ivw); + br_range_enc32le(buf, w2, 16); + } else { + int j; + + j = (int)(len >> 2); + for (i = 4; i < j; i ++) { + w2[i] ^= w1[i - 4]; + } + memcpy(ivw, w1 + j - 4, sizeof ivw); + br_range_enc32le(buf, w2, j); + break; + } + buf += 64; + len -= 64; + } + br_range_enc32le(iv, ivw, 4); +} + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_aes_ct64_cbcdec_vtable = { + sizeof(br_aes_ct64_cbcdec_keys), + 16, + 4, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_aes_ct64_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_aes_ct64_cbcdec_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcenc.c new file mode 100644 index 00000000..6cb9dece --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_cbcenc.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct64_cbcenc_init(br_aes_ct64_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct64_cbcenc_vtable; + ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_aes_ct64_cbcenc_run(const br_aes_ct64_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf; + uint64_t sk_exp[120]; + uint32_t ivw[4]; + + br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + br_range_dec32le(ivw, 4, iv); + buf = data; + while (len > 0) { + uint32_t w[4]; + uint64_t q[8]; + + w[0] = ivw[0] ^ br_dec32le(buf); + w[1] = ivw[1] ^ br_dec32le(buf + 4); + w[2] = ivw[2] ^ br_dec32le(buf + 8); + w[3] = ivw[3] ^ br_dec32le(buf + 12); + br_aes_ct64_interleave_in(&q[0], &q[4], w); + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + br_aes_ct64_interleave_out(w, q[0], q[4]); + memcpy(ivw, w, sizeof w); + br_enc32le(buf, w[0]); + br_enc32le(buf + 4, w[1]); + br_enc32le(buf + 8, w[2]); + br_enc32le(buf + 12, w[3]); + buf += 16; + len -= 16; + } + br_range_enc32le(iv, ivw, 4); +} + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_aes_ct64_cbcenc_vtable = { + sizeof(br_aes_ct64_cbcenc_keys), + 16, + 4, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_aes_ct64_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_aes_ct64_cbcenc_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctr.c new file mode 100644 index 00000000..1275873d --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctr.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct64_ctr_init(br_aes_ct64_ctr_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct64_ctr_vtable; + ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +uint32_t +br_aes_ct64_ctr_run(const br_aes_ct64_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + uint32_t ivw[16]; + uint64_t sk_exp[120]; + + br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + br_range_dec32le(ivw, 3, iv); + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + buf = data; + while (len > 0) { + uint64_t q[8]; + uint32_t w[16]; + unsigned char tmp[64]; + int i; + + /* + * TODO: see if we can save on the first br_aes_ct64_ortho() + * call, since iv0/iv1/iv2 are constant for the whole run. + */ + memcpy(w, ivw, sizeof ivw); + w[3] = br_swap32(cc); + w[7] = br_swap32(cc + 1); + w[11] = br_swap32(cc + 2); + w[15] = br_swap32(cc + 3); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_in( + &q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out( + w + (i << 2), q[i], q[i + 4]); + } + br_range_enc32le(tmp, w, 16); + if (len <= 64) { + xorbuf(buf, tmp, len); + cc += (uint32_t)len >> 4; + break; + } + xorbuf(buf, tmp, 64); + buf += 64; + len -= 64; + cc += 4; + } + return cc; +} + +/* see bearssl_block.h */ +const br_block_ctr_class br_aes_ct64_ctr_vtable = { + sizeof(br_aes_ct64_ctr_keys), + 16, + 4, + (void (*)(const br_block_ctr_class **, const void *, size_t)) + &br_aes_ct64_ctr_init, + (uint32_t (*)(const br_block_ctr_class *const *, + const void *, uint32_t, void *, size_t)) + &br_aes_ct64_ctr_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctrcbc.c new file mode 100644 index 00000000..21bb8efa --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_ctrcbc.c @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct64_ctrcbc_init(br_aes_ct64_ctrcbc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct64_ctrcbc_vtable; + ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +void +br_aes_ct64_ctrcbc_ctr(const br_aes_ct64_ctrcbc_keys *ctx, + void *ctr, void *data, size_t len) +{ + unsigned char *buf; + unsigned char *ivbuf; + uint32_t iv0, iv1, iv2, iv3; + uint64_t sk_exp[120]; + + br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + + /* + * We keep the counter as four 32-bit values, with big-endian + * convention, because that's what is expected for purposes of + * incrementing the counter value. + */ + ivbuf = ctr; + iv0 = br_dec32be(ivbuf + 0); + iv1 = br_dec32be(ivbuf + 4); + iv2 = br_dec32be(ivbuf + 8); + iv3 = br_dec32be(ivbuf + 12); + + buf = data; + while (len > 0) { + uint64_t q[8]; + uint32_t w[16]; + unsigned char tmp[64]; + int i, j; + + /* + * The bitslice implementation expects values in + * little-endian convention, so we have to byteswap them. + */ + j = (len >= 64) ? 16 : (int)(len >> 2); + for (i = 0; i < j; i += 4) { + uint32_t carry; + + w[i + 0] = br_swap32(iv0); + w[i + 1] = br_swap32(iv1); + w[i + 2] = br_swap32(iv2); + w[i + 3] = br_swap32(iv3); + iv3 ++; + carry = ~(iv3 | -iv3) >> 31; + iv2 += carry; + carry &= -(~(iv2 | -iv2) >> 31); + iv1 += carry; + carry &= -(~(iv1 | -iv1) >> 31); + iv0 += carry; + } + memset(w + i, 0, (16 - i) * sizeof(uint32_t)); + + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_in( + &q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out( + w + (i << 2), q[i], q[i + 4]); + } + + br_range_enc32le(tmp, w, 16); + if (len <= 64) { + xorbuf(buf, tmp, len); + break; + } + xorbuf(buf, tmp, 64); + buf += 64; + len -= 64; + } + br_enc32be(ivbuf + 0, iv0); + br_enc32be(ivbuf + 4, iv1); + br_enc32be(ivbuf + 8, iv2); + br_enc32be(ivbuf + 12, iv3); +} + +/* see bearssl_block.h */ +void +br_aes_ct64_ctrcbc_mac(const br_aes_ct64_ctrcbc_keys *ctx, + void *cbcmac, const void *data, size_t len) +{ + const unsigned char *buf; + uint32_t cm0, cm1, cm2, cm3; + uint64_t q[8]; + uint64_t sk_exp[120]; + + br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + + cm0 = br_dec32le((unsigned char *)cbcmac + 0); + cm1 = br_dec32le((unsigned char *)cbcmac + 4); + cm2 = br_dec32le((unsigned char *)cbcmac + 8); + cm3 = br_dec32le((unsigned char *)cbcmac + 12); + + buf = data; + memset(q, 0, sizeof q); + while (len > 0) { + uint32_t w[4]; + + w[0] = cm0 ^ br_dec32le(buf + 0); + w[1] = cm1 ^ br_dec32le(buf + 4); + w[2] = cm2 ^ br_dec32le(buf + 8); + w[3] = cm3 ^ br_dec32le(buf + 12); + + br_aes_ct64_interleave_in(&q[0], &q[4], w); + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + br_aes_ct64_interleave_out(w, q[0], q[4]); + + cm0 = w[0]; + cm1 = w[1]; + cm2 = w[2]; + cm3 = w[3]; + buf += 16; + len -= 16; + } + + br_enc32le((unsigned char *)cbcmac + 0, cm0); + br_enc32le((unsigned char *)cbcmac + 4, cm1); + br_enc32le((unsigned char *)cbcmac + 8, cm2); + br_enc32le((unsigned char *)cbcmac + 12, cm3); +} + +/* see bearssl_block.h */ +void +br_aes_ct64_ctrcbc_encrypt(const br_aes_ct64_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + /* + * When encrypting, the CBC-MAC processing must be lagging by + * one block, since it operates on the encrypted values, so + * it must wait for that encryption to complete. + */ + + unsigned char *buf; + unsigned char *ivbuf; + uint32_t iv0, iv1, iv2, iv3; + uint32_t cm0, cm1, cm2, cm3; + uint64_t sk_exp[120]; + uint64_t q[8]; + int first_iter; + + br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + + /* + * We keep the counter as four 32-bit values, with big-endian + * convention, because that's what is expected for purposes of + * incrementing the counter value. + */ + ivbuf = ctr; + iv0 = br_dec32be(ivbuf + 0); + iv1 = br_dec32be(ivbuf + 4); + iv2 = br_dec32be(ivbuf + 8); + iv3 = br_dec32be(ivbuf + 12); + + /* + * The current CBC-MAC value is kept in little-endian convention. + */ + cm0 = br_dec32le((unsigned char *)cbcmac + 0); + cm1 = br_dec32le((unsigned char *)cbcmac + 4); + cm2 = br_dec32le((unsigned char *)cbcmac + 8); + cm3 = br_dec32le((unsigned char *)cbcmac + 12); + + buf = data; + first_iter = 1; + memset(q, 0, sizeof q); + while (len > 0) { + uint32_t w[8], carry; + + /* + * The bitslice implementation expects values in + * little-endian convention, so we have to byteswap them. + */ + w[0] = br_swap32(iv0); + w[1] = br_swap32(iv1); + w[2] = br_swap32(iv2); + w[3] = br_swap32(iv3); + iv3 ++; + carry = ~(iv3 | -iv3) >> 31; + iv2 += carry; + carry &= -(~(iv2 | -iv2) >> 31); + iv1 += carry; + carry &= -(~(iv1 | -iv1) >> 31); + iv0 += carry; + + /* + * The block for CBC-MAC. + */ + w[4] = cm0; + w[5] = cm1; + w[6] = cm2; + w[7] = cm3; + + br_aes_ct64_interleave_in(&q[0], &q[4], w); + br_aes_ct64_interleave_in(&q[1], &q[5], w + 4); + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + br_aes_ct64_interleave_out(w, q[0], q[4]); + br_aes_ct64_interleave_out(w + 4, q[1], q[5]); + + /* + * We do the XOR with the plaintext in 32-bit registers, + * so that the value are available for CBC-MAC processing + * as well. + */ + w[0] ^= br_dec32le(buf + 0); + w[1] ^= br_dec32le(buf + 4); + w[2] ^= br_dec32le(buf + 8); + w[3] ^= br_dec32le(buf + 12); + br_enc32le(buf + 0, w[0]); + br_enc32le(buf + 4, w[1]); + br_enc32le(buf + 8, w[2]); + br_enc32le(buf + 12, w[3]); + + buf += 16; + len -= 16; + + /* + * We set the cm* values to the block to encrypt in the + * next iteration. + */ + if (first_iter) { + first_iter = 0; + cm0 ^= w[0]; + cm1 ^= w[1]; + cm2 ^= w[2]; + cm3 ^= w[3]; + } else { + cm0 = w[0] ^ w[4]; + cm1 = w[1] ^ w[5]; + cm2 = w[2] ^ w[6]; + cm3 = w[3] ^ w[7]; + } + + /* + * If this was the last iteration, then compute the + * extra block encryption to complete CBC-MAC. + */ + if (len == 0) { + w[0] = cm0; + w[1] = cm1; + w[2] = cm2; + w[3] = cm3; + br_aes_ct64_interleave_in(&q[0], &q[4], w); + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt( + ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + br_aes_ct64_interleave_out(w, q[0], q[4]); + cm0 = w[0]; + cm1 = w[1]; + cm2 = w[2]; + cm3 = w[3]; + break; + } + } + + br_enc32be(ivbuf + 0, iv0); + br_enc32be(ivbuf + 4, iv1); + br_enc32be(ivbuf + 8, iv2); + br_enc32be(ivbuf + 12, iv3); + br_enc32le((unsigned char *)cbcmac + 0, cm0); + br_enc32le((unsigned char *)cbcmac + 4, cm1); + br_enc32le((unsigned char *)cbcmac + 8, cm2); + br_enc32le((unsigned char *)cbcmac + 12, cm3); +} + +/* see bearssl_block.h */ +void +br_aes_ct64_ctrcbc_decrypt(const br_aes_ct64_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + unsigned char *buf; + unsigned char *ivbuf; + uint32_t iv0, iv1, iv2, iv3; + uint32_t cm0, cm1, cm2, cm3; + uint64_t sk_exp[120]; + uint64_t q[8]; + + br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + + /* + * We keep the counter as four 32-bit values, with big-endian + * convention, because that's what is expected for purposes of + * incrementing the counter value. + */ + ivbuf = ctr; + iv0 = br_dec32be(ivbuf + 0); + iv1 = br_dec32be(ivbuf + 4); + iv2 = br_dec32be(ivbuf + 8); + iv3 = br_dec32be(ivbuf + 12); + + /* + * The current CBC-MAC value is kept in little-endian convention. + */ + cm0 = br_dec32le((unsigned char *)cbcmac + 0); + cm1 = br_dec32le((unsigned char *)cbcmac + 4); + cm2 = br_dec32le((unsigned char *)cbcmac + 8); + cm3 = br_dec32le((unsigned char *)cbcmac + 12); + + buf = data; + memset(q, 0, sizeof q); + while (len > 0) { + uint32_t w[8], carry; + unsigned char tmp[16]; + + /* + * The bitslice implementation expects values in + * little-endian convention, so we have to byteswap them. + */ + w[0] = br_swap32(iv0); + w[1] = br_swap32(iv1); + w[2] = br_swap32(iv2); + w[3] = br_swap32(iv3); + iv3 ++; + carry = ~(iv3 | -iv3) >> 31; + iv2 += carry; + carry &= -(~(iv2 | -iv2) >> 31); + iv1 += carry; + carry &= -(~(iv1 | -iv1) >> 31); + iv0 += carry; + + /* + * The block for CBC-MAC. + */ + w[4] = cm0 ^ br_dec32le(buf + 0); + w[5] = cm1 ^ br_dec32le(buf + 4); + w[6] = cm2 ^ br_dec32le(buf + 8); + w[7] = cm3 ^ br_dec32le(buf + 12); + + br_aes_ct64_interleave_in(&q[0], &q[4], w); + br_aes_ct64_interleave_in(&q[1], &q[5], w + 4); + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + br_aes_ct64_interleave_out(w, q[0], q[4]); + br_aes_ct64_interleave_out(w + 4, q[1], q[5]); + + br_enc32le(tmp + 0, w[0]); + br_enc32le(tmp + 4, w[1]); + br_enc32le(tmp + 8, w[2]); + br_enc32le(tmp + 12, w[3]); + xorbuf(buf, tmp, 16); + cm0 = w[4]; + cm1 = w[5]; + cm2 = w[6]; + cm3 = w[7]; + buf += 16; + len -= 16; + } + + br_enc32be(ivbuf + 0, iv0); + br_enc32be(ivbuf + 4, iv1); + br_enc32be(ivbuf + 8, iv2); + br_enc32be(ivbuf + 12, iv3); + br_enc32le((unsigned char *)cbcmac + 0, cm0); + br_enc32le((unsigned char *)cbcmac + 4, cm1); + br_enc32le((unsigned char *)cbcmac + 8, cm2); + br_enc32le((unsigned char *)cbcmac + 12, cm3); +} + +/* see bearssl_block.h */ +const br_block_ctrcbc_class br_aes_ct64_ctrcbc_vtable = { + sizeof(br_aes_ct64_ctrcbc_keys), + 16, + 4, + (void (*)(const br_block_ctrcbc_class **, const void *, size_t)) + &br_aes_ct64_ctrcbc_init, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_ct64_ctrcbc_encrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_ct64_ctrcbc_decrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, size_t)) + &br_aes_ct64_ctrcbc_ctr, + (void (*)(const br_block_ctrcbc_class *const *, + void *, const void *, size_t)) + &br_aes_ct64_ctrcbc_mac +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_dec.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_dec.c new file mode 100644 index 00000000..ab00e099 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_dec.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see inner.h */ +void +br_aes_ct64_bitslice_invSbox(uint64_t *q) +{ + /* + * See br_aes_ct_bitslice_invSbox(). This is the natural extension + * to 64-bit registers. + */ + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = ~q[5]; + q6 = ~q[6]; + q7 = q[7]; + q[7] = q1 ^ q4 ^ q6; + q[6] = q0 ^ q3 ^ q5; + q[5] = q7 ^ q2 ^ q4; + q[4] = q6 ^ q1 ^ q3; + q[3] = q5 ^ q0 ^ q2; + q[2] = q4 ^ q7 ^ q1; + q[1] = q3 ^ q6 ^ q0; + q[0] = q2 ^ q5 ^ q7; + + br_aes_ct64_bitslice_Sbox(q); + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = ~q[5]; + q6 = ~q[6]; + q7 = q[7]; + q[7] = q1 ^ q4 ^ q6; + q[6] = q0 ^ q3 ^ q5; + q[5] = q7 ^ q2 ^ q4; + q[4] = q6 ^ q1 ^ q3; + q[3] = q5 ^ q0 ^ q2; + q[2] = q4 ^ q7 ^ q1; + q[1] = q3 ^ q6 ^ q0; + q[0] = q2 ^ q5 ^ q7; +} + +static void +add_round_key(uint64_t *q, const uint64_t *sk) +{ + int i; + + for (i = 0; i < 8; i ++) { + q[i] ^= sk[i]; + } +} + +static void +inv_shift_rows(uint64_t *q) +{ + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x000000000FFF0000) << 4) + | ((x & (uint64_t)0x00000000F0000000) >> 12) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000F000000000000) << 12) + | ((x & (uint64_t)0xFFF0000000000000) >> 4); + } +} + +static inline uint64_t +rotr32(uint64_t x) +{ + return (x << 32) | (x >> 32); +} + +static void +inv_mix_columns(uint64_t *q) +{ + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr32(q0 ^ q5 ^ q6 ^ r0 ^ r5); + q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6); + q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr32(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7); + q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr32(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7); + q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6); + q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr32(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7); + q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr32(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7); + q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr32(q4 ^ q5 ^ q7 ^ r4 ^ r7); +} + +/* see inner.h */ +void +br_aes_ct64_bitslice_decrypt(unsigned num_rounds, + const uint64_t *skey, uint64_t *q) +{ + unsigned u; + + add_round_key(q, skey + (num_rounds << 3)); + for (u = num_rounds - 1; u > 0; u --) { + inv_shift_rows(q); + br_aes_ct64_bitslice_invSbox(q); + add_round_key(q, skey + (u << 3)); + inv_mix_columns(q); + } + inv_shift_rows(q); + br_aes_ct64_bitslice_invSbox(q); + add_round_key(q, skey); +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct64_enc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct64_enc.c new file mode 100644 index 00000000..78631ced --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct64_enc.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static inline void +add_round_key(uint64_t *q, const uint64_t *sk) +{ + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void +shift_rows(uint64_t *q) +{ + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t +rotr32(uint64_t x) +{ + return (x << 32) | (x >> 32); +} + +static inline void +mix_columns(uint64_t *q) +{ + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +/* see inner.h */ +void +br_aes_ct64_bitslice_encrypt(unsigned num_rounds, + const uint64_t *skey, uint64_t *q) +{ + unsigned u; + + add_round_key(q, skey); + for (u = 1; u < num_rounds; u ++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, skey + (u << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, skey + (num_rounds << 3)); +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcdec.c new file mode 100644 index 00000000..522645ad --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcdec.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct_cbcdec_init(br_aes_ct_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct_cbcdec_vtable; + ctx->num_rounds = br_aes_ct_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_aes_ct_cbcdec_run(const br_aes_ct_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + uint32_t iv0, iv1, iv2, iv3; + uint32_t sk_exp[120]; + + br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + ivbuf = iv; + iv0 = br_dec32le(ivbuf); + iv1 = br_dec32le(ivbuf + 4); + iv2 = br_dec32le(ivbuf + 8); + iv3 = br_dec32le(ivbuf + 12); + buf = data; + while (len > 0) { + uint32_t q[8], sq[8]; + + q[0] = br_dec32le(buf); + q[2] = br_dec32le(buf + 4); + q[4] = br_dec32le(buf + 8); + q[6] = br_dec32le(buf + 12); + if (len >= 32) { + q[1] = br_dec32le(buf + 16); + q[3] = br_dec32le(buf + 20); + q[5] = br_dec32le(buf + 24); + q[7] = br_dec32le(buf + 28); + } else { + q[1] = 0; + q[3] = 0; + q[5] = 0; + q[7] = 0; + } + memcpy(sq, q, sizeof q); + br_aes_ct_ortho(q); + br_aes_ct_bitslice_decrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct_ortho(q); + br_enc32le(buf, q[0] ^ iv0); + br_enc32le(buf + 4, q[2] ^ iv1); + br_enc32le(buf + 8, q[4] ^ iv2); + br_enc32le(buf + 12, q[6] ^ iv3); + if (len < 32) { + iv0 = sq[0]; + iv1 = sq[2]; + iv2 = sq[4]; + iv3 = sq[6]; + break; + } + br_enc32le(buf + 16, q[1] ^ sq[0]); + br_enc32le(buf + 20, q[3] ^ sq[2]); + br_enc32le(buf + 24, q[5] ^ sq[4]); + br_enc32le(buf + 28, q[7] ^ sq[6]); + iv0 = sq[1]; + iv1 = sq[3]; + iv2 = sq[5]; + iv3 = sq[7]; + buf += 32; + len -= 32; + } + br_enc32le(ivbuf, iv0); + br_enc32le(ivbuf + 4, iv1); + br_enc32le(ivbuf + 8, iv2); + br_enc32le(ivbuf + 12, iv3); +} + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_aes_ct_cbcdec_vtable = { + sizeof(br_aes_ct_cbcdec_keys), + 16, + 4, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_aes_ct_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_aes_ct_cbcdec_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcenc.c new file mode 100644 index 00000000..cb85977b --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_cbcenc.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct_cbcenc_init(br_aes_ct_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct_cbcenc_vtable; + ctx->num_rounds = br_aes_ct_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_aes_ct_cbcenc_run(const br_aes_ct_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + uint32_t q[8]; + uint32_t iv0, iv1, iv2, iv3; + uint32_t sk_exp[120]; + + q[1] = 0; + q[3] = 0; + q[5] = 0; + q[7] = 0; + br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + ivbuf = iv; + iv0 = br_dec32le(ivbuf); + iv1 = br_dec32le(ivbuf + 4); + iv2 = br_dec32le(ivbuf + 8); + iv3 = br_dec32le(ivbuf + 12); + buf = data; + while (len > 0) { + q[0] = iv0 ^ br_dec32le(buf); + q[2] = iv1 ^ br_dec32le(buf + 4); + q[4] = iv2 ^ br_dec32le(buf + 8); + q[6] = iv3 ^ br_dec32le(buf + 12); + br_aes_ct_ortho(q); + br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct_ortho(q); + iv0 = q[0]; + iv1 = q[2]; + iv2 = q[4]; + iv3 = q[6]; + br_enc32le(buf, iv0); + br_enc32le(buf + 4, iv1); + br_enc32le(buf + 8, iv2); + br_enc32le(buf + 12, iv3); + buf += 16; + len -= 16; + } + br_enc32le(ivbuf, iv0); + br_enc32le(ivbuf + 4, iv1); + br_enc32le(ivbuf + 8, iv2); + br_enc32le(ivbuf + 12, iv3); +} + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_aes_ct_cbcenc_vtable = { + sizeof(br_aes_ct_cbcenc_keys), + 16, + 4, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_aes_ct_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_aes_ct_cbcenc_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_ctr.c new file mode 100644 index 00000000..f407689e --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_ctr.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct_ctr_init(br_aes_ct_ctr_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct_ctr_vtable; + ctx->num_rounds = br_aes_ct_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +uint32_t +br_aes_ct_ctr_run(const br_aes_ct_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + const unsigned char *ivbuf; + uint32_t iv0, iv1, iv2; + uint32_t sk_exp[120]; + + br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + ivbuf = iv; + iv0 = br_dec32le(ivbuf); + iv1 = br_dec32le(ivbuf + 4); + iv2 = br_dec32le(ivbuf + 8); + buf = data; + while (len > 0) { + uint32_t q[8]; + unsigned char tmp[32]; + + /* + * TODO: see if we can save on the first br_aes_ct_ortho() + * call, since iv0/iv1/iv2 are constant for the whole run. + */ + q[0] = q[1] = iv0; + q[2] = q[3] = iv1; + q[4] = q[5] = iv2; + q[6] = br_swap32(cc); + q[7] = br_swap32(cc + 1); + br_aes_ct_ortho(q); + br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct_ortho(q); + br_enc32le(tmp, q[0]); + br_enc32le(tmp + 4, q[2]); + br_enc32le(tmp + 8, q[4]); + br_enc32le(tmp + 12, q[6]); + br_enc32le(tmp + 16, q[1]); + br_enc32le(tmp + 20, q[3]); + br_enc32le(tmp + 24, q[5]); + br_enc32le(tmp + 28, q[7]); + + if (len <= 32) { + xorbuf(buf, tmp, len); + cc ++; + if (len > 16) { + cc ++; + } + break; + } + xorbuf(buf, tmp, 32); + buf += 32; + len -= 32; + cc += 2; + } + return cc; +} + +/* see bearssl_block.h */ +const br_block_ctr_class br_aes_ct_ctr_vtable = { + sizeof(br_aes_ct_ctr_keys), + 16, + 4, + (void (*)(const br_block_ctr_class **, const void *, size_t)) + &br_aes_ct_ctr_init, + (uint32_t (*)(const br_block_ctr_class *const *, + const void *, uint32_t, void *, size_t)) + &br_aes_ct_ctr_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_ctrcbc.c new file mode 100644 index 00000000..8ae9fc75 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_ctrcbc.c @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct_ctrcbc_init(br_aes_ct_ctrcbc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct_ctrcbc_vtable; + ctx->num_rounds = br_aes_ct_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +void +br_aes_ct_ctrcbc_ctr(const br_aes_ct_ctrcbc_keys *ctx, + void *ctr, void *data, size_t len) +{ + unsigned char *buf; + unsigned char *ivbuf; + uint32_t iv0, iv1, iv2, iv3; + uint32_t sk_exp[120]; + + br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + + /* + * We keep the counter as four 32-bit values, with big-endian + * convention, because that's what is expected for purposes of + * incrementing the counter value. + */ + ivbuf = ctr; + iv0 = br_dec32be(ivbuf + 0); + iv1 = br_dec32be(ivbuf + 4); + iv2 = br_dec32be(ivbuf + 8); + iv3 = br_dec32be(ivbuf + 12); + + buf = data; + while (len > 0) { + uint32_t q[8], carry; + unsigned char tmp[32]; + + /* + * The bitslice implementation expects values in + * little-endian convention, so we have to byteswap them. + */ + q[0] = br_swap32(iv0); + q[2] = br_swap32(iv1); + q[4] = br_swap32(iv2); + q[6] = br_swap32(iv3); + iv3 ++; + carry = ~(iv3 | -iv3) >> 31; + iv2 += carry; + carry &= -(~(iv2 | -iv2) >> 31); + iv1 += carry; + carry &= -(~(iv1 | -iv1) >> 31); + iv0 += carry; + q[1] = br_swap32(iv0); + q[3] = br_swap32(iv1); + q[5] = br_swap32(iv2); + q[7] = br_swap32(iv3); + if (len > 16) { + iv3 ++; + carry = ~(iv3 | -iv3) >> 31; + iv2 += carry; + carry &= -(~(iv2 | -iv2) >> 31); + iv1 += carry; + carry &= -(~(iv1 | -iv1) >> 31); + iv0 += carry; + } + + br_aes_ct_ortho(q); + br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct_ortho(q); + + br_enc32le(tmp, q[0]); + br_enc32le(tmp + 4, q[2]); + br_enc32le(tmp + 8, q[4]); + br_enc32le(tmp + 12, q[6]); + br_enc32le(tmp + 16, q[1]); + br_enc32le(tmp + 20, q[3]); + br_enc32le(tmp + 24, q[5]); + br_enc32le(tmp + 28, q[7]); + + if (len <= 32) { + xorbuf(buf, tmp, len); + break; + } + xorbuf(buf, tmp, 32); + buf += 32; + len -= 32; + } + br_enc32be(ivbuf + 0, iv0); + br_enc32be(ivbuf + 4, iv1); + br_enc32be(ivbuf + 8, iv2); + br_enc32be(ivbuf + 12, iv3); +} + +/* see bearssl_block.h */ +void +br_aes_ct_ctrcbc_mac(const br_aes_ct_ctrcbc_keys *ctx, + void *cbcmac, const void *data, size_t len) +{ + const unsigned char *buf; + uint32_t cm0, cm1, cm2, cm3; + uint32_t q[8]; + uint32_t sk_exp[120]; + + br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + + buf = data; + cm0 = br_dec32le((unsigned char *)cbcmac + 0); + cm1 = br_dec32le((unsigned char *)cbcmac + 4); + cm2 = br_dec32le((unsigned char *)cbcmac + 8); + cm3 = br_dec32le((unsigned char *)cbcmac + 12); + q[1] = 0; + q[3] = 0; + q[5] = 0; + q[7] = 0; + + while (len > 0) { + q[0] = cm0 ^ br_dec32le(buf + 0); + q[2] = cm1 ^ br_dec32le(buf + 4); + q[4] = cm2 ^ br_dec32le(buf + 8); + q[6] = cm3 ^ br_dec32le(buf + 12); + + br_aes_ct_ortho(q); + br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct_ortho(q); + + cm0 = q[0]; + cm1 = q[2]; + cm2 = q[4]; + cm3 = q[6]; + buf += 16; + len -= 16; + } + + br_enc32le((unsigned char *)cbcmac + 0, cm0); + br_enc32le((unsigned char *)cbcmac + 4, cm1); + br_enc32le((unsigned char *)cbcmac + 8, cm2); + br_enc32le((unsigned char *)cbcmac + 12, cm3); +} + +/* see bearssl_block.h */ +void +br_aes_ct_ctrcbc_encrypt(const br_aes_ct_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + /* + * When encrypting, the CBC-MAC processing must be lagging by + * one block, since it operates on the encrypted values, so + * it must wait for that encryption to complete. + */ + + unsigned char *buf; + unsigned char *ivbuf; + uint32_t iv0, iv1, iv2, iv3; + uint32_t cm0, cm1, cm2, cm3; + uint32_t sk_exp[120]; + int first_iter; + + br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + + /* + * We keep the counter as four 32-bit values, with big-endian + * convention, because that's what is expected for purposes of + * incrementing the counter value. + */ + ivbuf = ctr; + iv0 = br_dec32be(ivbuf + 0); + iv1 = br_dec32be(ivbuf + 4); + iv2 = br_dec32be(ivbuf + 8); + iv3 = br_dec32be(ivbuf + 12); + + /* + * The current CBC-MAC value is kept in little-endian convention. + */ + cm0 = br_dec32le((unsigned char *)cbcmac + 0); + cm1 = br_dec32le((unsigned char *)cbcmac + 4); + cm2 = br_dec32le((unsigned char *)cbcmac + 8); + cm3 = br_dec32le((unsigned char *)cbcmac + 12); + + buf = data; + first_iter = 1; + while (len > 0) { + uint32_t q[8], carry; + + /* + * The bitslice implementation expects values in + * little-endian convention, so we have to byteswap them. + */ + q[0] = br_swap32(iv0); + q[2] = br_swap32(iv1); + q[4] = br_swap32(iv2); + q[6] = br_swap32(iv3); + iv3 ++; + carry = ~(iv3 | -iv3) >> 31; + iv2 += carry; + carry &= -(~(iv2 | -iv2) >> 31); + iv1 += carry; + carry &= -(~(iv1 | -iv1) >> 31); + iv0 += carry; + + /* + * The odd values are used for CBC-MAC. + */ + q[1] = cm0; + q[3] = cm1; + q[5] = cm2; + q[7] = cm3; + + br_aes_ct_ortho(q); + br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct_ortho(q); + + /* + * We do the XOR with the plaintext in 32-bit registers, + * so that the value are available for CBC-MAC processing + * as well. + */ + q[0] ^= br_dec32le(buf + 0); + q[2] ^= br_dec32le(buf + 4); + q[4] ^= br_dec32le(buf + 8); + q[6] ^= br_dec32le(buf + 12); + br_enc32le(buf + 0, q[0]); + br_enc32le(buf + 4, q[2]); + br_enc32le(buf + 8, q[4]); + br_enc32le(buf + 12, q[6]); + + buf += 16; + len -= 16; + + /* + * We set the cm* values to the block to encrypt in the + * next iteration. + */ + if (first_iter) { + first_iter = 0; + cm0 ^= q[0]; + cm1 ^= q[2]; + cm2 ^= q[4]; + cm3 ^= q[6]; + } else { + cm0 = q[0] ^ q[1]; + cm1 = q[2] ^ q[3]; + cm2 = q[4] ^ q[5]; + cm3 = q[6] ^ q[7]; + } + + /* + * If this was the last iteration, then compute the + * extra block encryption to complete CBC-MAC. + */ + if (len == 0) { + q[0] = cm0; + q[2] = cm1; + q[4] = cm2; + q[6] = cm3; + br_aes_ct_ortho(q); + br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct_ortho(q); + cm0 = q[0]; + cm1 = q[2]; + cm2 = q[4]; + cm3 = q[6]; + break; + } + } + + br_enc32be(ivbuf + 0, iv0); + br_enc32be(ivbuf + 4, iv1); + br_enc32be(ivbuf + 8, iv2); + br_enc32be(ivbuf + 12, iv3); + br_enc32le((unsigned char *)cbcmac + 0, cm0); + br_enc32le((unsigned char *)cbcmac + 4, cm1); + br_enc32le((unsigned char *)cbcmac + 8, cm2); + br_enc32le((unsigned char *)cbcmac + 12, cm3); +} + +/* see bearssl_block.h */ +void +br_aes_ct_ctrcbc_decrypt(const br_aes_ct_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + unsigned char *buf; + unsigned char *ivbuf; + uint32_t iv0, iv1, iv2, iv3; + uint32_t cm0, cm1, cm2, cm3; + uint32_t sk_exp[120]; + + br_aes_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + + /* + * We keep the counter as four 32-bit values, with big-endian + * convention, because that's what is expected for purposes of + * incrementing the counter value. + */ + ivbuf = ctr; + iv0 = br_dec32be(ivbuf + 0); + iv1 = br_dec32be(ivbuf + 4); + iv2 = br_dec32be(ivbuf + 8); + iv3 = br_dec32be(ivbuf + 12); + + /* + * The current CBC-MAC value is kept in little-endian convention. + */ + cm0 = br_dec32le((unsigned char *)cbcmac + 0); + cm1 = br_dec32le((unsigned char *)cbcmac + 4); + cm2 = br_dec32le((unsigned char *)cbcmac + 8); + cm3 = br_dec32le((unsigned char *)cbcmac + 12); + + buf = data; + while (len > 0) { + uint32_t q[8], carry; + unsigned char tmp[16]; + + /* + * The bitslice implementation expects values in + * little-endian convention, so we have to byteswap them. + */ + q[0] = br_swap32(iv0); + q[2] = br_swap32(iv1); + q[4] = br_swap32(iv2); + q[6] = br_swap32(iv3); + iv3 ++; + carry = ~(iv3 | -iv3) >> 31; + iv2 += carry; + carry &= -(~(iv2 | -iv2) >> 31); + iv1 += carry; + carry &= -(~(iv1 | -iv1) >> 31); + iv0 += carry; + + /* + * The odd values are used for CBC-MAC. + */ + q[1] = cm0 ^ br_dec32le(buf + 0); + q[3] = cm1 ^ br_dec32le(buf + 4); + q[5] = cm2 ^ br_dec32le(buf + 8); + q[7] = cm3 ^ br_dec32le(buf + 12); + + br_aes_ct_ortho(q); + br_aes_ct_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct_ortho(q); + + br_enc32le(tmp + 0, q[0]); + br_enc32le(tmp + 4, q[2]); + br_enc32le(tmp + 8, q[4]); + br_enc32le(tmp + 12, q[6]); + xorbuf(buf, tmp, 16); + cm0 = q[1]; + cm1 = q[3]; + cm2 = q[5]; + cm3 = q[7]; + buf += 16; + len -= 16; + } + + br_enc32be(ivbuf + 0, iv0); + br_enc32be(ivbuf + 4, iv1); + br_enc32be(ivbuf + 8, iv2); + br_enc32be(ivbuf + 12, iv3); + br_enc32le((unsigned char *)cbcmac + 0, cm0); + br_enc32le((unsigned char *)cbcmac + 4, cm1); + br_enc32le((unsigned char *)cbcmac + 8, cm2); + br_enc32le((unsigned char *)cbcmac + 12, cm3); +} + +/* see bearssl_block.h */ +const br_block_ctrcbc_class br_aes_ct_ctrcbc_vtable = { + sizeof(br_aes_ct_ctrcbc_keys), + 16, + 4, + (void (*)(const br_block_ctrcbc_class **, const void *, size_t)) + &br_aes_ct_ctrcbc_init, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_ct_ctrcbc_encrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_ct_ctrcbc_decrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, size_t)) + &br_aes_ct_ctrcbc_ctr, + (void (*)(const br_block_ctrcbc_class *const *, + void *, const void *, size_t)) + &br_aes_ct_ctrcbc_mac +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_dec.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_dec.c new file mode 100644 index 00000000..7f32d2bd --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_dec.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see inner.h */ +void +br_aes_ct_bitslice_invSbox(uint32_t *q) +{ + /* + * AES S-box is: + * S(x) = A(I(x)) ^ 0x63 + * where I() is inversion in GF(256), and A() is a linear + * transform (0 is formally defined to be its own inverse). + * Since inversion is an involution, the inverse S-box can be + * computed from the S-box as: + * iS(x) = B(S(B(x ^ 0x63)) ^ 0x63) + * where B() is the inverse of A(). Indeed, for any y in GF(256): + * iS(S(y)) = B(A(I(B(A(I(y)) ^ 0x63 ^ 0x63))) ^ 0x63 ^ 0x63) = y + * + * Note: we reuse the implementation of the forward S-box, + * instead of duplicating it here, so that total code size is + * lower. By merging the B() transforms into the S-box circuit + * we could make faster CBC decryption, but CBC decryption is + * already quite faster than CBC encryption because we can + * process two blocks in parallel. + */ + uint32_t q0, q1, q2, q3, q4, q5, q6, q7; + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = ~q[5]; + q6 = ~q[6]; + q7 = q[7]; + q[7] = q1 ^ q4 ^ q6; + q[6] = q0 ^ q3 ^ q5; + q[5] = q7 ^ q2 ^ q4; + q[4] = q6 ^ q1 ^ q3; + q[3] = q5 ^ q0 ^ q2; + q[2] = q4 ^ q7 ^ q1; + q[1] = q3 ^ q6 ^ q0; + q[0] = q2 ^ q5 ^ q7; + + br_aes_ct_bitslice_Sbox(q); + + q0 = ~q[0]; + q1 = ~q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = ~q[5]; + q6 = ~q[6]; + q7 = q[7]; + q[7] = q1 ^ q4 ^ q6; + q[6] = q0 ^ q3 ^ q5; + q[5] = q7 ^ q2 ^ q4; + q[4] = q6 ^ q1 ^ q3; + q[3] = q5 ^ q0 ^ q2; + q[2] = q4 ^ q7 ^ q1; + q[1] = q3 ^ q6 ^ q0; + q[0] = q2 ^ q5 ^ q7; +} + +static void +add_round_key(uint32_t *q, const uint32_t *sk) +{ + int i; + + for (i = 0; i < 8; i ++) { + q[i] ^= sk[i]; + } +} + +static void +inv_shift_rows(uint32_t *q) +{ + int i; + + for (i = 0; i < 8; i ++) { + uint32_t x; + + x = q[i]; + q[i] = (x & 0x000000FF) + | ((x & 0x00003F00) << 2) | ((x & 0x0000C000) >> 6) + | ((x & 0x000F0000) << 4) | ((x & 0x00F00000) >> 4) + | ((x & 0x03000000) << 6) | ((x & 0xFC000000) >> 2); + } +} + +static inline uint32_t +rotr16(uint32_t x) +{ + return (x << 16) | (x >> 16); +} + +static void +inv_mix_columns(uint32_t *q) +{ + uint32_t q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 8) | (q0 << 24); + r1 = (q1 >> 8) | (q1 << 24); + r2 = (q2 >> 8) | (q2 << 24); + r3 = (q3 >> 8) | (q3 << 24); + r4 = (q4 >> 8) | (q4 << 24); + r5 = (q5 >> 8) | (q5 << 24); + r6 = (q6 >> 8) | (q6 << 24); + r7 = (q7 >> 8) | (q7 << 24); + + q[0] = q5 ^ q6 ^ q7 ^ r0 ^ r5 ^ r7 ^ rotr16(q0 ^ q5 ^ q6 ^ r0 ^ r5); + q[1] = q0 ^ q5 ^ r0 ^ r1 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q5 ^ q7 ^ r1 ^ r5 ^ r6); + q[2] = q0 ^ q1 ^ q6 ^ r1 ^ r2 ^ r6 ^ r7 ^ rotr16(q0 ^ q2 ^ q6 ^ r2 ^ r6 ^ r7); + q[3] = q0 ^ q1 ^ q2 ^ q5 ^ q6 ^ r0 ^ r2 ^ r3 ^ r5 ^ rotr16(q0 ^ q1 ^ q3 ^ q5 ^ q6 ^ q7 ^ r0 ^ r3 ^ r5 ^ r7); + q[4] = q1 ^ q2 ^ q3 ^ q5 ^ r1 ^ r3 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q1 ^ q2 ^ q4 ^ q5 ^ q7 ^ r1 ^ r4 ^ r5 ^ r6); + q[5] = q2 ^ q3 ^ q4 ^ q6 ^ r2 ^ r4 ^ r5 ^ r6 ^ r7 ^ rotr16(q2 ^ q3 ^ q5 ^ q6 ^ r2 ^ r5 ^ r6 ^ r7); + q[6] = q3 ^ q4 ^ q5 ^ q7 ^ r3 ^ r5 ^ r6 ^ r7 ^ rotr16(q3 ^ q4 ^ q6 ^ q7 ^ r3 ^ r6 ^ r7); + q[7] = q4 ^ q5 ^ q6 ^ r4 ^ r6 ^ r7 ^ rotr16(q4 ^ q5 ^ q7 ^ r4 ^ r7); +} + +/* see inner.h */ +void +br_aes_ct_bitslice_decrypt(unsigned num_rounds, + const uint32_t *skey, uint32_t *q) +{ + unsigned u; + + add_round_key(q, skey + (num_rounds << 3)); + for (u = num_rounds - 1; u > 0; u --) { + inv_shift_rows(q); + br_aes_ct_bitslice_invSbox(q); + add_round_key(q, skey + (u << 3)); + inv_mix_columns(q); + } + inv_shift_rows(q); + br_aes_ct_bitslice_invSbox(q); + add_round_key(q, skey); +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_ct_enc.c b/test/monniaux/BearSSL/src/symcipher/aes_ct_enc.c new file mode 100644 index 00000000..089bf356 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_ct_enc.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +static inline void +add_round_key(uint32_t *q, const uint32_t *sk) +{ + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void +shift_rows(uint32_t *q) +{ + int i; + + for (i = 0; i < 8; i ++) { + uint32_t x; + + x = q[i]; + q[i] = (x & 0x000000FF) + | ((x & 0x0000FC00) >> 2) | ((x & 0x00000300) << 6) + | ((x & 0x00F00000) >> 4) | ((x & 0x000F0000) << 4) + | ((x & 0xC0000000) >> 6) | ((x & 0x3F000000) << 2); + } +} + +static inline uint32_t +rotr16(uint32_t x) +{ + return (x << 16) | (x >> 16); +} + +static inline void +mix_columns(uint32_t *q) +{ + uint32_t q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 8) | (q0 << 24); + r1 = (q1 >> 8) | (q1 << 24); + r2 = (q2 >> 8) | (q2 << 24); + r3 = (q3 >> 8) | (q3 << 24); + r4 = (q4 >> 8) | (q4 << 24); + r5 = (q5 >> 8) | (q5 << 24); + r6 = (q6 >> 8) | (q6 << 24); + r7 = (q7 >> 8) | (q7 << 24); + + q[0] = q7 ^ r7 ^ r0 ^ rotr16(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr16(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr16(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr16(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr16(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr16(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr16(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr16(q7 ^ r7); +} + +/* see inner.h */ +void +br_aes_ct_bitslice_encrypt(unsigned num_rounds, + const uint32_t *skey, uint32_t *q) +{ + unsigned u; + + add_round_key(q, skey); + for (u = 1; u < num_rounds; u ++) { + br_aes_ct_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, skey + (u << 3)); + } + br_aes_ct_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, skey + (num_rounds << 3)); +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8.c new file mode 100644 index 00000000..b2c63c32 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +/* + * This code contains the AES key schedule implementation using the + * POWER8 opcodes. + */ + +#if BR_POWER8 + +static void +key_schedule_128(unsigned char *sk, const unsigned char *key) +{ + long cc; + + static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B }; +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + + /* + * We use the VSX instructions for loading and storing the + * key/subkeys, since they support unaligned accesses. The rest + * of the computation is VMX only. VMX register 0 is VSX + * register 32. + */ + asm volatile ( + + /* + * v0 = all-zero word + * v1 = constant -8 / +8, copied into four words + * v2 = current subkey + * v3 = Rcon (x4 words) + * v6 = constant 8, copied into four words + * v7 = constant 0x11B, copied into four words + * v8 = constant for byteswapping words + */ + vspltisw(0, 0) +#if BR_POWER8_LE + vspltisw(1, -8) +#else + vspltisw(1, 8) +#endif + lxvw4x(34, 0, %[key]) + vspltisw(3, 1) + vspltisw(6, 8) + lxvw4x(39, 0, %[fmod]) +#if BR_POWER8_LE + lxvw4x(40, 0, %[idx2be]) +#endif + + /* + * First subkey is a copy of the key itself. + */ +#if BR_POWER8_LE + vperm(4, 2, 2, 8) + stxvw4x(36, 0, %[sk]) +#else + stxvw4x(34, 0, %[sk]) +#endif + + /* + * Loop must run 10 times. + */ + li(%[cc], 10) + mtctr(%[cc]) + label(loop) + /* Increment subkey address */ + addi(%[sk], %[sk], 16) + + /* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */ + vrlw(4, 2, 1) + vsbox(4, 4) +#if BR_POWER8_LE + vxor(4, 4, 3) +#else + vsldoi(5, 3, 0, 3) + vxor(4, 4, 5) +#endif + vspltw(4, 4, 3) + + /* XOR words for next subkey */ + vsldoi(5, 0, 2, 12) + vxor(2, 2, 5) + vsldoi(5, 0, 2, 12) + vxor(2, 2, 5) + vsldoi(5, 0, 2, 12) + vxor(2, 2, 5) + vxor(2, 2, 4) + + /* Store next subkey */ +#if BR_POWER8_LE + vperm(4, 2, 2, 8) + stxvw4x(36, 0, %[sk]) +#else + stxvw4x(34, 0, %[sk]) +#endif + + /* Update Rcon */ + vadduwm(3, 3, 3) + vsrw(4, 3, 6) + vsubuwm(4, 0, 4) + vand(4, 4, 7) + vxor(3, 3, 4) + + bdnz(loop) + +: [sk] "+b" (sk), [cc] "+b" (cc) +: [key] "b" (key), [fmod] "b" (fmod) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory" + ); +} + +static void +key_schedule_192(unsigned char *sk, const unsigned char *key) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + + /* + * We use the VSX instructions for loading and storing the + * key/subkeys, since they support unaligned accesses. The rest + * of the computation is VMX only. VMX register 0 is VSX + * register 32. + */ + asm volatile ( + + /* + * v0 = all-zero word + * v1 = constant -8 / +8, copied into four words + * v2, v3 = current subkey + * v5 = Rcon (x4 words) (already shifted on big-endian) + * v6 = constant 8, copied into four words + * v8 = constant for byteswapping words + * + * The left two words of v3 are ignored. + */ + vspltisw(0, 0) +#if BR_POWER8_LE + vspltisw(1, -8) +#else + vspltisw(1, 8) +#endif + li(%[cc], 8) + lxvw4x(34, 0, %[key]) + lxvw4x(35, %[cc], %[key]) + vsldoi(3, 3, 0, 8) + vspltisw(5, 1) +#if !BR_POWER8_LE + vsldoi(5, 5, 0, 3) +#endif + vspltisw(6, 8) +#if BR_POWER8_LE + lxvw4x(40, 0, %[idx2be]) +#endif + + /* + * Loop must run 8 times. Each iteration produces 256 + * bits of subkeys, with a 64-bit overlap. + */ + li(%[cc], 8) + mtctr(%[cc]) + li(%[cc], 16) + label(loop) + + /* + * Last 6 words in v2:v3l. Compute next 6 words into + * v3r:v4. + */ + vrlw(10, 3, 1) + vsbox(10, 10) + vxor(10, 10, 5) + vspltw(10, 10, 1) + vsldoi(11, 0, 10, 8) + + vsldoi(12, 0, 2, 12) + vxor(12, 2, 12) + vsldoi(13, 0, 12, 12) + vxor(12, 12, 13) + vsldoi(13, 0, 12, 12) + vxor(12, 12, 13) + + vspltw(13, 12, 3) + vxor(13, 13, 3) + vsldoi(14, 0, 3, 12) + vxor(13, 13, 14) + + vsldoi(4, 12, 13, 8) + vsldoi(14, 0, 3, 8) + vsldoi(3, 14, 12, 8) + + vxor(3, 3, 11) + vxor(4, 4, 10) + + /* + * Update Rcon. Since for a 192-bit key, we use only 8 + * such constants, we will not hit the field modulus, + * so a simple shift (addition) works well. + */ + vadduwm(5, 5, 5) + + /* + * Write out the two left 128-bit words + */ +#if BR_POWER8_LE + vperm(10, 2, 2, 8) + vperm(11, 3, 3, 8) + stxvw4x(42, 0, %[sk]) + stxvw4x(43, %[cc], %[sk]) +#else + stxvw4x(34, 0, %[sk]) + stxvw4x(35, %[cc], %[sk]) +#endif + addi(%[sk], %[sk], 24) + + /* + * Shift words for next iteration. + */ + vsldoi(2, 3, 4, 8) + vsldoi(3, 4, 0, 8) + + bdnz(loop) + + /* + * The loop wrote the first 50 subkey words, but we need + * to produce 52, so we must do one last write. + */ +#if BR_POWER8_LE + vperm(10, 2, 2, 8) + stxvw4x(42, 0, %[sk]) +#else + stxvw4x(34, 0, %[sk]) +#endif + +: [sk] "+b" (sk), [cc] "+b" (cc) +: [key] "b" (key) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" + ); +} + +static void +key_schedule_256(unsigned char *sk, const unsigned char *key) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + + /* + * We use the VSX instructions for loading and storing the + * key/subkeys, since they support unaligned accesses. The rest + * of the computation is VMX only. VMX register 0 is VSX + * register 32. + */ + asm volatile ( + + /* + * v0 = all-zero word + * v1 = constant -8 / +8, copied into four words + * v2, v3 = current subkey + * v6 = Rcon (x4 words) (already shifted on big-endian) + * v7 = constant 8, copied into four words + * v8 = constant for byteswapping words + * + * The left two words of v3 are ignored. + */ + vspltisw(0, 0) +#if BR_POWER8_LE + vspltisw(1, -8) +#else + vspltisw(1, 8) +#endif + li(%[cc], 16) + lxvw4x(34, 0, %[key]) + lxvw4x(35, %[cc], %[key]) + vspltisw(6, 1) +#if !BR_POWER8_LE + vsldoi(6, 6, 0, 3) +#endif + vspltisw(7, 8) +#if BR_POWER8_LE + lxvw4x(40, 0, %[idx2be]) +#endif + + /* + * Loop must run 7 times. Each iteration produces two + * subkeys. + */ + li(%[cc], 7) + mtctr(%[cc]) + li(%[cc], 16) + label(loop) + + /* + * Current words are in v2:v3. Compute next word in v4. + */ + vrlw(10, 3, 1) + vsbox(10, 10) + vxor(10, 10, 6) + vspltw(10, 10, 3) + + vsldoi(4, 0, 2, 12) + vxor(4, 2, 4) + vsldoi(5, 0, 4, 12) + vxor(4, 4, 5) + vsldoi(5, 0, 4, 12) + vxor(4, 4, 5) + vxor(4, 4, 10) + + /* + * Then other word in v5. + */ + vsbox(10, 4) + vspltw(10, 10, 3) + + vsldoi(5, 0, 3, 12) + vxor(5, 3, 5) + vsldoi(11, 0, 5, 12) + vxor(5, 5, 11) + vsldoi(11, 0, 5, 12) + vxor(5, 5, 11) + vxor(5, 5, 10) + + /* + * Update Rcon. Since for a 256-bit key, we use only 7 + * such constants, we will not hit the field modulus, + * so a simple shift (addition) works well. + */ + vadduwm(6, 6, 6) + + /* + * Write out the two left 128-bit words + */ +#if BR_POWER8_LE + vperm(10, 2, 2, 8) + vperm(11, 3, 3, 8) + stxvw4x(42, 0, %[sk]) + stxvw4x(43, %[cc], %[sk]) +#else + stxvw4x(34, 0, %[sk]) + stxvw4x(35, %[cc], %[sk]) +#endif + addi(%[sk], %[sk], 32) + + /* + * Replace v2:v3 with v4:v5. + */ + vxor(2, 0, 4) + vxor(3, 0, 5) + + bdnz(loop) + + /* + * The loop wrote the first 14 subkeys, but we need 15, + * so we must do an extra write. + */ +#if BR_POWER8_LE + vperm(10, 2, 2, 8) + stxvw4x(42, 0, %[sk]) +#else + stxvw4x(34, 0, %[sk]) +#endif + +: [sk] "+b" (sk), [cc] "+b" (cc) +: [key] "b" (key) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" + ); +} + +/* see inner.h */ +int +br_aes_pwr8_supported(void) +{ + return 1; +} + +/* see inner.h */ +unsigned +br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len) +{ + switch (len) { + case 16: + key_schedule_128(sk, key); + return 10; + case 24: + key_schedule_192(sk, key); + return 12; + default: + key_schedule_256(sk, key); + return 14; + } +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcdec.c new file mode 100644 index 00000000..e535ba6f --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcdec.c @@ -0,0 +1,670 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +#if BR_POWER8 + +/* see bearssl_block.h */ +void +br_aes_pwr8_cbcdec_init(br_aes_pwr8_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_pwr8_cbcdec_vtable; + ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len); +} + +static void +cbcdec_128(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v10 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v24. + */ + lxvw4x(56, 0, %[iv]) +#if BR_POWER8_LE + vperm(24, 24, 24, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next ciphertext words in v16..v19. Also save them + * in v20..v23. + */ + lxvw4x(48, %[cc0], %[buf]) + lxvw4x(49, %[cc1], %[buf]) + lxvw4x(50, %[cc2], %[buf]) + lxvw4x(51, %[cc3], %[buf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + vand(20, 16, 16) + vand(21, 17, 17) + vand(22, 18, 18) + vand(23, 19, 19) + + /* + * Decrypt the blocks. + */ + vxor(16, 16, 10) + vxor(17, 17, 10) + vxor(18, 18, 10) + vxor(19, 19, 10) + vncipher(16, 16, 9) + vncipher(17, 17, 9) + vncipher(18, 18, 9) + vncipher(19, 19, 9) + vncipher(16, 16, 8) + vncipher(17, 17, 8) + vncipher(18, 18, 8) + vncipher(19, 19, 8) + vncipher(16, 16, 7) + vncipher(17, 17, 7) + vncipher(18, 18, 7) + vncipher(19, 19, 7) + vncipher(16, 16, 6) + vncipher(17, 17, 6) + vncipher(18, 18, 6) + vncipher(19, 19, 6) + vncipher(16, 16, 5) + vncipher(17, 17, 5) + vncipher(18, 18, 5) + vncipher(19, 19, 5) + vncipher(16, 16, 4) + vncipher(17, 17, 4) + vncipher(18, 18, 4) + vncipher(19, 19, 4) + vncipher(16, 16, 3) + vncipher(17, 17, 3) + vncipher(18, 18, 3) + vncipher(19, 19, 3) + vncipher(16, 16, 2) + vncipher(17, 17, 2) + vncipher(18, 18, 2) + vncipher(19, 19, 2) + vncipher(16, 16, 1) + vncipher(17, 17, 1) + vncipher(18, 18, 1) + vncipher(19, 19, 1) + vncipherlast(16, 16, 0) + vncipherlast(17, 17, 0) + vncipherlast(18, 18, 0) + vncipherlast(19, 19, 0) + + /* + * XOR decrypted blocks with IV / previous block. + */ + vxor(16, 16, 24) + vxor(17, 17, 20) + vxor(18, 18, 21) + vxor(19, 19, 22) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + /* + * Fourth encrypted block is IV for next run. + */ + vand(24, 23, 23) + + addi(%[buf], %[buf], 64) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +static void +cbcdec_192(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v12 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(43, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(44, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v24. + */ + lxvw4x(56, 0, %[iv]) +#if BR_POWER8_LE + vperm(24, 24, 24, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next ciphertext words in v16..v19. Also save them + * in v20..v23. + */ + lxvw4x(48, %[cc0], %[buf]) + lxvw4x(49, %[cc1], %[buf]) + lxvw4x(50, %[cc2], %[buf]) + lxvw4x(51, %[cc3], %[buf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + vand(20, 16, 16) + vand(21, 17, 17) + vand(22, 18, 18) + vand(23, 19, 19) + + /* + * Decrypt the blocks. + */ + vxor(16, 16, 12) + vxor(17, 17, 12) + vxor(18, 18, 12) + vxor(19, 19, 12) + vncipher(16, 16, 11) + vncipher(17, 17, 11) + vncipher(18, 18, 11) + vncipher(19, 19, 11) + vncipher(16, 16, 10) + vncipher(17, 17, 10) + vncipher(18, 18, 10) + vncipher(19, 19, 10) + vncipher(16, 16, 9) + vncipher(17, 17, 9) + vncipher(18, 18, 9) + vncipher(19, 19, 9) + vncipher(16, 16, 8) + vncipher(17, 17, 8) + vncipher(18, 18, 8) + vncipher(19, 19, 8) + vncipher(16, 16, 7) + vncipher(17, 17, 7) + vncipher(18, 18, 7) + vncipher(19, 19, 7) + vncipher(16, 16, 6) + vncipher(17, 17, 6) + vncipher(18, 18, 6) + vncipher(19, 19, 6) + vncipher(16, 16, 5) + vncipher(17, 17, 5) + vncipher(18, 18, 5) + vncipher(19, 19, 5) + vncipher(16, 16, 4) + vncipher(17, 17, 4) + vncipher(18, 18, 4) + vncipher(19, 19, 4) + vncipher(16, 16, 3) + vncipher(17, 17, 3) + vncipher(18, 18, 3) + vncipher(19, 19, 3) + vncipher(16, 16, 2) + vncipher(17, 17, 2) + vncipher(18, 18, 2) + vncipher(19, 19, 2) + vncipher(16, 16, 1) + vncipher(17, 17, 1) + vncipher(18, 18, 1) + vncipher(19, 19, 1) + vncipherlast(16, 16, 0) + vncipherlast(17, 17, 0) + vncipherlast(18, 18, 0) + vncipherlast(19, 19, 0) + + /* + * XOR decrypted blocks with IV / previous block. + */ + vxor(16, 16, 24) + vxor(17, 17, 20) + vxor(18, 18, 21) + vxor(19, 19, 22) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + /* + * Fourth encrypted block is IV for next run. + */ + vand(24, 23, 23) + + addi(%[buf], %[buf], 64) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +static void +cbcdec_256(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v14 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(43, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(44, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(45, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(46, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v24. + */ + lxvw4x(56, 0, %[iv]) +#if BR_POWER8_LE + vperm(24, 24, 24, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next ciphertext words in v16..v19. Also save them + * in v20..v23. + */ + lxvw4x(48, %[cc0], %[buf]) + lxvw4x(49, %[cc1], %[buf]) + lxvw4x(50, %[cc2], %[buf]) + lxvw4x(51, %[cc3], %[buf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + vand(20, 16, 16) + vand(21, 17, 17) + vand(22, 18, 18) + vand(23, 19, 19) + + /* + * Decrypt the blocks. + */ + vxor(16, 16, 14) + vxor(17, 17, 14) + vxor(18, 18, 14) + vxor(19, 19, 14) + vncipher(16, 16, 13) + vncipher(17, 17, 13) + vncipher(18, 18, 13) + vncipher(19, 19, 13) + vncipher(16, 16, 12) + vncipher(17, 17, 12) + vncipher(18, 18, 12) + vncipher(19, 19, 12) + vncipher(16, 16, 11) + vncipher(17, 17, 11) + vncipher(18, 18, 11) + vncipher(19, 19, 11) + vncipher(16, 16, 10) + vncipher(17, 17, 10) + vncipher(18, 18, 10) + vncipher(19, 19, 10) + vncipher(16, 16, 9) + vncipher(17, 17, 9) + vncipher(18, 18, 9) + vncipher(19, 19, 9) + vncipher(16, 16, 8) + vncipher(17, 17, 8) + vncipher(18, 18, 8) + vncipher(19, 19, 8) + vncipher(16, 16, 7) + vncipher(17, 17, 7) + vncipher(18, 18, 7) + vncipher(19, 19, 7) + vncipher(16, 16, 6) + vncipher(17, 17, 6) + vncipher(18, 18, 6) + vncipher(19, 19, 6) + vncipher(16, 16, 5) + vncipher(17, 17, 5) + vncipher(18, 18, 5) + vncipher(19, 19, 5) + vncipher(16, 16, 4) + vncipher(17, 17, 4) + vncipher(18, 18, 4) + vncipher(19, 19, 4) + vncipher(16, 16, 3) + vncipher(17, 17, 3) + vncipher(18, 18, 3) + vncipher(19, 19, 3) + vncipher(16, 16, 2) + vncipher(17, 17, 2) + vncipher(18, 18, 2) + vncipher(19, 19, 2) + vncipher(16, 16, 1) + vncipher(17, 17, 1) + vncipher(18, 18, 1) + vncipher(19, 19, 1) + vncipherlast(16, 16, 0) + vncipherlast(17, 17, 0) + vncipherlast(18, 18, 0) + vncipherlast(19, 19, 0) + + /* + * XOR decrypted blocks with IV / previous block. + */ + vxor(16, 16, 24) + vxor(17, 17, 20) + vxor(18, 18, 21) + vxor(19, 19, 22) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + /* + * Fourth encrypted block is IV for next run. + */ + vand(24, 23, 23) + + addi(%[buf], %[buf], 64) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (num_blocks >> 2) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +/* see bearssl_block.h */ +void +br_aes_pwr8_cbcdec_run(const br_aes_pwr8_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char nextiv[16]; + unsigned char *buf; + + if (len == 0) { + return; + } + buf = data; + memcpy(nextiv, buf + len - 16, 16); + if (len >= 64) { + size_t num_blocks; + unsigned char tmp[16]; + + num_blocks = (len >> 4) & ~(size_t)3; + memcpy(tmp, buf + (num_blocks << 4) - 16, 16); + switch (ctx->num_rounds) { + case 10: + cbcdec_128(ctx->skey.skni, iv, buf, num_blocks); + break; + case 12: + cbcdec_192(ctx->skey.skni, iv, buf, num_blocks); + break; + default: + cbcdec_256(ctx->skey.skni, iv, buf, num_blocks); + break; + } + buf += num_blocks << 4; + len &= 63; + memcpy(iv, tmp, 16); + } + if (len > 0) { + unsigned char tmp[64]; + + memcpy(tmp, buf, len); + memset(tmp + len, 0, (sizeof tmp) - len); + switch (ctx->num_rounds) { + case 10: + cbcdec_128(ctx->skey.skni, iv, tmp, 4); + break; + case 12: + cbcdec_192(ctx->skey.skni, iv, tmp, 4); + break; + default: + cbcdec_256(ctx->skey.skni, iv, tmp, 4); + break; + } + memcpy(buf, tmp, len); + } + memcpy(iv, nextiv, 16); +} + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_aes_pwr8_cbcdec_vtable = { + sizeof(br_aes_pwr8_cbcdec_keys), + 16, + 4, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_aes_pwr8_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_aes_pwr8_cbcdec_run +}; + +/* see bearssl_block.h */ +const br_block_cbcdec_class * +br_aes_pwr8_cbcdec_get_vtable(void) +{ + return br_aes_pwr8_supported() ? &br_aes_pwr8_cbcdec_vtable : NULL; +} + +#else + +/* see bearssl_block.h */ +const br_block_cbcdec_class * +br_aes_pwr8_cbcdec_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcenc.c new file mode 100644 index 00000000..00f8eca7 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_cbcenc.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +#if BR_POWER8 + +/* see bearssl_block.h */ +void +br_aes_pwr8_cbcenc_init(br_aes_pwr8_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_pwr8_cbcenc_vtable; + ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len); +} + +static void +cbcenc_128(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t len) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + asm volatile ( + + /* + * Load subkeys into v0..v10 + */ + lxvw4x(32, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(33, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(34, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(35, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(36, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(37, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(38, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(39, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(40, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(41, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(42, %[cc], %[sk]) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v16. + */ + lxvw4x(48, 0, %[iv]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next plaintext word and XOR with current IV. + */ + lxvw4x(49, 0, %[buf]) +#if BR_POWER8_LE + vperm(17, 17, 17, 15) +#endif + vxor(16, 16, 17) + + /* + * Encrypt the block. + */ + vxor(16, 16, 0) + vcipher(16, 16, 1) + vcipher(16, 16, 2) + vcipher(16, 16, 3) + vcipher(16, 16, 4) + vcipher(16, 16, 5) + vcipher(16, 16, 6) + vcipher(16, 16, 7) + vcipher(16, 16, 8) + vcipher(16, 16, 9) + vcipherlast(16, 16, 10) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(17, 16, 16, 15) + stxvw4x(49, 0, %[buf]) +#else + stxvw4x(48, 0, %[buf]) +#endif + addi(%[buf], %[buf], 16) + + bdnz(loop) + +: [cc] "+b" (cc), [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "ctr", "memory" + ); +} + +static void +cbcenc_192(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t len) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + asm volatile ( + + /* + * Load subkeys into v0..v12 + */ + lxvw4x(32, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(33, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(34, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(35, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(36, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(37, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(38, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(39, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(40, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(41, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(42, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(43, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(44, %[cc], %[sk]) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v16. + */ + lxvw4x(48, 0, %[iv]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next plaintext word and XOR with current IV. + */ + lxvw4x(49, 0, %[buf]) +#if BR_POWER8_LE + vperm(17, 17, 17, 15) +#endif + vxor(16, 16, 17) + + /* + * Encrypt the block. + */ + vxor(16, 16, 0) + vcipher(16, 16, 1) + vcipher(16, 16, 2) + vcipher(16, 16, 3) + vcipher(16, 16, 4) + vcipher(16, 16, 5) + vcipher(16, 16, 6) + vcipher(16, 16, 7) + vcipher(16, 16, 8) + vcipher(16, 16, 9) + vcipher(16, 16, 10) + vcipher(16, 16, 11) + vcipherlast(16, 16, 12) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(17, 16, 16, 15) + stxvw4x(49, 0, %[buf]) +#else + stxvw4x(48, 0, %[buf]) +#endif + addi(%[buf], %[buf], 16) + + bdnz(loop) + +: [cc] "+b" (cc), [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "ctr", "memory" + ); +} + +static void +cbcenc_256(const unsigned char *sk, + const unsigned char *iv, unsigned char *buf, size_t len) +{ + long cc; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + + cc = 0; + asm volatile ( + + /* + * Load subkeys into v0..v14 + */ + lxvw4x(32, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(33, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(34, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(35, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(36, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(37, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(38, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(39, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(40, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(41, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(42, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(43, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(44, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(45, %[cc], %[sk]) + addi(%[cc], %[cc], 16) + lxvw4x(46, %[cc], %[sk]) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * Load IV into v16. + */ + lxvw4x(48, 0, %[iv]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Load next plaintext word and XOR with current IV. + */ + lxvw4x(49, 0, %[buf]) +#if BR_POWER8_LE + vperm(17, 17, 17, 15) +#endif + vxor(16, 16, 17) + + /* + * Encrypt the block. + */ + vxor(16, 16, 0) + vcipher(16, 16, 1) + vcipher(16, 16, 2) + vcipher(16, 16, 3) + vcipher(16, 16, 4) + vcipher(16, 16, 5) + vcipher(16, 16, 6) + vcipher(16, 16, 7) + vcipher(16, 16, 8) + vcipher(16, 16, 9) + vcipher(16, 16, 10) + vcipher(16, 16, 11) + vcipher(16, 16, 12) + vcipher(16, 16, 13) + vcipherlast(16, 16, 14) + + /* + * Store back result (with byteswap) + */ +#if BR_POWER8_LE + vperm(17, 16, 16, 15) + stxvw4x(49, 0, %[buf]) +#else + stxvw4x(48, 0, %[buf]) +#endif + addi(%[buf], %[buf], 16) + + bdnz(loop) + +: [cc] "+b" (cc), [buf] "+b" (buf) +: [sk] "b" (sk), [iv] "b" (iv), [num_blocks] "b" (len >> 4) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "ctr", "memory" + ); +} + +/* see bearssl_block.h */ +void +br_aes_pwr8_cbcenc_run(const br_aes_pwr8_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + if (len > 0) { + switch (ctx->num_rounds) { + case 10: + cbcenc_128(ctx->skey.skni, iv, data, len); + break; + case 12: + cbcenc_192(ctx->skey.skni, iv, data, len); + break; + default: + cbcenc_256(ctx->skey.skni, iv, data, len); + break; + } + memcpy(iv, (unsigned char *)data + (len - 16), 16); + } +} + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_aes_pwr8_cbcenc_vtable = { + sizeof(br_aes_pwr8_cbcenc_keys), + 16, + 4, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_aes_pwr8_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_aes_pwr8_cbcenc_run +}; + +/* see bearssl_block.h */ +const br_block_cbcenc_class * +br_aes_pwr8_cbcenc_get_vtable(void) +{ + return br_aes_pwr8_supported() ? &br_aes_pwr8_cbcenc_vtable : NULL; +} + +#else + +/* see bearssl_block.h */ +const br_block_cbcenc_class * +br_aes_pwr8_cbcenc_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctr.c new file mode 100644 index 00000000..f5d20c0b --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctr.c @@ -0,0 +1,717 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +#if BR_POWER8 + +/* see bearssl_block.h */ +void +br_aes_pwr8_ctr_init(br_aes_pwr8_ctr_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_pwr8_ctr_vtable; + ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len); +} + +static void +ctr_128(const unsigned char *sk, const unsigned char *ivbuf, + unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + static const uint32_t ctrinc[] = { + 0, 0, 0, 4 + }; + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v10 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * v28 = increment for IV counter. + */ + lxvw4x(60, 0, %[ctrinc]) + + /* + * Load IV into v16..v19 + */ + lxvw4x(48, %[cc0], %[ivbuf]) + lxvw4x(49, %[cc1], %[ivbuf]) + lxvw4x(50, %[cc2], %[ivbuf]) + lxvw4x(51, %[cc3], %[ivbuf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Compute next IV into v24..v27 + */ + vadduwm(24, 16, 28) + vadduwm(25, 17, 28) + vadduwm(26, 18, 28) + vadduwm(27, 19, 28) + + /* + * Load next data blocks. We do this early on but we + * won't need them until IV encryption is done. + */ + lxvw4x(52, %[cc0], %[buf]) + lxvw4x(53, %[cc1], %[buf]) + lxvw4x(54, %[cc2], %[buf]) + lxvw4x(55, %[cc3], %[buf]) + + /* + * Encrypt the current IV. + */ + vxor(16, 16, 0) + vxor(17, 17, 0) + vxor(18, 18, 0) + vxor(19, 19, 0) + vcipher(16, 16, 1) + vcipher(17, 17, 1) + vcipher(18, 18, 1) + vcipher(19, 19, 1) + vcipher(16, 16, 2) + vcipher(17, 17, 2) + vcipher(18, 18, 2) + vcipher(19, 19, 2) + vcipher(16, 16, 3) + vcipher(17, 17, 3) + vcipher(18, 18, 3) + vcipher(19, 19, 3) + vcipher(16, 16, 4) + vcipher(17, 17, 4) + vcipher(18, 18, 4) + vcipher(19, 19, 4) + vcipher(16, 16, 5) + vcipher(17, 17, 5) + vcipher(18, 18, 5) + vcipher(19, 19, 5) + vcipher(16, 16, 6) + vcipher(17, 17, 6) + vcipher(18, 18, 6) + vcipher(19, 19, 6) + vcipher(16, 16, 7) + vcipher(17, 17, 7) + vcipher(18, 18, 7) + vcipher(19, 19, 7) + vcipher(16, 16, 8) + vcipher(17, 17, 8) + vcipher(18, 18, 8) + vcipher(19, 19, 8) + vcipher(16, 16, 9) + vcipher(17, 17, 9) + vcipher(18, 18, 9) + vcipher(19, 19, 9) + vcipherlast(16, 16, 10) + vcipherlast(17, 17, 10) + vcipherlast(18, 18, 10) + vcipherlast(19, 19, 10) + +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + /* + * Load next plaintext word and XOR with encrypted IV. + */ + vxor(16, 20, 16) + vxor(17, 21, 17) + vxor(18, 22, 18) + vxor(19, 23, 19) + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + addi(%[buf], %[buf], 64) + + /* + * Update IV. + */ + vand(16, 24, 24) + vand(17, 25, 25) + vand(18, 26, 26) + vand(19, 27, 27) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2), + [ctrinc] "b" (ctrinc) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +static void +ctr_192(const unsigned char *sk, const unsigned char *ivbuf, + unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + static const uint32_t ctrinc[] = { + 0, 0, 0, 4 + }; + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v12 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(43, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(44, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * v28 = increment for IV counter. + */ + lxvw4x(60, 0, %[ctrinc]) + + /* + * Load IV into v16..v19 + */ + lxvw4x(48, %[cc0], %[ivbuf]) + lxvw4x(49, %[cc1], %[ivbuf]) + lxvw4x(50, %[cc2], %[ivbuf]) + lxvw4x(51, %[cc3], %[ivbuf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Compute next IV into v24..v27 + */ + vadduwm(24, 16, 28) + vadduwm(25, 17, 28) + vadduwm(26, 18, 28) + vadduwm(27, 19, 28) + + /* + * Load next data blocks. We do this early on but we + * won't need them until IV encryption is done. + */ + lxvw4x(52, %[cc0], %[buf]) + lxvw4x(53, %[cc1], %[buf]) + lxvw4x(54, %[cc2], %[buf]) + lxvw4x(55, %[cc3], %[buf]) + + /* + * Encrypt the current IV. + */ + vxor(16, 16, 0) + vxor(17, 17, 0) + vxor(18, 18, 0) + vxor(19, 19, 0) + vcipher(16, 16, 1) + vcipher(17, 17, 1) + vcipher(18, 18, 1) + vcipher(19, 19, 1) + vcipher(16, 16, 2) + vcipher(17, 17, 2) + vcipher(18, 18, 2) + vcipher(19, 19, 2) + vcipher(16, 16, 3) + vcipher(17, 17, 3) + vcipher(18, 18, 3) + vcipher(19, 19, 3) + vcipher(16, 16, 4) + vcipher(17, 17, 4) + vcipher(18, 18, 4) + vcipher(19, 19, 4) + vcipher(16, 16, 5) + vcipher(17, 17, 5) + vcipher(18, 18, 5) + vcipher(19, 19, 5) + vcipher(16, 16, 6) + vcipher(17, 17, 6) + vcipher(18, 18, 6) + vcipher(19, 19, 6) + vcipher(16, 16, 7) + vcipher(17, 17, 7) + vcipher(18, 18, 7) + vcipher(19, 19, 7) + vcipher(16, 16, 8) + vcipher(17, 17, 8) + vcipher(18, 18, 8) + vcipher(19, 19, 8) + vcipher(16, 16, 9) + vcipher(17, 17, 9) + vcipher(18, 18, 9) + vcipher(19, 19, 9) + vcipher(16, 16, 10) + vcipher(17, 17, 10) + vcipher(18, 18, 10) + vcipher(19, 19, 10) + vcipher(16, 16, 11) + vcipher(17, 17, 11) + vcipher(18, 18, 11) + vcipher(19, 19, 11) + vcipherlast(16, 16, 12) + vcipherlast(17, 17, 12) + vcipherlast(18, 18, 12) + vcipherlast(19, 19, 12) + +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + /* + * Load next plaintext word and XOR with encrypted IV. + */ + vxor(16, 20, 16) + vxor(17, 21, 17) + vxor(18, 22, 18) + vxor(19, 23, 19) + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + addi(%[buf], %[buf], 64) + + /* + * Update IV. + */ + vand(16, 24, 24) + vand(17, 25, 25) + vand(18, 26, 26) + vand(19, 27, 27) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2), + [ctrinc] "b" (ctrinc) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +static void +ctr_256(const unsigned char *sk, const unsigned char *ivbuf, + unsigned char *buf, size_t num_blocks) +{ + long cc0, cc1, cc2, cc3; + +#if BR_POWER8_LE + static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C + }; +#endif + static const uint32_t ctrinc[] = { + 0, 0, 0, 4 + }; + + cc0 = 0; + cc1 = 16; + cc2 = 32; + cc3 = 48; + asm volatile ( + + /* + * Load subkeys into v0..v14 + */ + lxvw4x(32, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(33, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(34, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(35, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(36, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(37, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(38, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(39, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(40, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(41, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(42, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(43, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(44, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(45, %[cc0], %[sk]) + addi(%[cc0], %[cc0], 16) + lxvw4x(46, %[cc0], %[sk]) + li(%[cc0], 0) + +#if BR_POWER8_LE + /* + * v15 = constant for byteswapping words + */ + lxvw4x(47, 0, %[idx2be]) +#endif + /* + * v28 = increment for IV counter. + */ + lxvw4x(60, 0, %[ctrinc]) + + /* + * Load IV into v16..v19 + */ + lxvw4x(48, %[cc0], %[ivbuf]) + lxvw4x(49, %[cc1], %[ivbuf]) + lxvw4x(50, %[cc2], %[ivbuf]) + lxvw4x(51, %[cc3], %[ivbuf]) +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + mtctr(%[num_blocks]) + label(loop) + /* + * Compute next IV into v24..v27 + */ + vadduwm(24, 16, 28) + vadduwm(25, 17, 28) + vadduwm(26, 18, 28) + vadduwm(27, 19, 28) + + /* + * Load next data blocks. We do this early on but we + * won't need them until IV encryption is done. + */ + lxvw4x(52, %[cc0], %[buf]) + lxvw4x(53, %[cc1], %[buf]) + lxvw4x(54, %[cc2], %[buf]) + lxvw4x(55, %[cc3], %[buf]) + + /* + * Encrypt the current IV. + */ + vxor(16, 16, 0) + vxor(17, 17, 0) + vxor(18, 18, 0) + vxor(19, 19, 0) + vcipher(16, 16, 1) + vcipher(17, 17, 1) + vcipher(18, 18, 1) + vcipher(19, 19, 1) + vcipher(16, 16, 2) + vcipher(17, 17, 2) + vcipher(18, 18, 2) + vcipher(19, 19, 2) + vcipher(16, 16, 3) + vcipher(17, 17, 3) + vcipher(18, 18, 3) + vcipher(19, 19, 3) + vcipher(16, 16, 4) + vcipher(17, 17, 4) + vcipher(18, 18, 4) + vcipher(19, 19, 4) + vcipher(16, 16, 5) + vcipher(17, 17, 5) + vcipher(18, 18, 5) + vcipher(19, 19, 5) + vcipher(16, 16, 6) + vcipher(17, 17, 6) + vcipher(18, 18, 6) + vcipher(19, 19, 6) + vcipher(16, 16, 7) + vcipher(17, 17, 7) + vcipher(18, 18, 7) + vcipher(19, 19, 7) + vcipher(16, 16, 8) + vcipher(17, 17, 8) + vcipher(18, 18, 8) + vcipher(19, 19, 8) + vcipher(16, 16, 9) + vcipher(17, 17, 9) + vcipher(18, 18, 9) + vcipher(19, 19, 9) + vcipher(16, 16, 10) + vcipher(17, 17, 10) + vcipher(18, 18, 10) + vcipher(19, 19, 10) + vcipher(16, 16, 11) + vcipher(17, 17, 11) + vcipher(18, 18, 11) + vcipher(19, 19, 11) + vcipher(16, 16, 12) + vcipher(17, 17, 12) + vcipher(18, 18, 12) + vcipher(19, 19, 12) + vcipher(16, 16, 13) + vcipher(17, 17, 13) + vcipher(18, 18, 13) + vcipher(19, 19, 13) + vcipherlast(16, 16, 14) + vcipherlast(17, 17, 14) + vcipherlast(18, 18, 14) + vcipherlast(19, 19, 14) + +#if BR_POWER8_LE + vperm(16, 16, 16, 15) + vperm(17, 17, 17, 15) + vperm(18, 18, 18, 15) + vperm(19, 19, 19, 15) +#endif + + /* + * Load next plaintext word and XOR with encrypted IV. + */ + vxor(16, 20, 16) + vxor(17, 21, 17) + vxor(18, 22, 18) + vxor(19, 23, 19) + stxvw4x(48, %[cc0], %[buf]) + stxvw4x(49, %[cc1], %[buf]) + stxvw4x(50, %[cc2], %[buf]) + stxvw4x(51, %[cc3], %[buf]) + + addi(%[buf], %[buf], 64) + + /* + * Update IV. + */ + vand(16, 24, 24) + vand(17, 25, 25) + vand(18, 26, 26) + vand(19, 27, 27) + + bdnz(loop) + +: [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3), + [buf] "+b" (buf) +: [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2), + [ctrinc] "b" (ctrinc) +#if BR_POWER8_LE + , [idx2be] "b" (idx2be) +#endif +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "ctr", "memory" + ); +} + +/* see bearssl_block.h */ +uint32_t +br_aes_pwr8_ctr_run(const br_aes_pwr8_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + unsigned char ivbuf[64]; + + buf = data; + memcpy(ivbuf + 0, iv, 12); + memcpy(ivbuf + 16, iv, 12); + memcpy(ivbuf + 32, iv, 12); + memcpy(ivbuf + 48, iv, 12); + if (len >= 64) { + br_enc32be(ivbuf + 12, cc + 0); + br_enc32be(ivbuf + 28, cc + 1); + br_enc32be(ivbuf + 44, cc + 2); + br_enc32be(ivbuf + 60, cc + 3); + switch (ctx->num_rounds) { + case 10: + ctr_128(ctx->skey.skni, ivbuf, buf, + (len >> 4) & ~(size_t)3); + break; + case 12: + ctr_192(ctx->skey.skni, ivbuf, buf, + (len >> 4) & ~(size_t)3); + break; + default: + ctr_256(ctx->skey.skni, ivbuf, buf, + (len >> 4) & ~(size_t)3); + break; + } + cc += (len >> 4) & ~(size_t)3; + buf += len & ~(size_t)63; + len &= 63; + } + if (len > 0) { + unsigned char tmp[64]; + + memcpy(tmp, buf, len); + memset(tmp + len, 0, (sizeof tmp) - len); + br_enc32be(ivbuf + 12, cc + 0); + br_enc32be(ivbuf + 28, cc + 1); + br_enc32be(ivbuf + 44, cc + 2); + br_enc32be(ivbuf + 60, cc + 3); + switch (ctx->num_rounds) { + case 10: + ctr_128(ctx->skey.skni, ivbuf, tmp, 4); + break; + case 12: + ctr_192(ctx->skey.skni, ivbuf, tmp, 4); + break; + default: + ctr_256(ctx->skey.skni, ivbuf, tmp, 4); + break; + } + memcpy(buf, tmp, len); + cc += (len + 15) >> 4; + } + return cc; +} + +/* see bearssl_block.h */ +const br_block_ctr_class br_aes_pwr8_ctr_vtable = { + sizeof(br_aes_pwr8_ctr_keys), + 16, + 4, + (void (*)(const br_block_ctr_class **, const void *, size_t)) + &br_aes_pwr8_ctr_init, + (uint32_t (*)(const br_block_ctr_class *const *, + const void *, uint32_t, void *, size_t)) + &br_aes_pwr8_ctr_run +}; + +/* see bearssl_block.h */ +const br_block_ctr_class * +br_aes_pwr8_ctr_get_vtable(void) +{ + return br_aes_pwr8_supported() ? &br_aes_pwr8_ctr_vtable : NULL; +} + +#else + +/* see bearssl_block.h */ +const br_block_ctr_class * +br_aes_pwr8_ctr_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctrcbc.c new file mode 100644 index 00000000..a67d30b6 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_pwr8_ctrcbc.c @@ -0,0 +1,946 @@ +/* + * Copyright (c) 2018 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_POWER_ASM_MACROS 1 +#include "inner.h" + +#if BR_POWER8 + +/* see bearssl_block.h */ +const br_block_ctrcbc_class * +br_aes_pwr8_ctrcbc_get_vtable(void) +{ + return br_aes_pwr8_supported() ? &br_aes_pwr8_ctrcbc_vtable : NULL; +} + +/* see bearssl_block.h */ +void +br_aes_pwr8_ctrcbc_init(br_aes_pwr8_ctrcbc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_pwr8_ctrcbc_vtable; + ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len); +} + +/* + * Register conventions for CTR + CBC-MAC: + * + * AES subkeys are in registers 0 to 10/12/14 (depending on keys size) + * Register v15 contains the byteswap index register (little-endian only) + * Register v16 contains the CTR counter value + * Register v17 contains the CBC-MAC current value + * Registers v18 to v27 are scratch + * Counter increment uses v28, v29 and v30 + * + * For CTR alone: + * + * AES subkeys are in registers 0 to 10/12/14 (depending on keys size) + * Register v15 contains the byteswap index register (little-endian only) + * Registers v16 to v19 contain the CTR counter values (four blocks) + * Registers v20 to v27 are scratch + * Counter increment uses v28, v29 and v30 + */ + +#define LOAD_SUBKEYS_128 \ + lxvw4x(32, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(33, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(34, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(35, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(36, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(37, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(38, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(39, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(40, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(41, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(42, %[cc], %[sk]) + +#define LOAD_SUBKEYS_192 \ + LOAD_SUBKEYS_128 \ + addi(%[cc], %[cc], 16) \ + lxvw4x(43, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(44, %[cc], %[sk]) + +#define LOAD_SUBKEYS_256 \ + LOAD_SUBKEYS_192 \ + addi(%[cc], %[cc], 16) \ + lxvw4x(45, %[cc], %[sk]) \ + addi(%[cc], %[cc], 16) \ + lxvw4x(46, %[cc], %[sk]) + +#define BLOCK_ENCRYPT_128(x) \ + vxor(x, x, 0) \ + vcipher(x, x, 1) \ + vcipher(x, x, 2) \ + vcipher(x, x, 3) \ + vcipher(x, x, 4) \ + vcipher(x, x, 5) \ + vcipher(x, x, 6) \ + vcipher(x, x, 7) \ + vcipher(x, x, 8) \ + vcipher(x, x, 9) \ + vcipherlast(x, x, 10) + +#define BLOCK_ENCRYPT_192(x) \ + vxor(x, x, 0) \ + vcipher(x, x, 1) \ + vcipher(x, x, 2) \ + vcipher(x, x, 3) \ + vcipher(x, x, 4) \ + vcipher(x, x, 5) \ + vcipher(x, x, 6) \ + vcipher(x, x, 7) \ + vcipher(x, x, 8) \ + vcipher(x, x, 9) \ + vcipher(x, x, 10) \ + vcipher(x, x, 11) \ + vcipherlast(x, x, 12) + +#define BLOCK_ENCRYPT_256(x) \ + vxor(x, x, 0) \ + vcipher(x, x, 1) \ + vcipher(x, x, 2) \ + vcipher(x, x, 3) \ + vcipher(x, x, 4) \ + vcipher(x, x, 5) \ + vcipher(x, x, 6) \ + vcipher(x, x, 7) \ + vcipher(x, x, 8) \ + vcipher(x, x, 9) \ + vcipher(x, x, 10) \ + vcipher(x, x, 11) \ + vcipher(x, x, 12) \ + vcipher(x, x, 13) \ + vcipherlast(x, x, 14) + +#define BLOCK_ENCRYPT_X2_128(x, y) \ + vxor(x, x, 0) \ + vxor(y, y, 0) \ + vcipher(x, x, 1) \ + vcipher(y, y, 1) \ + vcipher(x, x, 2) \ + vcipher(y, y, 2) \ + vcipher(x, x, 3) \ + vcipher(y, y, 3) \ + vcipher(x, x, 4) \ + vcipher(y, y, 4) \ + vcipher(x, x, 5) \ + vcipher(y, y, 5) \ + vcipher(x, x, 6) \ + vcipher(y, y, 6) \ + vcipher(x, x, 7) \ + vcipher(y, y, 7) \ + vcipher(x, x, 8) \ + vcipher(y, y, 8) \ + vcipher(x, x, 9) \ + vcipher(y, y, 9) \ + vcipherlast(x, x, 10) \ + vcipherlast(y, y, 10) + +#define BLOCK_ENCRYPT_X2_192(x, y) \ + vxor(x, x, 0) \ + vxor(y, y, 0) \ + vcipher(x, x, 1) \ + vcipher(y, y, 1) \ + vcipher(x, x, 2) \ + vcipher(y, y, 2) \ + vcipher(x, x, 3) \ + vcipher(y, y, 3) \ + vcipher(x, x, 4) \ + vcipher(y, y, 4) \ + vcipher(x, x, 5) \ + vcipher(y, y, 5) \ + vcipher(x, x, 6) \ + vcipher(y, y, 6) \ + vcipher(x, x, 7) \ + vcipher(y, y, 7) \ + vcipher(x, x, 8) \ + vcipher(y, y, 8) \ + vcipher(x, x, 9) \ + vcipher(y, y, 9) \ + vcipher(x, x, 10) \ + vcipher(y, y, 10) \ + vcipher(x, x, 11) \ + vcipher(y, y, 11) \ + vcipherlast(x, x, 12) \ + vcipherlast(y, y, 12) + +#define BLOCK_ENCRYPT_X2_256(x, y) \ + vxor(x, x, 0) \ + vxor(y, y, 0) \ + vcipher(x, x, 1) \ + vcipher(y, y, 1) \ + vcipher(x, x, 2) \ + vcipher(y, y, 2) \ + vcipher(x, x, 3) \ + vcipher(y, y, 3) \ + vcipher(x, x, 4) \ + vcipher(y, y, 4) \ + vcipher(x, x, 5) \ + vcipher(y, y, 5) \ + vcipher(x, x, 6) \ + vcipher(y, y, 6) \ + vcipher(x, x, 7) \ + vcipher(y, y, 7) \ + vcipher(x, x, 8) \ + vcipher(y, y, 8) \ + vcipher(x, x, 9) \ + vcipher(y, y, 9) \ + vcipher(x, x, 10) \ + vcipher(y, y, 10) \ + vcipher(x, x, 11) \ + vcipher(y, y, 11) \ + vcipher(x, x, 12) \ + vcipher(y, y, 12) \ + vcipher(x, x, 13) \ + vcipher(y, y, 13) \ + vcipherlast(x, x, 14) \ + vcipherlast(y, y, 14) + +#define BLOCK_ENCRYPT_X4_128(x0, x1, x2, x3) \ + vxor(x0, x0, 0) \ + vxor(x1, x1, 0) \ + vxor(x2, x2, 0) \ + vxor(x3, x3, 0) \ + vcipher(x0, x0, 1) \ + vcipher(x1, x1, 1) \ + vcipher(x2, x2, 1) \ + vcipher(x3, x3, 1) \ + vcipher(x0, x0, 2) \ + vcipher(x1, x1, 2) \ + vcipher(x2, x2, 2) \ + vcipher(x3, x3, 2) \ + vcipher(x0, x0, 3) \ + vcipher(x1, x1, 3) \ + vcipher(x2, x2, 3) \ + vcipher(x3, x3, 3) \ + vcipher(x0, x0, 4) \ + vcipher(x1, x1, 4) \ + vcipher(x2, x2, 4) \ + vcipher(x3, x3, 4) \ + vcipher(x0, x0, 5) \ + vcipher(x1, x1, 5) \ + vcipher(x2, x2, 5) \ + vcipher(x3, x3, 5) \ + vcipher(x0, x0, 6) \ + vcipher(x1, x1, 6) \ + vcipher(x2, x2, 6) \ + vcipher(x3, x3, 6) \ + vcipher(x0, x0, 7) \ + vcipher(x1, x1, 7) \ + vcipher(x2, x2, 7) \ + vcipher(x3, x3, 7) \ + vcipher(x0, x0, 8) \ + vcipher(x1, x1, 8) \ + vcipher(x2, x2, 8) \ + vcipher(x3, x3, 8) \ + vcipher(x0, x0, 9) \ + vcipher(x1, x1, 9) \ + vcipher(x2, x2, 9) \ + vcipher(x3, x3, 9) \ + vcipherlast(x0, x0, 10) \ + vcipherlast(x1, x1, 10) \ + vcipherlast(x2, x2, 10) \ + vcipherlast(x3, x3, 10) + +#define BLOCK_ENCRYPT_X4_192(x0, x1, x2, x3) \ + vxor(x0, x0, 0) \ + vxor(x1, x1, 0) \ + vxor(x2, x2, 0) \ + vxor(x3, x3, 0) \ + vcipher(x0, x0, 1) \ + vcipher(x1, x1, 1) \ + vcipher(x2, x2, 1) \ + vcipher(x3, x3, 1) \ + vcipher(x0, x0, 2) \ + vcipher(x1, x1, 2) \ + vcipher(x2, x2, 2) \ + vcipher(x3, x3, 2) \ + vcipher(x0, x0, 3) \ + vcipher(x1, x1, 3) \ + vcipher(x2, x2, 3) \ + vcipher(x3, x3, 3) \ + vcipher(x0, x0, 4) \ + vcipher(x1, x1, 4) \ + vcipher(x2, x2, 4) \ + vcipher(x3, x3, 4) \ + vcipher(x0, x0, 5) \ + vcipher(x1, x1, 5) \ + vcipher(x2, x2, 5) \ + vcipher(x3, x3, 5) \ + vcipher(x0, x0, 6) \ + vcipher(x1, x1, 6) \ + vcipher(x2, x2, 6) \ + vcipher(x3, x3, 6) \ + vcipher(x0, x0, 7) \ + vcipher(x1, x1, 7) \ + vcipher(x2, x2, 7) \ + vcipher(x3, x3, 7) \ + vcipher(x0, x0, 8) \ + vcipher(x1, x1, 8) \ + vcipher(x2, x2, 8) \ + vcipher(x3, x3, 8) \ + vcipher(x0, x0, 9) \ + vcipher(x1, x1, 9) \ + vcipher(x2, x2, 9) \ + vcipher(x3, x3, 9) \ + vcipher(x0, x0, 10) \ + vcipher(x1, x1, 10) \ + vcipher(x2, x2, 10) \ + vcipher(x3, x3, 10) \ + vcipher(x0, x0, 11) \ + vcipher(x1, x1, 11) \ + vcipher(x2, x2, 11) \ + vcipher(x3, x3, 11) \ + vcipherlast(x0, x0, 12) \ + vcipherlast(x1, x1, 12) \ + vcipherlast(x2, x2, 12) \ + vcipherlast(x3, x3, 12) + +#define BLOCK_ENCRYPT_X4_256(x0, x1, x2, x3) \ + vxor(x0, x0, 0) \ + vxor(x1, x1, 0) \ + vxor(x2, x2, 0) \ + vxor(x3, x3, 0) \ + vcipher(x0, x0, 1) \ + vcipher(x1, x1, 1) \ + vcipher(x2, x2, 1) \ + vcipher(x3, x3, 1) \ + vcipher(x0, x0, 2) \ + vcipher(x1, x1, 2) \ + vcipher(x2, x2, 2) \ + vcipher(x3, x3, 2) \ + vcipher(x0, x0, 3) \ + vcipher(x1, x1, 3) \ + vcipher(x2, x2, 3) \ + vcipher(x3, x3, 3) \ + vcipher(x0, x0, 4) \ + vcipher(x1, x1, 4) \ + vcipher(x2, x2, 4) \ + vcipher(x3, x3, 4) \ + vcipher(x0, x0, 5) \ + vcipher(x1, x1, 5) \ + vcipher(x2, x2, 5) \ + vcipher(x3, x3, 5) \ + vcipher(x0, x0, 6) \ + vcipher(x1, x1, 6) \ + vcipher(x2, x2, 6) \ + vcipher(x3, x3, 6) \ + vcipher(x0, x0, 7) \ + vcipher(x1, x1, 7) \ + vcipher(x2, x2, 7) \ + vcipher(x3, x3, 7) \ + vcipher(x0, x0, 8) \ + vcipher(x1, x1, 8) \ + vcipher(x2, x2, 8) \ + vcipher(x3, x3, 8) \ + vcipher(x0, x0, 9) \ + vcipher(x1, x1, 9) \ + vcipher(x2, x2, 9) \ + vcipher(x3, x3, 9) \ + vcipher(x0, x0, 10) \ + vcipher(x1, x1, 10) \ + vcipher(x2, x2, 10) \ + vcipher(x3, x3, 10) \ + vcipher(x0, x0, 11) \ + vcipher(x1, x1, 11) \ + vcipher(x2, x2, 11) \ + vcipher(x3, x3, 11) \ + vcipher(x0, x0, 12) \ + vcipher(x1, x1, 12) \ + vcipher(x2, x2, 12) \ + vcipher(x3, x3, 12) \ + vcipher(x0, x0, 13) \ + vcipher(x1, x1, 13) \ + vcipher(x2, x2, 13) \ + vcipher(x3, x3, 13) \ + vcipherlast(x0, x0, 14) \ + vcipherlast(x1, x1, 14) \ + vcipherlast(x2, x2, 14) \ + vcipherlast(x3, x3, 14) + +#if BR_POWER8_LE +static const uint32_t idx2be[] = { + 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C +}; +#define BYTESWAP_INIT lxvw4x(47, 0, %[idx2be]) +#define BYTESWAP(x) vperm(x, x, x, 15) +#define BYTESWAPX(d, s) vperm(d, s, s, 15) +#define BYTESWAP_REG , [idx2be] "b" (idx2be) +#else +#define BYTESWAP_INIT +#define BYTESWAP(x) +#define BYTESWAPX(d, s) vand(d, s, s) +#define BYTESWAP_REG +#endif + +static const uint32_t ctrinc[] = { + 0, 0, 0, 1 +}; +static const uint32_t ctrinc_x4[] = { + 0, 0, 0, 4 +}; +#define INCR_128_INIT lxvw4x(60, 0, %[ctrinc]) +#define INCR_128_X4_INIT lxvw4x(60, 0, %[ctrinc_x4]) +#define INCR_128(d, s) \ + vaddcuw(29, s, 28) \ + vadduwm(d, s, 28) \ + vsldoi(30, 29, 29, 4) \ + vaddcuw(29, d, 30) \ + vadduwm(d, d, 30) \ + vsldoi(30, 29, 29, 4) \ + vaddcuw(29, d, 30) \ + vadduwm(d, d, 30) \ + vsldoi(30, 29, 29, 4) \ + vadduwm(d, d, 30) + +#define MKCTR(size) \ +static void \ +ctr_ ## size(const unsigned char *sk, \ + unsigned char *ctrbuf, unsigned char *buf, size_t num_blocks_x4) \ +{ \ + long cc, cc0, cc1, cc2, cc3; \ + \ + cc = 0; \ + cc0 = 0; \ + cc1 = 16; \ + cc2 = 32; \ + cc3 = 48; \ + asm volatile ( \ + \ + /* \ + * Load subkeys into v0..v10 \ + */ \ + LOAD_SUBKEYS_ ## size \ + li(%[cc], 0) \ + \ + BYTESWAP_INIT \ + INCR_128_X4_INIT \ + \ + /* \ + * Load current CTR counters into v16 to v19. \ + */ \ + lxvw4x(48, %[cc0], %[ctrbuf]) \ + lxvw4x(49, %[cc1], %[ctrbuf]) \ + lxvw4x(50, %[cc2], %[ctrbuf]) \ + lxvw4x(51, %[cc3], %[ctrbuf]) \ + BYTESWAP(16) \ + BYTESWAP(17) \ + BYTESWAP(18) \ + BYTESWAP(19) \ + \ + mtctr(%[num_blocks_x4]) \ + \ + label(loop) \ + /* \ + * Compute next counter values into v20..v23. \ + */ \ + INCR_128(20, 16) \ + INCR_128(21, 17) \ + INCR_128(22, 18) \ + INCR_128(23, 19) \ + \ + /* \ + * Encrypt counter values and XOR into next data blocks. \ + */ \ + lxvw4x(56, %[cc0], %[buf]) \ + lxvw4x(57, %[cc1], %[buf]) \ + lxvw4x(58, %[cc2], %[buf]) \ + lxvw4x(59, %[cc3], %[buf]) \ + BYTESWAP(24) \ + BYTESWAP(25) \ + BYTESWAP(26) \ + BYTESWAP(27) \ + BLOCK_ENCRYPT_X4_ ## size(16, 17, 18, 19) \ + vxor(16, 16, 24) \ + vxor(17, 17, 25) \ + vxor(18, 18, 26) \ + vxor(19, 19, 27) \ + BYTESWAP(16) \ + BYTESWAP(17) \ + BYTESWAP(18) \ + BYTESWAP(19) \ + stxvw4x(48, %[cc0], %[buf]) \ + stxvw4x(49, %[cc1], %[buf]) \ + stxvw4x(50, %[cc2], %[buf]) \ + stxvw4x(51, %[cc3], %[buf]) \ + \ + /* \ + * Update counters and data pointer. \ + */ \ + vand(16, 20, 20) \ + vand(17, 21, 21) \ + vand(18, 22, 22) \ + vand(19, 23, 23) \ + addi(%[buf], %[buf], 64) \ + \ + bdnz(loop) \ + \ + /* \ + * Write back new counter values. \ + */ \ + BYTESWAP(16) \ + BYTESWAP(17) \ + BYTESWAP(18) \ + BYTESWAP(19) \ + stxvw4x(48, %[cc0], %[ctrbuf]) \ + stxvw4x(49, %[cc1], %[ctrbuf]) \ + stxvw4x(50, %[cc2], %[ctrbuf]) \ + stxvw4x(51, %[cc3], %[ctrbuf]) \ + \ +: [cc] "+b" (cc), [buf] "+b" (buf), \ + [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3) \ +: [sk] "b" (sk), [ctrbuf] "b" (ctrbuf), \ + [num_blocks_x4] "b" (num_blocks_x4), [ctrinc_x4] "b" (ctrinc_x4) \ + BYTESWAP_REG \ +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \ + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \ + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \ + "v30", "ctr", "memory" \ + ); \ +} + +MKCTR(128) +MKCTR(192) +MKCTR(256) + +#define MKCBCMAC(size) \ +static void \ +cbcmac_ ## size(const unsigned char *sk, \ + unsigned char *cbcmac, const unsigned char *buf, size_t num_blocks) \ +{ \ + long cc; \ + \ + cc = 0; \ + asm volatile ( \ + \ + /* \ + * Load subkeys into v0..v10 \ + */ \ + LOAD_SUBKEYS_ ## size \ + li(%[cc], 0) \ + \ + BYTESWAP_INIT \ + \ + /* \ + * Load current CBC-MAC value into v16. \ + */ \ + lxvw4x(48, %[cc], %[cbcmac]) \ + BYTESWAP(16) \ + \ + mtctr(%[num_blocks]) \ + \ + label(loop) \ + /* \ + * Load next block, XOR into current CBC-MAC value, \ + * and then encrypt it. \ + */ \ + lxvw4x(49, %[cc], %[buf]) \ + BYTESWAP(17) \ + vxor(16, 16, 17) \ + BLOCK_ENCRYPT_ ## size(16) \ + addi(%[buf], %[buf], 16) \ + \ + bdnz(loop) \ + \ + /* \ + * Write back new CBC-MAC value. \ + */ \ + BYTESWAP(16) \ + stxvw4x(48, %[cc], %[cbcmac]) \ + \ +: [cc] "+b" (cc), [buf] "+b" (buf) \ +: [sk] "b" (sk), [cbcmac] "b" (cbcmac), [num_blocks] "b" (num_blocks) \ + BYTESWAP_REG \ +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \ + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \ + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \ + "v30", "ctr", "memory" \ + ); \ +} + +MKCBCMAC(128) +MKCBCMAC(192) +MKCBCMAC(256) + +#define MKENCRYPT(size) \ +static void \ +ctrcbc_ ## size ## _encrypt(const unsigned char *sk, \ + unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \ + size_t num_blocks) \ +{ \ + long cc; \ + \ + cc = 0; \ + asm volatile ( \ + \ + /* \ + * Load subkeys into v0..v10 \ + */ \ + LOAD_SUBKEYS_ ## size \ + li(%[cc], 0) \ + \ + BYTESWAP_INIT \ + INCR_128_INIT \ + \ + /* \ + * Load current CTR counter into v16, and current \ + * CBC-MAC IV into v17. \ + */ \ + lxvw4x(48, %[cc], %[ctr]) \ + lxvw4x(49, %[cc], %[cbcmac]) \ + BYTESWAP(16) \ + BYTESWAP(17) \ + \ + /* \ + * At each iteration, we do two parallel encryption: \ + * - new counter value for encryption of the next block; \ + * - CBC-MAC over the previous encrypted block. \ + * Thus, each plaintext block implies two AES instances, \ + * over two successive iterations. This requires a single \ + * counter encryption before the loop, and a single \ + * CBC-MAC encryption after the loop. \ + */ \ + \ + /* \ + * Encrypt first block (into v20). \ + */ \ + lxvw4x(52, %[cc], %[buf]) \ + BYTESWAP(20) \ + INCR_128(22, 16) \ + BLOCK_ENCRYPT_ ## size(16) \ + vxor(20, 20, 16) \ + BYTESWAPX(21, 20) \ + stxvw4x(53, %[cc], %[buf]) \ + vand(16, 22, 22) \ + addi(%[buf], %[buf], 16) \ + \ + /* \ + * Load loop counter; skip the loop if there is only \ + * one block in total (already handled by the boundary \ + * conditions). \ + */ \ + mtctr(%[num_blocks]) \ + bdz(fastexit) \ + \ + label(loop) \ + /* \ + * Upon loop entry: \ + * v16 counter value for next block \ + * v17 current CBC-MAC value \ + * v20 encrypted previous block \ + */ \ + vxor(17, 17, 20) \ + INCR_128(22, 16) \ + lxvw4x(52, %[cc], %[buf]) \ + BYTESWAP(20) \ + BLOCK_ENCRYPT_X2_ ## size(16, 17) \ + vxor(20, 20, 16) \ + BYTESWAPX(21, 20) \ + stxvw4x(53, %[cc], %[buf]) \ + addi(%[buf], %[buf], 16) \ + vand(16, 22, 22) \ + \ + bdnz(loop) \ + \ + label(fastexit) \ + vxor(17, 17, 20) \ + BLOCK_ENCRYPT_ ## size(17) \ + BYTESWAP(16) \ + BYTESWAP(17) \ + stxvw4x(48, %[cc], %[ctr]) \ + stxvw4x(49, %[cc], %[cbcmac]) \ + \ +: [cc] "+b" (cc), [buf] "+b" (buf) \ +: [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \ + [num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \ + BYTESWAP_REG \ +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \ + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \ + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \ + "v30", "ctr", "memory" \ + ); \ +} + +MKENCRYPT(128) +MKENCRYPT(192) +MKENCRYPT(256) + +#define MKDECRYPT(size) \ +static void \ +ctrcbc_ ## size ## _decrypt(const unsigned char *sk, \ + unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \ + size_t num_blocks) \ +{ \ + long cc; \ + \ + cc = 0; \ + asm volatile ( \ + \ + /* \ + * Load subkeys into v0..v10 \ + */ \ + LOAD_SUBKEYS_ ## size \ + li(%[cc], 0) \ + \ + BYTESWAP_INIT \ + INCR_128_INIT \ + \ + /* \ + * Load current CTR counter into v16, and current \ + * CBC-MAC IV into v17. \ + */ \ + lxvw4x(48, %[cc], %[ctr]) \ + lxvw4x(49, %[cc], %[cbcmac]) \ + BYTESWAP(16) \ + BYTESWAP(17) \ + \ + /* \ + * At each iteration, we do two parallel encryption: \ + * - new counter value for decryption of the next block; \ + * - CBC-MAC over the next encrypted block. \ + * Each iteration performs the two AES instances related \ + * to the current block; there is thus no need for some \ + * extra pre-loop and post-loop work as in encryption. \ + */ \ + \ + mtctr(%[num_blocks]) \ + \ + label(loop) \ + /* \ + * Upon loop entry: \ + * v16 counter value for next block \ + * v17 current CBC-MAC value \ + */ \ + lxvw4x(52, %[cc], %[buf]) \ + BYTESWAP(20) \ + vxor(17, 17, 20) \ + INCR_128(22, 16) \ + BLOCK_ENCRYPT_X2_ ## size(16, 17) \ + vxor(20, 20, 16) \ + BYTESWAPX(21, 20) \ + stxvw4x(53, %[cc], %[buf]) \ + addi(%[buf], %[buf], 16) \ + vand(16, 22, 22) \ + \ + bdnz(loop) \ + \ + /* \ + * Store back counter and CBC-MAC value. \ + */ \ + BYTESWAP(16) \ + BYTESWAP(17) \ + stxvw4x(48, %[cc], %[ctr]) \ + stxvw4x(49, %[cc], %[cbcmac]) \ + \ +: [cc] "+b" (cc), [buf] "+b" (buf) \ +: [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \ + [num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \ + BYTESWAP_REG \ +: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \ + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \ + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \ + "v30", "ctr", "memory" \ + ); \ +} + +MKDECRYPT(128) +MKDECRYPT(192) +MKDECRYPT(256) + +/* see bearssl_block.h */ +void +br_aes_pwr8_ctrcbc_encrypt(const br_aes_pwr8_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + if (len == 0) { + return; + } + switch (ctx->num_rounds) { + case 10: + ctrcbc_128_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); + break; + case 12: + ctrcbc_192_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); + break; + default: + ctrcbc_256_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); + break; + } +} + +/* see bearssl_block.h */ +void +br_aes_pwr8_ctrcbc_decrypt(const br_aes_pwr8_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + if (len == 0) { + return; + } + switch (ctx->num_rounds) { + case 10: + ctrcbc_128_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); + break; + case 12: + ctrcbc_192_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); + break; + default: + ctrcbc_256_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); + break; + } +} + +static inline void +incr_ctr(void *dst, const void *src) +{ + uint64_t hi, lo; + + hi = br_dec64be(src); + lo = br_dec64be((const unsigned char *)src + 8); + lo ++; + hi += ((lo | -lo) >> 63) ^ (uint64_t)1; + br_enc64be(dst, hi); + br_enc64be((unsigned char *)dst + 8, lo); +} + +/* see bearssl_block.h */ +void +br_aes_pwr8_ctrcbc_ctr(const br_aes_pwr8_ctrcbc_keys *ctx, + void *ctr, void *data, size_t len) +{ + unsigned char ctrbuf[64]; + + memcpy(ctrbuf, ctr, 16); + incr_ctr(ctrbuf + 16, ctrbuf); + incr_ctr(ctrbuf + 32, ctrbuf + 16); + incr_ctr(ctrbuf + 48, ctrbuf + 32); + if (len >= 64) { + switch (ctx->num_rounds) { + case 10: + ctr_128(ctx->skey.skni, ctrbuf, data, len >> 6); + break; + case 12: + ctr_192(ctx->skey.skni, ctrbuf, data, len >> 6); + break; + default: + ctr_256(ctx->skey.skni, ctrbuf, data, len >> 6); + break; + } + data = (unsigned char *)data + (len & ~(size_t)63); + len &= 63; + } + if (len > 0) { + unsigned char tmp[64]; + + if (len >= 32) { + if (len >= 48) { + memcpy(ctr, ctrbuf + 48, 16); + } else { + memcpy(ctr, ctrbuf + 32, 16); + } + } else { + if (len >= 16) { + memcpy(ctr, ctrbuf + 16, 16); + } + } + memcpy(tmp, data, len); + memset(tmp + len, 0, (sizeof tmp) - len); + switch (ctx->num_rounds) { + case 10: + ctr_128(ctx->skey.skni, ctrbuf, tmp, 1); + break; + case 12: + ctr_192(ctx->skey.skni, ctrbuf, tmp, 1); + break; + default: + ctr_256(ctx->skey.skni, ctrbuf, tmp, 1); + break; + } + memcpy(data, tmp, len); + } else { + memcpy(ctr, ctrbuf, 16); + } +} + +/* see bearssl_block.h */ +void +br_aes_pwr8_ctrcbc_mac(const br_aes_pwr8_ctrcbc_keys *ctx, + void *cbcmac, const void *data, size_t len) +{ + if (len > 0) { + switch (ctx->num_rounds) { + case 10: + cbcmac_128(ctx->skey.skni, cbcmac, data, len >> 4); + break; + case 12: + cbcmac_192(ctx->skey.skni, cbcmac, data, len >> 4); + break; + default: + cbcmac_256(ctx->skey.skni, cbcmac, data, len >> 4); + break; + } + } +} + +/* see bearssl_block.h */ +const br_block_ctrcbc_class br_aes_pwr8_ctrcbc_vtable = { + sizeof(br_aes_pwr8_ctrcbc_keys), + 16, + 4, + (void (*)(const br_block_ctrcbc_class **, const void *, size_t)) + &br_aes_pwr8_ctrcbc_init, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_pwr8_ctrcbc_encrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_pwr8_ctrcbc_decrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, size_t)) + &br_aes_pwr8_ctrcbc_ctr, + (void (*)(const br_block_ctrcbc_class *const *, + void *, const void *, size_t)) + &br_aes_pwr8_ctrcbc_mac +}; + +#else + +/* see bearssl_block.h */ +const br_block_ctrcbc_class * +br_aes_pwr8_ctrcbc_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_small_cbcdec.c new file mode 100644 index 00000000..8567244b --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_small_cbcdec.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_small_cbcdec_init(br_aes_small_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_small_cbcdec_vtable; + ctx->num_rounds = br_aes_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_aes_small_cbcdec_run(const br_aes_small_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + + ivbuf = iv; + buf = data; + while (len > 0) { + unsigned char tmp[16]; + int i; + + memcpy(tmp, buf, 16); + br_aes_small_decrypt(ctx->num_rounds, ctx->skey, buf); + for (i = 0; i < 16; i ++) { + buf[i] ^= ivbuf[i]; + } + memcpy(ivbuf, tmp, 16); + buf += 16; + len -= 16; + } +} + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_aes_small_cbcdec_vtable = { + sizeof(br_aes_small_cbcdec_keys), + 16, + 4, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_aes_small_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_aes_small_cbcdec_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_small_cbcenc.c new file mode 100644 index 00000000..0dc2910a --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_small_cbcenc.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_small_cbcenc_init(br_aes_small_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_small_cbcenc_vtable; + ctx->num_rounds = br_aes_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_aes_small_cbcenc_run(const br_aes_small_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + + ivbuf = iv; + buf = data; + while (len > 0) { + int i; + + for (i = 0; i < 16; i ++) { + buf[i] ^= ivbuf[i]; + } + br_aes_small_encrypt(ctx->num_rounds, ctx->skey, buf); + memcpy(ivbuf, buf, 16); + buf += 16; + len -= 16; + } +} + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_aes_small_cbcenc_vtable = { + sizeof(br_aes_small_cbcenc_keys), + 16, + 4, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_aes_small_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_aes_small_cbcenc_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_small_ctr.c new file mode 100644 index 00000000..d5d371c6 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_small_ctr.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_small_ctr_init(br_aes_small_ctr_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_small_ctr_vtable; + ctx->num_rounds = br_aes_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +uint32_t +br_aes_small_ctr_run(const br_aes_small_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + + buf = data; + while (len > 0) { + unsigned char tmp[16]; + + memcpy(tmp, iv, 12); + br_enc32be(tmp + 12, cc ++); + br_aes_small_encrypt(ctx->num_rounds, ctx->skey, tmp); + if (len <= 16) { + xorbuf(buf, tmp, len); + break; + } + xorbuf(buf, tmp, 16); + buf += 16; + len -= 16; + } + return cc; +} + +/* see bearssl_block.h */ +const br_block_ctr_class br_aes_small_ctr_vtable = { + sizeof(br_aes_small_ctr_keys), + 16, + 4, + (void (*)(const br_block_ctr_class **, const void *, size_t)) + &br_aes_small_ctr_init, + (uint32_t (*)(const br_block_ctr_class *const *, + const void *, uint32_t, void *, size_t)) + &br_aes_small_ctr_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_small_ctrcbc.c new file mode 100644 index 00000000..2d6ba329 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_small_ctrcbc.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_small_ctrcbc_init(br_aes_small_ctrcbc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_small_ctrcbc_vtable; + ctx->num_rounds = br_aes_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +void +br_aes_small_ctrcbc_ctr(const br_aes_small_ctrcbc_keys *ctx, + void *ctr, void *data, size_t len) +{ + unsigned char *buf, *bctr; + uint32_t cc0, cc1, cc2, cc3; + + buf = data; + bctr = ctr; + cc3 = br_dec32be(bctr + 0); + cc2 = br_dec32be(bctr + 4); + cc1 = br_dec32be(bctr + 8); + cc0 = br_dec32be(bctr + 12); + while (len > 0) { + unsigned char tmp[16]; + uint32_t carry; + + br_enc32be(tmp + 0, cc3); + br_enc32be(tmp + 4, cc2); + br_enc32be(tmp + 8, cc1); + br_enc32be(tmp + 12, cc0); + br_aes_small_encrypt(ctx->num_rounds, ctx->skey, tmp); + xorbuf(buf, tmp, 16); + buf += 16; + len -= 16; + cc0 ++; + carry = (~(cc0 | -cc0)) >> 31; + cc1 += carry; + carry &= (~(cc1 | -cc1)) >> 31; + cc2 += carry; + carry &= (~(cc2 | -cc2)) >> 31; + cc3 += carry; + } + br_enc32be(bctr + 0, cc3); + br_enc32be(bctr + 4, cc2); + br_enc32be(bctr + 8, cc1); + br_enc32be(bctr + 12, cc0); +} + +/* see bearssl_block.h */ +void +br_aes_small_ctrcbc_mac(const br_aes_small_ctrcbc_keys *ctx, + void *cbcmac, const void *data, size_t len) +{ + const unsigned char *buf; + + buf = data; + while (len > 0) { + xorbuf(cbcmac, buf, 16); + br_aes_small_encrypt(ctx->num_rounds, ctx->skey, cbcmac); + buf += 16; + len -= 16; + } +} + +/* see bearssl_block.h */ +void +br_aes_small_ctrcbc_encrypt(const br_aes_small_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + br_aes_small_ctrcbc_ctr(ctx, ctr, data, len); + br_aes_small_ctrcbc_mac(ctx, cbcmac, data, len); +} + +/* see bearssl_block.h */ +void +br_aes_small_ctrcbc_decrypt(const br_aes_small_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + br_aes_small_ctrcbc_mac(ctx, cbcmac, data, len); + br_aes_small_ctrcbc_ctr(ctx, ctr, data, len); +} + +/* see bearssl_block.h */ +const br_block_ctrcbc_class br_aes_small_ctrcbc_vtable = { + sizeof(br_aes_small_ctrcbc_keys), + 16, + 4, + (void (*)(const br_block_ctrcbc_class **, const void *, size_t)) + &br_aes_small_ctrcbc_init, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_small_ctrcbc_encrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_small_ctrcbc_decrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, size_t)) + &br_aes_small_ctrcbc_ctr, + (void (*)(const br_block_ctrcbc_class *const *, + void *, const void *, size_t)) + &br_aes_small_ctrcbc_mac +}; diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_dec.c b/test/monniaux/BearSSL/src/symcipher/aes_small_dec.c new file mode 100644 index 00000000..59dca8ec --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_small_dec.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Inverse S-box. + */ +static const unsigned char iS[] = { + 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, + 0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, + 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32, + 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, + 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, + 0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50, + 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, + 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, + 0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, + 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41, + 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, + 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, + 0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, + 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B, + 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, + 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, + 0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, + 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D, + 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, + 0x55, 0x21, 0x0C, 0x7D +}; + +static void +add_round_key(unsigned *state, const uint32_t *skeys) +{ + int i; + + for (i = 0; i < 16; i += 4) { + uint32_t k; + + k = *skeys ++; + state[i + 0] ^= (unsigned)(k >> 24); + state[i + 1] ^= (unsigned)(k >> 16) & 0xFF; + state[i + 2] ^= (unsigned)(k >> 8) & 0xFF; + state[i + 3] ^= (unsigned)k & 0xFF; + } +} + +static void +inv_sub_bytes(unsigned *state) +{ + int i; + + for (i = 0; i < 16; i ++) { + state[i] = iS[state[i]]; + } +} + +static void +inv_shift_rows(unsigned *state) +{ + unsigned tmp; + + tmp = state[13]; + state[13] = state[9]; + state[9] = state[5]; + state[5] = state[1]; + state[1] = tmp; + + tmp = state[2]; + state[2] = state[10]; + state[10] = tmp; + tmp = state[6]; + state[6] = state[14]; + state[14] = tmp; + + tmp = state[3]; + state[3] = state[7]; + state[7] = state[11]; + state[11] = state[15]; + state[15] = tmp; +} + +static inline unsigned +gf256red(unsigned x) +{ + unsigned y; + + y = x >> 8; + return (x ^ y ^ (y << 1) ^ (y << 3) ^ (y << 4)) & 0xFF; +} + +static void +inv_mix_columns(unsigned *state) +{ + int i; + + for (i = 0; i < 16; i += 4) { + unsigned s0, s1, s2, s3; + unsigned t0, t1, t2, t3; + + s0 = state[i + 0]; + s1 = state[i + 1]; + s2 = state[i + 2]; + s3 = state[i + 3]; + t0 = (s0 << 1) ^ (s0 << 2) ^ (s0 << 3) + ^ s1 ^ (s1 << 1) ^ (s1 << 3) + ^ s2 ^ (s2 << 2) ^ (s2 << 3) + ^ s3 ^ (s3 << 3); + t1 = s0 ^ (s0 << 3) + ^ (s1 << 1) ^ (s1 << 2) ^ (s1 << 3) + ^ s2 ^ (s2 << 1) ^ (s2 << 3) + ^ s3 ^ (s3 << 2) ^ (s3 << 3); + t2 = s0 ^ (s0 << 2) ^ (s0 << 3) + ^ s1 ^ (s1 << 3) + ^ (s2 << 1) ^ (s2 << 2) ^ (s2 << 3) + ^ s3 ^ (s3 << 1) ^ (s3 << 3); + t3 = s0 ^ (s0 << 1) ^ (s0 << 3) + ^ s1 ^ (s1 << 2) ^ (s1 << 3) + ^ s2 ^ (s2 << 3) + ^ (s3 << 1) ^ (s3 << 2) ^ (s3 << 3); + state[i + 0] = gf256red(t0); + state[i + 1] = gf256red(t1); + state[i + 2] = gf256red(t2); + state[i + 3] = gf256red(t3); + } +} + +/* see inner.h */ +void +br_aes_small_decrypt(unsigned num_rounds, const uint32_t *skey, void *data) +{ + unsigned char *buf; + unsigned state[16]; + unsigned u; + + buf = data; + for (u = 0; u < 16; u ++) { + state[u] = buf[u]; + } + add_round_key(state, skey + (num_rounds << 2)); + for (u = num_rounds - 1; u > 0; u --) { + inv_shift_rows(state); + inv_sub_bytes(state); + add_round_key(state, skey + (u << 2)); + inv_mix_columns(state); + } + inv_shift_rows(state); + inv_sub_bytes(state); + add_round_key(state, skey); + for (u = 0; u < 16; u ++) { + buf[u] = state[u]; + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_small_enc.c b/test/monniaux/BearSSL/src/symcipher/aes_small_enc.c new file mode 100644 index 00000000..29f48a8f --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_small_enc.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#define S br_aes_S + +static void +add_round_key(unsigned *state, const uint32_t *skeys) +{ + int i; + + for (i = 0; i < 16; i += 4) { + uint32_t k; + + k = *skeys ++; + state[i + 0] ^= (unsigned)(k >> 24); + state[i + 1] ^= (unsigned)(k >> 16) & 0xFF; + state[i + 2] ^= (unsigned)(k >> 8) & 0xFF; + state[i + 3] ^= (unsigned)k & 0xFF; + } +} + +static void +sub_bytes(unsigned *state) +{ + int i; + + for (i = 0; i < 16; i ++) { + state[i] = S[state[i]]; + } +} + +static void +shift_rows(unsigned *state) +{ + unsigned tmp; + + tmp = state[1]; + state[1] = state[5]; + state[5] = state[9]; + state[9] = state[13]; + state[13] = tmp; + + tmp = state[2]; + state[2] = state[10]; + state[10] = tmp; + tmp = state[6]; + state[6] = state[14]; + state[14] = tmp; + + tmp = state[15]; + state[15] = state[11]; + state[11] = state[7]; + state[7] = state[3]; + state[3] = tmp; +} + +static void +mix_columns(unsigned *state) +{ + int i; + + for (i = 0; i < 16; i += 4) { + unsigned s0, s1, s2, s3; + unsigned t0, t1, t2, t3; + + s0 = state[i + 0]; + s1 = state[i + 1]; + s2 = state[i + 2]; + s3 = state[i + 3]; + t0 = (s0 << 1) ^ s1 ^ (s1 << 1) ^ s2 ^ s3; + t1 = s0 ^ (s1 << 1) ^ s2 ^ (s2 << 1) ^ s3; + t2 = s0 ^ s1 ^ (s2 << 1) ^ s3 ^ (s3 << 1); + t3 = s0 ^ (s0 << 1) ^ s1 ^ s2 ^ (s3 << 1); + state[i + 0] = t0 ^ ((unsigned)(-(int)(t0 >> 8)) & 0x11B); + state[i + 1] = t1 ^ ((unsigned)(-(int)(t1 >> 8)) & 0x11B); + state[i + 2] = t2 ^ ((unsigned)(-(int)(t2 >> 8)) & 0x11B); + state[i + 3] = t3 ^ ((unsigned)(-(int)(t3 >> 8)) & 0x11B); + } +} + +/* see inner.h */ +void +br_aes_small_encrypt(unsigned num_rounds, const uint32_t *skey, void *data) +{ + unsigned char *buf; + unsigned state[16]; + unsigned u; + + buf = data; + for (u = 0; u < 16; u ++) { + state[u] = buf[u]; + } + add_round_key(state, skey); + for (u = 1; u < num_rounds; u ++) { + sub_bytes(state); + shift_rows(state); + mix_columns(state); + add_round_key(state, skey + (u << 2)); + } + sub_bytes(state); + shift_rows(state); + add_round_key(state, skey + (num_rounds << 2)); + for (u = 0; u < 16; u ++) { + buf[u] = state[u]; + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni.c new file mode 100644 index 00000000..d5408f13 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni.c @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_ENABLE_INTRINSICS 1 +#include "inner.h" + +/* + * This code contains the AES key schedule implementation using the + * AES-NI opcodes. + */ + +#if BR_AES_X86NI + +/* see inner.h */ +int +br_aes_x86ni_supported(void) +{ + /* + * Bit mask for features in ECX: + * 19 SSE4.1 (used for _mm_insert_epi32(), for AES-CTR) + * 25 AES-NI + */ + return br_cpuid(0, 0, 0x02080000, 0); +} + +BR_TARGETS_X86_UP + +BR_TARGET("sse2,aes") +static inline __m128i +expand_step128(__m128i k, __m128i k2) +{ + k = _mm_xor_si128(k, _mm_slli_si128(k, 4)); + k = _mm_xor_si128(k, _mm_slli_si128(k, 4)); + k = _mm_xor_si128(k, _mm_slli_si128(k, 4)); + k2 = _mm_shuffle_epi32(k2, 0xFF); + return _mm_xor_si128(k, k2); +} + +BR_TARGET("sse2,aes") +static inline void +expand_step192(__m128i *t1, __m128i *t2, __m128i *t3) +{ + __m128i t4; + + *t2 = _mm_shuffle_epi32(*t2, 0x55); + t4 = _mm_slli_si128(*t1, 0x4); + *t1 = _mm_xor_si128(*t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + *t1 = _mm_xor_si128(*t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + *t1 = _mm_xor_si128(*t1, t4); + *t1 = _mm_xor_si128(*t1, *t2); + *t2 = _mm_shuffle_epi32(*t1, 0xFF); + t4 = _mm_slli_si128(*t3, 0x4); + *t3 = _mm_xor_si128(*t3, t4); + *t3 = _mm_xor_si128(*t3, *t2); +} + +BR_TARGET("sse2,aes") +static inline void +expand_step256_1(__m128i *t1, __m128i *t2) +{ + __m128i t4; + + *t2 = _mm_shuffle_epi32(*t2, 0xFF); + t4 = _mm_slli_si128(*t1, 0x4); + *t1 = _mm_xor_si128(*t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + *t1 = _mm_xor_si128(*t1, t4); + t4 = _mm_slli_si128(t4, 0x4); + *t1 = _mm_xor_si128(*t1, t4); + *t1 = _mm_xor_si128(*t1, *t2); +} + +BR_TARGET("sse2,aes") +static inline void +expand_step256_2(__m128i *t1, __m128i *t3) +{ + __m128i t2, t4; + + t4 = _mm_aeskeygenassist_si128(*t1, 0x0); + t2 = _mm_shuffle_epi32(t4, 0xAA); + t4 = _mm_slli_si128(*t3, 0x4); + *t3 = _mm_xor_si128(*t3, t4); + t4 = _mm_slli_si128(t4, 0x4); + *t3 = _mm_xor_si128(*t3, t4); + t4 = _mm_slli_si128(t4, 0x4); + *t3 = _mm_xor_si128(*t3, t4); + *t3 = _mm_xor_si128(*t3, t2); +} + +/* + * Perform key schedule for AES, encryption direction. Subkeys are written + * in sk[], and the number of rounds is returned. Key length MUST be 16, + * 24 or 32 bytes. + */ +BR_TARGET("sse2,aes") +static unsigned +x86ni_keysched(__m128i *sk, const void *key, size_t len) +{ + const unsigned char *kb; + +#define KEXP128(k, i, rcon) do { \ + k = expand_step128(k, _mm_aeskeygenassist_si128(k, rcon)); \ + sk[i] = k; \ + } while (0) + +#define KEXP192(i, rcon1, rcon2) do { \ + sk[(i) + 0] = t1; \ + sk[(i) + 1] = t3; \ + t2 = _mm_aeskeygenassist_si128(t3, rcon1); \ + expand_step192(&t1, &t2, &t3); \ + sk[(i) + 1] = _mm_castpd_si128(_mm_shuffle_pd( \ + _mm_castsi128_pd(sk[(i) + 1]), \ + _mm_castsi128_pd(t1), 0)); \ + sk[(i) + 2] = _mm_castpd_si128(_mm_shuffle_pd( \ + _mm_castsi128_pd(t1), \ + _mm_castsi128_pd(t3), 1)); \ + t2 = _mm_aeskeygenassist_si128(t3, rcon2); \ + expand_step192(&t1, &t2, &t3); \ + } while (0) + +#define KEXP256(i, rcon) do { \ + sk[(i) + 0] = t3; \ + t2 = _mm_aeskeygenassist_si128(t3, rcon); \ + expand_step256_1(&t1, &t2); \ + sk[(i) + 1] = t1; \ + expand_step256_2(&t1, &t3); \ + } while (0) + + kb = key; + switch (len) { + __m128i t1, t2, t3; + + case 16: + t1 = _mm_loadu_si128((const void *)kb); + sk[0] = t1; + KEXP128(t1, 1, 0x01); + KEXP128(t1, 2, 0x02); + KEXP128(t1, 3, 0x04); + KEXP128(t1, 4, 0x08); + KEXP128(t1, 5, 0x10); + KEXP128(t1, 6, 0x20); + KEXP128(t1, 7, 0x40); + KEXP128(t1, 8, 0x80); + KEXP128(t1, 9, 0x1B); + KEXP128(t1, 10, 0x36); + return 10; + + case 24: + t1 = _mm_loadu_si128((const void *)kb); + t3 = _mm_loadu_si128((const void *)(kb + 8)); + t3 = _mm_shuffle_epi32(t3, 0x4E); + KEXP192(0, 0x01, 0x02); + KEXP192(3, 0x04, 0x08); + KEXP192(6, 0x10, 0x20); + KEXP192(9, 0x40, 0x80); + sk[12] = t1; + return 12; + + case 32: + t1 = _mm_loadu_si128((const void *)kb); + t3 = _mm_loadu_si128((const void *)(kb + 16)); + sk[0] = t1; + KEXP256( 1, 0x01); + KEXP256( 3, 0x02); + KEXP256( 5, 0x04); + KEXP256( 7, 0x08); + KEXP256( 9, 0x10); + KEXP256(11, 0x20); + sk[13] = t3; + t2 = _mm_aeskeygenassist_si128(t3, 0x40); + expand_step256_1(&t1, &t2); + sk[14] = t1; + return 14; + + default: + return 0; + } + +#undef KEXP128 +#undef KEXP192 +#undef KEXP256 +} + +/* see inner.h */ +BR_TARGET("sse2,aes") +unsigned +br_aes_x86ni_keysched_enc(unsigned char *skni, const void *key, size_t len) +{ + __m128i sk[15]; + unsigned num_rounds; + + num_rounds = x86ni_keysched(sk, key, len); + memcpy(skni, sk, (num_rounds + 1) << 4); + return num_rounds; +} + +/* see inner.h */ +BR_TARGET("sse2,aes") +unsigned +br_aes_x86ni_keysched_dec(unsigned char *skni, const void *key, size_t len) +{ + __m128i sk[15]; + unsigned u, num_rounds; + + num_rounds = x86ni_keysched(sk, key, len); + _mm_storeu_si128((void *)skni, sk[num_rounds]); + for (u = 1; u < num_rounds; u ++) { + _mm_storeu_si128((void *)(skni + (u << 4)), + _mm_aesimc_si128(sk[num_rounds - u])); + } + _mm_storeu_si128((void *)(skni + (num_rounds << 4)), sk[0]); + return num_rounds; +} + +BR_TARGETS_X86_DOWN + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcdec.c new file mode 100644 index 00000000..862b1b5b --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcdec.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_ENABLE_INTRINSICS 1 +#include "inner.h" + +#if BR_AES_X86NI + +/* see bearssl_block.h */ +const br_block_cbcdec_class * +br_aes_x86ni_cbcdec_get_vtable(void) +{ + return br_aes_x86ni_supported() ? &br_aes_x86ni_cbcdec_vtable : NULL; +} + +/* see bearssl_block.h */ +void +br_aes_x86ni_cbcdec_init(br_aes_x86ni_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_x86ni_cbcdec_vtable; + ctx->num_rounds = br_aes_x86ni_keysched_dec(ctx->skey.skni, key, len); +} + +BR_TARGETS_X86_UP + +/* see bearssl_block.h */ +BR_TARGET("sse2,aes") +void +br_aes_x86ni_cbcdec_run(const br_aes_x86ni_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf; + unsigned num_rounds; + __m128i sk[15], ivx; + unsigned u; + + buf = data; + ivx = _mm_loadu_si128(iv); + num_rounds = ctx->num_rounds; + for (u = 0; u <= num_rounds; u ++) { + sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); + } + while (len > 0) { + __m128i x0, x1, x2, x3, e0, e1, e2, e3; + + x0 = _mm_loadu_si128((void *)(buf + 0)); + if (len >= 64) { + x1 = _mm_loadu_si128((void *)(buf + 16)); + x2 = _mm_loadu_si128((void *)(buf + 32)); + x3 = _mm_loadu_si128((void *)(buf + 48)); + } else { + x0 = _mm_loadu_si128((void *)(buf + 0)); + if (len >= 32) { + x1 = _mm_loadu_si128((void *)(buf + 16)); + if (len >= 48) { + x2 = _mm_loadu_si128( + (void *)(buf + 32)); + x3 = x2; + } else { + x2 = x0; + x3 = x1; + } + } else { + x1 = x0; + x2 = x0; + x3 = x0; + } + } + e0 = x0; + e1 = x1; + e2 = x2; + e3 = x3; + x0 = _mm_xor_si128(x0, sk[0]); + x1 = _mm_xor_si128(x1, sk[0]); + x2 = _mm_xor_si128(x2, sk[0]); + x3 = _mm_xor_si128(x3, sk[0]); + x0 = _mm_aesdec_si128(x0, sk[1]); + x1 = _mm_aesdec_si128(x1, sk[1]); + x2 = _mm_aesdec_si128(x2, sk[1]); + x3 = _mm_aesdec_si128(x3, sk[1]); + x0 = _mm_aesdec_si128(x0, sk[2]); + x1 = _mm_aesdec_si128(x1, sk[2]); + x2 = _mm_aesdec_si128(x2, sk[2]); + x3 = _mm_aesdec_si128(x3, sk[2]); + x0 = _mm_aesdec_si128(x0, sk[3]); + x1 = _mm_aesdec_si128(x1, sk[3]); + x2 = _mm_aesdec_si128(x2, sk[3]); + x3 = _mm_aesdec_si128(x3, sk[3]); + x0 = _mm_aesdec_si128(x0, sk[4]); + x1 = _mm_aesdec_si128(x1, sk[4]); + x2 = _mm_aesdec_si128(x2, sk[4]); + x3 = _mm_aesdec_si128(x3, sk[4]); + x0 = _mm_aesdec_si128(x0, sk[5]); + x1 = _mm_aesdec_si128(x1, sk[5]); + x2 = _mm_aesdec_si128(x2, sk[5]); + x3 = _mm_aesdec_si128(x3, sk[5]); + x0 = _mm_aesdec_si128(x0, sk[6]); + x1 = _mm_aesdec_si128(x1, sk[6]); + x2 = _mm_aesdec_si128(x2, sk[6]); + x3 = _mm_aesdec_si128(x3, sk[6]); + x0 = _mm_aesdec_si128(x0, sk[7]); + x1 = _mm_aesdec_si128(x1, sk[7]); + x2 = _mm_aesdec_si128(x2, sk[7]); + x3 = _mm_aesdec_si128(x3, sk[7]); + x0 = _mm_aesdec_si128(x0, sk[8]); + x1 = _mm_aesdec_si128(x1, sk[8]); + x2 = _mm_aesdec_si128(x2, sk[8]); + x3 = _mm_aesdec_si128(x3, sk[8]); + x0 = _mm_aesdec_si128(x0, sk[9]); + x1 = _mm_aesdec_si128(x1, sk[9]); + x2 = _mm_aesdec_si128(x2, sk[9]); + x3 = _mm_aesdec_si128(x3, sk[9]); + if (num_rounds == 10) { + x0 = _mm_aesdeclast_si128(x0, sk[10]); + x1 = _mm_aesdeclast_si128(x1, sk[10]); + x2 = _mm_aesdeclast_si128(x2, sk[10]); + x3 = _mm_aesdeclast_si128(x3, sk[10]); + } else if (num_rounds == 12) { + x0 = _mm_aesdec_si128(x0, sk[10]); + x1 = _mm_aesdec_si128(x1, sk[10]); + x2 = _mm_aesdec_si128(x2, sk[10]); + x3 = _mm_aesdec_si128(x3, sk[10]); + x0 = _mm_aesdec_si128(x0, sk[11]); + x1 = _mm_aesdec_si128(x1, sk[11]); + x2 = _mm_aesdec_si128(x2, sk[11]); + x3 = _mm_aesdec_si128(x3, sk[11]); + x0 = _mm_aesdeclast_si128(x0, sk[12]); + x1 = _mm_aesdeclast_si128(x1, sk[12]); + x2 = _mm_aesdeclast_si128(x2, sk[12]); + x3 = _mm_aesdeclast_si128(x3, sk[12]); + } else { + x0 = _mm_aesdec_si128(x0, sk[10]); + x1 = _mm_aesdec_si128(x1, sk[10]); + x2 = _mm_aesdec_si128(x2, sk[10]); + x3 = _mm_aesdec_si128(x3, sk[10]); + x0 = _mm_aesdec_si128(x0, sk[11]); + x1 = _mm_aesdec_si128(x1, sk[11]); + x2 = _mm_aesdec_si128(x2, sk[11]); + x3 = _mm_aesdec_si128(x3, sk[11]); + x0 = _mm_aesdec_si128(x0, sk[12]); + x1 = _mm_aesdec_si128(x1, sk[12]); + x2 = _mm_aesdec_si128(x2, sk[12]); + x3 = _mm_aesdec_si128(x3, sk[12]); + x0 = _mm_aesdec_si128(x0, sk[13]); + x1 = _mm_aesdec_si128(x1, sk[13]); + x2 = _mm_aesdec_si128(x2, sk[13]); + x3 = _mm_aesdec_si128(x3, sk[13]); + x0 = _mm_aesdeclast_si128(x0, sk[14]); + x1 = _mm_aesdeclast_si128(x1, sk[14]); + x2 = _mm_aesdeclast_si128(x2, sk[14]); + x3 = _mm_aesdeclast_si128(x3, sk[14]); + } + x0 = _mm_xor_si128(x0, ivx); + x1 = _mm_xor_si128(x1, e0); + x2 = _mm_xor_si128(x2, e1); + x3 = _mm_xor_si128(x3, e2); + ivx = e3; + _mm_storeu_si128((void *)(buf + 0), x0); + if (len >= 64) { + _mm_storeu_si128((void *)(buf + 16), x1); + _mm_storeu_si128((void *)(buf + 32), x2); + _mm_storeu_si128((void *)(buf + 48), x3); + buf += 64; + len -= 64; + } else { + if (len >= 32) { + _mm_storeu_si128((void *)(buf + 16), x1); + if (len >= 48) { + _mm_storeu_si128( + (void *)(buf + 32), x2); + } + } + break; + } + } + _mm_storeu_si128(iv, ivx); +} + +BR_TARGETS_X86_DOWN + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_aes_x86ni_cbcdec_vtable = { + sizeof(br_aes_x86ni_cbcdec_keys), + 16, + 4, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_aes_x86ni_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_aes_x86ni_cbcdec_run +}; + +#else + +/* see bearssl_block.h */ +const br_block_cbcdec_class * +br_aes_x86ni_cbcdec_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcenc.c new file mode 100644 index 00000000..85feecdb --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_cbcenc.c @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_ENABLE_INTRINSICS 1 +#include "inner.h" + +#if BR_AES_X86NI + +/* see bearssl_block.h */ +const br_block_cbcenc_class * +br_aes_x86ni_cbcenc_get_vtable(void) +{ + return br_aes_x86ni_supported() ? &br_aes_x86ni_cbcenc_vtable : NULL; +} + +/* see bearssl_block.h */ +void +br_aes_x86ni_cbcenc_init(br_aes_x86ni_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_x86ni_cbcenc_vtable; + ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len); +} + +BR_TARGETS_X86_UP + +/* see bearssl_block.h */ +BR_TARGET("sse2,aes") +void +br_aes_x86ni_cbcenc_run(const br_aes_x86ni_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf; + unsigned num_rounds; + __m128i sk[15], ivx; + unsigned u; + + buf = data; + ivx = _mm_loadu_si128(iv); + num_rounds = ctx->num_rounds; + for (u = 0; u <= num_rounds; u ++) { + sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); + } + while (len > 0) { + __m128i x; + + x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx); + x = _mm_xor_si128(x, sk[0]); + x = _mm_aesenc_si128(x, sk[1]); + x = _mm_aesenc_si128(x, sk[2]); + x = _mm_aesenc_si128(x, sk[3]); + x = _mm_aesenc_si128(x, sk[4]); + x = _mm_aesenc_si128(x, sk[5]); + x = _mm_aesenc_si128(x, sk[6]); + x = _mm_aesenc_si128(x, sk[7]); + x = _mm_aesenc_si128(x, sk[8]); + x = _mm_aesenc_si128(x, sk[9]); + if (num_rounds == 10) { + x = _mm_aesenclast_si128(x, sk[10]); + } else if (num_rounds == 12) { + x = _mm_aesenc_si128(x, sk[10]); + x = _mm_aesenc_si128(x, sk[11]); + x = _mm_aesenclast_si128(x, sk[12]); + } else { + x = _mm_aesenc_si128(x, sk[10]); + x = _mm_aesenc_si128(x, sk[11]); + x = _mm_aesenc_si128(x, sk[12]); + x = _mm_aesenc_si128(x, sk[13]); + x = _mm_aesenclast_si128(x, sk[14]); + } + ivx = x; + _mm_storeu_si128((void *)buf, x); + buf += 16; + len -= 16; + } + _mm_storeu_si128(iv, ivx); +} + +BR_TARGETS_X86_DOWN + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_aes_x86ni_cbcenc_vtable = { + sizeof(br_aes_x86ni_cbcenc_keys), + 16, + 4, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_aes_x86ni_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_aes_x86ni_cbcenc_run +}; + +#else + +/* see bearssl_block.h */ +const br_block_cbcenc_class * +br_aes_x86ni_cbcenc_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctr.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctr.c new file mode 100644 index 00000000..1cddd606 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctr.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_ENABLE_INTRINSICS 1 +#include "inner.h" + +#if BR_AES_X86NI + +/* see bearssl_block.h */ +const br_block_ctr_class * +br_aes_x86ni_ctr_get_vtable(void) +{ + return br_aes_x86ni_supported() ? &br_aes_x86ni_ctr_vtable : NULL; +} + +/* see bearssl_block.h */ +void +br_aes_x86ni_ctr_init(br_aes_x86ni_ctr_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_x86ni_ctr_vtable; + ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len); +} + +BR_TARGETS_X86_UP + +/* see bearssl_block.h */ +BR_TARGET("sse2,sse4.1,aes") +uint32_t +br_aes_x86ni_ctr_run(const br_aes_x86ni_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + unsigned char ivbuf[16]; + unsigned num_rounds; + __m128i sk[15]; + __m128i ivx; + unsigned u; + + buf = data; + memcpy(ivbuf, iv, 12); + num_rounds = ctx->num_rounds; + for (u = 0; u <= num_rounds; u ++) { + sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); + } + ivx = _mm_loadu_si128((void *)ivbuf); + while (len > 0) { + __m128i x0, x1, x2, x3; + + x0 = _mm_insert_epi32(ivx, br_bswap32(cc + 0), 3); + x1 = _mm_insert_epi32(ivx, br_bswap32(cc + 1), 3); + x2 = _mm_insert_epi32(ivx, br_bswap32(cc + 2), 3); + x3 = _mm_insert_epi32(ivx, br_bswap32(cc + 3), 3); + x0 = _mm_xor_si128(x0, sk[0]); + x1 = _mm_xor_si128(x1, sk[0]); + x2 = _mm_xor_si128(x2, sk[0]); + x3 = _mm_xor_si128(x3, sk[0]); + x0 = _mm_aesenc_si128(x0, sk[1]); + x1 = _mm_aesenc_si128(x1, sk[1]); + x2 = _mm_aesenc_si128(x2, sk[1]); + x3 = _mm_aesenc_si128(x3, sk[1]); + x0 = _mm_aesenc_si128(x0, sk[2]); + x1 = _mm_aesenc_si128(x1, sk[2]); + x2 = _mm_aesenc_si128(x2, sk[2]); + x3 = _mm_aesenc_si128(x3, sk[2]); + x0 = _mm_aesenc_si128(x0, sk[3]); + x1 = _mm_aesenc_si128(x1, sk[3]); + x2 = _mm_aesenc_si128(x2, sk[3]); + x3 = _mm_aesenc_si128(x3, sk[3]); + x0 = _mm_aesenc_si128(x0, sk[4]); + x1 = _mm_aesenc_si128(x1, sk[4]); + x2 = _mm_aesenc_si128(x2, sk[4]); + x3 = _mm_aesenc_si128(x3, sk[4]); + x0 = _mm_aesenc_si128(x0, sk[5]); + x1 = _mm_aesenc_si128(x1, sk[5]); + x2 = _mm_aesenc_si128(x2, sk[5]); + x3 = _mm_aesenc_si128(x3, sk[5]); + x0 = _mm_aesenc_si128(x0, sk[6]); + x1 = _mm_aesenc_si128(x1, sk[6]); + x2 = _mm_aesenc_si128(x2, sk[6]); + x3 = _mm_aesenc_si128(x3, sk[6]); + x0 = _mm_aesenc_si128(x0, sk[7]); + x1 = _mm_aesenc_si128(x1, sk[7]); + x2 = _mm_aesenc_si128(x2, sk[7]); + x3 = _mm_aesenc_si128(x3, sk[7]); + x0 = _mm_aesenc_si128(x0, sk[8]); + x1 = _mm_aesenc_si128(x1, sk[8]); + x2 = _mm_aesenc_si128(x2, sk[8]); + x3 = _mm_aesenc_si128(x3, sk[8]); + x0 = _mm_aesenc_si128(x0, sk[9]); + x1 = _mm_aesenc_si128(x1, sk[9]); + x2 = _mm_aesenc_si128(x2, sk[9]); + x3 = _mm_aesenc_si128(x3, sk[9]); + if (num_rounds == 10) { + x0 = _mm_aesenclast_si128(x0, sk[10]); + x1 = _mm_aesenclast_si128(x1, sk[10]); + x2 = _mm_aesenclast_si128(x2, sk[10]); + x3 = _mm_aesenclast_si128(x3, sk[10]); + } else if (num_rounds == 12) { + x0 = _mm_aesenc_si128(x0, sk[10]); + x1 = _mm_aesenc_si128(x1, sk[10]); + x2 = _mm_aesenc_si128(x2, sk[10]); + x3 = _mm_aesenc_si128(x3, sk[10]); + x0 = _mm_aesenc_si128(x0, sk[11]); + x1 = _mm_aesenc_si128(x1, sk[11]); + x2 = _mm_aesenc_si128(x2, sk[11]); + x3 = _mm_aesenc_si128(x3, sk[11]); + x0 = _mm_aesenclast_si128(x0, sk[12]); + x1 = _mm_aesenclast_si128(x1, sk[12]); + x2 = _mm_aesenclast_si128(x2, sk[12]); + x3 = _mm_aesenclast_si128(x3, sk[12]); + } else { + x0 = _mm_aesenc_si128(x0, sk[10]); + x1 = _mm_aesenc_si128(x1, sk[10]); + x2 = _mm_aesenc_si128(x2, sk[10]); + x3 = _mm_aesenc_si128(x3, sk[10]); + x0 = _mm_aesenc_si128(x0, sk[11]); + x1 = _mm_aesenc_si128(x1, sk[11]); + x2 = _mm_aesenc_si128(x2, sk[11]); + x3 = _mm_aesenc_si128(x3, sk[11]); + x0 = _mm_aesenc_si128(x0, sk[12]); + x1 = _mm_aesenc_si128(x1, sk[12]); + x2 = _mm_aesenc_si128(x2, sk[12]); + x3 = _mm_aesenc_si128(x3, sk[12]); + x0 = _mm_aesenc_si128(x0, sk[13]); + x1 = _mm_aesenc_si128(x1, sk[13]); + x2 = _mm_aesenc_si128(x2, sk[13]); + x3 = _mm_aesenc_si128(x3, sk[13]); + x0 = _mm_aesenclast_si128(x0, sk[14]); + x1 = _mm_aesenclast_si128(x1, sk[14]); + x2 = _mm_aesenclast_si128(x2, sk[14]); + x3 = _mm_aesenclast_si128(x3, sk[14]); + } + if (len >= 64) { + x0 = _mm_xor_si128(x0, + _mm_loadu_si128((void *)(buf + 0))); + x1 = _mm_xor_si128(x1, + _mm_loadu_si128((void *)(buf + 16))); + x2 = _mm_xor_si128(x2, + _mm_loadu_si128((void *)(buf + 32))); + x3 = _mm_xor_si128(x3, + _mm_loadu_si128((void *)(buf + 48))); + _mm_storeu_si128((void *)(buf + 0), x0); + _mm_storeu_si128((void *)(buf + 16), x1); + _mm_storeu_si128((void *)(buf + 32), x2); + _mm_storeu_si128((void *)(buf + 48), x3); + buf += 64; + len -= 64; + cc += 4; + } else { + unsigned char tmp[64]; + + _mm_storeu_si128((void *)(tmp + 0), x0); + _mm_storeu_si128((void *)(tmp + 16), x1); + _mm_storeu_si128((void *)(tmp + 32), x2); + _mm_storeu_si128((void *)(tmp + 48), x3); + for (u = 0; u < len; u ++) { + buf[u] ^= tmp[u]; + } + cc += (uint32_t)len >> 4; + break; + } + } + return cc; +} + +BR_TARGETS_X86_DOWN + +/* see bearssl_block.h */ +const br_block_ctr_class br_aes_x86ni_ctr_vtable = { + sizeof(br_aes_x86ni_ctr_keys), + 16, + 4, + (void (*)(const br_block_ctr_class **, const void *, size_t)) + &br_aes_x86ni_ctr_init, + (uint32_t (*)(const br_block_ctr_class *const *, + const void *, uint32_t, void *, size_t)) + &br_aes_x86ni_ctr_run +}; + +#else + +/* see bearssl_block.h */ +const br_block_ctr_class * +br_aes_x86ni_ctr_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctrcbc.c b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctrcbc.c new file mode 100644 index 00000000..f57fead6 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/aes_x86ni_ctrcbc.c @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_ENABLE_INTRINSICS 1 +#include "inner.h" + +#if BR_AES_X86NI + +/* see bearssl_block.h */ +const br_block_ctrcbc_class * +br_aes_x86ni_ctrcbc_get_vtable(void) +{ + return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL; +} + +/* see bearssl_block.h */ +void +br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_x86ni_ctrcbc_vtable; + ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len); +} + +BR_TARGETS_X86_UP + +/* see bearssl_block.h */ +BR_TARGET("sse2,sse4.1,aes") +void +br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx, + void *ctr, void *data, size_t len) +{ + unsigned char *buf; + unsigned num_rounds; + __m128i sk[15]; + __m128i ivx0, ivx1, ivx2, ivx3; + __m128i erev, zero, one, four, notthree; + unsigned u; + + buf = data; + num_rounds = ctx->num_rounds; + for (u = 0; u <= num_rounds; u ++) { + sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); + } + + /* + * Some SSE2 constants. + */ + erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + zero = _mm_setzero_si128(); + one = _mm_set_epi64x(0, 1); + four = _mm_set_epi64x(0, 4); + notthree = _mm_sub_epi64(zero, four); + + /* + * Decode the counter in big-endian and pre-increment the other + * three counters. + */ + ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev); + ivx1 = _mm_add_epi64(ivx0, one); + ivx1 = _mm_sub_epi64(ivx1, + _mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8)); + ivx2 = _mm_add_epi64(ivx1, one); + ivx2 = _mm_sub_epi64(ivx2, + _mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8)); + ivx3 = _mm_add_epi64(ivx2, one); + ivx3 = _mm_sub_epi64(ivx3, + _mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8)); + while (len > 0) { + __m128i x0, x1, x2, x3; + + /* + * Load counter values; we need to byteswap them because + * the specification says that they use big-endian. + */ + x0 = _mm_shuffle_epi8(ivx0, erev); + x1 = _mm_shuffle_epi8(ivx1, erev); + x2 = _mm_shuffle_epi8(ivx2, erev); + x3 = _mm_shuffle_epi8(ivx3, erev); + + x0 = _mm_xor_si128(x0, sk[0]); + x1 = _mm_xor_si128(x1, sk[0]); + x2 = _mm_xor_si128(x2, sk[0]); + x3 = _mm_xor_si128(x3, sk[0]); + x0 = _mm_aesenc_si128(x0, sk[1]); + x1 = _mm_aesenc_si128(x1, sk[1]); + x2 = _mm_aesenc_si128(x2, sk[1]); + x3 = _mm_aesenc_si128(x3, sk[1]); + x0 = _mm_aesenc_si128(x0, sk[2]); + x1 = _mm_aesenc_si128(x1, sk[2]); + x2 = _mm_aesenc_si128(x2, sk[2]); + x3 = _mm_aesenc_si128(x3, sk[2]); + x0 = _mm_aesenc_si128(x0, sk[3]); + x1 = _mm_aesenc_si128(x1, sk[3]); + x2 = _mm_aesenc_si128(x2, sk[3]); + x3 = _mm_aesenc_si128(x3, sk[3]); + x0 = _mm_aesenc_si128(x0, sk[4]); + x1 = _mm_aesenc_si128(x1, sk[4]); + x2 = _mm_aesenc_si128(x2, sk[4]); + x3 = _mm_aesenc_si128(x3, sk[4]); + x0 = _mm_aesenc_si128(x0, sk[5]); + x1 = _mm_aesenc_si128(x1, sk[5]); + x2 = _mm_aesenc_si128(x2, sk[5]); + x3 = _mm_aesenc_si128(x3, sk[5]); + x0 = _mm_aesenc_si128(x0, sk[6]); + x1 = _mm_aesenc_si128(x1, sk[6]); + x2 = _mm_aesenc_si128(x2, sk[6]); + x3 = _mm_aesenc_si128(x3, sk[6]); + x0 = _mm_aesenc_si128(x0, sk[7]); + x1 = _mm_aesenc_si128(x1, sk[7]); + x2 = _mm_aesenc_si128(x2, sk[7]); + x3 = _mm_aesenc_si128(x3, sk[7]); + x0 = _mm_aesenc_si128(x0, sk[8]); + x1 = _mm_aesenc_si128(x1, sk[8]); + x2 = _mm_aesenc_si128(x2, sk[8]); + x3 = _mm_aesenc_si128(x3, sk[8]); + x0 = _mm_aesenc_si128(x0, sk[9]); + x1 = _mm_aesenc_si128(x1, sk[9]); + x2 = _mm_aesenc_si128(x2, sk[9]); + x3 = _mm_aesenc_si128(x3, sk[9]); + if (num_rounds == 10) { + x0 = _mm_aesenclast_si128(x0, sk[10]); + x1 = _mm_aesenclast_si128(x1, sk[10]); + x2 = _mm_aesenclast_si128(x2, sk[10]); + x3 = _mm_aesenclast_si128(x3, sk[10]); + } else if (num_rounds == 12) { + x0 = _mm_aesenc_si128(x0, sk[10]); + x1 = _mm_aesenc_si128(x1, sk[10]); + x2 = _mm_aesenc_si128(x2, sk[10]); + x3 = _mm_aesenc_si128(x3, sk[10]); + x0 = _mm_aesenc_si128(x0, sk[11]); + x1 = _mm_aesenc_si128(x1, sk[11]); + x2 = _mm_aesenc_si128(x2, sk[11]); + x3 = _mm_aesenc_si128(x3, sk[11]); + x0 = _mm_aesenclast_si128(x0, sk[12]); + x1 = _mm_aesenclast_si128(x1, sk[12]); + x2 = _mm_aesenclast_si128(x2, sk[12]); + x3 = _mm_aesenclast_si128(x3, sk[12]); + } else { + x0 = _mm_aesenc_si128(x0, sk[10]); + x1 = _mm_aesenc_si128(x1, sk[10]); + x2 = _mm_aesenc_si128(x2, sk[10]); + x3 = _mm_aesenc_si128(x3, sk[10]); + x0 = _mm_aesenc_si128(x0, sk[11]); + x1 = _mm_aesenc_si128(x1, sk[11]); + x2 = _mm_aesenc_si128(x2, sk[11]); + x3 = _mm_aesenc_si128(x3, sk[11]); + x0 = _mm_aesenc_si128(x0, sk[12]); + x1 = _mm_aesenc_si128(x1, sk[12]); + x2 = _mm_aesenc_si128(x2, sk[12]); + x3 = _mm_aesenc_si128(x3, sk[12]); + x0 = _mm_aesenc_si128(x0, sk[13]); + x1 = _mm_aesenc_si128(x1, sk[13]); + x2 = _mm_aesenc_si128(x2, sk[13]); + x3 = _mm_aesenc_si128(x3, sk[13]); + x0 = _mm_aesenclast_si128(x0, sk[14]); + x1 = _mm_aesenclast_si128(x1, sk[14]); + x2 = _mm_aesenclast_si128(x2, sk[14]); + x3 = _mm_aesenclast_si128(x3, sk[14]); + } + if (len >= 64) { + x0 = _mm_xor_si128(x0, + _mm_loadu_si128((void *)(buf + 0))); + x1 = _mm_xor_si128(x1, + _mm_loadu_si128((void *)(buf + 16))); + x2 = _mm_xor_si128(x2, + _mm_loadu_si128((void *)(buf + 32))); + x3 = _mm_xor_si128(x3, + _mm_loadu_si128((void *)(buf + 48))); + _mm_storeu_si128((void *)(buf + 0), x0); + _mm_storeu_si128((void *)(buf + 16), x1); + _mm_storeu_si128((void *)(buf + 32), x2); + _mm_storeu_si128((void *)(buf + 48), x3); + buf += 64; + len -= 64; + } else { + unsigned char tmp[64]; + + _mm_storeu_si128((void *)(tmp + 0), x0); + _mm_storeu_si128((void *)(tmp + 16), x1); + _mm_storeu_si128((void *)(tmp + 32), x2); + _mm_storeu_si128((void *)(tmp + 48), x3); + for (u = 0; u < len; u ++) { + buf[u] ^= tmp[u]; + } + switch (len) { + case 16: + ivx0 = ivx1; + break; + case 32: + ivx0 = ivx2; + break; + case 48: + ivx0 = ivx3; + break; + } + break; + } + + /* + * Add 4 to each counter value. For carry propagation + * into the upper 64-bit words, we would need to compare + * the results with 4, but SSE2+ has only _signed_ + * comparisons. Instead, we mask out the low two bits, + * and check whether the remaining bits are zero. + */ + ivx0 = _mm_add_epi64(ivx0, four); + ivx1 = _mm_add_epi64(ivx1, four); + ivx2 = _mm_add_epi64(ivx2, four); + ivx3 = _mm_add_epi64(ivx3, four); + ivx0 = _mm_sub_epi64(ivx0, + _mm_slli_si128(_mm_cmpeq_epi64( + _mm_and_si128(ivx0, notthree), zero), 8)); + ivx1 = _mm_sub_epi64(ivx1, + _mm_slli_si128(_mm_cmpeq_epi64( + _mm_and_si128(ivx1, notthree), zero), 8)); + ivx2 = _mm_sub_epi64(ivx2, + _mm_slli_si128(_mm_cmpeq_epi64( + _mm_and_si128(ivx2, notthree), zero), 8)); + ivx3 = _mm_sub_epi64(ivx3, + _mm_slli_si128(_mm_cmpeq_epi64( + _mm_and_si128(ivx3, notthree), zero), 8)); + } + + /* + * Write back new counter value. The loop took care to put the + * right counter value in ivx0. + */ + _mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev)); +} + +/* see bearssl_block.h */ +BR_TARGET("sse2,sse4.1,aes") +void +br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx, + void *cbcmac, const void *data, size_t len) +{ + const unsigned char *buf; + unsigned num_rounds; + __m128i sk[15], ivx; + unsigned u; + + buf = data; + ivx = _mm_loadu_si128(cbcmac); + num_rounds = ctx->num_rounds; + for (u = 0; u <= num_rounds; u ++) { + sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); + } + while (len > 0) { + __m128i x; + + x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx); + x = _mm_xor_si128(x, sk[0]); + x = _mm_aesenc_si128(x, sk[1]); + x = _mm_aesenc_si128(x, sk[2]); + x = _mm_aesenc_si128(x, sk[3]); + x = _mm_aesenc_si128(x, sk[4]); + x = _mm_aesenc_si128(x, sk[5]); + x = _mm_aesenc_si128(x, sk[6]); + x = _mm_aesenc_si128(x, sk[7]); + x = _mm_aesenc_si128(x, sk[8]); + x = _mm_aesenc_si128(x, sk[9]); + if (num_rounds == 10) { + x = _mm_aesenclast_si128(x, sk[10]); + } else if (num_rounds == 12) { + x = _mm_aesenc_si128(x, sk[10]); + x = _mm_aesenc_si128(x, sk[11]); + x = _mm_aesenclast_si128(x, sk[12]); + } else { + x = _mm_aesenc_si128(x, sk[10]); + x = _mm_aesenc_si128(x, sk[11]); + x = _mm_aesenc_si128(x, sk[12]); + x = _mm_aesenc_si128(x, sk[13]); + x = _mm_aesenclast_si128(x, sk[14]); + } + ivx = x; + buf += 16; + len -= 16; + } + _mm_storeu_si128(cbcmac, ivx); +} + +/* see bearssl_block.h */ +BR_TARGET("sse2,sse4.1,aes") +void +br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + unsigned char *buf; + unsigned num_rounds; + __m128i sk[15]; + __m128i ivx, cmx; + __m128i erev, zero, one; + unsigned u; + int first_iter; + + num_rounds = ctx->num_rounds; + for (u = 0; u <= num_rounds; u ++) { + sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); + } + + /* + * Some SSE2 constants. + */ + erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + zero = _mm_setzero_si128(); + one = _mm_set_epi64x(0, 1); + + /* + * Decode the counter in big-endian. + */ + ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev); + cmx = _mm_loadu_si128(cbcmac); + + buf = data; + first_iter = 1; + while (len > 0) { + __m128i dx, x0, x1; + + /* + * Load initial values: + * dx encrypted block of data + * x0 counter (for CTR encryption) + * x1 input for CBC-MAC + */ + dx = _mm_loadu_si128((void *)buf); + x0 = _mm_shuffle_epi8(ivx, erev); + x1 = cmx; + + x0 = _mm_xor_si128(x0, sk[0]); + x1 = _mm_xor_si128(x1, sk[0]); + x0 = _mm_aesenc_si128(x0, sk[1]); + x1 = _mm_aesenc_si128(x1, sk[1]); + x0 = _mm_aesenc_si128(x0, sk[2]); + x1 = _mm_aesenc_si128(x1, sk[2]); + x0 = _mm_aesenc_si128(x0, sk[3]); + x1 = _mm_aesenc_si128(x1, sk[3]); + x0 = _mm_aesenc_si128(x0, sk[4]); + x1 = _mm_aesenc_si128(x1, sk[4]); + x0 = _mm_aesenc_si128(x0, sk[5]); + x1 = _mm_aesenc_si128(x1, sk[5]); + x0 = _mm_aesenc_si128(x0, sk[6]); + x1 = _mm_aesenc_si128(x1, sk[6]); + x0 = _mm_aesenc_si128(x0, sk[7]); + x1 = _mm_aesenc_si128(x1, sk[7]); + x0 = _mm_aesenc_si128(x0, sk[8]); + x1 = _mm_aesenc_si128(x1, sk[8]); + x0 = _mm_aesenc_si128(x0, sk[9]); + x1 = _mm_aesenc_si128(x1, sk[9]); + if (num_rounds == 10) { + x0 = _mm_aesenclast_si128(x0, sk[10]); + x1 = _mm_aesenclast_si128(x1, sk[10]); + } else if (num_rounds == 12) { + x0 = _mm_aesenc_si128(x0, sk[10]); + x1 = _mm_aesenc_si128(x1, sk[10]); + x0 = _mm_aesenc_si128(x0, sk[11]); + x1 = _mm_aesenc_si128(x1, sk[11]); + x0 = _mm_aesenclast_si128(x0, sk[12]); + x1 = _mm_aesenclast_si128(x1, sk[12]); + } else { + x0 = _mm_aesenc_si128(x0, sk[10]); + x1 = _mm_aesenc_si128(x1, sk[10]); + x0 = _mm_aesenc_si128(x0, sk[11]); + x1 = _mm_aesenc_si128(x1, sk[11]); + x0 = _mm_aesenc_si128(x0, sk[12]); + x1 = _mm_aesenc_si128(x1, sk[12]); + x0 = _mm_aesenc_si128(x0, sk[13]); + x1 = _mm_aesenc_si128(x1, sk[13]); + x0 = _mm_aesenclast_si128(x0, sk[14]); + x1 = _mm_aesenclast_si128(x1, sk[14]); + } + + x0 = _mm_xor_si128(x0, dx); + if (first_iter) { + cmx = _mm_xor_si128(cmx, x0); + first_iter = 0; + } else { + cmx = _mm_xor_si128(x1, x0); + } + _mm_storeu_si128((void *)buf, x0); + + buf += 16; + len -= 16; + + /* + * Increment the counter value. + */ + ivx = _mm_add_epi64(ivx, one); + ivx = _mm_sub_epi64(ivx, + _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8)); + + /* + * If this was the last iteration, then compute the + * extra block encryption to complete CBC-MAC. + */ + if (len == 0) { + cmx = _mm_xor_si128(cmx, sk[0]); + cmx = _mm_aesenc_si128(cmx, sk[1]); + cmx = _mm_aesenc_si128(cmx, sk[2]); + cmx = _mm_aesenc_si128(cmx, sk[3]); + cmx = _mm_aesenc_si128(cmx, sk[4]); + cmx = _mm_aesenc_si128(cmx, sk[5]); + cmx = _mm_aesenc_si128(cmx, sk[6]); + cmx = _mm_aesenc_si128(cmx, sk[7]); + cmx = _mm_aesenc_si128(cmx, sk[8]); + cmx = _mm_aesenc_si128(cmx, sk[9]); + if (num_rounds == 10) { + cmx = _mm_aesenclast_si128(cmx, sk[10]); + } else if (num_rounds == 12) { + cmx = _mm_aesenc_si128(cmx, sk[10]); + cmx = _mm_aesenc_si128(cmx, sk[11]); + cmx = _mm_aesenclast_si128(cmx, sk[12]); + } else { + cmx = _mm_aesenc_si128(cmx, sk[10]); + cmx = _mm_aesenc_si128(cmx, sk[11]); + cmx = _mm_aesenc_si128(cmx, sk[12]); + cmx = _mm_aesenc_si128(cmx, sk[13]); + cmx = _mm_aesenclast_si128(cmx, sk[14]); + } + break; + } + } + + /* + * Write back new counter value and CBC-MAC value. + */ + _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev)); + _mm_storeu_si128(cbcmac, cmx); +} + +/* see bearssl_block.h */ +BR_TARGET("sse2,sse4.1,aes") +void +br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx, + void *ctr, void *cbcmac, void *data, size_t len) +{ + unsigned char *buf; + unsigned num_rounds; + __m128i sk[15]; + __m128i ivx, cmx; + __m128i erev, zero, one; + unsigned u; + + num_rounds = ctx->num_rounds; + for (u = 0; u <= num_rounds; u ++) { + sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); + } + + /* + * Some SSE2 constants. + */ + erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15); + zero = _mm_setzero_si128(); + one = _mm_set_epi64x(0, 1); + + /* + * Decode the counter in big-endian. + */ + ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev); + cmx = _mm_loadu_si128(cbcmac); + + buf = data; + while (len > 0) { + __m128i dx, x0, x1; + + /* + * Load initial values: + * dx encrypted block of data + * x0 counter (for CTR encryption) + * x1 input for CBC-MAC + */ + dx = _mm_loadu_si128((void *)buf); + x0 = _mm_shuffle_epi8(ivx, erev); + x1 = _mm_xor_si128(cmx, dx); + + x0 = _mm_xor_si128(x0, sk[0]); + x1 = _mm_xor_si128(x1, sk[0]); + x0 = _mm_aesenc_si128(x0, sk[1]); + x1 = _mm_aesenc_si128(x1, sk[1]); + x0 = _mm_aesenc_si128(x0, sk[2]); + x1 = _mm_aesenc_si128(x1, sk[2]); + x0 = _mm_aesenc_si128(x0, sk[3]); + x1 = _mm_aesenc_si128(x1, sk[3]); + x0 = _mm_aesenc_si128(x0, sk[4]); + x1 = _mm_aesenc_si128(x1, sk[4]); + x0 = _mm_aesenc_si128(x0, sk[5]); + x1 = _mm_aesenc_si128(x1, sk[5]); + x0 = _mm_aesenc_si128(x0, sk[6]); + x1 = _mm_aesenc_si128(x1, sk[6]); + x0 = _mm_aesenc_si128(x0, sk[7]); + x1 = _mm_aesenc_si128(x1, sk[7]); + x0 = _mm_aesenc_si128(x0, sk[8]); + x1 = _mm_aesenc_si128(x1, sk[8]); + x0 = _mm_aesenc_si128(x0, sk[9]); + x1 = _mm_aesenc_si128(x1, sk[9]); + if (num_rounds == 10) { + x0 = _mm_aesenclast_si128(x0, sk[10]); + x1 = _mm_aesenclast_si128(x1, sk[10]); + } else if (num_rounds == 12) { + x0 = _mm_aesenc_si128(x0, sk[10]); + x1 = _mm_aesenc_si128(x1, sk[10]); + x0 = _mm_aesenc_si128(x0, sk[11]); + x1 = _mm_aesenc_si128(x1, sk[11]); + x0 = _mm_aesenclast_si128(x0, sk[12]); + x1 = _mm_aesenclast_si128(x1, sk[12]); + } else { + x0 = _mm_aesenc_si128(x0, sk[10]); + x1 = _mm_aesenc_si128(x1, sk[10]); + x0 = _mm_aesenc_si128(x0, sk[11]); + x1 = _mm_aesenc_si128(x1, sk[11]); + x0 = _mm_aesenc_si128(x0, sk[12]); + x1 = _mm_aesenc_si128(x1, sk[12]); + x0 = _mm_aesenc_si128(x0, sk[13]); + x1 = _mm_aesenc_si128(x1, sk[13]); + x0 = _mm_aesenclast_si128(x0, sk[14]); + x1 = _mm_aesenclast_si128(x1, sk[14]); + } + x0 = _mm_xor_si128(x0, dx); + cmx = x1; + _mm_storeu_si128((void *)buf, x0); + + buf += 16; + len -= 16; + + /* + * Increment the counter value. + */ + ivx = _mm_add_epi64(ivx, one); + ivx = _mm_sub_epi64(ivx, + _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8)); + } + + /* + * Write back new counter value and CBC-MAC value. + */ + _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev)); + _mm_storeu_si128(cbcmac, cmx); +} + +BR_TARGETS_X86_DOWN + +/* see bearssl_block.h */ +const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = { + sizeof(br_aes_x86ni_ctrcbc_keys), + 16, + 4, + (void (*)(const br_block_ctrcbc_class **, const void *, size_t)) + &br_aes_x86ni_ctrcbc_init, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_x86ni_ctrcbc_encrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, void *, size_t)) + &br_aes_x86ni_ctrcbc_decrypt, + (void (*)(const br_block_ctrcbc_class *const *, + void *, void *, size_t)) + &br_aes_x86ni_ctrcbc_ctr, + (void (*)(const br_block_ctrcbc_class *const *, + void *, const void *, size_t)) + &br_aes_x86ni_ctrcbc_mac +}; + +#else + +/* see bearssl_block.h */ +const br_block_ctrcbc_class * +br_aes_x86ni_ctrcbc_get_vtable(void) +{ + return NULL; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/chacha20_ct.c b/test/monniaux/BearSSL/src/symcipher/chacha20_ct.c new file mode 100644 index 00000000..9961eb11 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/chacha20_ct.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +uint32_t +br_chacha20_ct_run(const void *key, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + uint32_t kw[8], ivw[3]; + size_t u; + + static const uint32_t CW[] = { + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 + }; + + buf = data; + for (u = 0; u < 8; u ++) { + kw[u] = br_dec32le((const unsigned char *)key + (u << 2)); + } + for (u = 0; u < 3; u ++) { + ivw[u] = br_dec32le((const unsigned char *)iv + (u << 2)); + } + while (len > 0) { + uint32_t state[16]; + int i; + size_t clen; + unsigned char tmp[64]; + + memcpy(&state[0], CW, sizeof CW); + memcpy(&state[4], kw, sizeof kw); + state[12] = cc; + memcpy(&state[13], ivw, sizeof ivw); + for (i = 0; i < 10; i ++) { + +#define QROUND(a, b, c, d) do { \ + state[a] += state[b]; \ + state[d] ^= state[a]; \ + state[d] = (state[d] << 16) | (state[d] >> 16); \ + state[c] += state[d]; \ + state[b] ^= state[c]; \ + state[b] = (state[b] << 12) | (state[b] >> 20); \ + state[a] += state[b]; \ + state[d] ^= state[a]; \ + state[d] = (state[d] << 8) | (state[d] >> 24); \ + state[c] += state[d]; \ + state[b] ^= state[c]; \ + state[b] = (state[b] << 7) | (state[b] >> 25); \ + } while (0) + + QROUND( 0, 4, 8, 12); + QROUND( 1, 5, 9, 13); + QROUND( 2, 6, 10, 14); + QROUND( 3, 7, 11, 15); + QROUND( 0, 5, 10, 15); + QROUND( 1, 6, 11, 12); + QROUND( 2, 7, 8, 13); + QROUND( 3, 4, 9, 14); + +#undef QROUND + + } + for (u = 0; u < 4; u ++) { + br_enc32le(&tmp[u << 2], state[u] + CW[u]); + } + for (u = 4; u < 12; u ++) { + br_enc32le(&tmp[u << 2], state[u] + kw[u - 4]); + } + br_enc32le(&tmp[48], state[12] + cc); + for (u = 13; u < 16; u ++) { + br_enc32le(&tmp[u << 2], state[u] + ivw[u - 13]); + } + + clen = len < 64 ? len : 64; + for (u = 0; u < clen; u ++) { + buf[u] ^= tmp[u]; + } + buf += clen; + len -= clen; + cc ++; + } + return cc; +} diff --git a/test/monniaux/BearSSL/src/symcipher/chacha20_sse2.c b/test/monniaux/BearSSL/src/symcipher/chacha20_sse2.c new file mode 100644 index 00000000..92b4a4a8 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/chacha20_sse2.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define BR_ENABLE_INTRINSICS 1 +#include "inner.h" + +#if BR_SSE2 + +/* + * This file contains a ChaCha20 implementation that leverages SSE2 + * opcodes for better performance. + */ + +/* see bearssl_block.h */ +br_chacha20_run +br_chacha20_sse2_get(void) +{ + /* + * If using 64-bit mode, then SSE2 opcodes should be automatically + * available, since they are part of the ABI. + * + * In 32-bit mode, we use CPUID to detect the SSE2 feature. + */ + +#if BR_amd64 + return &br_chacha20_sse2_run; +#else + + /* + * SSE2 support is indicated by bit 26 in EDX. + */ + if (br_cpuid(0, 0, 0, 0x04000000)) { + return &br_chacha20_sse2_run; + } else { + return 0; + } +#endif +} + +BR_TARGETS_X86_UP + +/* see bearssl_block.h */ +BR_TARGET("sse2") +uint32_t +br_chacha20_sse2_run(const void *key, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + uint32_t ivtmp[4]; + __m128i kw0, kw1; + __m128i iw, cw; + __m128i one; + + static const uint32_t CW[] = { + 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 + }; + + buf = data; + kw0 = _mm_loadu_si128(key); + kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16)); + ivtmp[0] = cc; + memcpy(ivtmp + 1, iv, 12); + iw = _mm_loadu_si128((const void *)ivtmp); + cw = _mm_loadu_si128((const void *)CW); + one = _mm_set_epi32(0, 0, 0, 1); + + while (len > 0) { + /* + * sj contains state words 4*j to 4*j+3. + */ + __m128i s0, s1, s2, s3; + int i; + + s0 = cw; + s1 = kw0; + s2 = kw1; + s3 = iw; + for (i = 0; i < 10; i ++) { + /* + * Even round is straightforward application on + * the state words. + */ + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 16), + _mm_srli_epi32(s3, 16)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 12), + _mm_srli_epi32(s1, 20)); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 8), + _mm_srli_epi32(s3, 24)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 7), + _mm_srli_epi32(s1, 25)); + + /* + * For the odd round, we must rotate some state + * words so that the computations apply on the + * right combinations of words. + */ + s1 = _mm_shuffle_epi32(s1, 0x39); + s2 = _mm_shuffle_epi32(s2, 0x4E); + s3 = _mm_shuffle_epi32(s3, 0x93); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 16), + _mm_srli_epi32(s3, 16)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 12), + _mm_srli_epi32(s1, 20)); + + s0 = _mm_add_epi32(s0, s1); + s3 = _mm_xor_si128(s3, s0); + s3 = _mm_or_si128( + _mm_slli_epi32(s3, 8), + _mm_srli_epi32(s3, 24)); + + s2 = _mm_add_epi32(s2, s3); + s1 = _mm_xor_si128(s1, s2); + s1 = _mm_or_si128( + _mm_slli_epi32(s1, 7), + _mm_srli_epi32(s1, 25)); + + /* + * After the odd round, we rotate back the values + * to undo the rotate at the start of the odd round. + */ + s1 = _mm_shuffle_epi32(s1, 0x93); + s2 = _mm_shuffle_epi32(s2, 0x4E); + s3 = _mm_shuffle_epi32(s3, 0x39); + } + + /* + * Addition with the initial state. + */ + s0 = _mm_add_epi32(s0, cw); + s1 = _mm_add_epi32(s1, kw0); + s2 = _mm_add_epi32(s2, kw1); + s3 = _mm_add_epi32(s3, iw); + + /* + * Increment block counter. + */ + iw = _mm_add_epi32(iw, one); + + /* + * XOR final state with the data. + */ + if (len < 64) { + unsigned char tmp[64]; + size_t u; + + _mm_storeu_si128((void *)(tmp + 0), s0); + _mm_storeu_si128((void *)(tmp + 16), s1); + _mm_storeu_si128((void *)(tmp + 32), s2); + _mm_storeu_si128((void *)(tmp + 48), s3); + for (u = 0; u < len; u ++) { + buf[u] ^= tmp[u]; + } + break; + } else { + __m128i b0, b1, b2, b3; + + b0 = _mm_loadu_si128((const void *)(buf + 0)); + b1 = _mm_loadu_si128((const void *)(buf + 16)); + b2 = _mm_loadu_si128((const void *)(buf + 32)); + b3 = _mm_loadu_si128((const void *)(buf + 48)); + b0 = _mm_xor_si128(b0, s0); + b1 = _mm_xor_si128(b1, s1); + b2 = _mm_xor_si128(b2, s2); + b3 = _mm_xor_si128(b3, s3); + _mm_storeu_si128((void *)(buf + 0), b0); + _mm_storeu_si128((void *)(buf + 16), b1); + _mm_storeu_si128((void *)(buf + 32), b2); + _mm_storeu_si128((void *)(buf + 48), b3); + buf += 64; + len -= 64; + } + } + + /* + * _mm_extract_epi32() requires SSE4.1. We prefer to stick to + * raw SSE2, thus we use _mm_extract_epi16(). + */ + return (uint32_t)_mm_extract_epi16(iw, 0) + | ((uint32_t)_mm_extract_epi16(iw, 1) << 16); +} + +BR_TARGETS_X86_DOWN + +#else + +/* see bearssl_block.h */ +br_chacha20_run +br_chacha20_sse2_get(void) +{ + return 0; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/des_ct.c b/test/monniaux/BearSSL/src/symcipher/des_ct.c new file mode 100644 index 00000000..581c0ab2 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/des_ct.c @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * During key schedule, we need to apply bit extraction PC-2 then permute + * things into our bitslice representation. PC-2 extracts 48 bits out + * of two 28-bit words (kl and kr), and we store these bits into two + * 32-bit words sk0 and sk1. + * + * -- bit 16+x of sk0 comes from bit QL0[x] of kl + * -- bit x of sk0 comes from bit QR0[x] of kr + * -- bit 16+x of sk1 comes from bit QL1[x] of kl + * -- bit x of sk1 comes from bit QR1[x] of kr + */ + +static const unsigned char QL0[] = { + 17, 4, 27, 23, 13, 22, 7, 18, + 16, 24, 2, 20, 1, 8, 15, 26 +}; + +static const unsigned char QR0[] = { + 25, 19, 9, 1, 5, 11, 23, 8, + 17, 0, 22, 3, 6, 20, 27, 24 +}; + +static const unsigned char QL1[] = { + 28, 28, 14, 11, 28, 28, 25, 0, + 28, 28, 5, 9, 28, 28, 12, 21 +}; + +static const unsigned char QR1[] = { + 28, 28, 15, 4, 28, 28, 26, 16, + 28, 28, 12, 7, 28, 28, 10, 14 +}; + +/* + * 32-bit rotation. The C compiler is supposed to recognize it as a + * rotation and use the local architecture rotation opcode (if available). + */ +static inline uint32_t +rotl(uint32_t x, int n) +{ + return (x << n) | (x >> (32 - n)); +} + +/* + * Compute key schedule for 8 key bytes (produces 32 subkey words). + */ +static void +keysched_unit(uint32_t *skey, const void *key) +{ + int i; + + br_des_keysched_unit(skey, key); + + /* + * Apply PC-2 + bitslicing. + */ + for (i = 0; i < 16; i ++) { + uint32_t kl, kr, sk0, sk1; + int j; + + kl = skey[(i << 1) + 0]; + kr = skey[(i << 1) + 1]; + sk0 = 0; + sk1 = 0; + for (j = 0; j < 16; j ++) { + sk0 <<= 1; + sk1 <<= 1; + sk0 |= ((kl >> QL0[j]) & (uint32_t)1) << 16; + sk0 |= (kr >> QR0[j]) & (uint32_t)1; + sk1 |= ((kl >> QL1[j]) & (uint32_t)1) << 16; + sk1 |= (kr >> QR1[j]) & (uint32_t)1; + } + + skey[(i << 1) + 0] = sk0; + skey[(i << 1) + 1] = sk1; + } + +#if 0 + /* + * Speed-optimized version for PC-2 + bitslicing. + * (Unused. Kept for reference only.) + */ + sk0 = kl & (uint32_t)0x00100000; + sk0 |= (kl & (uint32_t)0x08008000) << 2; + sk0 |= (kl & (uint32_t)0x00400000) << 4; + sk0 |= (kl & (uint32_t)0x00800000) << 5; + sk0 |= (kl & (uint32_t)0x00040000) << 6; + sk0 |= (kl & (uint32_t)0x00010000) << 7; + sk0 |= (kl & (uint32_t)0x00000100) << 10; + sk0 |= (kl & (uint32_t)0x00022000) << 14; + sk0 |= (kl & (uint32_t)0x00000082) << 18; + sk0 |= (kl & (uint32_t)0x00000004) << 19; + sk0 |= (kl & (uint32_t)0x04000000) >> 10; + sk0 |= (kl & (uint32_t)0x00000010) << 26; + sk0 |= (kl & (uint32_t)0x01000000) >> 2; + + sk0 |= kr & (uint32_t)0x00000100; + sk0 |= (kr & (uint32_t)0x00000008) << 1; + sk0 |= (kr & (uint32_t)0x00000200) << 4; + sk0 |= rotl(kr & (uint32_t)0x08000021, 6); + sk0 |= (kr & (uint32_t)0x01000000) >> 24; + sk0 |= (kr & (uint32_t)0x00000002) << 11; + sk0 |= (kr & (uint32_t)0x00100000) >> 18; + sk0 |= (kr & (uint32_t)0x00400000) >> 17; + sk0 |= (kr & (uint32_t)0x00800000) >> 14; + sk0 |= (kr & (uint32_t)0x02020000) >> 10; + sk0 |= (kr & (uint32_t)0x00080000) >> 5; + sk0 |= (kr & (uint32_t)0x00000040) >> 3; + sk0 |= (kr & (uint32_t)0x00000800) >> 1; + + sk1 = kl & (uint32_t)0x02000000; + sk1 |= (kl & (uint32_t)0x00001000) << 5; + sk1 |= (kl & (uint32_t)0x00000200) << 11; + sk1 |= (kl & (uint32_t)0x00004000) << 15; + sk1 |= (kl & (uint32_t)0x00000020) << 16; + sk1 |= (kl & (uint32_t)0x00000800) << 17; + sk1 |= (kl & (uint32_t)0x00000001) << 24; + sk1 |= (kl & (uint32_t)0x00200000) >> 5; + + sk1 |= (kr & (uint32_t)0x00000010) << 8; + sk1 |= (kr & (uint32_t)0x04000000) >> 17; + sk1 |= (kr & (uint32_t)0x00004000) >> 14; + sk1 |= (kr & (uint32_t)0x00000400) >> 9; + sk1 |= (kr & (uint32_t)0x00010000) >> 8; + sk1 |= (kr & (uint32_t)0x00001000) >> 7; + sk1 |= (kr & (uint32_t)0x00000080) >> 3; + sk1 |= (kr & (uint32_t)0x00008000) >> 2; +#endif +} + +/* see inner.h */ +unsigned +br_des_ct_keysched(uint32_t *skey, const void *key, size_t key_len) +{ + switch (key_len) { + case 8: + keysched_unit(skey, key); + return 1; + case 16: + keysched_unit(skey, key); + keysched_unit(skey + 32, (const unsigned char *)key + 8); + br_des_rev_skey(skey + 32); + memcpy(skey + 64, skey, 32 * sizeof *skey); + return 3; + default: + keysched_unit(skey, key); + keysched_unit(skey + 32, (const unsigned char *)key + 8); + br_des_rev_skey(skey + 32); + keysched_unit(skey + 64, (const unsigned char *)key + 16); + return 3; + } +} + +/* + * DES confusion function. This function performs expansion E (32 to + * 48 bits), XOR with subkey, S-boxes, and permutation P. + */ +static inline uint32_t +Fconf(uint32_t r0, const uint32_t *sk) +{ + /* + * Each 6->4 S-box is virtually turned into four 6->1 boxes; we + * thus end up with 32 boxes that we call "T-boxes" here. We will + * evaluate them with bitslice code. + * + * Each T-box is a circuit of multiplexers (sort of) and thus + * takes 70 inputs: the 6 actual T-box inputs, and 64 constants + * that describe the T-box output for all combinations of the + * 6 inputs. With this model, all T-boxes are identical (with + * distinct inputs) and thus can be executed in parallel with + * bitslice code. + * + * T-boxes are numbered from 0 to 31, in least-to-most + * significant order. Thus, S-box S1 corresponds to T-boxes 31, + * 30, 29 and 28, in that order. T-box 'n' is computed with the + * bits at rank 'n' in the 32-bit words. + * + * Words x0 to x5 contain the T-box inputs 0 to 5. + */ + uint32_t x0, x1, x2, x3, x4, x5, z0; + uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint32_t y20, y21, y22, y23, y24, y25, y26, y27, y28, y29; + uint32_t y30; + + /* + * Spread input bits over the 6 input words x*. + */ + x1 = r0 & (uint32_t)0x11111111; + x2 = (r0 >> 1) & (uint32_t)0x11111111; + x3 = (r0 >> 2) & (uint32_t)0x11111111; + x4 = (r0 >> 3) & (uint32_t)0x11111111; + x1 = (x1 << 4) - x1; + x2 = (x2 << 4) - x2; + x3 = (x3 << 4) - x3; + x4 = (x4 << 4) - x4; + x0 = (x4 << 4) | (x4 >> 28); + x5 = (x1 >> 4) | (x1 << 28); + + /* + * XOR with the subkey for this round. + */ + x0 ^= sk[0]; + x1 ^= sk[1]; + x2 ^= sk[2]; + x3 ^= sk[3]; + x4 ^= sk[4]; + x5 ^= sk[5]; + + /* + * The T-boxes are done in parallel, since they all use a + * "tree of multiplexer". We use "fake multiplexers": + * + * y = a ^ (x & b) + * + * computes y as either 'a' (if x == 0) or 'a ^ b' (if x == 1). + */ + y0 = (uint32_t)0xEFA72C4D ^ (x0 & (uint32_t)0xEC7AC69C); + y1 = (uint32_t)0xAEAAEDFF ^ (x0 & (uint32_t)0x500FB821); + y2 = (uint32_t)0x37396665 ^ (x0 & (uint32_t)0x40EFA809); + y3 = (uint32_t)0x68D7B833 ^ (x0 & (uint32_t)0xA5EC0B28); + y4 = (uint32_t)0xC9C755BB ^ (x0 & (uint32_t)0x252CF820); + y5 = (uint32_t)0x73FC3606 ^ (x0 & (uint32_t)0x40205801); + y6 = (uint32_t)0xA2A0A918 ^ (x0 & (uint32_t)0xE220F929); + y7 = (uint32_t)0x8222BD90 ^ (x0 & (uint32_t)0x44A3F9E1); + y8 = (uint32_t)0xD6B6AC77 ^ (x0 & (uint32_t)0x794F104A); + y9 = (uint32_t)0x3069300C ^ (x0 & (uint32_t)0x026F320B); + y10 = (uint32_t)0x6CE0D5CC ^ (x0 & (uint32_t)0x7640B01A); + y11 = (uint32_t)0x59A9A22D ^ (x0 & (uint32_t)0x238F1572); + y12 = (uint32_t)0xAC6D0BD4 ^ (x0 & (uint32_t)0x7A63C083); + y13 = (uint32_t)0x21C83200 ^ (x0 & (uint32_t)0x11CCA000); + y14 = (uint32_t)0xA0E62188 ^ (x0 & (uint32_t)0x202F69AA); + /* y15 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */ + y16 = (uint32_t)0xAF7D655A ^ (x0 & (uint32_t)0x51B33BE9); + y17 = (uint32_t)0xF0168AA3 ^ (x0 & (uint32_t)0x3B0FE8AE); + y18 = (uint32_t)0x90AA30C6 ^ (x0 & (uint32_t)0x90BF8816); + y19 = (uint32_t)0x5AB2750A ^ (x0 & (uint32_t)0x09E34F9B); + y20 = (uint32_t)0x5391BE65 ^ (x0 & (uint32_t)0x0103BE88); + y21 = (uint32_t)0x93372BAF ^ (x0 & (uint32_t)0x49AC8E25); + y22 = (uint32_t)0xF288210C ^ (x0 & (uint32_t)0x922C313D); + y23 = (uint32_t)0x920AF5C0 ^ (x0 & (uint32_t)0x70EF31B0); + y24 = (uint32_t)0x63D312C0 ^ (x0 & (uint32_t)0x6A707100); + y25 = (uint32_t)0x537B3006 ^ (x0 & (uint32_t)0xB97C9011); + y26 = (uint32_t)0xA2EFB0A5 ^ (x0 & (uint32_t)0xA320C959); + y27 = (uint32_t)0xBC8F96A5 ^ (x0 & (uint32_t)0x6EA0AB4A); + y28 = (uint32_t)0xFAD176A5 ^ (x0 & (uint32_t)0x6953DDF8); + y29 = (uint32_t)0x665A14A3 ^ (x0 & (uint32_t)0xF74F3E2B); + y30 = (uint32_t)0xF2EFF0CC ^ (x0 & (uint32_t)0xF0306CAD); + /* y31 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */ + + y0 = y0 ^ (x1 & y1); + y1 = y2 ^ (x1 & y3); + y2 = y4 ^ (x1 & y5); + y3 = y6 ^ (x1 & y7); + y4 = y8 ^ (x1 & y9); + y5 = y10 ^ (x1 & y11); + y6 = y12 ^ (x1 & y13); + y7 = y14; /* was: y14 ^ (x1 & y15) */ + y8 = y16 ^ (x1 & y17); + y9 = y18 ^ (x1 & y19); + y10 = y20 ^ (x1 & y21); + y11 = y22 ^ (x1 & y23); + y12 = y24 ^ (x1 & y25); + y13 = y26 ^ (x1 & y27); + y14 = y28 ^ (x1 & y29); + y15 = y30; /* was: y30 ^ (x1 & y31) */ + + y0 = y0 ^ (x2 & y1); + y1 = y2 ^ (x2 & y3); + y2 = y4 ^ (x2 & y5); + y3 = y6 ^ (x2 & y7); + y4 = y8 ^ (x2 & y9); + y5 = y10 ^ (x2 & y11); + y6 = y12 ^ (x2 & y13); + y7 = y14 ^ (x2 & y15); + + y0 = y0 ^ (x3 & y1); + y1 = y2 ^ (x3 & y3); + y2 = y4 ^ (x3 & y5); + y3 = y6 ^ (x3 & y7); + + y0 = y0 ^ (x4 & y1); + y1 = y2 ^ (x4 & y3); + + y0 = y0 ^ (x5 & y1); + + /* + * The P permutation: + * -- Each bit move is converted into a mask + left rotation. + * -- Rotations that use the same movement are coalesced together. + * -- Left and right shifts are used as alternatives to a rotation + * where appropriate (this will help architectures that do not have + * a rotation opcode). + */ + z0 = (y0 & (uint32_t)0x00000004) << 3; + z0 |= (y0 & (uint32_t)0x00004000) << 4; + z0 |= rotl(y0 & 0x12020120, 5); + z0 |= (y0 & (uint32_t)0x00100000) << 6; + z0 |= (y0 & (uint32_t)0x00008000) << 9; + z0 |= (y0 & (uint32_t)0x04000000) >> 22; + z0 |= (y0 & (uint32_t)0x00000001) << 11; + z0 |= rotl(y0 & 0x20000200, 12); + z0 |= (y0 & (uint32_t)0x00200000) >> 19; + z0 |= (y0 & (uint32_t)0x00000040) << 14; + z0 |= (y0 & (uint32_t)0x00010000) << 15; + z0 |= (y0 & (uint32_t)0x00000002) << 16; + z0 |= rotl(y0 & 0x40801800, 17); + z0 |= (y0 & (uint32_t)0x00080000) >> 13; + z0 |= (y0 & (uint32_t)0x00000010) << 21; + z0 |= (y0 & (uint32_t)0x01000000) >> 10; + z0 |= rotl(y0 & 0x88000008, 24); + z0 |= (y0 & (uint32_t)0x00000480) >> 7; + z0 |= (y0 & (uint32_t)0x00442000) >> 6; + return z0; +} + +/* + * Process one block through 16 successive rounds, omitting the swap + * in the final round. + */ +static void +process_block_unit(uint32_t *pl, uint32_t *pr, const uint32_t *sk_exp) +{ + int i; + uint32_t l, r; + + l = *pl; + r = *pr; + for (i = 0; i < 16; i ++) { + uint32_t t; + + t = l ^ Fconf(r, sk_exp); + l = r; + r = t; + sk_exp += 6; + } + *pl = r; + *pr = l; +} + +/* see inner.h */ +void +br_des_ct_process_block(unsigned num_rounds, + const uint32_t *sk_exp, void *block) +{ + unsigned char *buf; + uint32_t l, r; + + buf = block; + l = br_dec32be(buf); + r = br_dec32be(buf + 4); + br_des_do_IP(&l, &r); + while (num_rounds -- > 0) { + process_block_unit(&l, &r, sk_exp); + sk_exp += 96; + } + br_des_do_invIP(&l, &r); + br_enc32be(buf, l); + br_enc32be(buf + 4, r); +} + +/* see inner.h */ +void +br_des_ct_skey_expand(uint32_t *sk_exp, + unsigned num_rounds, const uint32_t *skey) +{ + num_rounds <<= 4; + while (num_rounds -- > 0) { + uint32_t v, w0, w1, w2, w3; + + v = *skey ++; + w0 = v & 0x11111111; + w1 = (v >> 1) & 0x11111111; + w2 = (v >> 2) & 0x11111111; + w3 = (v >> 3) & 0x11111111; + *sk_exp ++ = (w0 << 4) - w0; + *sk_exp ++ = (w1 << 4) - w1; + *sk_exp ++ = (w2 << 4) - w2; + *sk_exp ++ = (w3 << 4) - w3; + v = *skey ++; + w0 = v & 0x11111111; + w1 = (v >> 1) & 0x11111111; + *sk_exp ++ = (w0 << 4) - w0; + *sk_exp ++ = (w1 << 4) - w1; + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/des_ct_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/des_ct_cbcdec.c new file mode 100644 index 00000000..d208a3d2 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/des_ct_cbcdec.c @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_des_ct_cbcdec_init(br_des_ct_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_des_ct_cbcdec_vtable; + ctx->num_rounds = br_des_ct_keysched(ctx->skey, key, len); + if (len == 8) { + br_des_rev_skey(ctx->skey); + } else { + int i; + + for (i = 0; i < 48; i += 2) { + uint32_t t; + + t = ctx->skey[i]; + ctx->skey[i] = ctx->skey[94 - i]; + ctx->skey[94 - i] = t; + t = ctx->skey[i + 1]; + ctx->skey[i + 1] = ctx->skey[95 - i]; + ctx->skey[95 - i] = t; + } + } +} + +/* see bearssl_block.h */ +void +br_des_ct_cbcdec_run(const br_des_ct_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + uint32_t sk_exp[288]; + + br_des_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + ivbuf = iv; + buf = data; + while (len > 0) { + unsigned char tmp[8]; + int i; + + memcpy(tmp, buf, 8); + br_des_ct_process_block(ctx->num_rounds, sk_exp, buf); + for (i = 0; i < 8; i ++) { + buf[i] ^= ivbuf[i]; + } + memcpy(ivbuf, tmp, 8); + buf += 8; + len -= 8; + } +} + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_des_ct_cbcdec_vtable = { + sizeof(br_des_ct_cbcdec_keys), + 8, + 3, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_des_ct_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_des_ct_cbcdec_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/des_ct_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/des_ct_cbcenc.c new file mode 100644 index 00000000..4b3610e0 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/des_ct_cbcenc.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_des_ct_cbcenc_init(br_des_ct_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_des_ct_cbcenc_vtable; + ctx->num_rounds = br_des_ct_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_des_ct_cbcenc_run(const br_des_ct_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + uint32_t sk_exp[288]; + + br_des_ct_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + ivbuf = iv; + buf = data; + while (len > 0) { + int i; + + for (i = 0; i < 8; i ++) { + buf[i] ^= ivbuf[i]; + } + br_des_ct_process_block(ctx->num_rounds, sk_exp, buf); + memcpy(ivbuf, buf, 8); + buf += 8; + len -= 8; + } +} + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_des_ct_cbcenc_vtable = { + sizeof(br_des_ct_cbcenc_keys), + 8, + 3, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_des_ct_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_des_ct_cbcenc_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/des_support.c b/test/monniaux/BearSSL/src/symcipher/des_support.c new file mode 100644 index 00000000..37f6db32 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/des_support.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see inner.h */ +void +br_des_do_IP(uint32_t *xl, uint32_t *xr) +{ + /* + * Permutation algorithm is initially from Richard Outerbridge; + * implementation here is adapted from Crypto++ "des.cpp" file + * (which is in public domain). + */ + uint32_t l, r, t; + + l = *xl; + r = *xr; + t = ((l >> 4) ^ r) & (uint32_t)0x0F0F0F0F; + r ^= t; + l ^= t << 4; + t = ((l >> 16) ^ r) & (uint32_t)0x0000FFFF; + r ^= t; + l ^= t << 16; + t = ((r >> 2) ^ l) & (uint32_t)0x33333333; + l ^= t; + r ^= t << 2; + t = ((r >> 8) ^ l) & (uint32_t)0x00FF00FF; + l ^= t; + r ^= t << 8; + t = ((l >> 1) ^ r) & (uint32_t)0x55555555; + r ^= t; + l ^= t << 1; + *xl = l; + *xr = r; +} + +/* see inner.h */ +void +br_des_do_invIP(uint32_t *xl, uint32_t *xr) +{ + /* + * See br_des_do_IP(). + */ + uint32_t l, r, t; + + l = *xl; + r = *xr; + t = ((l >> 1) ^ r) & 0x55555555; + r ^= t; + l ^= t << 1; + t = ((r >> 8) ^ l) & 0x00FF00FF; + l ^= t; + r ^= t << 8; + t = ((r >> 2) ^ l) & 0x33333333; + l ^= t; + r ^= t << 2; + t = ((l >> 16) ^ r) & 0x0000FFFF; + r ^= t; + l ^= t << 16; + t = ((l >> 4) ^ r) & 0x0F0F0F0F; + r ^= t; + l ^= t << 4; + *xl = l; + *xr = r; +} + +/* see inner.h */ +void +br_des_keysched_unit(uint32_t *skey, const void *key) +{ + uint32_t xl, xr, kl, kr; + int i; + + xl = br_dec32be(key); + xr = br_dec32be((const unsigned char *)key + 4); + + /* + * Permutation PC-1 is quite similar to the IP permutation. + * Definition of IP (in FIPS 46-3 notations) is: + * 58 50 42 34 26 18 10 2 + * 60 52 44 36 28 20 12 4 + * 62 54 46 38 30 22 14 6 + * 64 56 48 40 32 24 16 8 + * 57 49 41 33 25 17 9 1 + * 59 51 43 35 27 19 11 3 + * 61 53 45 37 29 21 13 5 + * 63 55 47 39 31 23 15 7 + * + * Definition of PC-1 is: + * 57 49 41 33 25 17 9 1 + * 58 50 42 34 26 18 10 2 + * 59 51 43 35 27 19 11 3 + * 60 52 44 36 + * 63 55 47 39 31 23 15 7 + * 62 54 46 38 30 22 14 6 + * 61 53 45 37 29 21 13 5 + * 28 20 12 4 + */ + br_des_do_IP(&xl, &xr); + kl = ((xr & (uint32_t)0xFF000000) >> 4) + | ((xl & (uint32_t)0xFF000000) >> 12) + | ((xr & (uint32_t)0x00FF0000) >> 12) + | ((xl & (uint32_t)0x00FF0000) >> 20); + kr = ((xr & (uint32_t)0x000000FF) << 20) + | ((xl & (uint32_t)0x0000FF00) << 4) + | ((xr & (uint32_t)0x0000FF00) >> 4) + | ((xl & (uint32_t)0x000F0000) >> 16); + + /* + * For each round, rotate the two 28-bit words kl and kr. + * The extraction of the 48-bit subkey (PC-2) is not done yet. + */ + for (i = 0; i < 16; i ++) { + if ((1 << i) & 0x8103) { + kl = (kl << 1) | (kl >> 27); + kr = (kr << 1) | (kr >> 27); + } else { + kl = (kl << 2) | (kl >> 26); + kr = (kr << 2) | (kr >> 26); + } + kl &= (uint32_t)0x0FFFFFFF; + kr &= (uint32_t)0x0FFFFFFF; + skey[(i << 1) + 0] = kl; + skey[(i << 1) + 1] = kr; + } +} + +/* see inner.h */ +void +br_des_rev_skey(uint32_t *skey) +{ + int i; + + for (i = 0; i < 16; i += 2) { + uint32_t t; + + t = skey[i + 0]; + skey[i + 0] = skey[30 - i]; + skey[30 - i] = t; + t = skey[i + 1]; + skey[i + 1] = skey[31 - i]; + skey[31 - i] = t; + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/des_tab.c b/test/monniaux/BearSSL/src/symcipher/des_tab.c new file mode 100644 index 00000000..3f8e4f9f --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/des_tab.c @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * PC2left[x] tells where bit x goes when applying PC-2. 'x' is a bit + * position in the left rotated key word. Both position are in normal + * order (rightmost bit is 0). + */ +static const unsigned char PC2left[] = { + 16, 3, 7, 24, 20, 11, 24, + 13, 2, 10, 24, 22, 5, 15, + 23, 1, 9, 21, 12, 24, 6, + 4, 14, 18, 8, 17, 0, 19 +}; + +/* + * Similar to PC2left[x], for the right rotated key word. + */ +static const unsigned char PC2right[] = { + 8, 18, 24, 6, 22, 15, 3, + 10, 12, 19, 5, 14, 11, 24, + 4, 23, 16, 9, 24, 20, 2, + 24, 7, 13, 0, 21, 17, 1 +}; + +/* + * S-boxes and PC-1 merged. + */ +static const uint32_t S1[] = { + 0x00808200, 0x00000000, 0x00008000, 0x00808202, + 0x00808002, 0x00008202, 0x00000002, 0x00008000, + 0x00000200, 0x00808200, 0x00808202, 0x00000200, + 0x00800202, 0x00808002, 0x00800000, 0x00000002, + 0x00000202, 0x00800200, 0x00800200, 0x00008200, + 0x00008200, 0x00808000, 0x00808000, 0x00800202, + 0x00008002, 0x00800002, 0x00800002, 0x00008002, + 0x00000000, 0x00000202, 0x00008202, 0x00800000, + 0x00008000, 0x00808202, 0x00000002, 0x00808000, + 0x00808200, 0x00800000, 0x00800000, 0x00000200, + 0x00808002, 0x00008000, 0x00008200, 0x00800002, + 0x00000200, 0x00000002, 0x00800202, 0x00008202, + 0x00808202, 0x00008002, 0x00808000, 0x00800202, + 0x00800002, 0x00000202, 0x00008202, 0x00808200, + 0x00000202, 0x00800200, 0x00800200, 0x00000000, + 0x00008002, 0x00008200, 0x00000000, 0x00808002 +}; + +static const uint32_t S2[] = { + 0x40084010, 0x40004000, 0x00004000, 0x00084010, + 0x00080000, 0x00000010, 0x40080010, 0x40004010, + 0x40000010, 0x40084010, 0x40084000, 0x40000000, + 0x40004000, 0x00080000, 0x00000010, 0x40080010, + 0x00084000, 0x00080010, 0x40004010, 0x00000000, + 0x40000000, 0x00004000, 0x00084010, 0x40080000, + 0x00080010, 0x40000010, 0x00000000, 0x00084000, + 0x00004010, 0x40084000, 0x40080000, 0x00004010, + 0x00000000, 0x00084010, 0x40080010, 0x00080000, + 0x40004010, 0x40080000, 0x40084000, 0x00004000, + 0x40080000, 0x40004000, 0x00000010, 0x40084010, + 0x00084010, 0x00000010, 0x00004000, 0x40000000, + 0x00004010, 0x40084000, 0x00080000, 0x40000010, + 0x00080010, 0x40004010, 0x40000010, 0x00080010, + 0x00084000, 0x00000000, 0x40004000, 0x00004010, + 0x40000000, 0x40080010, 0x40084010, 0x00084000 +}; + +static const uint32_t S3[] = { + 0x00000104, 0x04010100, 0x00000000, 0x04010004, + 0x04000100, 0x00000000, 0x00010104, 0x04000100, + 0x00010004, 0x04000004, 0x04000004, 0x00010000, + 0x04010104, 0x00010004, 0x04010000, 0x00000104, + 0x04000000, 0x00000004, 0x04010100, 0x00000100, + 0x00010100, 0x04010000, 0x04010004, 0x00010104, + 0x04000104, 0x00010100, 0x00010000, 0x04000104, + 0x00000004, 0x04010104, 0x00000100, 0x04000000, + 0x04010100, 0x04000000, 0x00010004, 0x00000104, + 0x00010000, 0x04010100, 0x04000100, 0x00000000, + 0x00000100, 0x00010004, 0x04010104, 0x04000100, + 0x04000004, 0x00000100, 0x00000000, 0x04010004, + 0x04000104, 0x00010000, 0x04000000, 0x04010104, + 0x00000004, 0x00010104, 0x00010100, 0x04000004, + 0x04010000, 0x04000104, 0x00000104, 0x04010000, + 0x00010104, 0x00000004, 0x04010004, 0x00010100 +}; + +static const uint32_t S4[] = { + 0x80401000, 0x80001040, 0x80001040, 0x00000040, + 0x00401040, 0x80400040, 0x80400000, 0x80001000, + 0x00000000, 0x00401000, 0x00401000, 0x80401040, + 0x80000040, 0x00000000, 0x00400040, 0x80400000, + 0x80000000, 0x00001000, 0x00400000, 0x80401000, + 0x00000040, 0x00400000, 0x80001000, 0x00001040, + 0x80400040, 0x80000000, 0x00001040, 0x00400040, + 0x00001000, 0x00401040, 0x80401040, 0x80000040, + 0x00400040, 0x80400000, 0x00401000, 0x80401040, + 0x80000040, 0x00000000, 0x00000000, 0x00401000, + 0x00001040, 0x00400040, 0x80400040, 0x80000000, + 0x80401000, 0x80001040, 0x80001040, 0x00000040, + 0x80401040, 0x80000040, 0x80000000, 0x00001000, + 0x80400000, 0x80001000, 0x00401040, 0x80400040, + 0x80001000, 0x00001040, 0x00400000, 0x80401000, + 0x00000040, 0x00400000, 0x00001000, 0x00401040 +}; + +static const uint32_t S5[] = { + 0x00000080, 0x01040080, 0x01040000, 0x21000080, + 0x00040000, 0x00000080, 0x20000000, 0x01040000, + 0x20040080, 0x00040000, 0x01000080, 0x20040080, + 0x21000080, 0x21040000, 0x00040080, 0x20000000, + 0x01000000, 0x20040000, 0x20040000, 0x00000000, + 0x20000080, 0x21040080, 0x21040080, 0x01000080, + 0x21040000, 0x20000080, 0x00000000, 0x21000000, + 0x01040080, 0x01000000, 0x21000000, 0x00040080, + 0x00040000, 0x21000080, 0x00000080, 0x01000000, + 0x20000000, 0x01040000, 0x21000080, 0x20040080, + 0x01000080, 0x20000000, 0x21040000, 0x01040080, + 0x20040080, 0x00000080, 0x01000000, 0x21040000, + 0x21040080, 0x00040080, 0x21000000, 0x21040080, + 0x01040000, 0x00000000, 0x20040000, 0x21000000, + 0x00040080, 0x01000080, 0x20000080, 0x00040000, + 0x00000000, 0x20040000, 0x01040080, 0x20000080 +}; + +static const uint32_t S6[] = { + 0x10000008, 0x10200000, 0x00002000, 0x10202008, + 0x10200000, 0x00000008, 0x10202008, 0x00200000, + 0x10002000, 0x00202008, 0x00200000, 0x10000008, + 0x00200008, 0x10002000, 0x10000000, 0x00002008, + 0x00000000, 0x00200008, 0x10002008, 0x00002000, + 0x00202000, 0x10002008, 0x00000008, 0x10200008, + 0x10200008, 0x00000000, 0x00202008, 0x10202000, + 0x00002008, 0x00202000, 0x10202000, 0x10000000, + 0x10002000, 0x00000008, 0x10200008, 0x00202000, + 0x10202008, 0x00200000, 0x00002008, 0x10000008, + 0x00200000, 0x10002000, 0x10000000, 0x00002008, + 0x10000008, 0x10202008, 0x00202000, 0x10200000, + 0x00202008, 0x10202000, 0x00000000, 0x10200008, + 0x00000008, 0x00002000, 0x10200000, 0x00202008, + 0x00002000, 0x00200008, 0x10002008, 0x00000000, + 0x10202000, 0x10000000, 0x00200008, 0x10002008 +}; + +static const uint32_t S7[] = { + 0x00100000, 0x02100001, 0x02000401, 0x00000000, + 0x00000400, 0x02000401, 0x00100401, 0x02100400, + 0x02100401, 0x00100000, 0x00000000, 0x02000001, + 0x00000001, 0x02000000, 0x02100001, 0x00000401, + 0x02000400, 0x00100401, 0x00100001, 0x02000400, + 0x02000001, 0x02100000, 0x02100400, 0x00100001, + 0x02100000, 0x00000400, 0x00000401, 0x02100401, + 0x00100400, 0x00000001, 0x02000000, 0x00100400, + 0x02000000, 0x00100400, 0x00100000, 0x02000401, + 0x02000401, 0x02100001, 0x02100001, 0x00000001, + 0x00100001, 0x02000000, 0x02000400, 0x00100000, + 0x02100400, 0x00000401, 0x00100401, 0x02100400, + 0x00000401, 0x02000001, 0x02100401, 0x02100000, + 0x00100400, 0x00000000, 0x00000001, 0x02100401, + 0x00000000, 0x00100401, 0x02100000, 0x00000400, + 0x02000001, 0x02000400, 0x00000400, 0x00100001 +}; + +static const uint32_t S8[] = { + 0x08000820, 0x00000800, 0x00020000, 0x08020820, + 0x08000000, 0x08000820, 0x00000020, 0x08000000, + 0x00020020, 0x08020000, 0x08020820, 0x00020800, + 0x08020800, 0x00020820, 0x00000800, 0x00000020, + 0x08020000, 0x08000020, 0x08000800, 0x00000820, + 0x00020800, 0x00020020, 0x08020020, 0x08020800, + 0x00000820, 0x00000000, 0x00000000, 0x08020020, + 0x08000020, 0x08000800, 0x00020820, 0x00020000, + 0x00020820, 0x00020000, 0x08020800, 0x00000800, + 0x00000020, 0x08020020, 0x00000800, 0x00020820, + 0x08000800, 0x00000020, 0x08000020, 0x08020000, + 0x08020020, 0x08000000, 0x00020000, 0x08000820, + 0x00000000, 0x08020820, 0x00020020, 0x08000020, + 0x08020000, 0x08000800, 0x08000820, 0x00000000, + 0x08020820, 0x00020800, 0x00020800, 0x00000820, + 0x00000820, 0x00020020, 0x08000000, 0x08020800 +}; + +static inline uint32_t +Fconf(uint32_t r0, uint32_t skl, uint32_t skr) +{ + uint32_t r1; + + r1 = (r0 << 16) | (r0 >> 16); + return + S1[((r1 >> 11) ^ (skl >> 18)) & 0x3F] + | S2[((r0 >> 23) ^ (skl >> 12)) & 0x3F] + | S3[((r0 >> 19) ^ (skl >> 6)) & 0x3F] + | S4[((r0 >> 15) ^ (skl )) & 0x3F] + | S5[((r0 >> 11) ^ (skr >> 18)) & 0x3F] + | S6[((r0 >> 7) ^ (skr >> 12)) & 0x3F] + | S7[((r0 >> 3) ^ (skr >> 6)) & 0x3F] + | S8[((r1 >> 15) ^ (skr )) & 0x3F]; +} + +static void +process_block_unit(uint32_t *pl, uint32_t *pr, const uint32_t *skey) +{ + int i; + uint32_t l, r; + + l = *pl; + r = *pr; + for (i = 0; i < 16; i ++) { + uint32_t t; + + t = l ^ Fconf(r, skey[(i << 1) + 0], skey[(i << 1) + 1]); + l = r; + r = t; + } + *pl = r; + *pr = l; +} + +/* see inner.h */ +void +br_des_tab_process_block(unsigned num_rounds, const uint32_t *skey, void *block) +{ + unsigned char *buf; + uint32_t l, r; + + buf = block; + l = br_dec32be(buf); + r = br_dec32be(buf + 4); + br_des_do_IP(&l, &r); + while (num_rounds -- > 0) { + process_block_unit(&l, &r, skey); + skey += 32; + } + br_des_do_invIP(&l, &r); + br_enc32be(buf, l); + br_enc32be(buf + 4, r); +} + +static void +keysched_unit(uint32_t *skey, const void *key) +{ + int i; + + br_des_keysched_unit(skey, key); + + /* + * Apply PC-2 to get the 48-bit subkeys. + */ + for (i = 0; i < 16; i ++) { + uint32_t xl, xr, ul, ur; + int j; + + xl = skey[(i << 1) + 0]; + xr = skey[(i << 1) + 1]; + ul = 0; + ur = 0; + for (j = 0; j < 28; j ++) { + ul |= (xl & 1) << PC2left[j]; + ur |= (xr & 1) << PC2right[j]; + xl >>= 1; + xr >>= 1; + } + skey[(i << 1) + 0] = ul; + skey[(i << 1) + 1] = ur; + } +} + +/* see inner.h */ +unsigned +br_des_tab_keysched(uint32_t *skey, const void *key, size_t key_len) +{ + switch (key_len) { + case 8: + keysched_unit(skey, key); + return 1; + case 16: + keysched_unit(skey, key); + keysched_unit(skey + 32, (const unsigned char *)key + 8); + br_des_rev_skey(skey + 32); + memcpy(skey + 64, skey, 32 * sizeof *skey); + return 3; + default: + keysched_unit(skey, key); + keysched_unit(skey + 32, (const unsigned char *)key + 8); + br_des_rev_skey(skey + 32); + keysched_unit(skey + 64, (const unsigned char *)key + 16); + return 3; + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/des_tab_cbcdec.c b/test/monniaux/BearSSL/src/symcipher/des_tab_cbcdec.c new file mode 100644 index 00000000..e7eabe9d --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/des_tab_cbcdec.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_des_tab_cbcdec_init(br_des_tab_cbcdec_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_des_tab_cbcdec_vtable; + ctx->num_rounds = br_des_tab_keysched(ctx->skey, key, len); + if (len == 8) { + br_des_rev_skey(ctx->skey); + } else { + int i; + + for (i = 0; i < 48; i += 2) { + uint32_t t; + + t = ctx->skey[i]; + ctx->skey[i] = ctx->skey[94 - i]; + ctx->skey[94 - i] = t; + t = ctx->skey[i + 1]; + ctx->skey[i + 1] = ctx->skey[95 - i]; + ctx->skey[95 - i] = t; + } + } +} + +/* see bearssl_block.h */ +void +br_des_tab_cbcdec_run(const br_des_tab_cbcdec_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + + ivbuf = iv; + buf = data; + while (len > 0) { + unsigned char tmp[8]; + int i; + + memcpy(tmp, buf, 8); + br_des_tab_process_block(ctx->num_rounds, ctx->skey, buf); + for (i = 0; i < 8; i ++) { + buf[i] ^= ivbuf[i]; + } + memcpy(ivbuf, tmp, 8); + buf += 8; + len -= 8; + } +} + +/* see bearssl_block.h */ +const br_block_cbcdec_class br_des_tab_cbcdec_vtable = { + sizeof(br_des_tab_cbcdec_keys), + 8, + 3, + (void (*)(const br_block_cbcdec_class **, const void *, size_t)) + &br_des_tab_cbcdec_init, + (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) + &br_des_tab_cbcdec_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/des_tab_cbcenc.c b/test/monniaux/BearSSL/src/symcipher/des_tab_cbcenc.c new file mode 100644 index 00000000..3a45ba3e --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/des_tab_cbcenc.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* see bearssl_block.h */ +void +br_des_tab_cbcenc_init(br_des_tab_cbcenc_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_des_tab_cbcenc_vtable; + ctx->num_rounds = br_des_tab_keysched(ctx->skey, key, len); +} + +/* see bearssl_block.h */ +void +br_des_tab_cbcenc_run(const br_des_tab_cbcenc_keys *ctx, + void *iv, void *data, size_t len) +{ + unsigned char *buf, *ivbuf; + + ivbuf = iv; + buf = data; + while (len > 0) { + int i; + + for (i = 0; i < 8; i ++) { + buf[i] ^= ivbuf[i]; + } + br_des_tab_process_block(ctx->num_rounds, ctx->skey, buf); + memcpy(ivbuf, buf, 8); + buf += 8; + len -= 8; + } +} + +/* see bearssl_block.h */ +const br_block_cbcenc_class br_des_tab_cbcenc_vtable = { + sizeof(br_des_tab_cbcenc_keys), + 8, + 3, + (void (*)(const br_block_cbcenc_class **, const void *, size_t)) + &br_des_tab_cbcenc_init, + (void (*)(const br_block_cbcenc_class *const *, void *, void *, size_t)) + &br_des_tab_cbcenc_run +}; diff --git a/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul.c b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul.c new file mode 100644 index 00000000..150e610a --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Perform the inner processing of blocks for Poly1305. The accumulator + * and the r key are provided as arrays of 26-bit words (these words + * are allowed to have an extra bit, i.e. use 27 bits). + * + * On output, all accumulator words fit on 26 bits, except acc[1], which + * may be slightly larger (but by a very small amount only). + */ +static void +poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len) +{ + /* + * Implementation notes: we split the 130-bit values into five + * 26-bit words. This gives us some space for carries. + * + * This code is inspired from the public-domain code available + * on: + * https://github.com/floodyberry/poly1305-donna + * + * Since we compute modulo 2^130-5, the "upper words" become + * low words with a factor of 5; that is, x*2^130 = x*5 mod p. + */ + const unsigned char *buf; + uint32_t a0, a1, a2, a3, a4; + uint32_t r0, r1, r2, r3, r4; + uint32_t u1, u2, u3, u4; + + r0 = r[0]; + r1 = r[1]; + r2 = r[2]; + r3 = r[3]; + r4 = r[4]; + + u1 = r1 * 5; + u2 = r2 * 5; + u3 = r3 * 5; + u4 = r4 * 5; + + a0 = acc[0]; + a1 = acc[1]; + a2 = acc[2]; + a3 = acc[3]; + a4 = acc[4]; + + buf = data; + while (len > 0) { + uint64_t w0, w1, w2, w3, w4; + uint64_t c; + unsigned char tmp[16]; + + /* + * If there is a partial block, right-pad it with zeros. + */ + if (len < 16) { + memset(tmp, 0, sizeof tmp); + memcpy(tmp, buf, len); + buf = tmp; + len = 16; + } + + /* + * Decode next block and apply the "high bit"; that value + * is added to the accumulator. + */ + a0 += br_dec32le(buf) & 0x03FFFFFF; + a1 += (br_dec32le(buf + 3) >> 2) & 0x03FFFFFF; + a2 += (br_dec32le(buf + 6) >> 4) & 0x03FFFFFF; + a3 += (br_dec32le(buf + 9) >> 6) & 0x03FFFFFF; + a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000; + + /* + * Compute multiplication. + */ +#define M(x, y) ((uint64_t)(x) * (uint64_t)(y)) + + w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1); + w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2); + w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3); + w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4); + w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0); + +#undef M + /* + * Perform some (partial) modular reduction. This step is + * enough to keep values in ranges such that there won't + * be carry overflows. Most of the reduction was done in + * the multiplication step (by using the 'u*' values, and + * using the fact that 2^130 = -5 mod p); here we perform + * some carry propagation. + */ + c = w0 >> 26; + a0 = (uint32_t)w0 & 0x3FFFFFF; + w1 += c; + c = w1 >> 26; + a1 = (uint32_t)w1 & 0x3FFFFFF; + w2 += c; + c = w2 >> 26; + a2 = (uint32_t)w2 & 0x3FFFFFF; + w3 += c; + c = w3 >> 26; + a3 = (uint32_t)w3 & 0x3FFFFFF; + w4 += c; + c = w4 >> 26; + a4 = (uint32_t)w4 & 0x3FFFFFF; + a0 += (uint32_t)c * 5; + a1 += a0 >> 26; + a0 &= 0x3FFFFFF; + + buf += 16; + len -= 16; + } + + acc[0] = a0; + acc[1] = a1; + acc[2] = a2; + acc[3] = a3; + acc[4] = a4; +} + +/* see bearssl_block.h */ +void +br_poly1305_ctmul_run(const void *key, const void *iv, + void *data, size_t len, const void *aad, size_t aad_len, + void *tag, br_chacha20_run ichacha, int encrypt) +{ + unsigned char pkey[32], foot[16]; + uint32_t r[5], acc[5], cc, ctl, hi; + uint64_t w; + int i; + + /* + * Compute the MAC key. The 'r' value is the first 16 bytes of + * pkey[]. + */ + memset(pkey, 0, sizeof pkey); + ichacha(key, iv, 0, pkey, sizeof pkey); + + /* + * If encrypting, ChaCha20 must run first, followed by Poly1305. + * When decrypting, the operations are reversed. + */ + if (encrypt) { + ichacha(key, iv, 1, data, len); + } + + /* + * Run Poly1305. We must process the AAD, then ciphertext, then + * the footer (with the lengths). Note that the AAD and ciphertext + * are meant to be padded with zeros up to the next multiple of 16, + * and the length of the footer is 16 bytes as well. + */ + + /* + * Decode the 'r' value into 26-bit words, with the "clamping" + * operation applied. + */ + r[0] = br_dec32le(pkey) & 0x03FFFFFF; + r[1] = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03; + r[2] = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF; + r[3] = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF; + r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF; + + /* + * Accumulator is 0. + */ + memset(acc, 0, sizeof acc); + + /* + * Process the additional authenticated data, ciphertext, and + * footer in due order. + */ + br_enc64le(foot, (uint64_t)aad_len); + br_enc64le(foot + 8, (uint64_t)len); + poly1305_inner(acc, r, aad, aad_len); + poly1305_inner(acc, r, data, len); + poly1305_inner(acc, r, foot, sizeof foot); + + /* + * Finalise modular reduction. This is done with carry propagation + * and applying the '2^130 = -5 mod p' rule. Note that the output + * of poly1035_inner() is already mostly reduced, since only + * acc[1] may be (very slightly) above 2^26. A single loop back + * to acc[1] will be enough to make the value fit in 130 bits. + */ + cc = 0; + for (i = 1; i <= 6; i ++) { + int j; + + j = (i >= 5) ? i - 5 : i; + acc[j] += cc; + cc = acc[j] >> 26; + acc[j] &= 0x03FFFFFF; + } + + /* + * We may still have a value in the 2^130-5..2^130-1 range, in + * which case we must reduce it again. The code below selects, + * in constant-time, between 'acc' and 'acc-p', + */ + ctl = GT(acc[0], 0x03FFFFFA); + for (i = 1; i < 5; i ++) { + ctl &= EQ(acc[i], 0x03FFFFFF); + } + cc = 5; + for (i = 0; i < 5; i ++) { + uint32_t t; + + t = (acc[i] + cc); + cc = t >> 26; + t &= 0x03FFFFFF; + acc[i] = MUX(ctl, t, acc[i]); + } + + /* + * Convert back the accumulator to 32-bit words, and add the + * 's' value (second half of pkey[]). That addition is done + * modulo 2^128. + */ + w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16); + br_enc32le((unsigned char *)tag, (uint32_t)w); + w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20); + br_enc32le((unsigned char *)tag + 4, (uint32_t)w); + w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24); + br_enc32le((unsigned char *)tag + 8, (uint32_t)w); + hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28); + br_enc32le((unsigned char *)tag + 12, hi); + + /* + * If decrypting, then ChaCha20 runs _after_ Poly1305. + */ + if (!encrypt) { + ichacha(key, iv, 1, data, len); + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul32.c b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul32.c new file mode 100644 index 00000000..15d9635d --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmul32.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * Perform the inner processing of blocks for Poly1305. + */ +static void +poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len) +{ + /* + * Implementation notes: we split the 130-bit values into ten + * 13-bit words. This gives us some space for carries and allows + * using only 32x32->32 multiplications, which are way faster than + * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also + * help in making constant-time code on the Cortex-M3. + * + * Since we compute modulo 2^130-5, the "upper words" become + * low words with a factor of 5; that is, x*2^130 = x*5 mod p. + * This has already been integrated in the r[] array, which + * is extended to the 0..18 range. + * + * In each loop iteration, a[] and r[] words are 13-bit each, + * except a[1] which may use 14 bits. + */ + const unsigned char *buf; + + buf = data; + while (len > 0) { + unsigned char tmp[16]; + uint32_t b[10]; + unsigned u, v; + uint32_t z, cc1, cc2; + + /* + * If there is a partial block, right-pad it with zeros. + */ + if (len < 16) { + memset(tmp, 0, sizeof tmp); + memcpy(tmp, buf, len); + buf = tmp; + len = 16; + } + + /* + * Decode next block and apply the "high bit"; that value + * is added to the accumulator. + */ + v = br_dec16le(buf); + a[0] += v & 0x01FFF; + v >>= 13; + v |= buf[2] << 3; + v |= buf[3] << 11; + a[1] += v & 0x01FFF; + v >>= 13; + v |= buf[4] << 6; + a[2] += v & 0x01FFF; + v >>= 13; + v |= buf[5] << 1; + v |= buf[6] << 9; + a[3] += v & 0x01FFF; + v >>= 13; + v |= buf[7] << 4; + v |= buf[8] << 12; + a[4] += v & 0x01FFF; + v >>= 13; + v |= buf[9] << 7; + a[5] += v & 0x01FFF; + v >>= 13; + v |= buf[10] << 2; + v |= buf[11] << 10; + a[6] += v & 0x01FFF; + v >>= 13; + v |= buf[12] << 5; + a[7] += v & 0x01FFF; + v = br_dec16le(buf + 13); + a[8] += v & 0x01FFF; + v >>= 13; + v |= buf[15] << 3; + a[9] += v | 0x00800; + + /* + * At that point, all a[] values fit on 14 bits, while + * all r[] values fit on 13 bits. Thus products fit on + * 27 bits, and we can accumulate up to 31 of them in + * a 32-bit word and still have some room for carries. + */ + + /* + * Now a[] contains words with values up to 14 bits each. + * We perform the multiplication with r[]. + * + * The extended words of r[] may be larger than 13 bits + * (they are 5 times a 13-bit word) so the full summation + * may yield values up to 46 times a 27-bit word, which + * does not fit on a 32-bit word. To avoid that issue, we + * must split the loop below in two, with a carry + * propagation operation in the middle. + */ + cc1 = 0; + for (u = 0; u < 10; u ++) { + uint32_t s; + + s = cc1 + + MUL15(a[0], r[u + 9 - 0]) + + MUL15(a[1], r[u + 9 - 1]) + + MUL15(a[2], r[u + 9 - 2]) + + MUL15(a[3], r[u + 9 - 3]) + + MUL15(a[4], r[u + 9 - 4]); + b[u] = s & 0x1FFF; + cc1 = s >> 13; + } + cc2 = 0; + for (u = 0; u < 10; u ++) { + uint32_t s; + + s = b[u] + cc2 + + MUL15(a[5], r[u + 9 - 5]) + + MUL15(a[6], r[u + 9 - 6]) + + MUL15(a[7], r[u + 9 - 7]) + + MUL15(a[8], r[u + 9 - 8]) + + MUL15(a[9], r[u + 9 - 9]); + b[u] = s & 0x1FFF; + cc2 = s >> 13; + } + memcpy(a, b, sizeof b); + + /* + * The two carries "loop back" with a factor of 5. We + * propagate them into a[0] and a[1]. + */ + z = cc1 + cc2; + z += (z << 2) + a[0]; + a[0] = z & 0x1FFF; + a[1] += z >> 13; + + buf += 16; + len -= 16; + } +} + +/* see bearssl_block.h */ +void +br_poly1305_ctmul32_run(const void *key, const void *iv, + void *data, size_t len, const void *aad, size_t aad_len, + void *tag, br_chacha20_run ichacha, int encrypt) +{ + unsigned char pkey[32], foot[16]; + uint32_t z, r[19], acc[10], cc, ctl; + int i; + + /* + * Compute the MAC key. The 'r' value is the first 16 bytes of + * pkey[]. + */ + memset(pkey, 0, sizeof pkey); + ichacha(key, iv, 0, pkey, sizeof pkey); + + /* + * If encrypting, ChaCha20 must run first, followed by Poly1305. + * When decrypting, the operations are reversed. + */ + if (encrypt) { + ichacha(key, iv, 1, data, len); + } + + /* + * Run Poly1305. We must process the AAD, then ciphertext, then + * the footer (with the lengths). Note that the AAD and ciphertext + * are meant to be padded with zeros up to the next multiple of 16, + * and the length of the footer is 16 bytes as well. + */ + + /* + * Decode the 'r' value into 13-bit words, with the "clamping" + * operation applied. + */ + z = br_dec32le(pkey) & 0x03FFFFFF; + r[9] = z & 0x1FFF; + r[10] = z >> 13; + z = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03; + r[11] = z & 0x1FFF; + r[12] = z >> 13; + z = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF; + r[13] = z & 0x1FFF; + r[14] = z >> 13; + z = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF; + r[15] = z & 0x1FFF; + r[16] = z >> 13; + z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF; + r[17] = z & 0x1FFF; + r[18] = z >> 13; + + /* + * Extend r[] with the 5x factor pre-applied. + */ + for (i = 0; i < 9; i ++) { + r[i] = MUL15(5, r[i + 10]); + } + + /* + * Accumulator is 0. + */ + memset(acc, 0, sizeof acc); + + /* + * Process the additional authenticated data, ciphertext, and + * footer in due order. + */ + br_enc64le(foot, (uint64_t)aad_len); + br_enc64le(foot + 8, (uint64_t)len); + poly1305_inner(acc, r, aad, aad_len); + poly1305_inner(acc, r, data, len); + poly1305_inner(acc, r, foot, sizeof foot); + + /* + * Finalise modular reduction. This is done with carry propagation + * and applying the '2^130 = -5 mod p' rule. Note that the output + * of poly1035_inner() is already mostly reduced, since only + * acc[1] may be (very slightly) above 2^13. A single loop back + * to acc[1] will be enough to make the value fit in 130 bits. + */ + cc = 0; + for (i = 1; i < 10; i ++) { + z = acc[i] + cc; + acc[i] = z & 0x1FFF; + cc = z >> 13; + } + z = acc[0] + cc + (cc << 2); + acc[0] = z & 0x1FFF; + acc[1] += z >> 13; + + /* + * We may still have a value in the 2^130-5..2^130-1 range, in + * which case we must reduce it again. The code below selects, + * in constant-time, between 'acc' and 'acc-p', + */ + ctl = GT(acc[0], 0x1FFA); + for (i = 1; i < 10; i ++) { + ctl &= EQ(acc[i], 0x1FFF); + } + acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]); + for (i = 1; i < 10; i ++) { + acc[i] &= ~(-ctl); + } + + /* + * Convert back the accumulator to 32-bit words, and add the + * 's' value (second half of pkey[]). That addition is done + * modulo 2^128. + */ + z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16); + br_enc16le((unsigned char *)tag, z & 0xFFFF); + z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18); + br_enc16le((unsigned char *)tag + 2, z & 0xFFFF); + z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20); + br_enc16le((unsigned char *)tag + 4, z & 0xFFFF); + z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22); + br_enc16le((unsigned char *)tag + 6, z & 0xFFFF); + z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24); + br_enc16le((unsigned char *)tag + 8, z & 0xFFFF); + z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26); + br_enc16le((unsigned char *)tag + 10, z & 0xFFFF); + z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28); + br_enc16le((unsigned char *)tag + 12, z & 0xFFFF); + z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30); + br_enc16le((unsigned char *)tag + 14, z & 0xFFFF); + + /* + * If decrypting, then ChaCha20 runs _after_ Poly1305. + */ + if (!encrypt) { + ichacha(key, iv, 1, data, len); + } +} diff --git a/test/monniaux/BearSSL/src/symcipher/poly1305_ctmulq.c b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmulq.c new file mode 100644 index 00000000..b00683a6 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/poly1305_ctmulq.c @@ -0,0 +1,475 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +#if BR_INT128 || BR_UMUL128 + +#if BR_INT128 + +#define MUL128(hi, lo, x, y) do { \ + unsigned __int128 mul128tmp; \ + mul128tmp = (unsigned __int128)(x) * (unsigned __int128)(y); \ + (hi) = (uint64_t)(mul128tmp >> 64); \ + (lo) = (uint64_t)mul128tmp; \ + } while (0) + +#elif BR_UMUL128 + +#include + +#define MUL128(hi, lo, x, y) do { \ + (lo) = _umul128((x), (y), &(hi)); \ + } while (0) + +#endif + +#define MASK42 ((uint64_t)0x000003FFFFFFFFFF) +#define MASK44 ((uint64_t)0x00000FFFFFFFFFFF) + +/* + * The "accumulator" word is nominally a 130-bit value. We split it into + * words of 44 bits, each held in a 64-bit variable. + * + * If the current accumulator is a = a0 + a1*W + a2*W^2 (where W = 2^44) + * and r = r0 + r1*W + r2*W^2, then: + * + * a*r = (a0*r0) + * + (a0*r1 + a1*r0) * W + * + (a0*r2 + a1*r1 + a2*r0) * W^2 + * + (a1*r2 + a2*r1) * W^3 + * + (a2*r2) * W^4 + * + * We want to reduce that value modulo p = 2^130-5, so W^3 = 20 mod p, + * and W^4 = 20*W mod p. Thus, if we define u1 = 20*r1 and u2 = 20*r2, + * then the equations above become: + * + * b0 = a0*r0 + a1*u2 + a2*u1 + * b1 = a0*r1 + a1*r0 + a2*u2 + * b2 = a0*r2 + a1*r1 + a2*r0 + * + * In order to make u1 fit in 44 bits, we can change these equations + * into: + * + * b0 = a0*r0 + a1*u2 + a2*t1 + * b1 = a0*r1 + a1*r0 + a2*t2 + * b2 = a0*r2 + a1*r1 + a2*r0 + * + * Where t1 is u1 truncated to 44 bits, and t2 is u2 added to the extra + * bits of u1. Note that since r is clamped down to a 124-bit value, the + * values u2 and t2 fit on 44 bits too. + * + * The bx values are larger than 44 bits, so we may split them into a + * lower half (cx, 44 bits) and an upper half (dx). The new values for + * the accumulator are then: + * + * e0 = c0 + 20*d2 + * e1 = c1 + d0 + * e2 = c2 + d1 + * + * The equations allow for some room, i.e. the ax values may be larger + * than 44 bits. Similarly, the ex values will usually be larger than + * the ax. Thus, some sort of carry propagation must be done regularly, + * though not necessarily at each iteration. In particular, we do not + * need to compute the additions (for the bx values) over 128-bit + * quantities; we can stick to 64-bit computations. + * + * + * Since the 128-bit result of a 64x64 multiplication is actually + * represented over two 64-bit registers, it is cheaper to arrange for + * any split that happens between the "high" and "low" halves to be on + * that 64-bit boundary. This is done by left shifting the rx, ux and tx + * by 20 bits (since they all fit on 44 bits each, this shift is + * always possible). + */ + +static void +poly1305_inner_big(uint64_t *acc, uint64_t *r, const void *data, size_t len) +{ + +#define MX(hi, lo, m0, m1, m2) do { \ + uint64_t mxhi, mxlo; \ + MUL128(mxhi, mxlo, a0, m0); \ + (hi) = mxhi; \ + (lo) = mxlo >> 20; \ + MUL128(mxhi, mxlo, a1, m1); \ + (hi) += mxhi; \ + (lo) += mxlo >> 20; \ + MUL128(mxhi, mxlo, a2, m2); \ + (hi) += mxhi; \ + (lo) += mxlo >> 20; \ + } while (0) + + const unsigned char *buf; + uint64_t a0, a1, a2; + uint64_t r0, r1, r2, t1, t2, u2; + + r0 = r[0]; + r1 = r[1]; + r2 = r[2]; + t1 = r[3]; + t2 = r[4]; + u2 = r[5]; + a0 = acc[0]; + a1 = acc[1]; + a2 = acc[2]; + buf = data; + + while (len > 0) { + uint64_t v0, v1, v2; + uint64_t c0, c1, c2, d0, d1, d2; + + v0 = br_dec64le(buf + 0); + v1 = br_dec64le(buf + 8); + v2 = v1 >> 24; + v1 = ((v0 >> 44) | (v1 << 20)) & MASK44; + v0 &= MASK44; + a0 += v0; + a1 += v1; + a2 += v2 + ((uint64_t)1 << 40); + MX(d0, c0, r0, u2, t1); + MX(d1, c1, r1, r0, t2); + MX(d2, c2, r2, r1, r0); + a0 = c0 + 20 * d2; + a1 = c1 + d0; + a2 = c2 + d1; + + v0 = br_dec64le(buf + 16); + v1 = br_dec64le(buf + 24); + v2 = v1 >> 24; + v1 = ((v0 >> 44) | (v1 << 20)) & MASK44; + v0 &= MASK44; + a0 += v0; + a1 += v1; + a2 += v2 + ((uint64_t)1 << 40); + MX(d0, c0, r0, u2, t1); + MX(d1, c1, r1, r0, t2); + MX(d2, c2, r2, r1, r0); + a0 = c0 + 20 * d2; + a1 = c1 + d0; + a2 = c2 + d1; + + v0 = br_dec64le(buf + 32); + v1 = br_dec64le(buf + 40); + v2 = v1 >> 24; + v1 = ((v0 >> 44) | (v1 << 20)) & MASK44; + v0 &= MASK44; + a0 += v0; + a1 += v1; + a2 += v2 + ((uint64_t)1 << 40); + MX(d0, c0, r0, u2, t1); + MX(d1, c1, r1, r0, t2); + MX(d2, c2, r2, r1, r0); + a0 = c0 + 20 * d2; + a1 = c1 + d0; + a2 = c2 + d1; + + v0 = br_dec64le(buf + 48); + v1 = br_dec64le(buf + 56); + v2 = v1 >> 24; + v1 = ((v0 >> 44) | (v1 << 20)) & MASK44; + v0 &= MASK44; + a0 += v0; + a1 += v1; + a2 += v2 + ((uint64_t)1 << 40); + MX(d0, c0, r0, u2, t1); + MX(d1, c1, r1, r0, t2); + MX(d2, c2, r2, r1, r0); + a0 = c0 + 20 * d2; + a1 = c1 + d0; + a2 = c2 + d1; + + a1 += a0 >> 44; + a0 &= MASK44; + a2 += a1 >> 44; + a1 &= MASK44; + a0 += 20 * (a2 >> 44); + a2 &= MASK44; + + buf += 64; + len -= 64; + } + acc[0] = a0; + acc[1] = a1; + acc[2] = a2; + +#undef MX +} + +static void +poly1305_inner_small(uint64_t *acc, uint64_t *r, const void *data, size_t len) +{ + const unsigned char *buf; + uint64_t a0, a1, a2; + uint64_t r0, r1, r2, t1, t2, u2; + + r0 = r[0]; + r1 = r[1]; + r2 = r[2]; + t1 = r[3]; + t2 = r[4]; + u2 = r[5]; + a0 = acc[0]; + a1 = acc[1]; + a2 = acc[2]; + buf = data; + + while (len > 0) { + uint64_t v0, v1, v2; + uint64_t c0, c1, c2, d0, d1, d2; + unsigned char tmp[16]; + + if (len < 16) { + memcpy(tmp, buf, len); + memset(tmp + len, 0, (sizeof tmp) - len); + buf = tmp; + len = 16; + } + v0 = br_dec64le(buf + 0); + v1 = br_dec64le(buf + 8); + + v2 = v1 >> 24; + v1 = ((v0 >> 44) | (v1 << 20)) & MASK44; + v0 &= MASK44; + + a0 += v0; + a1 += v1; + a2 += v2 + ((uint64_t)1 << 40); + +#define MX(hi, lo, m0, m1, m2) do { \ + uint64_t mxhi, mxlo; \ + MUL128(mxhi, mxlo, a0, m0); \ + (hi) = mxhi; \ + (lo) = mxlo >> 20; \ + MUL128(mxhi, mxlo, a1, m1); \ + (hi) += mxhi; \ + (lo) += mxlo >> 20; \ + MUL128(mxhi, mxlo, a2, m2); \ + (hi) += mxhi; \ + (lo) += mxlo >> 20; \ + } while (0) + + MX(d0, c0, r0, u2, t1); + MX(d1, c1, r1, r0, t2); + MX(d2, c2, r2, r1, r0); + +#undef MX + + a0 = c0 + 20 * d2; + a1 = c1 + d0; + a2 = c2 + d1; + + a1 += a0 >> 44; + a0 &= MASK44; + a2 += a1 >> 44; + a1 &= MASK44; + a0 += 20 * (a2 >> 44); + a2 &= MASK44; + + buf += 16; + len -= 16; + } + acc[0] = a0; + acc[1] = a1; + acc[2] = a2; +} + +static inline void +poly1305_inner(uint64_t *acc, uint64_t *r, const void *data, size_t len) +{ + if (len >= 64) { + size_t len2; + + len2 = len & ~(size_t)63; + poly1305_inner_big(acc, r, data, len2); + data = (const unsigned char *)data + len2; + len -= len2; + } + if (len > 0) { + poly1305_inner_small(acc, r, data, len); + } +} + +/* see bearssl_block.h */ +void +br_poly1305_ctmulq_run(const void *key, const void *iv, + void *data, size_t len, const void *aad, size_t aad_len, + void *tag, br_chacha20_run ichacha, int encrypt) +{ + unsigned char pkey[32], foot[16]; + uint64_t r[6], acc[3], r0, r1; + uint32_t v0, v1, v2, v3, v4; + uint64_t w0, w1, w2, w3; + uint32_t ctl; + + /* + * Compute the MAC key. The 'r' value is the first 16 bytes of + * pkey[]. + */ + memset(pkey, 0, sizeof pkey); + ichacha(key, iv, 0, pkey, sizeof pkey); + + /* + * If encrypting, ChaCha20 must run first, followed by Poly1305. + * When decrypting, the operations are reversed. + */ + if (encrypt) { + ichacha(key, iv, 1, data, len); + } + + /* + * Run Poly1305. We must process the AAD, then ciphertext, then + * the footer (with the lengths). Note that the AAD and ciphertext + * are meant to be padded with zeros up to the next multiple of 16, + * and the length of the footer is 16 bytes as well. + */ + + /* + * Apply the "clamping" on r. + */ + pkey[ 3] &= 0x0F; + pkey[ 4] &= 0xFC; + pkey[ 7] &= 0x0F; + pkey[ 8] &= 0xFC; + pkey[11] &= 0x0F; + pkey[12] &= 0xFC; + pkey[15] &= 0x0F; + + /* + * Decode the 'r' value into 44-bit words, left-shifted by 20 bits. + * Also compute the u1 and u2 values. + */ + r0 = br_dec64le(pkey + 0); + r1 = br_dec64le(pkey + 8); + r[0] = r0 << 20; + r[1] = ((r0 >> 24) | (r1 << 40)) & ~(uint64_t)0xFFFFF; + r[2] = (r1 >> 4) & ~(uint64_t)0xFFFFF; + r1 = 20 * (r[1] >> 20); + r[3] = r1 << 20; + r[5] = 20 * r[2]; + r[4] = (r[5] + (r1 >> 24)) & ~(uint64_t)0xFFFFF; + + /* + * Accumulator is 0. + */ + acc[0] = 0; + acc[1] = 0; + acc[2] = 0; + + /* + * Process the additional authenticated data, ciphertext, and + * footer in due order. + */ + br_enc64le(foot, (uint64_t)aad_len); + br_enc64le(foot + 8, (uint64_t)len); + poly1305_inner(acc, r, aad, aad_len); + poly1305_inner(acc, r, data, len); + poly1305_inner_small(acc, r, foot, sizeof foot); + + /* + * Finalise modular reduction. At that point, the value consists + * in three 44-bit values (the lowest one might be slightly above + * 2^44). Two loops shall be sufficient. + */ + acc[1] += (acc[0] >> 44); + acc[0] &= MASK44; + acc[2] += (acc[1] >> 44); + acc[1] &= MASK44; + acc[0] += 5 * (acc[2] >> 42); + acc[2] &= MASK42; + acc[1] += (acc[0] >> 44); + acc[0] &= MASK44; + acc[2] += (acc[1] >> 44); + acc[1] &= MASK44; + acc[0] += 5 * (acc[2] >> 42); + acc[2] &= MASK42; + + /* + * The value may still fall in the 2^130-5..2^130-1 range, in + * which case we must reduce it again. The code below selects, + * in constant-time, between 'acc' and 'acc-p'. We encode the + * value over four 32-bit integers to finish the operation. + */ + v0 = (uint32_t)acc[0]; + v1 = (uint32_t)(acc[0] >> 32) | ((uint32_t)acc[1] << 12); + v2 = (uint32_t)(acc[1] >> 20) | ((uint32_t)acc[2] << 24); + v3 = (uint32_t)(acc[2] >> 8); + v4 = (uint32_t)(acc[2] >> 40); + + ctl = GT(v0, 0xFFFFFFFA); + ctl &= EQ(v1, 0xFFFFFFFF); + ctl &= EQ(v2, 0xFFFFFFFF); + ctl &= EQ(v3, 0xFFFFFFFF); + ctl &= EQ(v4, 0x00000003); + v0 = MUX(ctl, v0 + 5, v0); + v1 = MUX(ctl, 0, v1); + v2 = MUX(ctl, 0, v2); + v3 = MUX(ctl, 0, v3); + + /* + * Add the "s" value. This is done modulo 2^128. Don't forget + * carry propagation... + */ + w0 = (uint64_t)v0 + (uint64_t)br_dec32le(pkey + 16); + w1 = (uint64_t)v1 + (uint64_t)br_dec32le(pkey + 20) + (w0 >> 32); + w2 = (uint64_t)v2 + (uint64_t)br_dec32le(pkey + 24) + (w1 >> 32); + w3 = (uint64_t)v3 + (uint64_t)br_dec32le(pkey + 28) + (w2 >> 32); + v0 = (uint32_t)w0; + v1 = (uint32_t)w1; + v2 = (uint32_t)w2; + v3 = (uint32_t)w3; + + /* + * Encode the tag. + */ + br_enc32le((unsigned char *)tag + 0, v0); + br_enc32le((unsigned char *)tag + 4, v1); + br_enc32le((unsigned char *)tag + 8, v2); + br_enc32le((unsigned char *)tag + 12, v3); + + /* + * If decrypting, then ChaCha20 runs _after_ Poly1305. + */ + if (!encrypt) { + ichacha(key, iv, 1, data, len); + } +} + +/* see bearssl_block.h */ +br_poly1305_run +br_poly1305_ctmulq_get(void) +{ + return &br_poly1305_ctmulq_run; +} + +#else + +/* see bearssl_block.h */ +br_poly1305_run +br_poly1305_ctmulq_get(void) +{ + return 0; +} + +#endif diff --git a/test/monniaux/BearSSL/src/symcipher/poly1305_i15.c b/test/monniaux/BearSSL/src/symcipher/poly1305_i15.c new file mode 100644 index 00000000..6f892121 --- /dev/null +++ b/test/monniaux/BearSSL/src/symcipher/poly1305_i15.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "inner.h" + +/* + * This is a "reference" implementation of Poly1305 that uses the + * generic "i15" code for big integers. It is slow, but it handles all + * big-integer operations with generic code, thereby avoiding most + * tricky situations with carry propagation and modular reduction. + */ + +/* + * Modulus: 2^130-5. + */ +static const uint16_t P1305[] = { + 0x008A, + 0x7FFB, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x03FF +}; + +/* + * -p mod 2^15. + */ +#define P0I 0x4CCD + +/* + * R^2 mod p, for conversion to Montgomery representation (R = 2^135, + * since we use 9 words of 15 bits each, and 15*9 = 135). + */ +static const uint16_t R2[] = { + 0x008A, + 0x6400, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 +}; + +/* + * Perform the inner processing of blocks for Poly1305. The "r" array + * is in Montgomery representation, while the "a" array is not. + */ +static void +poly1305_inner(uint16_t *a, const uint16_t *r, const void *data, size_t len) +{ + const unsigned char *buf; + + buf = data; + while (len > 0) { + unsigned char tmp[16], rev[16]; + uint16_t b[10]; + uint32_t ctl; + int i; + + /* + * If there is a partial block, right-pad it with zeros. + */ + if (len < 16) { + memset(tmp, 0, sizeof tmp); + memcpy(tmp, buf, len); + buf = tmp; + len = 16; + } + + /* + * Decode next block and apply the "high bit". Since + * decoding is little-endian, we must byte-swap the buffer. + */ + for (i = 0; i < 16; i ++) { + rev[i] = buf[15 - i]; + } + br_i15_decode_mod(b, rev, sizeof rev, P1305); + b[9] |= 0x0100; + + /* + * Add the accumulator to the decoded block (modular + * addition). + */ + ctl = br_i15_add(b, a, 1); + ctl |= NOT(br_i15_sub(b, P1305, 0)); + br_i15_sub(b, P1305, ctl); + + /* + * Multiply by r, result is the new accumulator value. + */ + br_i15_montymul(a, b, r, P1305, P0I); + + buf += 16; + len -= 16; + } +} + +/* + * Byteswap a 16-byte value. + */ +static void +byteswap16(unsigned char *buf) +{ + int i; + + for (i = 0; i < 8; i ++) { + unsigned x; + + x = buf[i]; + buf[i] = buf[15 - i]; + buf[15 - i] = x; + } +} + +/* see bearssl_block.h */ +void +br_poly1305_i15_run(const void *key, const void *iv, + void *data, size_t len, const void *aad, size_t aad_len, + void *tag, br_chacha20_run ichacha, int encrypt) +{ + unsigned char pkey[32], foot[16]; + uint16_t t[10], r[10], acc[10]; + + /* + * Compute the MAC key. The 'r' value is the first 16 bytes of + * pkey[]. + */ + memset(pkey, 0, sizeof pkey); + ichacha(key, iv, 0, pkey, sizeof pkey); + + /* + * If encrypting, ChaCha20 must run first, followed by Poly1305. + * When decrypting, the operations are reversed. + */ + if (encrypt) { + ichacha(key, iv, 1, data, len); + } + + /* + * Run Poly1305. We must process the AAD, then ciphertext, then + * the footer (with the lengths). Note that the AAD and ciphertext + * are meant to be padded with zeros up to the next multiple of 16, + * and the length of the footer is 16 bytes as well. + */ + + /* + * Apply the "clamping" operation on the encoded 'r' value. + */ + pkey[ 3] &= 0x0F; + pkey[ 7] &= 0x0F; + pkey[11] &= 0x0F; + pkey[15] &= 0x0F; + pkey[ 4] &= 0xFC; + pkey[ 8] &= 0xFC; + pkey[12] &= 0xFC; + + /* + * Decode the clamped 'r' value. Decoding should use little-endian + * so we must byteswap the value first. + */ + byteswap16(pkey); + br_i15_decode_mod(t, pkey, 16, P1305); + + /* + * Convert 'r' to Montgomery representation. + */ + br_i15_montymul(r, t, R2, P1305, P0I); + + /* + * Accumulator is 0. + */ + br_i15_zero(acc, 0x8A); + + /* + * Process the additional authenticated data, ciphertext, and + * footer in due order. + */ + br_enc64le(foot, (uint64_t)aad_len); + br_enc64le(foot + 8, (uint64_t)len); + poly1305_inner(acc, r, aad, aad_len); + poly1305_inner(acc, r, data, len); + poly1305_inner(acc, r, foot, sizeof foot); + + /* + * Decode the value 's'. Again, a byteswap is needed. + */ + byteswap16(pkey + 16); + br_i15_decode_mod(t, pkey + 16, 16, P1305); + + /* + * Add the value 's' to the accumulator. That addition is done + * modulo 2^128, so we just ignore the carry. + */ + br_i15_add(acc, t, 1); + + /* + * Encode the result (128 low bits) to the tag. Encoding should + * be little-endian. + */ + br_i15_encode(tag, 16, acc); + byteswap16(tag); + + /* + * If decrypting, then ChaCha20 runs _after_ Poly1305. + */ + if (!encrypt) { + ichacha(key, iv, 1, data, len); + } +} -- cgit