diff options
Diffstat (limited to 'test')
21 files changed, 1916 insertions, 1082 deletions
diff --git a/test/monniaux/bitsliced-aes/bs.c b/test/monniaux/bitsliced-aes/bs.c index df5c1f6b..4a9df4aa 100644 --- a/test/monniaux/bitsliced-aes/bs.c +++ b/test/monniaux/bitsliced-aes/bs.c @@ -14,6 +14,11 @@ #error "endianness not supported" #endif +#if 1 +#define TERNARY_XY0(t, x) ((-((t) != 0)) & (x)) +#else +#define TERNARY_XY0(t, x) (((t) != 0) ? (x) : (0)) +#endif void bs_addroundkey(word_t * B, word_t * rk) { @@ -388,11 +393,14 @@ void bs_transpose_dst(word_t * transpose, word_t * blocks) int offset = i << MUL_SHIFT; #ifndef UNROLL_TRANSPOSE + /* DM experiments */ + /* The normal ternary operator costs us a lot! + from 10145951 to 7995063 */ int j; for(j=0; j < WORD_SIZE; j++) { // TODO make const time - transpose[offset + j] |= (w & (ONE << j)) ? bitpos : 0; + transpose[offset + j] |= TERNARY_XY0(w & (ONE << j), bitpos); } #else @@ -488,7 +496,7 @@ void bs_transpose_rev(word_t * blocks) int j; for(j=0; j < WORD_SIZE; j++) { - word_t bit = (w & (ONE << j)) ? (ONE << (k % WORD_SIZE)) : 0; + word_t bit = TERNARY_XY0((w & (ONE << j)), (ONE << (k % WORD_SIZE))); transpose[j * WORDS_PER_BLOCK + (offset)] |= bit; } #else diff --git a/test/monniaux/bitsliced-aes/one_file/bitsliced-aes.c b/test/monniaux/bitsliced-aes/one_file/bitsliced-aes.c new file mode 100644 index 00000000..bfa9dba8 --- /dev/null +++ b/test/monniaux/bitsliced-aes/one_file/bitsliced-aes.c @@ -0,0 +1,1542 @@ +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include "/home/monniaux/work/Kalray/CompCert/test/monniaux/clock.h" + +#define EXIT1 + +void aes_ecb_encrypt(uint8_t * outputb, uint8_t * inputb, size_t size, uint8_t * key); +void aes_ecb_decrypt(uint8_t * outputb, uint8_t * inputb, size_t size, uint8_t * key); + +void aes_ctr_encrypt(uint8_t * outputb, uint8_t * inputb, size_t size, uint8_t * key, uint8_t * iv); +#define aes_ctr_decrypt(outputb,inputb,size,key,iv) aes_ctr_encrypt(outputb,inputb,size,key,iv) + +#define BLOCK_SIZE 128 +#define KEY_SCHEDULE_SIZE 176 +#define WORD_SIZE 64 +#define BS_BLOCK_SIZE (BLOCK_SIZE * WORD_SIZE / 8) +#define WORDS_PER_BLOCK (BLOCK_SIZE / WORD_SIZE) + +#if (WORD_SIZE==64) + typedef uint64_t word_t; + #define ONE 1ULL + #define MUL_SHIFT 6 + #define WFMT "lx" + #define WPAD "016" + #define __builtin_bswap_wordsize(x) __builtin_bswap64(x) +#elif (WORD_SIZE==32) + typedef uint32_t word_t; + #define ONE 1UL + #define MUL_SHIFT 5 + #define WFMT "x" + #define WPAD "08" + #define __builtin_bswap_wordsize(x) __builtin_bswap32(x) +#elif (WORD_SIZE==16) + typedef uint16_t word_t; + #define ONE 1 + #define MUL_SHIFT 4 + #define WFMT "hx" + #define WPAD "04" + #define __builtin_bswap_wordsize(x) __builtin_bswap16(x) +#elif (WORD_SIZE==8) + typedef uint8_t word_t; + #define ONE 1 + #define MUL_SHIFT 3 + #define WFMT "hhx" + #define WPAD "02" + #define __builtin_bswap_wordsize(x) (x) +#else +#error "invalid word size" +#endif + +void bs_transpose(word_t * blocks); +void bs_transpose_rev(word_t * blocks); +void bs_transpose_dst(word_t * transpose, word_t * blocks); + +void bs_sbox(word_t U[8]); +void bs_sbox_rev(word_t U[8]); + +void bs_shiftrows(word_t * B); +void bs_shiftrows_rev(word_t * B); + +void bs_mixcolumns(word_t * B); +void bs_mixcolumns_rev(word_t * B); + +void bs_shiftmix(word_t * B); + +void bs_addroundkey(word_t * B, word_t * rk); +void bs_apply_sbox(word_t * input); +void bs_apply_sbox_rev(word_t * input); + + +void expand_key(unsigned char *in); +void bs_expand_key(word_t (* rk)[BLOCK_SIZE], uint8_t * key); + +void bs_cipher(word_t state[BLOCK_SIZE], word_t (* rk)[BLOCK_SIZE]); +void bs_cipher_rev(word_t state[BLOCK_SIZE], word_t (* rk)[BLOCK_SIZE]); + + +void dump_hex(uint8_t * h, int len); +void dump_word(word_t * h, int len); +void dump_block(word_t * h, int len); + +#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) + +void aes_ecb_encrypt(uint8_t * outputb, uint8_t * inputb, size_t size, uint8_t * key) +{ + word_t input_space[BLOCK_SIZE]; + word_t rk[11][BLOCK_SIZE]; + + memset(outputb,0,size); + word_t * state = (word_t *)outputb; + + bs_expand_key(rk, key); + + while (size > 0) + { + if (size < BS_BLOCK_SIZE) + { + memset(input_space,0,BS_BLOCK_SIZE); + memmove(input_space, inputb, size); + bs_cipher(input_space,rk); + memmove(outputb, input_space, size); + size = 0; + state += size; + } + else + { + memmove(state,inputb,BS_BLOCK_SIZE); + bs_cipher(state,rk); + size -= BS_BLOCK_SIZE; + state += BS_BLOCK_SIZE; + } + + } +} + +void aes_ecb_decrypt(uint8_t * outputb, uint8_t * inputb, size_t size, uint8_t * key) +{ + word_t input_space[BLOCK_SIZE]; + word_t rk[11][BLOCK_SIZE]; + + memset(outputb,0,size); + word_t * state = (word_t *)outputb; + + bs_expand_key(rk, key); + + while (size > 0) + { + if (size < BS_BLOCK_SIZE) + { + memset(input_space,0,BS_BLOCK_SIZE); + memmove(input_space, inputb, size); + bs_cipher_rev(input_space,rk); + memmove(outputb, input_space, size); + size = 0; + state += size; + } + else + { + memmove(state,inputb,BS_BLOCK_SIZE); + bs_cipher_rev(state,rk); + size -= BS_BLOCK_SIZE; + state += BS_BLOCK_SIZE; + } + + } +} + +static void INC_CTR(uint8_t * ctr, uint8_t i) +{ + ctr += BLOCK_SIZE/8 - 1; + uint8_t n = *(ctr); + *ctr += i; + while(*ctr < n) + { + ctr--; + n = *ctr; + (*ctr)++; + } +} + +void aes_ctr_encrypt(uint8_t * outputb, uint8_t * inputb, size_t size, uint8_t * key, uint8_t * iv) +{ + word_t rk[11][BLOCK_SIZE]; + word_t ctr[BLOCK_SIZE]; + uint8_t iv_copy[BLOCK_SIZE/8]; + + memset(outputb,0,size); + memset(ctr,0,sizeof(ctr)); + memmove(iv_copy,iv,BLOCK_SIZE/8); + + word_t * state = (word_t *)outputb; + bs_expand_key(rk, key); + + do + { + int chunk = MIN(size, BS_BLOCK_SIZE); + int blocks = chunk / (BLOCK_SIZE/8); + if (chunk % (BLOCK_SIZE/8)) + { + blocks++; + } + + int i; + for (i = 0; i < blocks; i++) + { + memmove(ctr + (i * WORDS_PER_BLOCK), iv_copy, BLOCK_SIZE/8); + INC_CTR(iv_copy,1); + } + + bs_cipher(ctr, rk); + size -= chunk; + + uint8_t * ctr_p = (uint8_t *) ctr; + while(chunk--) + { + *outputb++ = *ctr_p++ ^ *inputb++; + } + + } + while(size); + +} + +void dump_hex(uint8_t * h, int len) +{ + while(len--) + printf("%02hhx",*h++); + printf("\n"); +} + +void dump_word(word_t * h, int len) +{ + while(len--) + if ((len+1) % 8) printf("%" WPAD WFMT "\n",*h++); + else printf("%d:\n%" WPAD WFMT "\n",128-len-1,*h++); + + printf("\n"); +} + +void dump_block(word_t * h, int len) +{ + while(len-=2 >= 0) + printf("%" WPAD WFMT"%" WPAD WFMT "\n",*h++,*h++); + printf("\n"); +} + +static const uint8_t sbox[256] = { + //0 1 2 3 4 5 6 7 8 9 A B C D E F + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; + +static void rotate(unsigned char *in) { + unsigned char a,c; + a = in[0]; + for(c=0;c<3;c++) + in[c] = in[c + 1]; + in[3] = a; + return; +} + +/* Calculate the rcon used in key expansion */ +static unsigned char rcon(unsigned char in) { + unsigned char c=1; + if(in == 0) + return 0; + while(in != 1) { + unsigned char b; + b = c & 0x80; + c <<= 1; + if(b == 0x80) { + c ^= 0x1b; + } + in--; + } + return c; +} + +/* This is the core key expansion, which, given a 4-byte value, + * does some scrambling */ +static void schedule_core(unsigned char *in, unsigned char i) { + char a; + /* Rotate the input 8 bits to the left */ + rotate(in); + /* Apply Rijndael's s-box on all 4 bytes */ + for(a = 0; a < 4; a++) + in[a] = sbox[in[a]]; + /* On just the first byte, add 2^i to the byte */ + in[0] ^= rcon(i); +} + +void expand_key(unsigned char *in) { + unsigned char t[4]; + /* c is 16 because the first sub-key is the user-supplied key */ + unsigned char c = 16; + unsigned char i = 1; + unsigned char a; + + /* We need 11 sets of sixteen bytes each for 128-bit mode */ + while(c < 176) { + /* Copy the temporary variable over from the last 4-byte + * block */ + for(a = 0; a < 4; a++) + t[a] = in[a + c - 4]; + /* Every four blocks (of four bytes), + * do a complex calculation */ + if(c % 16 == 0) { + schedule_core(t,i); + i++; + } + for(a = 0; a < 4; a++) { + in[c] = in[c - 16] ^ t[a]; + c++; + } + } +} + +#if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) ||\ + defined(__amd64__) || defined(__amd32__)|| defined(__amd16__) +#define bs2le(x) (x) +#define bs2be(x) (x) +#elif (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) ||\ + (defined(__sparc__)) +#define bs2le(x) __builtin_bswap_wordsize(x) +#define bs2be(x) __builtin_bswap_wordsize(x) +#else +#error "endianness not supported" +#endif + + +void bs_addroundkey(word_t * B, word_t * rk) +{ + int i; + for (i = 0; i < BLOCK_SIZE; i++) + B[i] ^= rk[i]; +} + +void bs_apply_sbox(word_t * input) +{ + int i; + for(i=0; i < BLOCK_SIZE; i+=8) + { + bs_sbox(input+i); + } +} + +void bs_apply_sbox_rev(word_t * input) +{ + int i; + for(i=0; i < BLOCK_SIZE; i+=8) + { + bs_sbox_rev(input+i); + } +} + +/*July 2011*/ +/*Straight-line program for AES s box*/ + +/*Input is U[0], U[1],...,U[7]*/ +/*Output is S[0], S[1],...,S[7]*/ +// http://cs-www.cs.yale.edu/homes/peralta/CircuitStuff/CMT.html +void bs_sbox_rev(word_t U[8]) +{ + word_t W[8]; + word_t + T1,T2,T3,T4,T5,T6,T8, + T9,T10,T13,T14,T15,T16, + T17,T18,T19,T20,T22,T23,T24, + T25, T26, T27; + + word_t + M1,M2,M3,M4,M5,M6,M7,M8, + M9,M10,M11,M12,M13,M14,M15, + M16,M17,M18,M19,M20,M21,M22, + M23,M24,M25,M26,M27,M28,M29, + M30,M31,M32,M33,M34,M35,M36, + M37,M38,M39,M40,M41,M42,M43, + M44,M45,M46,M47,M48,M49,M50, + M51,M52,M53,M54,M55,M56,M57, + M58,M59,M60,M61,M62,M63; + + word_t + P0,P1,P2,P3,P4,P5,P6,P7,P8, + P9,P10,P11,P12,P13,P14, + P15,P16,P17,P18,P19,P20, + P21,P22,P23,P24,P25,P26, + P27,P28,P29; + + word_t Y5, + R5, R13, R17, R18, R19; + + + T23 = U[7] ^ U[4]; + T22 = ~(U[6] ^ U[4]); + T2 = ~(U[7] ^ U[6]); + T1 = U[4] ^ U[3]; + T24 = ~(U[3] ^ U[0]); + R5 = U[1] ^ U[0]; + T8 = ~(U[6] ^ T23); + T19 = T22 ^ R5; + T9 = ~(U[0] ^ T1); + T10 = T2 ^ T24; + T13 = T2 ^ R5; + T3 = T1 ^ R5; + T25 = ~(U[5] ^ T1); + R13 = U[6] ^ U[1]; + T17 = ~(U[5] ^ T19); + T20 = T24 ^ R13; + T4 = U[3] ^ T8; + R17 = ~(U[5] ^ U[2]); + R18 = ~(U[2] ^ U[1]); + R19 = ~(U[5] ^ U[3]); + Y5 = U[7] ^ R17; + T6 = T22 ^ R17; + T16 = R13 ^ R19; + T27 = T1 ^ R18; + T15 = T10 ^ T27; + T14 = T10 ^ R18; + T26 = T3 ^ T16; + M1 = T13 & T6; + M2 = T23 & T8; + M3 = T14 ^ M1; + M4 = T19 & Y5; + M5 = M4 ^ M1; + M6 = T3 & T16; + M7 = T22 & T9; + M8 = T26 ^ M6; + M9 = T20 & T17; + M10 = M9 ^ M6; + M11 = T1 & T15; + M12 = T4 & T27; + M13 = M12 ^ M11; + M14 = T2 & T10; + M15 = M14 ^ M11; + M16 = M3 ^ M2; + M17 = M5 ^ T24; + M18 = M8 ^ M7; + M19 = M10 ^ M15; + M20 = M16 ^ M13; + M21 = M17 ^ M15; + M22 = M18 ^ M13; + M23 = M19 ^ T25; + M24 = M22 ^ M23; + M25 = M22 & M20; + M26 = M21 ^ M25; + M27 = M20 ^ M21; + M28 = M23 ^ M25; + M29 = M28 & M27; + M30 = M26 & M24; + M31 = M20 & M23; + M32 = M27 & M31; + M33 = M27 ^ M25; + M34 = M21 & M22; + M35 = M24 & M34; + M36 = M24 ^ M25; + M37 = M21 ^ M29; + M38 = M32 ^ M33; + M39 = M23 ^ M30; + M40 = M35 ^ M36; + M41 = M38 ^ M40; + M42 = M37 ^ M39; + M43 = M37 ^ M38; + M44 = M39 ^ M40; + M45 = M42 ^ M41; + M46 = M44 & T6; + M47 = M40 & T8; + M48 = M39 & Y5; + M49 = M43 & T16; + M50 = M38 & T9; + M51 = M37 & T17; + M52 = M42 & T15; + M53 = M45 & T27; + M54 = M41 & T10; + M55 = M44 & T13; + M56 = M40 & T23; + M57 = M39 & T19; + M58 = M43 & T3; + M59 = M38 & T22; + M60 = M37 & T20; + M61 = M42 & T1; + M62 = M45 & T4; + M63 = M41 & T2; + P0 = M52 ^ M61; + P1 = M58 ^ M59; + P2 = M54 ^ M62; + P3 = M47 ^ M50; + P4 = M48 ^ M56; + P5 = M46 ^ M51; + P6 = M49 ^ M60; + P7 = P0 ^ P1; + P8 = M50 ^ M53; + P9 = M55 ^ M63; + P10 = M57 ^ P4; + P11 = P0 ^ P3; + P12 = M46 ^ M48; + P13 = M49 ^ M51; + P14 = M49 ^ M62; + P15 = M54 ^ M59; + P16 = M57 ^ M61; + P17 = M58 ^ P2; + P18 = M63 ^ P5; + P19 = P2 ^ P3; + P20 = P4 ^ P6; + P22 = P2 ^ P7; + P23 = P7 ^ P8; + P24 = P5 ^ P7; + P25 = P6 ^ P10; + P26 = P9 ^ P11; + P27 = P10 ^ P18; + P28 = P11 ^ P25; + P29 = P15 ^ P20; + W[7] = P13 ^ P22; + W[6] = P26 ^ P29; + W[5] = P17 ^ P28; + W[4] = P12 ^ P22; + W[3] = P23 ^ P27; + W[2] = P19 ^ P24; + W[1] = P14 ^ P23; + W[0] = P9 ^ P16; + + memmove(U,W,sizeof(W)); +} + +void bs_sbox(word_t U[8]) +{ + word_t S[8]; + word_t + T1,T2,T3,T4,T5,T6,T7,T8, + T9,T10,T11,T12,T13,T14,T15,T16, + T17,T18,T19,T20,T21,T22,T23,T24, + T25, T26, T27; + + word_t + M1,M2,M3,M4,M5,M6,M7,M8, + M9,M10,M11,M12,M13,M14,M15, + M16,M17,M18,M19,M20,M21,M22, + M23,M24,M25,M26,M27,M28,M29, + M30,M31,M32,M33,M34,M35,M36, + M37,M38,M39,M40,M41,M42,M43, + M44,M45,M46,M47,M48,M49,M50, + M51,M52,M53,M54,M55,M56,M57, + M58,M59,M60,M61,M62,M63; + + word_t + L0,L1,L2,L3,L4,L5,L6,L7,L8, + L9,L10,L11,L12,L13,L14, + L15,L16,L17,L18,L19,L20, + L21,L22,L23,L24,L25,L26, + L27,L28,L29; + + T1 = U[7] ^ U[4]; + T2 = U[7] ^ U[2]; + T3 = U[7] ^ U[1]; + T4 = U[4] ^ U[2]; + T5 = U[3] ^ U[1]; + T6 = T1 ^ T5; + T7 = U[6] ^ U[5]; + T8 = U[0] ^ T6; + T9 = U[0] ^ T7; + T10 = T6 ^ T7; + T11 = U[6] ^ U[2]; + T12 = U[5] ^ U[2]; + T13 = T3 ^ T4; + T14 = T6 ^ T11; + T15 = T5 ^ T11; + T16 = T5 ^ T12; + T17 = T9 ^ T16; + T18 = U[4] ^ U[0]; + T19 = T7 ^ T18; + T20 = T1 ^ T19; + T21 = U[1] ^ U[0]; + T22 = T7 ^ T21; + T23 = T2 ^ T22; + T24 = T2 ^ T10; + T25 = T20 ^ T17; + T26 = T3 ^ T16; + T27 = T1 ^ T12; + M1 = T13 & T6; + M2 = T23 & T8; + M3 = T14 ^ M1; + M4 = T19 & U[0]; + M5 = M4 ^ M1; + M6 = T3 & T16; + M7 = T22 & T9; + M8 = T26 ^ M6; + M9 = T20 & T17; + M10 = M9 ^ M6; + M11 = T1 & T15; + M12 = T4 & T27; + M13 = M12 ^ M11; + M14 = T2 & T10; + M15 = M14 ^ M11; + M16 = M3 ^ M2; + M17 = M5 ^ T24; + M18 = M8 ^ M7; + M19 = M10 ^ M15; + M20 = M16 ^ M13; + M21 = M17 ^ M15; + M22 = M18 ^ M13; + M23 = M19 ^ T25; + M24 = M22 ^ M23; + M25 = M22 & M20; + M26 = M21 ^ M25; + M27 = M20 ^ M21; + M28 = M23 ^ M25; + M29 = M28 & M27; + M30 = M26 & M24; + M31 = M20 & M23; + M32 = M27 & M31; + M33 = M27 ^ M25; + M34 = M21 & M22; + M35 = M24 & M34; + M36 = M24 ^ M25; + M37 = M21 ^ M29; + M38 = M32 ^ M33; + M39 = M23 ^ M30; + M40 = M35 ^ M36; + M41 = M38 ^ M40; + M42 = M37 ^ M39; + M43 = M37 ^ M38; + M44 = M39 ^ M40; + M45 = M42 ^ M41; + M46 = M44 & T6; + M47 = M40 & T8; + M48 = M39 & U[0]; + M49 = M43 & T16; + M50 = M38 & T9; + M51 = M37 & T17; + M52 = M42 & T15; + M53 = M45 & T27; + M54 = M41 & T10; + M55 = M44 & T13; + M56 = M40 & T23; + M57 = M39 & T19; + M58 = M43 & T3; + M59 = M38 & T22; + M60 = M37 & T20; + M61 = M42 & T1; + M62 = M45 & T4; + M63 = M41 & T2; + L0 = M61 ^ M62; + L1 = M50 ^ M56; + L2 = M46 ^ M48; + L3 = M47 ^ M55; + L4 = M54 ^ M58; + L5 = M49 ^ M61; + L6 = M62 ^ L5; + L7 = M46 ^ L3; + L8 = M51 ^ M59; + L9 = M52 ^ M53; + L10 = M53 ^ L4; + L11 = M60 ^ L2; + L12 = M48 ^ M51; + L13 = M50 ^ L0; + L14 = M52 ^ M61; + L15 = M55 ^ L1; + L16 = M56 ^ L0; + L17 = M57 ^ L1; + L18 = M58 ^ L8; + L19 = M63 ^ L4; + L20 = L0 ^ L1; + L21 = L1 ^ L7; + L22 = L3 ^ L12; + L23 = L18 ^ L2; + L24 = L15 ^ L9; + L25 = L6 ^ L10; + L26 = L7 ^ L9; + L27 = L8 ^ L10; + L28 = L11 ^ L14; + L29 = L11 ^ L17; + S[7] = L6 ^ L24; + S[6] = ~(L16 ^ L26); + S[5] = ~(L19 ^ L28); + S[4] = L6 ^ L21; + S[3] = L20 ^ L22; + S[2] = L25 ^ L29; + S[1] = ~(L13 ^ L27); + S[0] = ~(L6 ^ L23); + + memmove(U,S,sizeof(S)); +} + +void bs_transpose(word_t * blocks) +{ + word_t transpose[BLOCK_SIZE]; + memset(transpose, 0, sizeof(transpose)); + bs_transpose_dst(transpose,blocks); + memmove(blocks,transpose,sizeof(transpose)); +} + +void bs_transpose_dst(word_t * transpose, word_t * blocks) +{ + int i,k; + word_t w; + for(k=0; k < WORD_SIZE; k++) + { + int bitpos = ONE << k; + for (i=0; i < WORDS_PER_BLOCK; i++) + { + w = bs2le(blocks[k * WORDS_PER_BLOCK + i]); + int offset = i << MUL_SHIFT; + +#ifndef UNROLL_TRANSPOSE + int j; + for(j=0; j < WORD_SIZE; j++) + { + // TODO make const time + transpose[offset + j] |= (w & (ONE << j)) ? bitpos : 0; + } +#else + + transpose[(offset)+ 0 ] |= (w & (ONE << 0 )) ? (bitpos) : 0; + transpose[(offset)+ 1 ] |= (w & (ONE << 1 )) ? (bitpos) : 0; + transpose[(offset)+ 2 ] |= (w & (ONE << 2 )) ? (bitpos) : 0; + transpose[(offset)+ 3 ] |= (w & (ONE << 3 )) ? (bitpos) : 0; + transpose[(offset)+ 4 ] |= (w & (ONE << 4 )) ? (bitpos) : 0; + transpose[(offset)+ 5 ] |= (w & (ONE << 5 )) ? (bitpos) : 0; + transpose[(offset)+ 6 ] |= (w & (ONE << 6 )) ? (bitpos) : 0; + transpose[(offset)+ 7 ] |= (w & (ONE << 7 )) ? (bitpos) : 0; +#if WORD_SIZE > 8 + transpose[(offset)+ 8 ] |= (w & (ONE << 8 )) ? (bitpos) : 0; + transpose[(offset)+ 9 ] |= (w & (ONE << 9 )) ? (bitpos) : 0; + transpose[(offset)+ 10] |= (w & (ONE << 10)) ? (bitpos) : 0; + transpose[(offset)+ 11] |= (w & (ONE << 11)) ? (bitpos) : 0; + transpose[(offset)+ 12] |= (w & (ONE << 12)) ? (bitpos) : 0; + transpose[(offset)+ 13] |= (w & (ONE << 13)) ? (bitpos) : 0; + transpose[(offset)+ 14] |= (w & (ONE << 14)) ? (bitpos) : 0; + transpose[(offset)+ 15] |= (w & (ONE << 15)) ? (bitpos) : 0; +#endif +#if WORD_SIZE > 16 + transpose[(offset)+ 16] |= (w & (ONE << 16)) ? (bitpos) : 0; + transpose[(offset)+ 17] |= (w & (ONE << 17)) ? (bitpos) : 0; + transpose[(offset)+ 18] |= (w & (ONE << 18)) ? (bitpos) : 0; + transpose[(offset)+ 19] |= (w & (ONE << 19)) ? (bitpos) : 0; + transpose[(offset)+ 20] |= (w & (ONE << 20)) ? (bitpos) : 0; + transpose[(offset)+ 21] |= (w & (ONE << 21)) ? (bitpos) : 0; + transpose[(offset)+ 22] |= (w & (ONE << 22)) ? (bitpos) : 0; + transpose[(offset)+ 23] |= (w & (ONE << 23)) ? (bitpos) : 0; + transpose[(offset)+ 24] |= (w & (ONE << 24)) ? (bitpos) : 0; + transpose[(offset)+ 25] |= (w & (ONE << 25)) ? (bitpos) : 0; + transpose[(offset)+ 26] |= (w & (ONE << 26)) ? (bitpos) : 0; + transpose[(offset)+ 27] |= (w & (ONE << 27)) ? (bitpos) : 0; + transpose[(offset)+ 28] |= (w & (ONE << 28)) ? (bitpos) : 0; + transpose[(offset)+ 29] |= (w & (ONE << 29)) ? (bitpos) : 0; + transpose[(offset)+ 30] |= (w & (ONE << 30)) ? (bitpos) : 0; + transpose[(offset)+ 31] |= (w & (ONE << 31)) ? (bitpos) : 0; +#endif +#if WORD_SIZE > 32 + transpose[(offset)+ 32] |= (w & (ONE << 32)) ? (bitpos) : 0; + transpose[(offset)+ 33] |= (w & (ONE << 33)) ? (bitpos) : 0; + transpose[(offset)+ 34] |= (w & (ONE << 34)) ? (bitpos) : 0; + transpose[(offset)+ 35] |= (w & (ONE << 35)) ? (bitpos) : 0; + transpose[(offset)+ 36] |= (w & (ONE << 36)) ? (bitpos) : 0; + transpose[(offset)+ 37] |= (w & (ONE << 37)) ? (bitpos) : 0; + transpose[(offset)+ 38] |= (w & (ONE << 38)) ? (bitpos) : 0; + transpose[(offset)+ 39] |= (w & (ONE << 39)) ? (bitpos) : 0; + transpose[(offset)+ 40] |= (w & (ONE << 40)) ? (bitpos) : 0; + transpose[(offset)+ 41] |= (w & (ONE << 41)) ? (bitpos) : 0; + transpose[(offset)+ 42] |= (w & (ONE << 42)) ? (bitpos) : 0; + transpose[(offset)+ 43] |= (w & (ONE << 43)) ? (bitpos) : 0; + transpose[(offset)+ 44] |= (w & (ONE << 44)) ? (bitpos) : 0; + transpose[(offset)+ 45] |= (w & (ONE << 45)) ? (bitpos) : 0; + transpose[(offset)+ 46] |= (w & (ONE << 46)) ? (bitpos) : 0; + transpose[(offset)+ 47] |= (w & (ONE << 47)) ? (bitpos) : 0; + transpose[(offset)+ 48] |= (w & (ONE << 48)) ? (bitpos) : 0; + transpose[(offset)+ 49] |= (w & (ONE << 49)) ? (bitpos) : 0; + transpose[(offset)+ 50] |= (w & (ONE << 50)) ? (bitpos) : 0; + transpose[(offset)+ 51] |= (w & (ONE << 51)) ? (bitpos) : 0; + transpose[(offset)+ 52] |= (w & (ONE << 52)) ? (bitpos) : 0; + transpose[(offset)+ 53] |= (w & (ONE << 53)) ? (bitpos) : 0; + transpose[(offset)+ 54] |= (w & (ONE << 54)) ? (bitpos) : 0; + transpose[(offset)+ 55] |= (w & (ONE << 55)) ? (bitpos) : 0; + transpose[(offset)+ 56] |= (w & (ONE << 56)) ? (bitpos) : 0; + transpose[(offset)+ 57] |= (w & (ONE << 57)) ? (bitpos) : 0; + transpose[(offset)+ 58] |= (w & (ONE << 58)) ? (bitpos) : 0; + transpose[(offset)+ 59] |= (w & (ONE << 59)) ? (bitpos) : 0; + transpose[(offset)+ 60] |= (w & (ONE << 60)) ? (bitpos) : 0; + transpose[(offset)+ 61] |= (w & (ONE << 61)) ? (bitpos) : 0; + transpose[(offset)+ 62] |= (w & (ONE << 62)) ? (bitpos) : 0; + transpose[(offset)+ 63] |= (w & (ONE << 63)) ? (bitpos) : 0; +#endif +#endif + // constant time: + //transpose[(i<<MUL_SHIFT)+ j] |= (((int64_t)((w & (ONE << j)) << (WORD_SIZE-1-j)))>>(WORD_SIZE-1)) & (ONE<<k); + } + } +} + +void bs_transpose_rev(word_t * blocks) +{ + int i,k; + word_t w; + word_t transpose[BLOCK_SIZE]; + memset(transpose, 0, sizeof(transpose)); + for(k=0; k < BLOCK_SIZE; k++) + { + w = blocks[k]; + word_t bitpos = bs2be(ONE << (k % WORD_SIZE)); + word_t offset = k / WORD_SIZE; +#ifndef UNROLL_TRANSPOSE + int j; + for(j=0; j < WORD_SIZE; j++) + { + word_t bit = (w & (ONE << j)) ? (ONE << (k % WORD_SIZE)) : 0; + transpose[j * WORDS_PER_BLOCK + (offset)] |= bit; + } +#else + transpose[0 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 0 )) ? bitpos : 0; + transpose[1 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 1 )) ? bitpos : 0; + transpose[2 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 2 )) ? bitpos : 0; + transpose[3 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 3 )) ? bitpos : 0; + transpose[4 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 4 )) ? bitpos : 0; + transpose[5 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 5 )) ? bitpos : 0; + transpose[6 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 6 )) ? bitpos : 0; + transpose[7 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 7 )) ? bitpos : 0; +#if WORD_SIZE > 8 + transpose[8 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 8 )) ? bitpos : 0; + transpose[9 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 9 )) ? bitpos : 0; + transpose[10 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 10)) ? bitpos : 0; + transpose[11 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 11)) ? bitpos : 0; + transpose[12 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 12)) ? bitpos : 0; + transpose[13 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 13)) ? bitpos : 0; + transpose[14 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 14)) ? bitpos : 0; + transpose[15 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 15)) ? bitpos : 0; +#endif +#if WORD_SIZE > 16 + transpose[16 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 16)) ? bitpos : 0; + transpose[17 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 17)) ? bitpos : 0; + transpose[18 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 18)) ? bitpos : 0; + transpose[19 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 19)) ? bitpos : 0; + transpose[20 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 20)) ? bitpos : 0; + transpose[21 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 21)) ? bitpos : 0; + transpose[22 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 22)) ? bitpos : 0; + transpose[23 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 23)) ? bitpos : 0; + transpose[24 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 24)) ? bitpos : 0; + transpose[25 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 25)) ? bitpos : 0; + transpose[26 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 26)) ? bitpos : 0; + transpose[27 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 27)) ? bitpos : 0; + transpose[28 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 28)) ? bitpos : 0; + transpose[29 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 29)) ? bitpos : 0; + transpose[30 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 30)) ? bitpos : 0; + transpose[31 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 31)) ? bitpos : 0; +#endif +#if WORD_SIZE > 32 + transpose[32 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 32)) ? bitpos : 0; + transpose[33 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 33)) ? bitpos : 0; + transpose[34 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 34)) ? bitpos : 0; + transpose[35 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 35)) ? bitpos : 0; + transpose[36 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 36)) ? bitpos : 0; + transpose[37 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 37)) ? bitpos : 0; + transpose[38 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 38)) ? bitpos : 0; + transpose[39 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 39)) ? bitpos : 0; + transpose[40 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 40)) ? bitpos : 0; + transpose[41 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 41)) ? bitpos : 0; + transpose[42 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 42)) ? bitpos : 0; + transpose[43 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 43)) ? bitpos : 0; + transpose[44 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 44)) ? bitpos : 0; + transpose[45 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 45)) ? bitpos : 0; + transpose[46 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 46)) ? bitpos : 0; + transpose[47 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 47)) ? bitpos : 0; + transpose[48 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 48)) ? bitpos : 0; + transpose[49 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 49)) ? bitpos : 0; + transpose[50 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 50)) ? bitpos : 0; + transpose[51 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 51)) ? bitpos : 0; + transpose[52 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 52)) ? bitpos : 0; + transpose[53 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 53)) ? bitpos : 0; + transpose[54 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 54)) ? bitpos : 0; + transpose[55 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 55)) ? bitpos : 0; + transpose[56 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 56)) ? bitpos : 0; + transpose[57 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 57)) ? bitpos : 0; + transpose[58 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 58)) ? bitpos : 0; + transpose[59 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 59)) ? bitpos : 0; + transpose[60 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 60)) ? bitpos : 0; + transpose[61 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 61)) ? bitpos : 0; + transpose[62 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 62)) ? bitpos : 0; + transpose[63 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 63)) ? bitpos : 0; +#endif +#endif + } + memmove(blocks,transpose,sizeof(transpose)); +} + + +#define R0 0 +#define R1 8 +#define R2 16 +#define R3 24 + +#define B0 0 +#define B1 32 +#define B2 64 +#define B3 96 + +#define R0_shift (BLOCK_SIZE/4)*0 +#define R1_shift (BLOCK_SIZE/4)*1 +#define R2_shift (BLOCK_SIZE/4)*2 +#define R3_shift (BLOCK_SIZE/4)*3 +#define B_MOD (BLOCK_SIZE) + + +void bs_shiftrows(word_t * B) +{ + word_t Bp_space[BLOCK_SIZE]; + word_t * Bp = Bp_space; + word_t * Br0 = B + 0; + word_t * Br1 = B + 32; + word_t * Br2 = B + 64; + word_t * Br3 = B + 96; + uint8_t offsetr0 = 0; + uint8_t offsetr1 = 32; + uint8_t offsetr2 = 64; + uint8_t offsetr3 = 96; + + + int i; + for(i=0; i<4; i++) + { + Bp[B0 + 0] = Br0[0]; + Bp[B0 + 1] = Br0[1]; + Bp[B0 + 2] = Br0[2]; + Bp[B0 + 3] = Br0[3]; + Bp[B0 + 4] = Br0[4]; + Bp[B0 + 5] = Br0[5]; + Bp[B0 + 6] = Br0[6]; + Bp[B0 + 7] = Br0[7]; + Bp[B1 + 0] = Br1[0]; + Bp[B1 + 1] = Br1[1]; + Bp[B1 + 2] = Br1[2]; + Bp[B1 + 3] = Br1[3]; + Bp[B1 + 4] = Br1[4]; + Bp[B1 + 5] = Br1[5]; + Bp[B1 + 6] = Br1[6]; + Bp[B1 + 7] = Br1[7]; + Bp[B2 + 0] = Br2[0]; + Bp[B2 + 1] = Br2[1]; + Bp[B2 + 2] = Br2[2]; + Bp[B2 + 3] = Br2[3]; + Bp[B2 + 4] = Br2[4]; + Bp[B2 + 5] = Br2[5]; + Bp[B2 + 6] = Br2[6]; + Bp[B2 + 7] = Br2[7]; + Bp[B3 + 0] = Br3[0]; + Bp[B3 + 1] = Br3[1]; + Bp[B3 + 2] = Br3[2]; + Bp[B3 + 3] = Br3[3]; + Bp[B3 + 4] = Br3[4]; + Bp[B3 + 5] = Br3[5]; + Bp[B3 + 6] = Br3[6]; + Bp[B3 + 7] = Br3[7]; + + offsetr0 = (offsetr0 + BLOCK_SIZE/16 + BLOCK_SIZE/4) & 0x7f; + offsetr1 = (offsetr1 + BLOCK_SIZE/16 + BLOCK_SIZE/4) & 0x7f; + offsetr2 = (offsetr2 + BLOCK_SIZE/16 + BLOCK_SIZE/4) & 0x7f; + offsetr3 = (offsetr3 + BLOCK_SIZE/16 + BLOCK_SIZE/4) & 0x7f; + + Br0 = B + offsetr0; + Br1 = B + offsetr1; + Br2 = B + offsetr2; + Br3 = B + offsetr3; + + Bp += 8; + } + memmove(B,Bp_space,sizeof(Bp_space)); +} + + +void bs_shiftrows_rev(word_t * B) +{ + word_t Bp_space[BLOCK_SIZE]; + word_t * Bp = Bp_space; + word_t * Br0 = Bp + 0; + word_t * Br1 = Bp + 32; + word_t * Br2 = Bp + 64; + word_t * Br3 = Bp + 96; + uint8_t offsetr0 = 0; + uint8_t offsetr1 = 32; + uint8_t offsetr2 = 64; + uint8_t offsetr3 = 96; + + + int i; + for(i=0; i<4; i++) + { + Br0[0] = B[B0 + 0]; + Br0[1] = B[B0 + 1]; + Br0[2] = B[B0 + 2]; + Br0[3] = B[B0 + 3]; + Br0[4] = B[B0 + 4]; + Br0[5] = B[B0 + 5]; + Br0[6] = B[B0 + 6]; + Br0[7] = B[B0 + 7]; + Br1[0] = B[B1 + 0]; + Br1[1] = B[B1 + 1]; + Br1[2] = B[B1 + 2]; + Br1[3] = B[B1 + 3]; + Br1[4] = B[B1 + 4]; + Br1[5] = B[B1 + 5]; + Br1[6] = B[B1 + 6]; + Br1[7] = B[B1 + 7]; + Br2[0] = B[B2 + 0]; + Br2[1] = B[B2 + 1]; + Br2[2] = B[B2 + 2]; + Br2[3] = B[B2 + 3]; + Br2[4] = B[B2 + 4]; + Br2[5] = B[B2 + 5]; + Br2[6] = B[B2 + 6]; + Br2[7] = B[B2 + 7]; + Br3[0] = B[B3 + 0]; + Br3[1] = B[B3 + 1]; + Br3[2] = B[B3 + 2]; + Br3[3] = B[B3 + 3]; + Br3[4] = B[B3 + 4]; + Br3[5] = B[B3 + 5]; + Br3[6] = B[B3 + 6]; + Br3[7] = B[B3 + 7]; + + offsetr0 = (offsetr0 + BLOCK_SIZE/16 + BLOCK_SIZE/4) & 0x7f; + offsetr1 = (offsetr1 + BLOCK_SIZE/16 + BLOCK_SIZE/4) & 0x7f; + offsetr2 = (offsetr2 + BLOCK_SIZE/16 + BLOCK_SIZE/4) & 0x7f; + offsetr3 = (offsetr3 + BLOCK_SIZE/16 + BLOCK_SIZE/4) & 0x7f; + + Br0 = Bp + offsetr0; + Br1 = Bp + offsetr1; + Br2 = Bp + offsetr2; + Br3 = Bp + offsetr3; + + B += 8; + } + memmove(B - 8 * 4,Bp_space,sizeof(Bp_space)); +} + + +#define A0 0 +#define A1 8 +#define A2 16 +#define A3 24 + +// Does shift rows and mix columns in same step +void bs_shiftmix(word_t * B) +{ + word_t Bp_space[BLOCK_SIZE]; + word_t * Bp = Bp_space; + + word_t * Br0 = B + 0; + word_t * Br1 = B + 32; + word_t * Br2 = B + 64; + word_t * Br3 = B + 96; + + uint8_t offsetr0 = 0; + uint8_t offsetr1 = 32; + uint8_t offsetr2 = 64; + uint8_t offsetr3 = 96; + + Br0 = B + offsetr0; + Br1 = B + offsetr1; + Br2 = B + offsetr2; + Br3 = B + offsetr3; + + + int i; + for (i = 0; i < 4; i++) + { + // B0 + // 2*A0 2*A1 A1 A2 A3 + word_t of =Br0[R0+7]^ Br1[R1+7]; + Bp[A0+0] = Br1[R1+0] ^ Br2[R2+0] ^ Br3[R3+0] ^ of; + Bp[A0+1] = Br0[R0+0] ^ Br1[R1+0] ^ Br1[R1+1] ^ Br2[R2+1] ^ Br3[R3+1] ^ of; + Bp[A0+2] = Br0[R0+1] ^ Br1[R1+1] ^ Br1[R1+2] ^ Br2[R2+2] ^ Br3[R3+2]; + Bp[A0+3] = Br0[R0+2] ^ Br1[R1+2] ^ Br1[R1+3] ^ Br2[R2+3] ^ Br3[R3+3] ^ of; + Bp[A0+4] = Br0[R0+3] ^ Br1[R1+3] ^ Br1[R1+4] ^ Br2[R2+4] ^ Br3[R3+4] ^ of; + Bp[A0+5] = Br0[R0+4] ^ Br1[R1+4] ^ Br1[R1+5] ^ Br2[R2+5] ^ Br3[R3+5]; + Bp[A0+6] = Br0[R0+5] ^ Br1[R1+5] ^ Br1[R1+6] ^ Br2[R2+6] ^ Br3[R3+6]; + Bp[A0+7] = Br0[R0+6] ^ Br1[R1+6] ^ Br1[R1+7] ^ Br2[R2+7] ^ Br3[R3+7]; + + // A0 2*A1 2*A2 A2 A3 + of = Br1[R1+7] ^ Br2[R2+7]; + Bp[A1+0] = Br0[R0+0] ^ Br2[R2+0] ^ Br3[R3+0] ^ of; + Bp[A1+1] = Br0[R0+1] ^ Br1[R1+0] ^ Br2[R2+0] ^ Br2[R2+1] ^ Br3[R3+1] ^ of; + Bp[A1+2] = Br0[R0+2] ^ Br1[R1+1] ^ Br2[R2+1] ^ Br2[R2+2] ^ Br3[R3+2]; + Bp[A1+3] = Br0[R0+3] ^ Br1[R1+2] ^ Br2[R2+2] ^ Br2[R2+3] ^ Br3[R3+3] ^ of; + Bp[A1+4] = Br0[R0+4] ^ Br1[R1+3] ^ Br2[R2+3] ^ Br2[R2+4] ^ Br3[R3+4] ^ of; + Bp[A1+5] = Br0[R0+5] ^ Br1[R1+4] ^ Br2[R2+4] ^ Br2[R2+5] ^ Br3[R3+5]; + Bp[A1+6] = Br0[R0+6] ^ Br1[R1+5] ^ Br2[R2+5] ^ Br2[R2+6] ^ Br3[R3+6]; + Bp[A1+7] = Br0[R0+7] ^ Br1[R1+6] ^ Br2[R2+6] ^ Br2[R2+7] ^ Br3[R3+7]; + + // A0 A1 2*A2 2*A3 A3 + of = Br2[R2+7] ^ Br3[R3+7]; + Bp[A2+0] = Br0[R0+0] ^ Br1[R1+0] ^ Br3[R3+0] ^ of; + Bp[A2+1] = Br0[R0+1] ^ Br1[R1+1] ^ Br2[R2+0] ^ Br3[R3+0] ^ Br3[R3+1] ^ of; + Bp[A2+2] = Br0[R0+2] ^ Br1[R1+2] ^ Br2[R2+1] ^ Br3[R3+1] ^ Br3[R3+2]; + Bp[A2+3] = Br0[R0+3] ^ Br1[R1+3] ^ Br2[R2+2] ^ Br3[R3+2] ^ Br3[R3+3] ^ of; + Bp[A2+4] = Br0[R0+4] ^ Br1[R1+4] ^ Br2[R2+3] ^ Br3[R3+3] ^ Br3[R3+4] ^ of; + Bp[A2+5] = Br0[R0+5] ^ Br1[R1+5] ^ Br2[R2+4] ^ Br3[R3+4] ^ Br3[R3+5]; + Bp[A2+6] = Br0[R0+6] ^ Br1[R1+6] ^ Br2[R2+5] ^ Br3[R3+5] ^ Br3[R3+6]; + Bp[A2+7] = Br0[R0+7] ^ Br1[R1+7] ^ Br2[R2+6] ^ Br3[R3+6] ^ Br3[R3+7]; + + // A0 2*A0 A1 A2 2*A3 + of = Br0[R0+7] ^ Br3[R3+7]; + Bp[A3+0] = Br0[R0+0] ^ Br1[R1+0] ^ Br2[R2+0] ^ of; + Bp[A3+1] = Br0[R0+1] ^ Br0[R0+0] ^ Br1[R1+1] ^ Br2[R2+1] ^ Br3[R3+0] ^ of; + Bp[A3+2] = Br0[R0+2] ^ Br0[R0+1] ^ Br1[R1+2] ^ Br2[R2+2] ^ Br3[R3+1]; + Bp[A3+3] = Br0[R0+3] ^ Br0[R0+2] ^ Br1[R1+3] ^ Br2[R2+3] ^ Br3[R3+2] ^ of; + Bp[A3+4] = Br0[R0+4] ^ Br0[R0+3] ^ Br1[R1+4] ^ Br2[R2+4] ^ Br3[R3+3] ^ of; + Bp[A3+5] = Br0[R0+5] ^ Br0[R0+4] ^ Br1[R1+5] ^ Br2[R2+5] ^ Br3[R3+4]; + Bp[A3+6] = Br0[R0+6] ^ Br0[R0+5] ^ Br1[R1+6] ^ Br2[R2+6] ^ Br3[R3+5]; + Bp[A3+7] = Br0[R0+7] ^ Br0[R0+6] ^ Br1[R1+7] ^ Br2[R2+7] ^ Br3[R3+6]; + + Bp += BLOCK_SIZE/4; + + offsetr0 = (offsetr0 + BLOCK_SIZE/4) & 0x7f; + offsetr1 = (offsetr1 + BLOCK_SIZE/4) & 0x7f; + offsetr2 = (offsetr2 + BLOCK_SIZE/4) & 0x7f; + offsetr3 = (offsetr3 + BLOCK_SIZE/4) & 0x7f; + + Br0 = B + offsetr0; + Br1 = B + offsetr1; + Br2 = B + offsetr2; + Br3 = B + offsetr3; + } + + memmove(B,Bp_space,sizeof(Bp_space)); +} + + + +void bs_mixcolumns(word_t * B) +{ + word_t Bp_space[BLOCK_SIZE]; + word_t * Bp = Bp_space; + // to understand this, see + // https://en.wikipedia.org/wiki/Rijndael_mix_columns + + int i = 0; + for (; i < 4; i++) + { + // of = A0 ^ A1; + // A0 = A0 ^ (0x1b & ((signed char)of>>7)); + + //// 2 * A0 + // A0 = A0 ^ (A0 << 1) + + //// + 3 * A1 + // A0 = A0 ^ (A1) + // A0 = A0 ^ (A1<<1) + + //// + A2 + A3 + // A0 = A0 ^ (A2) + // A0 = A0 ^ (A3) + // A0.7 A1.7 + word_t of = B[A0+7] ^ B[A1+7]; + + // 2*A0 2*A1 A1 A2 A3 + Bp[A0+0] = B[A1+0] ^ B[A2+0] ^ B[A3+0] ^ of; + Bp[A0+1] = B[A0+0] ^ B[A1+0] ^ B[A1+1] ^ B[A2+1] ^ B[A3+1] ^ of; + Bp[A0+2] = B[A0+1] ^ B[A1+1] ^ B[A1+2] ^ B[A2+2] ^ B[A3+2]; + Bp[A0+3] = B[A0+2] ^ B[A1+2] ^ B[A1+3] ^ B[A2+3] ^ B[A3+3] ^ of; + Bp[A0+4] = B[A0+3] ^ B[A1+3] ^ B[A1+4] ^ B[A2+4] ^ B[A3+4] ^ of; + Bp[A0+5] = B[A0+4] ^ B[A1+4] ^ B[A1+5] ^ B[A2+5] ^ B[A3+5]; + Bp[A0+6] = B[A0+5] ^ B[A1+5] ^ B[A1+6] ^ B[A2+6] ^ B[A3+6]; + Bp[A0+7] = B[A0+6] ^ B[A1+6] ^ B[A1+7] ^ B[A2+7] ^ B[A3+7]; + + + + // of = A1 ^ A2 + // A1 = A1 ^ (0x1b & ((signed char)of>>7)); + + //// A0 + // A1 = A1 ^ (A0) + + //// + 2 * A1 + // A1 = A1 ^ (A1 << 1) + + //// + 3 * A2 + // A1 = A1 ^ (A2) + // A1 = A1 ^ (A2<<1) + + //// + A3 + // A1 = A1 ^ (A3) + + of = B[A1+7] ^ B[A2+7]; + + // A0 2*A1 2*A2 A2 A3 + Bp[A1+0] = B[A0+0] ^ B[A2+0] ^ B[A3+0] ^ of; + Bp[A1+1] = B[A0+1] ^ B[A1+0] ^ B[A2+0] ^ B[A2+1] ^ B[A3+1] ^ of; + Bp[A1+2] = B[A0+2] ^ B[A1+1] ^ B[A2+1] ^ B[A2+2] ^ B[A3+2]; + Bp[A1+3] = B[A0+3] ^ B[A1+2] ^ B[A2+2] ^ B[A2+3] ^ B[A3+3] ^ of; + Bp[A1+4] = B[A0+4] ^ B[A1+3] ^ B[A2+3] ^ B[A2+4] ^ B[A3+4] ^ of; + Bp[A1+5] = B[A0+5] ^ B[A1+4] ^ B[A2+4] ^ B[A2+5] ^ B[A3+5]; + Bp[A1+6] = B[A0+6] ^ B[A1+5] ^ B[A2+5] ^ B[A2+6] ^ B[A3+6]; + Bp[A1+7] = B[A0+7] ^ B[A1+6] ^ B[A2+6] ^ B[A2+7] ^ B[A3+7]; + + + // of = A2 ^ A3 + // A2 = A2 ^ (0x1b & ((signed char)of>>7)); + + //// A0 + A1 + // A2 = A2 ^ (A0) + // A2 = A2 ^ (A1) + + //// + 2 * A2 + // A2 = A2 ^ (A2 << 1) + + //// + 3 * A3 + // A2 = A2 ^ (A3) + // A2 = A2 ^ (A3<<1) + + + of = B[A2+7] ^ B[A3+7]; + + // A0 A1 2*A2 2*A3 A3 + Bp[A2+0] = B[A0+0] ^ B[A1+0] ^ B[A3+0] ^ of; + Bp[A2+1] = B[A0+1] ^ B[A1+1] ^ B[A2+0] ^ B[A3+0] ^ B[A3+1] ^ of; + Bp[A2+2] = B[A0+2] ^ B[A1+2] ^ B[A2+1] ^ B[A3+1] ^ B[A3+2]; + Bp[A2+3] = B[A0+3] ^ B[A1+3] ^ B[A2+2] ^ B[A3+2] ^ B[A3+3] ^ of; + Bp[A2+4] = B[A0+4] ^ B[A1+4] ^ B[A2+3] ^ B[A3+3] ^ B[A3+4] ^ of; + Bp[A2+5] = B[A0+5] ^ B[A1+5] ^ B[A2+4] ^ B[A3+4] ^ B[A3+5]; + Bp[A2+6] = B[A0+6] ^ B[A1+6] ^ B[A2+5] ^ B[A3+5] ^ B[A3+6]; + Bp[A2+7] = B[A0+7] ^ B[A1+7] ^ B[A2+6] ^ B[A3+6] ^ B[A3+7]; + + + // A3 = A0 ^ A3 + // A3 = A3 ^ (0x1b & ((signed char)of>>7)); + + //// 3 * A0 + // A3 = A3 ^ (A0) + // A3 = A3 ^ (A0 << 1) + + //// + A1 + A2 + // A3 = A3 ^ A1 + // A3 = A3 ^ A2 + + //// + 2 * A3 + // A3 = A3 ^ (A3<<1) + + of = B[A0+7] ^ B[A3+7]; + + // 2*A0 A0 A1 A2 2*A3 + Bp[A3+0] = B[A0+0] ^ B[A1+0] ^ B[A2+0] ^ of; + Bp[A3+1] = B[A0+1] ^ B[A0+0] ^ B[A1+1] ^ B[A2+1] ^ B[A3+0] ^ of; + Bp[A3+2] = B[A0+2] ^ B[A0+1] ^ B[A1+2] ^ B[A2+2] ^ B[A3+1]; + Bp[A3+3] = B[A0+3] ^ B[A0+2] ^ B[A1+3] ^ B[A2+3] ^ B[A3+2] ^ of; + Bp[A3+4] = B[A0+4] ^ B[A0+3] ^ B[A1+4] ^ B[A2+4] ^ B[A3+3] ^ of; + Bp[A3+5] = B[A0+5] ^ B[A0+4] ^ B[A1+5] ^ B[A2+5] ^ B[A3+4]; + Bp[A3+6] = B[A0+6] ^ B[A0+5] ^ B[A1+6] ^ B[A2+6] ^ B[A3+5]; + Bp[A3+7] = B[A0+7] ^ B[A0+6] ^ B[A1+7] ^ B[A2+7] ^ B[A3+6]; + + + // + Bp += BLOCK_SIZE/4; + B += BLOCK_SIZE/4; + } + + + memmove(B - BLOCK_SIZE,Bp - BLOCK_SIZE,sizeof(Bp_space)); +} + +void bs_mixcolumns_rev(word_t * B) +{ + // to understand this, see + // https://en.wikipedia.org/wiki/Rijndael_mix_columns + // TODO combine with shiftrows for performance on decryption + word_t Bp_space[BLOCK_SIZE]; + word_t * Bp = Bp_space; + + + int i = 0; + for (; i < BLOCK_SIZE / 4; i += BLOCK_SIZE / 16) + { + + //// state[i][0] = A0*0x0e + A1*0x0b + A2*0x0d + A3*0x09 + // overflow: + /* A0 * 0b1110 */ /* A1 * 0b1011 */ /* A2 * 0b1101 */ /* A3 * 0b1001 */ + word_t of0 = ( (B[A0+7] ^ B[A0+6] ^ B[A0+5]) ^ (B[A1 + 7] ^ B[A1+5]) ^ (B[A2+6] ^ B[A2+5]) ^ ( B[A3+5] )); // 2 bit + word_t of1 = ( (B[A0+7] ^ B[A0+6]) ^ ( B[A1+6]) ^ (B[A2+7] ^ B[A2+6]) ^ ( B[A3+6] )); // 3 bit + word_t of2 = ( (B[A0+7]) ^ ( B[A1+7]) ^ ( B[A2+7]) ^ ( B[A3+7] )); // 4 bit + + // inverse: + // 1110 1011 1101 1001 + // A0 = A0 * 14 + A1 * 11 + A2 * 13 + A3 * 9 + // A0 = A0 * (2+4+8) + A1 * (1+2+8) + A2 * (1+4+8) + A3 * (1+8) + + // (2*A0 + 4*A0 + 8*A0 ) + (8*A1 + 2*A1 + A1 ) + ( A2 + 4*A2 + 8*A2 ) + ( A3 + 8*A3) + Bp[A0+0] = B[A1+0] ^ B[A2+0] ^ B[A3+0] ^ of0; + Bp[A0+1] = B[A0+0] ^ B[A1+0] ^ B[A1+1] ^ B[A2+1] ^ B[A3+1] ^ of0 ^ of1; + Bp[A0+2] = B[A0+1] ^ B[A0+0] ^ B[A1+1] ^ B[A1+2] ^ B[A2+2] ^ B[A2+0] ^ B[A3+2] ^ of1 ^ of2; + Bp[A0+3] = B[A0+2] ^ B[A0+1] ^ B[A0+0] ^ B[A1+0] ^ B[A1+2] ^ B[A1+3] ^ B[A2+3] ^ B[A2+1] ^ B[A2+0] ^ B[A3+3] ^ B[A3+0] ^ of0 ^ of2; + Bp[A0+4] = B[A0+3] ^ B[A0+2] ^ B[A0+1] ^ B[A1+1] ^ B[A1+3] ^ B[A1+4] ^ B[A2+4] ^ B[A2+2] ^ B[A2+1] ^ B[A3+4] ^ B[A3+1] ^ of0 ^ of1; + Bp[A0+5] = B[A0+4] ^ B[A0+3] ^ B[A0+2] ^ B[A1+2] ^ B[A1+4] ^ B[A1+5] ^ B[A2+5] ^ B[A2+3] ^ B[A2+2] ^ B[A3+5] ^ B[A3+2] ^ of1 ^ of2; + Bp[A0+6] = B[A0+5] ^ B[A0+4] ^ B[A0+3] ^ B[A1+3] ^ B[A1+5] ^ B[A1+6] ^ B[A2+6] ^ B[A2+4] ^ B[A2+3] ^ B[A3+6] ^ B[A3+3] ^ of2; + Bp[A0+7] = B[A0+6] ^ B[A0+5] ^ B[A0+4] ^ B[A1+4] ^ B[A1+6] ^ B[A1+7] ^ B[A2+7] ^ B[A2+5] ^ B[A2+4] ^ B[A3+7] ^ B[A3+4]; + + + + //// state[i][1] = A0*0x09 + A1*0xe + A2*0x0b + A3*0x0d + // overflow: + /* A0 * 0b1001 */ /* A1 * 0b1110 */ /* A2 * 0b101 1 */ /* A3 * 0b1101 */ + of0 = ( (B[A0+5]) ^ (B[A1+7] ^ B[A1+6] ^ B[A1+5]) ^ (B[A2 + 7] ^ B[A2+5]) ^ (B[A3+6] ^ B[A3+5])); // 2 bit + of1 = ( (B[A0+6]) ^ (B[A1+7] ^ B[A1+6]) ^ ( B[A2+6]) ^ (B[A3+7] ^ B[A3+6])); // 3 bit + of2 = ( (B[A0+7]) ^ (B[A1+7]) ^ ( B[A2+7]) ^ ( B[A3+7])); // 4 bit + + // inverse: + // 1001 1110 1011 1101 + // A1 = A0 * 9 + A1 * 14 + A2 * 11 + A3 * 13 + // A1 = A0 * (1+8) + A1 * (2+4+8) + A2 * (1+2+8) + A3 * (1+4+8) + + // (1*A0 + 8*A0 ) +(2*A1 + 4*A1 + 8*A1 ) + (1*A2 + 2*A2 + 8*A2 ) + (1*A3 + 4*A3 + 8*A3) + Bp[A1+0] = B[A0+0] ^ B[A2+0] ^ B[A3+0] ^ of0; + Bp[A1+1] = B[A0+1] ^ B[A1+0] ^ B[A2+1] ^ B[A2+0] ^ B[A3+1] ^ of0 ^ of1; + Bp[A1+2] = B[A0+2] ^ B[A1+1] ^ B[A1+0] ^ B[A2+2] ^ B[A2+1] ^ B[A3+2] ^ B[A3+0] ^ of1 ^ of2; + Bp[A1+3] = B[A0+3] ^ B[A0+0] ^ B[A1+2] ^ B[A1+1] ^ B[A1+0] ^ B[A2+3] ^ B[A2+2] ^ B[A2+0] ^ B[A3+3] ^ B[A3+1] ^ B[A3+0] ^ of0 ^ of2; + Bp[A1+4] = B[A0+4] ^ B[A0+1] ^ B[A1+3] ^ B[A1+2] ^ B[A1+1] ^ B[A2+4] ^ B[A2+3] ^ B[A2+1] ^ B[A3+4] ^ B[A3+2] ^ B[A3+1] ^ of0 ^ of1; + Bp[A1+5] = B[A0+5] ^ B[A0+2] ^ B[A1+4] ^ B[A1+3] ^ B[A1+2] ^ B[A2+5] ^ B[A2+4] ^ B[A2+2] ^ B[A3+5] ^ B[A3+3] ^ B[A3+2] ^ of1 ^ of2; + Bp[A1+6] = B[A0+6] ^ B[A0+3] ^ B[A1+5] ^ B[A1+4] ^ B[A1+3] ^ B[A2+6] ^ B[A2+5] ^ B[A2+3] ^ B[A3+6] ^ B[A3+4] ^ B[A3+3] ^ of2; + Bp[A1+7] = B[A0+7] ^ B[A0+4] ^ B[A1+6] ^ B[A1+5] ^ B[A1+4] ^ B[A2+7] ^ B[A2+6] ^ B[A2+4] ^ B[A3+7] ^ B[A3+5] ^ B[A3+4]; + + + //// state[i][2] = A0*0x0d + A1*0x09 + A2*0x0e + A3*0x0b + // overflow: + /* A1 * 0b1001 */ /* A2 * 0b1110 */ /* A3 * 0b1011 */ /* A0 * 0b1101 */ + of0 = ( (B[A1+5]) ^ (B[A2+7] ^ B[A2+6] ^ B[A2+5]) ^ (B[A3 + 7] ^ B[A3+5]) ^ (B[A0+6] ^ B[A0+5])); // 2 bit + of1 = ( (B[A1+6]) ^ (B[A2+7] ^ B[A2+6]) ^ ( B[A3+6]) ^ (B[A0+7] ^ B[A0+6])); // 3 bit + of2 = ( (B[A1+7]) ^ (B[A2+7]) ^ ( B[A3+7]) ^ ( B[A0+7])); // 4 bit + + // inverse: + // 1001 1110 1011 1101 + // A2 = A1 * 9 + A2 * 14 + A3 * 11 + A0 * 13 + // A2 = A1 * (1+8) + A2 * (2+4+8) + A3 * (1+2+8) + A0 * (1+4+8) + + // (1*A1 + 8*A1) + ( 2*A2 + 4*A2 + 8*A2) + (1*A3 2*A2 + 8*A2) + (1*A0 + 4*A0 + 8*A0) + Bp[A2+0] = B[A1+0] ^ B[A3+0] ^ B[A0+0] ^ of0; + Bp[A2+1] = B[A1+1] ^ B[A2+0] ^ B[A3+1] ^ B[A3+0] ^ B[A0+1] ^ of0 ^ of1; + Bp[A2+2] = B[A1+2] ^ B[A2+1] ^ B[A2+0] ^ B[A3+2] ^ B[A3+1] ^ B[A0+2] ^ B[A0+0] ^ of1 ^ of2; + Bp[A2+3] = B[A1+3] ^ B[A1+0] ^ B[A2+2] ^ B[A2+1] ^ B[A2+0] ^ B[A3+3] ^ B[A3+2] ^ B[A3+0] ^ B[A0+3] ^ B[A0+1] ^ B[A0+0] ^ of0 ^ of2; + Bp[A2+4] = B[A1+4] ^ B[A1+1] ^ B[A2+3] ^ B[A2+2] ^ B[A2+1] ^ B[A3+4] ^ B[A3+3] ^ B[A3+1] ^ B[A0+4] ^ B[A0+2] ^ B[A0+1] ^ of0 ^ of1; + Bp[A2+5] = B[A1+5] ^ B[A1+2] ^ B[A2+4] ^ B[A2+3] ^ B[A2+2] ^ B[A3+5] ^ B[A3+4] ^ B[A3+2] ^ B[A0+5] ^ B[A0+3] ^ B[A0+2] ^ of1 ^ of2; + Bp[A2+6] = B[A1+6] ^ B[A1+3] ^ B[A2+5] ^ B[A2+4] ^ B[A2+3] ^ B[A3+6] ^ B[A3+5] ^ B[A3+3] ^ B[A0+6] ^ B[A0+4] ^ B[A0+3] ^ of2; + Bp[A2+7] = B[A1+7] ^ B[A1+4] ^ B[A2+6] ^ B[A2+5] ^ B[A2+4] ^ B[A3+7] ^ B[A3+6] ^ B[A3+4] ^ B[A0+7] ^ B[A0+5] ^ B[A0+4]; + + + + //// state[i][3] = A0*0x0b + A1*0x0d + A2*0x09 + A3*0x0e + // overflow: + /* A2 * 0b1001 */ /* A3 * 0b1110 */ /* A0 * 0b1011 */ /* A1 * 0b1101 */ + of0 = ( (B[A2+5]) ^ (B[A3+7] ^ B[A3+6] ^ B[A3+5]) ^ (B[A0 + 7] ^ B[A0+5]) ^ (B[A1+6] ^ B[A1+5])); // 2 bit + of1 = ( (B[A2+6]) ^ (B[A3+7] ^ B[A3+6]) ^ ( B[A0+6]) ^ (B[A1+7] ^ B[A1+6])); // 3 bit + of2 = ( (B[A2+7]) ^ (B[A3+7]) ^ ( B[A0+7]) ^ ( B[A1+7])); // 4 bit + + // inverse: + // 1001 1110 1011 1101 + // A2 = A2 * 9 + A3 * 14 + A0 * 11 + A1 * 13 + // A2 = A2 * (1+8) + A3 * (2+4+8) + A0 * (1+2+8) + A1 * (1+4+8) + + // (1*A2 + 8*A2) + ( 2*A3 + 4*A3 + 8*A3) + (1*A0 2*A0 + 8*A0) + (1*A1 + 4*A1 + 8*A1) + Bp[A3+0] = B[A2+0] ^ B[A0+0] ^ B[A1+0] ^ of0; + Bp[A3+1] = B[A2+1] ^ B[A3+0] ^ B[A0+1] ^ B[A0+0] ^ B[A1+1] ^ of0 ^ of1; + Bp[A3+2] = B[A2+2] ^ B[A3+1] ^ B[A3+0] ^ B[A0+2] ^ B[A0+1] ^ B[A1+2] ^ B[A1+0] ^ of1 ^ of2; + Bp[A3+3] = B[A2+3] ^ B[A2+0] ^ B[A3+2] ^ B[A3+1] ^ B[A3+0] ^ B[A0+3] ^ B[A0+2] ^ B[A0+0] ^ B[A1+3] ^ B[A1+1] ^ B[A1+0] ^ of0 ^ of2; + Bp[A3+4] = B[A2+4] ^ B[A2+1] ^ B[A3+3] ^ B[A3+2] ^ B[A3+1] ^ B[A0+4] ^ B[A0+3] ^ B[A0+1] ^ B[A1+4] ^ B[A1+2] ^ B[A1+1] ^ of0 ^ of1; + Bp[A3+5] = B[A2+5] ^ B[A2+2] ^ B[A3+4] ^ B[A3+3] ^ B[A3+2] ^ B[A0+5] ^ B[A0+4] ^ B[A0+2] ^ B[A1+5] ^ B[A1+3] ^ B[A1+2] ^ of1 ^ of2; + Bp[A3+6] = B[A2+6] ^ B[A2+3] ^ B[A3+5] ^ B[A3+4] ^ B[A3+3] ^ B[A0+6] ^ B[A0+5] ^ B[A0+3] ^ B[A1+6] ^ B[A1+4] ^ B[A1+3] ^ of2; + Bp[A3+7] = B[A2+7] ^ B[A2+4] ^ B[A3+6] ^ B[A3+5] ^ B[A3+4] ^ B[A0+7] ^ B[A0+6] ^ B[A0+4] ^ B[A1+7] ^ B[A1+5] ^ B[A1+4]; + + Bp += BLOCK_SIZE/4; + B += BLOCK_SIZE/4; + } + + memmove(B - BLOCK_SIZE, Bp - BLOCK_SIZE,sizeof(Bp_space)); + +} + +void bs_expand_key(word_t (* rk)[BLOCK_SIZE], uint8_t * _key) +{ + // TODO integrate this better + uint8_t key[KEY_SCHEDULE_SIZE]; + memmove(key,_key,BLOCK_SIZE/8); + expand_key(key); + + int i, j = 0, k, l; + for (i = 0; i < KEY_SCHEDULE_SIZE; i += (BLOCK_SIZE/8)) + { + memmove(rk[j], key + i, BLOCK_SIZE / 8); + + for (k = WORDS_PER_BLOCK; k < 128; k += WORDS_PER_BLOCK) + { + for (l = 0; l < WORDS_PER_BLOCK; l++) + { + rk[j][k + l] = rk[j][l]; + } + } + bs_transpose(rk[j]); + j++; + } + +} + +void bs_cipher(word_t state[BLOCK_SIZE], word_t (* rk)[BLOCK_SIZE]) +{ + int round; + bs_transpose(state); + + + bs_addroundkey(state,rk[0]); + for (round = 1; round < 10; round++) + { + bs_apply_sbox(state); + /*bs_shiftrows(state);*/ + /*bs_mixcolumns(state);*/ + bs_shiftmix(state); + bs_addroundkey(state,rk[round]); + } + bs_apply_sbox(state); + bs_shiftrows(state); + bs_addroundkey(state,rk[10]); + bs_transpose_rev(state); +} + +void bs_cipher_rev(word_t state[BLOCK_SIZE], word_t (* rk)[BLOCK_SIZE]) +{ + int round; + bs_transpose(state); + + bs_addroundkey(state,rk[10]); + for (round = 9; round > 0; round--) + { + bs_shiftrows_rev(state); + bs_apply_sbox_rev(state); + bs_addroundkey(state,rk[round]); + bs_mixcolumns_rev(state); + } + bs_shiftrows_rev(state); + bs_apply_sbox_rev(state); + bs_addroundkey(state,rk[0]); + + bs_transpose_rev(state); +} + +void aes_ecb_test() +{ + uint8_t key_vector[16] = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6\xab\xf7\x15\x88\x09\xcf\x4f\x3c"; + uint8_t pt_vector[16] = "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a"; + uint8_t ct_vector[16] = "\x3a\xd7\x7b\xb4\x0d\x7a\x36\x60\xa8\x9e\xca\xf3\x24\x66\xef\x97"; + uint8_t output[16]; + uint8_t input[16]; + + printf("AES ECB\n"); + + aes_ecb_encrypt(output, pt_vector,16,key_vector); + + + printf("cipher text: \n"); + dump_hex(output, 16); + + aes_ecb_decrypt(input, output, 16, key_vector); + + printf("plain text: \n"); + dump_hex((uint8_t * )input,16); + + if (memcmp(pt_vector, input, 16) != 0) + { + fprintf(stderr,"error: decrypted ciphertext is not the same as the input plaintext\n"); + EXIT1; + } + else if (memcmp(ct_vector, output, 16) != 0) + { + fprintf(stderr,"error: ciphertext is not the same as the test vector\n"); + EXIT1; + } + else + { + printf("ECB passes test vector\n\n"); + } +} + +void aes_ctr_test() +{ +// Test vector from NIST for 4 input blocks +#define AES_CTR_TESTS_BYTES 64 + + uint8_t key_vector[16] = + "\x2b\x7e\x15\x16\x28\xae\xd2\xa6\xab\xf7\x15\x88\x09\xcf\x4f\x3c"; + + uint8_t iv_vector[16] = + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"; + + uint8_t pt_vector[AES_CTR_TESTS_BYTES] = + "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96\xe9\x3d\x7e\x11\x73\x93\x17\x2a" + "\xae\x2d\x8a\x57\x1e\x03\xac\x9c\x9e\xb7\x6f\xac\x45\xaf\x8e\x51" + "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11\xe5\xfb\xc1\x19\x1a\x0a\x52\xef" + "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17\xad\x2b\x41\x7b\xe6\x6c\x37\x10" + ; + + uint8_t ct_vector[AES_CTR_TESTS_BYTES] = + "\x87\x4d\x61\x91\xb6\x20\xe3\x26\x1b\xef\x68\x64\x99\x0d\xb6\xce" + "\x98\x06\xf6\x6b\x79\x70\xfd\xff\x86\x17\x18\x7b\xb9\xff\xfd\xff" + "\x5a\xe4\xdf\x3e\xdb\xd5\xd3\x5e\x5b\x4f\x09\x02\x0d\xb0\x3e\xab" + "\x1e\x03\x1d\xda\x2f\xbe\x03\xd1\x79\x21\x70\xa0\xf3\x00\x9c\xee" + ; + + uint8_t output[AES_CTR_TESTS_BYTES]; + uint8_t input[AES_CTR_TESTS_BYTES]; + + printf("AES CTR\n"); + + aes_ctr_encrypt(output,pt_vector,AES_CTR_TESTS_BYTES,key_vector, iv_vector); + + printf("cipher text: \n"); + dump_hex(output,AES_CTR_TESTS_BYTES); + + aes_ctr_decrypt(input,output,AES_CTR_TESTS_BYTES,key_vector, iv_vector); + + printf("plain text: \n"); + dump_hex(input,AES_CTR_TESTS_BYTES); + + if (memcmp(pt_vector, input, AES_CTR_TESTS_BYTES) != 0) + { + fprintf(stderr,"error: decrypted ciphertext is not the same as the input plaintext\n"); + EXIT1; + } + else if (memcmp(ct_vector, output, AES_CTR_TESTS_BYTES) != 0) + { + fprintf(stderr,"error: ciphertext is not the same as the test vector\n"); + EXIT1; + } + else + { + printf("CTR passes test vector\n\n"); + } + +} + + +int main(int argc, char * argv[]) +{ + clock_prepare(); + + clock_start(); + + aes_ecb_test(); + aes_ctr_test(); + + + clock_stop(); + print_total_clock(); + + return 0; +} diff --git a/test/monniaux/bitsliced-aes/one_file/compare.sh b/test/monniaux/bitsliced-aes/one_file/compare.sh new file mode 100755 index 00000000..276a95ee --- /dev/null +++ b/test/monniaux/bitsliced-aes/one_file/compare.sh @@ -0,0 +1,12 @@ +#!/bin/bash +ROOT=/home/monniaux/work/Kalray/CompCert +SRC=bitsliced-aes.c +k1-mbr-gcc -Werror=implicit -Werror=uninitialized -O3 $SRC $ROOT/test/monniaux/clock.gcc.k1c.o -o bitsliced-aes.gcc.k1c && +$ROOT/ccomp -O3 -fno-unprototyped -O3 $SRC $ROOT/test/monniaux/clock.gcc.k1c.o -o bitsliced-aes.ccomp.k1c && +gcc -Werror=implicit -Werror=uninitialized -O3 $SRC $ROOT/test/monniaux/clock.gcc.host.o -o bitsliced-aes.gcc.host && +valgrind ./bitsliced-aes.gcc.host && +k1-cluster -- ./bitsliced-aes.gcc.k1c > ./bitsliced-aes.gcc.k1c.out && +k1-cluster -- ./bitsliced-aes.ccomp.k1c > ./bitsliced-aes.ccomp.k1c.out && +grep cycles ./bitsliced-aes.gcc.k1c.out | sed -e 's/cycles: //' > ./bitsliced-aes.gcc.k1c.cycles && +grep cycles ./bitsliced-aes.ccomp.k1c.out | sed -e 's/cycles: //' > ./bitsliced-aes.ccomp.k1c.cycles && +test $(cat ./bitsliced-aes.ccomp.k1c.cycles) -gt $(expr 2 '*' $(cat ./bitsliced-aes.gcc.k1c.cycles)) diff --git a/test/monniaux/bitsliced-aes/one_file/reduce/bitsliced-aes_compute.c b/test/monniaux/bitsliced-aes/one_file/reduce/bitsliced-aes_compute.c new file mode 100644 index 00000000..5294ff1d --- /dev/null +++ b/test/monniaux/bitsliced-aes/one_file/reduce/bitsliced-aes_compute.c @@ -0,0 +1,32 @@ +#include <stdint.h> +#include <string.h> +int i[1]; +int j, bs_transpose_dst_k, k, s, o; +void a(int (*)[], uint8_t *); +void b(uint8_t c, uint8_t d, size_t e, uint8_t f, uint8_t g) { + int l[1]; + a(l, f); +} +void a(int (*l)[], uint8_t *m) { + for (; o < 76; o += 8) { + { + int *n = i; + bs_transpose_dst_k = 0; + for (; bs_transpose_dst_k < 64; bs_transpose_dst_k++) { + j = 0; + for (; j < 64; j++) { + k = &s; + n[j] = k & 1; + } + } + } + } +} +void aes_ecb_test() {} +void aes_ctr_test() { + uint8_t p = ""; + uint8_t q = ""; + uint8_t r = ""; + uint8_t output[4]; + b(output, r, 4, p, q); +} diff --git a/test/monniaux/bitsliced-aes/one_file/reduce/bitsliced-aes_main.c b/test/monniaux/bitsliced-aes/one_file/reduce/bitsliced-aes_main.c new file mode 100644 index 00000000..0d48b3b8 --- /dev/null +++ b/test/monniaux/bitsliced-aes/one_file/reduce/bitsliced-aes_main.c @@ -0,0 +1,20 @@ +#include "/home/monniaux/work/Kalray/CompCert/test/monniaux/clock.h" + +void aes_ecb_test(void); +void aes_ctr_test(void); + +int main(int argc, char * argv[]) +{ + clock_prepare(); + + clock_start(); + + aes_ecb_test(); + aes_ctr_test(); + + + clock_stop(); + print_total_clock(); + + return 0; +} diff --git a/test/monniaux/bitsliced-aes/one_file/reduce/compare.sh b/test/monniaux/bitsliced-aes/one_file/reduce/compare.sh new file mode 100755 index 00000000..a21bb465 --- /dev/null +++ b/test/monniaux/bitsliced-aes/one_file/reduce/compare.sh @@ -0,0 +1,16 @@ +#!/bin/bash +ROOT=/home/monniaux/work/Kalray/CompCert +SRC=bitsliced-aes_compute.c +MAIN=/home/monniaux/work/Kalray/CompCert/test/monniaux/bitsliced-aes/one_file/reduce/bitsliced-aes_main +k1-mbr-gcc -Werror=implicit -Werror=uninitialized -O3 $SRC $ROOT/test/monniaux/clock.gcc.k1c.o $MAIN.gcc.k1c.o -o bitsliced-aes.gcc.k1c && +$ROOT/ccomp -O3 -fno-unprototyped -O3 $SRC $ROOT/test/monniaux/clock.gcc.k1c.o $MAIN.gcc.k1c.o -o bitsliced-aes.ccomp.k1c && +gcc -Werror=implicit -Werror=uninitialized -O3 $SRC $ROOT/test/monniaux/clock.gcc.host.o $MAIN.c -o bitsliced-aes.gcc.host && +valgrind ./bitsliced-aes.gcc.host && +k1-cluster --cycle-based -- ./bitsliced-aes.gcc.k1c > ./bitsliced-aes.gcc.k1c.out && +k1-cluster --cycle-based -- ./bitsliced-aes.ccomp.k1c > ./bitsliced-aes.ccomp.k1c.out && +grep cycles ./bitsliced-aes.gcc.k1c.out > ./bitsliced-aes.gcc.k1c.cycles && +grep cycles ./bitsliced-aes.ccomp.k1c.out > ./bitsliced-aes.ccomp.k1c.cycles && +sed -i -e 's/cycles: //' ./bitsliced-aes.gcc.k1c.cycles && +sed -i -e 's/cycles: //' ./bitsliced-aes.ccomp.k1c.cycles && +test $(cat ./bitsliced-aes.gcc.k1c.cycles) -gt 100000 && +test $(cat ./bitsliced-aes.ccomp.k1c.cycles) -gt $(expr 2 '*' $(cat ./bitsliced-aes.gcc.k1c.cycles)) diff --git a/test/monniaux/ocaml/Makefile b/test/monniaux/ocaml/Makefile index 46ce8994..0ae7c22f 100644 --- a/test/monniaux/ocaml/Makefile +++ b/test/monniaux/ocaml/Makefile @@ -1,7 +1,29 @@ -test: byterun/ocamlrun - k1-cluster --syscall=libstd_scalls.so -- byterun/ocamlrun examples/quicksort +ALL_CFLAGS=-Ibyterun +EXECUTE_ARGS=examples/quicksort -byterun/ocamlrun: - (cd byterun ; $(MAKE)) +include ../rules.mk -.PHONY: test +ALL_CCOMPFLAGS= +LDLIBS=-lm + +CFILES=$(wildcard byterun/*.c) + +CCOMP_K1C_S=$(patsubst %.c,%.ccomp.k1c.s,$(CFILES)) +CCOMP_HOST_S=$(patsubst %.c,%.ccomp.host.s,$(CFILES)) + +GCC_K1C_S=$(patsubst %.c,%.gcc.k1c.s,$(CFILES)) +GCC_HOST_S=$(patsubst %.c,%.gcc.host.s,$(CFILES)) + +all: $(CCOMP_K1C_S) $(GCC_K1C_S) ocamlrun.ccomp.k1c.out ocamlrun.gcc.k1c.out + +ocamlrun.ccomp.k1c : $(CCOMP_K1C_S) ../clock.gcc.k1c.o + $(K1C_CCOMP) $(K1C_CCOMPFLAGS) $+ -o $@ $(LDLIBS) + +ocamlrun.ccomp.host : $(CCOMP_HOST_S) ../clock.gcc.host.o + $(CCOMP) $(CCOMPFLAGS) $+ -o $@ $(LDLIBS) + +ocamlrun.gcc.k1c : $(GCC_K1C_S) ../clock.gcc.k1c.o + $(K1C_CC) $(K1C_CFLAGS) $+ -o $@ $(LDLIBS) + +ocamlrun.gcc.host : $(GCC_HOST_S) ../clock.gcc.host.o + $(CC) $(CFLAGS) $+ -o $@ $(LDLIBS) diff --git a/test/monniaux/ocaml/byterun/caml/finalise.h b/test/monniaux/ocaml/byterun/caml/finalise.h index 5315ac21..b2052c21 100644 --- a/test/monniaux/ocaml/byterun/caml/finalise.h +++ b/test/monniaux/ocaml/byterun/caml/finalise.h @@ -25,7 +25,7 @@ void caml_final_update_clean_phase (void); void caml_final_do_calls (void); void caml_final_do_roots (scanning_action f); void caml_final_invert_finalisable_values (); -void caml_final_oldify_young_roots (); +void caml_final_oldify_young_roots (void); void caml_final_empty_young (void); void caml_final_update_minor_roots(void); value caml_final_register (value f, value v); diff --git a/test/monniaux/ocaml/byterun/caml/version.h b/test/monniaux/ocaml/byterun/caml/version.h new file mode 100644 index 00000000..68d7000e --- /dev/null +++ b/test/monniaux/ocaml/byterun/caml/version.h @@ -0,0 +1,6 @@ +#define OCAML_VERSION_MAJOR 4 +#define OCAML_VERSION_MINOR 7 +#define OCAML_VERSION_PATCHLEVEL 1 +#undef OCAML_VERSION_ADDITIONAL +#define OCAML_VERSION 40701 +#define OCAML_VERSION_STRING "4.07.1" diff --git a/test/monniaux/ocaml/byterun/compact.c b/test/monniaux/ocaml/byterun/compact.c index 7b7188ab..83e7ed0a 100644 --- a/test/monniaux/ocaml/byterun/compact.c +++ b/test/monniaux/ocaml/byterun/compact.c @@ -32,6 +32,7 @@ extern uintnat caml_percent_free; /* major_gc.c */ extern void caml_shrink_heap (char *); /* memory.c */ +extern void caml_final_invert_finalisable_values (void); /* Encoded headers: the color is stored in the 2 least significant bits. (For pointer inversion, we need to distinguish headers from pointers.) diff --git a/test/monniaux/ocaml/byterun/main.c b/test/monniaux/ocaml/byterun/main.c index 5e5839ff..498f3d18 100644 --- a/test/monniaux/ocaml/byterun/main.c +++ b/test/monniaux/ocaml/byterun/main.c @@ -13,6 +13,7 @@ /* */ /**************************************************************************/ +#define VERIMAG_MEASUREMENTS #define CAML_INTERNALS /* Main entry point (can be overridden by a user-provided main() @@ -26,6 +27,10 @@ #include <windows.h> #endif +#ifdef VERIMAG_MEASUREMENTS +#include "../../clock.h" +#endif + CAMLextern void caml_main (char_os **); #ifdef _WIN32 @@ -41,7 +46,15 @@ int main(int argc, char **argv) caml_expand_command_line(&argc, &argv); #endif +#ifdef VERIMAG_MEASUREMENTS + clock_prepare(); + clock_start(); +#endif caml_main(argv); +#ifdef VERIMAG_MEASUREMENTS + clock_stop(); + print_total_clock(); +#endif caml_sys_exit(Val_int(0)); return 0; /* not reached */ } diff --git a/test/monniaux/ocaml/byterun/win32.c b/test/monniaux/ocaml/byterun/win32.c deleted file mode 100644 index 1ce8ad5e..00000000 --- a/test/monniaux/ocaml/byterun/win32.c +++ /dev/null @@ -1,1019 +0,0 @@ -/**************************************************************************/ -/* */ -/* OCaml */ -/* */ -/* Xavier Leroy, projet Cristal, INRIA Rocquencourt */ -/* */ -/* Copyright 1996 Institut National de Recherche en Informatique et */ -/* en Automatique. */ -/* */ -/* All rights reserved. This file is distributed under the terms of */ -/* the GNU Lesser General Public License version 2.1, with the */ -/* special exception on linking described in the file LICENSE. */ -/* */ -/**************************************************************************/ - -#define CAML_INTERNALS - -/* Win32-specific stuff */ - -/* FILE_INFO_BY_HANDLE_CLASS and FILE_NAME_INFO are only available from Windows - Vista onwards */ -#undef _WIN32_WINNT -#define _WIN32_WINNT 0x0600 - -#define WIN32_LEAN_AND_MEAN -#include <wtypes.h> -#include <winbase.h> -#include <winsock2.h> -#include <stdlib.h> -#include <stdio.h> -#include <stdarg.h> -#include <io.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <ctype.h> -#include <errno.h> -#include <string.h> -#include <signal.h> -#include "caml/alloc.h" -#include "caml/address_class.h" -#include "caml/fail.h" -#include "caml/io.h" -#include "caml/memory.h" -#include "caml/misc.h" -#include "caml/osdeps.h" -#include "caml/signals.h" -#include "caml/sys.h" - -#include "caml/config.h" -#ifdef SUPPORT_DYNAMIC_LINKING -#include <flexdll.h> -#endif - -#ifndef S_ISREG -#define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) -#endif - -unsigned short caml_win32_major = 0; -unsigned short caml_win32_minor = 0; -unsigned short caml_win32_build = 0; -unsigned short caml_win32_revision = 0; - -CAMLnoreturn_start -static void caml_win32_sys_error (int errnum) -CAMLnoreturn_end; - -static void caml_win32_sys_error(int errnum) -{ - wchar_t buffer[512]; - value msg; - if (FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, - NULL, - errnum, - 0, - buffer, - sizeof(buffer)/sizeof(wchar_t), - NULL)) { - msg = caml_copy_string_of_utf16(buffer); - } else { - msg = caml_alloc_sprintf("unknown error #%d", errnum); - } - caml_raise_sys_error(msg); -} - -int caml_read_fd(int fd, int flags, void * buf, int n) -{ - int retcode; - if ((flags & CHANNEL_FLAG_FROM_SOCKET) == 0) { - caml_enter_blocking_section(); - retcode = read(fd, buf, n); - /* Large reads from console can fail with ENOMEM. Reduce requested size - and try again. */ - if (retcode == -1 && errno == ENOMEM && n > 16384) { - retcode = read(fd, buf, 16384); - } - caml_leave_blocking_section(); - if (retcode == -1) caml_sys_io_error(NO_ARG); - } else { - caml_enter_blocking_section(); - retcode = recv((SOCKET) _get_osfhandle(fd), buf, n, 0); - caml_leave_blocking_section(); - if (retcode == -1) caml_win32_sys_error(WSAGetLastError()); - } - return retcode; -} - -int caml_write_fd(int fd, int flags, void * buf, int n) -{ - int retcode; - if ((flags & CHANNEL_FLAG_FROM_SOCKET) == 0) { -#if defined(NATIVE_CODE) && defined(WITH_SPACETIME) - if (flags & CHANNEL_FLAG_BLOCKING_WRITE) { - retcode = write(fd, buf, n); - } else { -#endif - caml_enter_blocking_section(); - retcode = write(fd, buf, n); - caml_leave_blocking_section(); -#if defined(NATIVE_CODE) && defined(WITH_SPACETIME) - } -#endif - if (retcode == -1) caml_sys_io_error(NO_ARG); - } else { - caml_enter_blocking_section(); - retcode = send((SOCKET) _get_osfhandle(fd), buf, n, 0); - caml_leave_blocking_section(); - if (retcode == -1) caml_win32_sys_error(WSAGetLastError()); - } - CAMLassert (retcode > 0); - return retcode; -} - -wchar_t * caml_decompose_path(struct ext_table * tbl, wchar_t * path) -{ - wchar_t * p, * q; - int n; - - if (path == NULL) return NULL; - p = caml_stat_wcsdup(path); - q = p; - while (1) { - for (n = 0; q[n] != 0 && q[n] != L';'; n++) /*nothing*/; - caml_ext_table_add(tbl, q); - q = q + n; - if (*q == 0) break; - *q = 0; - q += 1; - } - return p; -} - -wchar_t * caml_search_in_path(struct ext_table * path, const wchar_t * name) -{ - wchar_t * dir, * fullname; - char * u8; - const wchar_t * p; - int i; - struct _stati64 st; - - for (p = name; *p != 0; p++) { - if (*p == '/' || *p == '\\') goto not_found; - } - for (i = 0; i < path->size; i++) { - dir = path->contents[i]; - if (dir[0] == 0) continue; - /* not sure what empty path components mean under Windows */ - fullname = caml_stat_wcsconcat(3, dir, L"\\", name); - u8 = caml_stat_strdup_of_utf16(fullname); - caml_gc_message(0x100, "Searching %s\n", u8); - caml_stat_free(u8); - if (_wstati64(fullname, &st) == 0 && S_ISREG(st.st_mode)) - return fullname; - caml_stat_free(fullname); - } - not_found: - u8 = caml_stat_strdup_of_utf16(name); - caml_gc_message(0x100, "%s not found in search path\n", u8); - caml_stat_free(u8); - return caml_stat_wcsdup(name); -} - -CAMLexport wchar_t * caml_search_exe_in_path(const wchar_t * name) -{ - wchar_t * fullname, * filepart; - char * u8; - size_t fullnamelen; - DWORD retcode; - - fullnamelen = wcslen(name) + 1; - if (fullnamelen < 256) fullnamelen = 256; - while (1) { - fullname = caml_stat_alloc(fullnamelen*sizeof(wchar_t)); - retcode = SearchPath(NULL, /* use system search path */ - name, - L".exe", /* add .exe extension if needed */ - fullnamelen, - fullname, - &filepart); - if (retcode == 0) { - u8 = caml_stat_strdup_of_utf16(name); - caml_gc_message(0x100, "%s not found in search path\n", u8); - caml_stat_free(u8); - caml_stat_free(fullname); - return caml_stat_strdup_os(name); - } - if (retcode < fullnamelen) - return fullname; - caml_stat_free(fullname); - fullnamelen = retcode + 1; - } -} - -wchar_t * caml_search_dll_in_path(struct ext_table * path, const wchar_t * name) -{ - wchar_t * dllname; - wchar_t * res; - - dllname = caml_stat_wcsconcat(2, name, L".dll"); - res = caml_search_in_path(path, dllname); - caml_stat_free(dllname); - return res; -} - -#ifdef SUPPORT_DYNAMIC_LINKING - -void * caml_dlopen(wchar_t * libname, int for_execution, int global) -{ - void *handle; - int flags = (global ? FLEXDLL_RTLD_GLOBAL : 0); - if (!for_execution) flags |= FLEXDLL_RTLD_NOEXEC; - handle = flexdll_wdlopen(libname, flags); - if ((handle != NULL) && ((caml_verb_gc & 0x100) != 0)) { - flexdll_dump_exports(handle); - fflush(stdout); - } - return handle; -} - -void caml_dlclose(void * handle) -{ - flexdll_dlclose(handle); -} - -void * caml_dlsym(void * handle, const char * name) -{ - return flexdll_dlsym(handle, name); -} - -void * caml_globalsym(const char * name) -{ - return flexdll_dlsym(flexdll_dlopen(NULL,0), name); -} - -char * caml_dlerror(void) -{ - return flexdll_dlerror(); -} - -#else - -void * caml_dlopen(wchar_t * libname, int for_execution, int global) -{ - return NULL; -} - -void caml_dlclose(void * handle) -{ -} - -void * caml_dlsym(void * handle, const char * name) -{ - return NULL; -} - -void * caml_globalsym(const char * name) -{ - return NULL; -} - -char * caml_dlerror(void) -{ - return "dynamic loading not supported on this platform"; -} - -#endif - -/* Proper emulation of signal(), including ctrl-C and ctrl-break */ - -typedef void (*sighandler)(int sig); -static int ctrl_handler_installed = 0; -static volatile sighandler ctrl_handler_action = SIG_DFL; - -static BOOL WINAPI ctrl_handler(DWORD event) -{ - /* Only ctrl-C and ctrl-Break are handled */ - if (event != CTRL_C_EVENT && event != CTRL_BREAK_EVENT) return FALSE; - /* Default behavior is to exit, which we get by not handling the event */ - if (ctrl_handler_action == SIG_DFL) return FALSE; - /* Ignore behavior is to do nothing, which we get by claiming that we - have handled the event */ - if (ctrl_handler_action == SIG_IGN) return TRUE; - /* Win32 doesn't like it when we do a longjmp() at this point - (it looks like we're running in a different thread than - the main program!). So, just record the signal. */ - caml_record_signal(SIGINT); - /* We have handled the event */ - return TRUE; -} - -sighandler caml_win32_signal(int sig, sighandler action) -{ - sighandler oldaction; - - if (sig != SIGINT) return signal(sig, action); - if (! ctrl_handler_installed) { - SetConsoleCtrlHandler(ctrl_handler, TRUE); - ctrl_handler_installed = 1; - } - oldaction = ctrl_handler_action; - ctrl_handler_action = action; - return oldaction; -} - -/* Expansion of @responsefile and *? file patterns in the command line */ - -static int argc; -static wchar_t ** argv; -static int argvsize; - -static void store_argument(wchar_t * arg); -static void expand_argument(wchar_t * arg); -static void expand_pattern(wchar_t * arg); - -static void out_of_memory(void) -{ - fprintf(stderr, "Out of memory while expanding command line\n"); - exit(2); -} - -static void store_argument(wchar_t * arg) -{ - if (argc + 1 >= argvsize) { - argvsize *= 2; - argv = (wchar_t **) caml_stat_resize_noexc(argv, argvsize * sizeof(wchar_t *)); - if (argv == NULL) out_of_memory(); - } - argv[argc++] = arg; -} - -static void expand_argument(wchar_t * arg) -{ - wchar_t * p; - - for (p = arg; *p != 0; p++) { - if (*p == L'*' || *p == L'?') { - expand_pattern(arg); - return; - } - } - store_argument(arg); -} - -static void expand_pattern(wchar_t * pat) -{ - wchar_t * prefix, * p, * name; - intptr_t handle; - struct _wfinddata_t ffblk; - size_t i; - - handle = _wfindfirst(pat, &ffblk); - if (handle == -1) { - store_argument(pat); /* a la Bourne shell */ - return; - } - prefix = caml_stat_wcsdup(pat); - /* We need to stop at the first directory or drive boundary, because the - * _findata_t structure contains the filename, not the leading directory. */ - for (i = wcslen(prefix); i > 0; i--) { - wchar_t c = prefix[i - 1]; - if (c == L'\\' || c == L'/' || c == L':') { prefix[i] = 0; break; } - } - /* No separator was found, it's a filename pattern without a leading directory. */ - if (i == 0) - prefix[0] = 0; - do { - name = caml_stat_wcsconcat(2, prefix, ffblk.name); - store_argument(name); - } while (_wfindnext(handle, &ffblk) != -1); - _findclose(handle); - caml_stat_free(prefix); -} - - -CAMLexport void caml_expand_command_line(int * argcp, wchar_t *** argvp) -{ - int i; - argc = 0; - argvsize = 16; - argv = (wchar_t **) caml_stat_alloc_noexc(argvsize * sizeof(wchar_t *)); - if (argv == NULL) out_of_memory(); - for (i = 0; i < *argcp; i++) expand_argument((*argvp)[i]); - argv[argc] = NULL; - *argcp = argc; - *argvp = argv; -} - -/* Add to [contents] the (short) names of the files contained in - the directory named [dirname]. No entries are added for [.] and [..]. - Return 0 on success, -1 on error; set errno in the case of error. */ - -int caml_read_directory(wchar_t * dirname, struct ext_table * contents) -{ - size_t dirnamelen; - wchar_t * template; - intptr_t h; - struct _wfinddata_t fileinfo; - - dirnamelen = wcslen(dirname); - if (dirnamelen > 0 && - (dirname[dirnamelen - 1] == L'/' - || dirname[dirnamelen - 1] == L'\\' - || dirname[dirnamelen - 1] == L':')) - template = caml_stat_wcsconcat(2, dirname, L"*.*"); - else - template = caml_stat_wcsconcat(2, dirname, L"\\*.*"); - h = _wfindfirst(template, &fileinfo); - if (h == -1) { - caml_stat_free(template); - return errno == ENOENT ? 0 : -1; - } - do { - if (wcscmp(fileinfo.name, L".") != 0 && wcscmp(fileinfo.name, L"..") != 0) { - caml_ext_table_add(contents, caml_stat_strdup_of_utf16(fileinfo.name)); - } - } while (_wfindnext(h, &fileinfo) == 0); - _findclose(h); - caml_stat_free(template); - return 0; -} - -#ifndef NATIVE_CODE - -/* Set up a new thread for control-C emulation and termination */ - -void caml_signal_thread(void * lpParam) -{ - wchar_t *endptr; - HANDLE h; - /* Get an hexa-code raw handle through the environment */ - h = (HANDLE) (uintptr_t) - wcstol(caml_secure_getenv(_T("CAMLSIGPIPE")), &endptr, 16); - while (1) { - DWORD numread; - BOOL ret; - char iobuf[2]; - /* This shall always return a single character */ - ret = ReadFile(h, iobuf, 1, &numread, NULL); - if (!ret || numread != 1) caml_sys_exit(Val_int(2)); - switch (iobuf[0]) { - case 'C': - caml_record_signal(SIGINT); - break; - case 'T': - raise(SIGTERM); - return; - } - } -} - -#endif /* NATIVE_CODE */ - -#if defined(NATIVE_CODE) - -/* Handling of system stack overflow. - * Based on code provided by Olivier Andrieu. - - * An EXCEPTION_STACK_OVERFLOW is signaled when the guard page at the - * end of the stack has been accessed. Windows clears the PAGE_GUARD - * protection (making it a regular PAGE_READWRITE) and then calls our - * exception handler. This means that although we're handling an "out - * of stack" condition, there is a bit of stack available to call - * functions and allocate temporaries. - * - * PAGE_GUARD is a one-shot access protection mechanism: we need to - * restore the PAGE_GUARD protection on this page otherwise the next - * stack overflow won't be detected and the program will abruptly exit - * with STATUS_ACCESS_VIOLATION. - * - * Visual Studio 2003 and later (_MSC_VER >= 1300) have a - * _resetstkoflw() function that resets this protection. - * Unfortunately, it cannot work when called directly from the - * exception handler because at this point we are using the page that - * is to be protected. - * - * A solution is to use an alternate stack when restoring the - * protection. However it's not possible to use _resetstkoflw() then - * since it determines the stack pointer by calling alloca(): it would - * try to protect the alternate stack. - * - * Finally, we call caml_raise_stack_overflow; it will either call - * caml_raise_exception which switches back to the normal stack, or - * call caml_fatal_uncaught_exception which terminates the program - * quickly. - */ - -static uintnat win32_alt_stack[0x100]; - -static void caml_reset_stack (void *faulting_address) -{ - SYSTEM_INFO si; - DWORD page_size; - MEMORY_BASIC_INFORMATION mbi; - DWORD oldprot; - - /* get the system's page size. */ - GetSystemInfo (&si); - page_size = si.dwPageSize; - - /* get some information on the page the fault occurred */ - if (! VirtualQuery (faulting_address, &mbi, sizeof mbi)) - goto failed; - - VirtualProtect (mbi.BaseAddress, page_size, - mbi.Protect | PAGE_GUARD, &oldprot); - - failed: - caml_raise_stack_overflow(); -} - - -#ifndef _WIN64 -static LONG CALLBACK - caml_stack_overflow_VEH (EXCEPTION_POINTERS* exn_info) -{ - DWORD code = exn_info->ExceptionRecord->ExceptionCode; - CONTEXT *ctx = exn_info->ContextRecord; - DWORD *ctx_ip = &(ctx->Eip); - DWORD *ctx_sp = &(ctx->Esp); - - if (code == EXCEPTION_STACK_OVERFLOW && Is_in_code_area (*ctx_ip)) - { - uintnat faulting_address; - uintnat * alt_esp; - - /* grab the address that caused the fault */ - faulting_address = exn_info->ExceptionRecord->ExceptionInformation[1]; - - /* call caml_reset_stack(faulting_address) using the alternate stack */ - alt_esp = win32_alt_stack + sizeof(win32_alt_stack) / sizeof(uintnat); - *--alt_esp = faulting_address; - *ctx_sp = (uintnat) (alt_esp - 1); - *ctx_ip = (uintnat) &caml_reset_stack; - - return EXCEPTION_CONTINUE_EXECUTION; - } - - return EXCEPTION_CONTINUE_SEARCH; -} - -#else -extern char *caml_exception_pointer; -extern value *caml_young_ptr; - -/* Do not use the macro from address_class.h here. */ -#undef Is_in_code_area -#define Is_in_code_area(pc) \ - ( ((char *)(pc) >= caml_code_area_start && \ - (char *)(pc) <= caml_code_area_end) \ -|| ((char *)(pc) >= &caml_system__code_begin && \ - (char *)(pc) <= &caml_system__code_end) \ -|| (Classify_addr(pc) & In_code_area) ) -extern char caml_system__code_begin, caml_system__code_end; - - -static LONG CALLBACK - caml_stack_overflow_VEH (EXCEPTION_POINTERS* exn_info) -{ - DWORD code = exn_info->ExceptionRecord->ExceptionCode; - CONTEXT *ctx = exn_info->ContextRecord; - - if (code == EXCEPTION_STACK_OVERFLOW && Is_in_code_area (ctx->Rip)) - { - uintnat faulting_address; - uintnat * alt_rsp; - - /* grab the address that caused the fault */ - faulting_address = exn_info->ExceptionRecord->ExceptionInformation[1]; - - /* refresh runtime parameters from registers */ - caml_exception_pointer = (char *) ctx->R14; - caml_young_ptr = (value *) ctx->R15; - - /* call caml_reset_stack(faulting_address) using the alternate stack */ - alt_rsp = win32_alt_stack + sizeof(win32_alt_stack) / sizeof(uintnat); - ctx->Rcx = faulting_address; - ctx->Rsp = (uintnat) (alt_rsp - 4 - 1); - ctx->Rip = (uintnat) &caml_reset_stack; - - return EXCEPTION_CONTINUE_EXECUTION; - } - - return EXCEPTION_CONTINUE_SEARCH; -} -#endif /* _WIN64 */ - -void caml_win32_overflow_detection(void) -{ - AddVectoredExceptionHandler(1, caml_stack_overflow_VEH); -} - -#endif /* NATIVE_CODE */ - -/* Seeding of pseudo-random number generators */ - -int caml_win32_random_seed (intnat data[16]) -{ - /* For better randomness, consider: - http://msdn.microsoft.com/library/en-us/seccrypto/security/rtlgenrandom.asp - http://blogs.msdn.com/b/michael_howard/archive/2005/01/14/353379.aspx - */ - FILETIME t; - LARGE_INTEGER pc; - GetSystemTimeAsFileTime(&t); - QueryPerformanceCounter(&pc); /* PR#6032 */ - data[0] = t.dwLowDateTime; - data[1] = t.dwHighDateTime; - data[2] = GetCurrentProcessId(); - data[3] = pc.LowPart; - data[4] = pc.HighPart; - return 5; -} - - -#if defined(_MSC_VER) && __STDC_SECURE_LIB__ >= 200411L - -static void invalid_parameter_handler(const wchar_t* expression, - const wchar_t* function, - const wchar_t* file, - unsigned int line, - uintptr_t pReserved) -{ - /* no crash box */ -} - - -void caml_install_invalid_parameter_handler() -{ - _set_invalid_parameter_handler(invalid_parameter_handler); -} - -#endif - - -/* Recover executable name */ - -wchar_t * caml_executable_name(void) -{ - wchar_t * name; - DWORD namelen, ret; - - namelen = 256; - while (1) { - name = caml_stat_alloc(namelen*sizeof(wchar_t)); - ret = GetModuleFileName(NULL, name, namelen); - if (ret == 0) { caml_stat_free(name); return NULL; } - if (ret < namelen) break; - caml_stat_free(name); - if (namelen >= 1024*1024) return NULL; /* avoid runaway and overflow */ - namelen *= 2; - } - return name; -} - -/* snprintf emulation */ - -#ifdef LACKS_VSCPRINTF -/* No _vscprintf until Visual Studio .NET 2002 and sadly no version number - in the CRT headers until Visual Studio 2005 so forced to predicate this - on the compiler version instead */ -int _vscprintf(const char * format, va_list args) -{ - int n; - int sz = 5; - char* buf = (char*)malloc(sz); - n = _vsnprintf(buf, sz, format, args); - while (n < 0 || n > sz) { - sz += 512; - buf = (char*)realloc(buf, sz); - n = _vsnprintf(buf, sz, format, args); - } - free(buf); - return n; -} -#endif - -#if defined(_WIN32) && !defined(_UCRT) -int caml_snprintf(char * buf, size_t size, const char * format, ...) -{ - int len; - va_list args; - - if (size > 0) { - va_start(args, format); - len = _vsnprintf(buf, size, format, args); - va_end(args); - if (len >= 0 && len < size) { - /* [len] characters were stored in [buf], - a null-terminator was appended. */ - return len; - } - /* [size] characters were stored in [buf], without null termination. - Put a null terminator, truncating the output. */ - buf[size - 1] = 0; - } - /* Compute the actual length of output, excluding null terminator */ - va_start(args, format); - len = _vscprintf(format, args); - va_end(args); - return len; -} -#endif - -wchar_t *caml_secure_getenv (wchar_t const *var) -{ - /* Win32 doesn't have a notion of setuid bit, so getenv is safe. */ - return _wgetenv(var); -} - -/* caml_win32_getenv is used to implement Sys.getenv and Unix.getenv in such a - way that they get direct access to the Win32 environment rather than to the - copy that is cached by the C runtime system. The result of caml_win32_getenv - is dynamically allocated and must be explicitly deallocated. - - In contrast, the OCaml runtime system still calls _wgetenv from the C runtime - system, via caml_secure_getenv. The result is statically allocated and needs - no deallocation. */ -CAMLexport wchar_t *caml_win32_getenv(wchar_t const *lpName) -{ - wchar_t * lpBuffer; - DWORD nSize = 256, res; - - lpBuffer = caml_stat_alloc_noexc(nSize * sizeof(wchar_t)); - - if (lpBuffer == NULL) - return NULL; - - res = GetEnvironmentVariable(lpName, lpBuffer, nSize); - - if (res == 0) { - caml_stat_free(lpBuffer); - return NULL; - } - - if (res < nSize) - return lpBuffer; - - nSize = res; - lpBuffer = caml_stat_resize_noexc(lpBuffer, nSize * sizeof(wchar_t)); - - if (lpBuffer == NULL) - return NULL; - - res = GetEnvironmentVariable(lpName, lpBuffer, nSize); - - if (res == 0 || res >= nSize) { - caml_stat_free(lpBuffer); - return NULL; - } - - return lpBuffer; -} - -/* The rename() implementation in MSVC's CRT is based on MoveFile() - and therefore fails if the new name exists. This is inconsistent - with POSIX and a problem in practice. Here we reimplement - rename() using MoveFileEx() to make it more POSIX-like. - There are no official guarantee that the rename operation is atomic, - but it is widely believed to be atomic on NTFS. */ - -int caml_win32_rename(const wchar_t * oldpath, const wchar_t * newpath) -{ - /* MOVEFILE_REPLACE_EXISTING: to be closer to POSIX - MOVEFILE_COPY_ALLOWED: MoveFile performs a copy if old and new - paths are on different devices, so we do the same here for - compatibility with the old rename()-based implementation. - MOVEFILE_WRITE_THROUGH: not sure it's useful; affects only - the case where a copy is done. */ - if (MoveFileEx(oldpath, newpath, - MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH | - MOVEFILE_COPY_ALLOWED)) { - return 0; - } - /* Modest attempt at mapping Win32 error codes to POSIX error codes. - The __dosmaperr() function from the CRT does a better job but is - generally not accessible. */ - switch (GetLastError()) { - case ERROR_FILE_NOT_FOUND: case ERROR_PATH_NOT_FOUND: - errno = ENOENT; break; - case ERROR_ACCESS_DENIED: case ERROR_WRITE_PROTECT: case ERROR_CANNOT_MAKE: - errno = EACCES; break; - case ERROR_CURRENT_DIRECTORY: case ERROR_BUSY: - errno = EBUSY; break; - case ERROR_NOT_SAME_DEVICE: - errno = EXDEV; break; - case ERROR_ALREADY_EXISTS: - errno = EEXIST; break; - default: - errno = EINVAL; - } - return -1; -} - -/* Windows Unicode support */ -static uintnat windows_unicode_enabled = WINDOWS_UNICODE; - -/* If [windows_unicode_strict] is non-zero, then illegal UTF-8 characters (on - the OCaml side) or illegal UTF-16 characters (on the Windows side) cause an - error to be signaled. What happens then depends on the variable - [windows_unicode_fallback]. - - If [windows_unicode_strict] is zero, then illegal characters are silently - dropped. */ -static uintnat windows_unicode_strict = 1; - -/* If [windows_unicode_fallback] is non-zero, then if an error is signaled when - translating to UTF-16, the translation is re-done under the assumption that - the argument string is encoded in the local codepage. */ -static uintnat windows_unicode_fallback = 1; - -CAMLexport int win_multi_byte_to_wide_char(const char *s, int slen, wchar_t *out, int outlen) -{ - int retcode; - - CAMLassert (s != NULL); - - if (slen == 0) - return 0; - - if (windows_unicode_enabled != 0) { - retcode = MultiByteToWideChar(CP_UTF8, windows_unicode_strict ? MB_ERR_INVALID_CHARS : 0, s, slen, out, outlen); - if (retcode == 0 && windows_unicode_fallback != 0) - retcode = MultiByteToWideChar(CP_THREAD_ACP, 0, s, slen, out, outlen); - } else { - retcode = MultiByteToWideChar(CP_THREAD_ACP, 0, s, slen, out, outlen); - } - - if (retcode == 0) - caml_win32_sys_error(GetLastError()); - - return retcode; -} - -#ifndef WC_ERR_INVALID_CHARS /* For old versions of Windows we simply ignore the flag */ -#define WC_ERR_INVALID_CHARS 0 -#endif - -CAMLexport int win_wide_char_to_multi_byte(const wchar_t *s, int slen, char *out, int outlen) -{ - int retcode; - - CAMLassert(s != NULL); - - if (slen == 0) - return 0; - - if (windows_unicode_enabled != 0) - retcode = WideCharToMultiByte(CP_UTF8, windows_unicode_strict ? WC_ERR_INVALID_CHARS : 0, s, slen, out, outlen, NULL, NULL); - else - retcode = WideCharToMultiByte(CP_THREAD_ACP, 0, s, slen, out, outlen, NULL, NULL); - - if (retcode == 0) - caml_win32_sys_error(GetLastError()); - - return retcode; -} - -CAMLexport value caml_copy_string_of_utf16(const wchar_t *s) -{ - int retcode, slen; - value v; - - slen = wcslen(s); - retcode = win_wide_char_to_multi_byte(s, slen, NULL, 0); /* Do not include final NULL */ - v = caml_alloc_string(retcode); - win_wide_char_to_multi_byte(s, slen, String_val(v), retcode); - - return v; -} - -CAMLexport inline wchar_t* caml_stat_strdup_to_utf16(const char *s) -{ - wchar_t * ws; - int retcode; - - retcode = win_multi_byte_to_wide_char(s, -1, NULL, 0); - ws = malloc(retcode * sizeof(*ws)); - win_multi_byte_to_wide_char(s, -1, ws, retcode); - - return ws; -} - -CAMLexport caml_stat_string caml_stat_strdup_of_utf16(const wchar_t *s) -{ - caml_stat_string out; - int retcode; - - retcode = win_wide_char_to_multi_byte(s, -1, NULL, 0); - out = caml_stat_alloc(retcode); - win_wide_char_to_multi_byte(s, -1, out, retcode); - - return out; -} - -void caml_probe_win32_version(void) -{ - /* Determine the version of Windows we're running, and cache it */ - WCHAR fileName[MAX_PATH]; - DWORD size = - GetModuleFileName(GetModuleHandle(L"kernel32"), fileName, MAX_PATH); - DWORD dwHandle = 0; - BYTE* versionInfo; - fileName[size] = 0; - size = GetFileVersionInfoSize(fileName, &dwHandle); - versionInfo = (BYTE*)malloc(size * sizeof(BYTE)); - if (GetFileVersionInfo(fileName, 0, size, versionInfo)) { - UINT len = 0; - VS_FIXEDFILEINFO* vsfi = NULL; - VerQueryValue(versionInfo, L"\\", (void**)&vsfi, &len); - caml_win32_major = HIWORD(vsfi->dwProductVersionMS); - caml_win32_minor = LOWORD(vsfi->dwProductVersionMS); - caml_win32_build = HIWORD(vsfi->dwProductVersionLS); - caml_win32_revision = LOWORD(vsfi->dwProductVersionLS); - } - free(versionInfo); -} - -static UINT startup_codepage = 0; - -void caml_setup_win32_terminal(void) -{ - if (caml_win32_major >= 10) { - startup_codepage = GetConsoleOutputCP(); - if (startup_codepage != CP_UTF8) - SetConsoleOutputCP(CP_UTF8); - } -} - -void caml_restore_win32_terminal(void) -{ - if (startup_codepage != 0) - SetConsoleOutputCP(startup_codepage); -} - -/* Detect if a named pipe corresponds to a Cygwin/MSYS pty: see - https://github.com/mirror/newlib-cygwin/blob/00e9bf2/winsup/cygwin/dtable.cc#L932 -*/ -typedef -BOOL (WINAPI *tGetFileInformationByHandleEx)(HANDLE, FILE_INFO_BY_HANDLE_CLASS, - LPVOID, DWORD); - -static int caml_win32_is_cygwin_pty(HANDLE hFile) -{ - char buffer[1024]; - FILE_NAME_INFO * nameinfo = (FILE_NAME_INFO *) buffer; - static tGetFileInformationByHandleEx pGetFileInformationByHandleEx = INVALID_HANDLE_VALUE; - - if (pGetFileInformationByHandleEx == INVALID_HANDLE_VALUE) - pGetFileInformationByHandleEx = - (tGetFileInformationByHandleEx)GetProcAddress(GetModuleHandle(L"KERNEL32.DLL"), - "GetFileInformationByHandleEx"); - - if (pGetFileInformationByHandleEx == NULL) - return 0; - - /* Get pipe name. GetFileInformationByHandleEx does not NULL-terminate the string, so reduce - the buffer size to allow for adding one. */ - if (! pGetFileInformationByHandleEx(hFile, FileNameInfo, buffer, sizeof(buffer) - sizeof(WCHAR))) - return 0; - - nameinfo->FileName[nameinfo->FileNameLength / sizeof(WCHAR)] = L'\0'; - - /* check if this could be a msys pty pipe ('msys-XXXX-ptyN-XX') - or a cygwin pty pipe ('cygwin-XXXX-ptyN-XX') */ - if ((wcsstr(nameinfo->FileName, L"msys-") || - wcsstr(nameinfo->FileName, L"cygwin-")) && wcsstr(nameinfo->FileName, L"-pty")) - return 1; - - return 0; -} - -CAMLexport int caml_win32_isatty(int fd) -{ - DWORD lpMode; - HANDLE hFile = (HANDLE)_get_osfhandle(fd); - - if (hFile == INVALID_HANDLE_VALUE) - return 0; - - switch (GetFileType(hFile)) { - case FILE_TYPE_CHAR: - /* Both console handles and the NUL device are FILE_TYPE_CHAR. The NUL - device returns FALSE for a GetConsoleMode call. _isatty incorrectly - only uses GetFileType (see GPR#1321). */ - return GetConsoleMode(hFile, &lpMode); - case FILE_TYPE_PIPE: - /* Cygwin PTYs are implemented using named pipes */ - return caml_win32_is_cygwin_pty(hFile); - default: - break; - } - - return 0; -} - -int caml_num_rows_fd(int fd) -{ - return -1; -} diff --git a/test/monniaux/picosat-965/Makefile b/test/monniaux/picosat-965/Makefile new file mode 100644 index 00000000..69613a79 --- /dev/null +++ b/test/monniaux/picosat-965/Makefile @@ -0,0 +1,34 @@ +EXECUTE_ARGS=sudoku.sat + +include ../rules.mk + +ALL_CFLAGS = -DNDEBUG +EMBEDDED_CFLAGS = -DNALARM -DNZIP -DNGETRUSAGE +K1C_CFLAGS += $(EMBEDDED_CFLAGS) +K1C_CCOMPFLAGS += $(EMBEDDED_CFLAGS) +CCOMPFLAGS += -fbitfields +K1C_CCOMPFLAGS += -fbitfields + +K1C_CFLAGS += $(ALL_CFLAGS) +K1C_CCOMPFLAGS += $(ALL_CFLAGS) +CCOMPFLAGS += $(ALL_CFLAGS) +CFLAGS += $(ALL_CFLAGS) + +all: picosat.ccomp.k1c.s version.ccomp.k1c.s app.ccomp.k1c.s main.ccomp.k1c.s picosat.gcc.k1c.s version.gcc.k1c.s app.gcc.k1c.s main.gcc.k1c.s picosat.ccomp.k1c.out picosat.gcc.k1c.out picosat.ccomp.host.out picosat.gcc.host.out + +picosat.ccomp.k1c : picosat.ccomp.k1c.s version.ccomp.k1c.s app.ccomp.k1c.s main.ccomp.k1c.s ../clock.gcc.k1c.o + $(K1C_CCOMP) $(K1C_CCOMPFLAGS) $+ -o $@ + +picosat.gcc.k1c : picosat.gcc.k1c.s version.gcc.k1c.s app.gcc.k1c.s main.gcc.k1c.s ../clock.gcc.k1c.o + $(K1C_CC) $(K1C_CFLAGS) $+ -o $@ + +picosat.ccomp.host : picosat.ccomp.host.s version.ccomp.host.s app.ccomp.host.s main.ccomp.host.s ../clock.gcc.host.o + $(CCOMP) $(CCOMPFLAGS) $+ -o $@ + +picosat.gcc.host : picosat.gcc.host.s version.gcc.host.s app.gcc.host.s main.gcc.host.s ../clock.gcc.host.o + $(CC) $(FLAGS) $+ -o $@ + +clean: + -rm -f *.s *.k1c *.out + +.PHONY: clean diff --git a/test/monniaux/picosat-965/app.c b/test/monniaux/picosat-965/app.c index d817cf21..64ebdbd0 100644 --- a/test/monniaux/picosat-965/app.c +++ b/test/monniaux/picosat-965/app.c @@ -12,7 +12,7 @@ #define BUNZIP2 "bzcat %s" #define GZIP "gzip -c -f > %s" -#if 0 +#ifndef NZIP FILE * popen (const char *, const char*); int pclose (FILE *); #endif @@ -542,7 +542,7 @@ picosat_main (int argc, char **argv) unsigned seed; FILE *file; int trace; - + start_time = picosat_time_stamp (); sargc = argc; diff --git a/test/monniaux/picosat-965/main.c b/test/monniaux/picosat-965/main.c index 03fad79f..13d7b0e5 100644 --- a/test/monniaux/picosat-965/main.c +++ b/test/monniaux/picosat-965/main.c @@ -1,7 +1,25 @@ +#define VERIMAG_MEASUREMENTS +#ifdef VERIMAG_MEASUREMENTS +#include "../clock.h" +#endif + int picosat_main (int, char **); int main (int argc, char **argv) { - return picosat_main (argc, argv); + +#ifdef VERIMAG_MEASUREMENTS + clock_prepare(); + clock_start(); +#endif + + int ret= picosat_main (argc, argv); + +#ifdef VERIMAG_MEASUREMENTS + clock_stop(); + print_total_clock(); +#endif + + return ret; } diff --git a/test/monniaux/picosat-965/picosat.c b/test/monniaux/picosat-965/picosat.c index aca9d962..21442f44 100644 --- a/test/monniaux/picosat-965/picosat.c +++ b/test/monniaux/picosat-965/picosat.c @@ -31,6 +31,8 @@ IN THE SOFTWARE. #include "picosat.h" +#define INLINE inline + /* By default code for 'all different constraints' is disabled, since 'NADC' * is defined. */ @@ -730,7 +732,7 @@ struct PicoSAT typedef PicoSAT PS; -static Flt +static INLINE Flt packflt (unsigned m, int e) { Flt res; @@ -942,13 +944,13 @@ flt2double (Flt f) #endif -static int +static INLINE int log2flt (Flt a) { return FLTEXPONENT (a) + 24; } -static int +static INLINE int cmpflt (Flt a, Flt b) { if (a < b) @@ -1058,19 +1060,19 @@ resize (PS * ps, void *void_ptr, size_t old_size, size_t new_size) return b->data; } -static unsigned +static INLINE unsigned int2unsigned (int l) { return (l < 0) ? 1 + 2 * -l : 2 * l; } -static Lit * +static INLINE Lit * int2lit (PS * ps, int l) { return ps->lits + int2unsigned (l); } -static Lit ** +static INLINE Lit ** end_of_lits (Cls * c) { return (Lit**)c->lits + c->size; @@ -1153,7 +1155,7 @@ dumpcnf (PS * ps) #endif -static void +static INLINE void delete_prefix (PS * ps) { if (!ps->prefix) @@ -1437,7 +1439,7 @@ lrelease (PS * ps, Ltk * stk) #ifndef NADC -static unsigned +static INLINE unsigned llength (Lit ** a) { Lit ** p; @@ -1446,7 +1448,7 @@ llength (Lit ** a) return p - a; } -static void +static INLINE void resetadoconflict (PS * ps) { assert (ps->adoconflict); @@ -1454,7 +1456,7 @@ resetadoconflict (PS * ps) ps->adoconflict = 0; } -static void +static INLINE void reset_ados (PS * ps) { Lit *** p; @@ -1565,7 +1567,7 @@ tpush (PS * ps, Lit * lit) *ps->thead++ = lit; } -static void +static INLINE void assign_reason (PS * ps, Var * v, Cls * reason) { #if defined(NO_BINARY_CLAUSES) && !defined(NDEBUG) @@ -1665,7 +1667,7 @@ cmp_added (PS * ps, Lit * k, Lit * l) return u - v; /* smaller index first */ } -static void +static INLINE void sorttwolits (Lit ** v) { Lit * a = v[0], * b = v[1]; @@ -1689,7 +1691,7 @@ sortlits (PS * ps, Lit ** v, unsigned size) } #ifdef NO_BINARY_CLAUSES -static Cls * +static INLINE Cls * setimpl (PS * ps, Lit * a, Lit * b) { assert (!ps->implvalid); @@ -1704,7 +1706,7 @@ setimpl (PS * ps, Lit * a, Lit * b) return &ps->impl; } -static void +static INLINE void resetimpl (PS * ps) { ps->implvalid = 0; @@ -1725,7 +1727,7 @@ setcimpl (PS * ps, Lit * a, Lit * b) return &ps->cimpl; } -static void +static INLINE void resetcimpl (PS * ps) { assert (ps->cimplvalid); @@ -1734,7 +1736,7 @@ resetcimpl (PS * ps) #endif -static int +static INLINE int cmp_ptr (PS * ps, void *l, void *k) { (void) ps; @@ -1831,7 +1833,7 @@ add_antecedent (PS * ps, Cls * c) #endif /* TRACE */ -static void +static INLINE void add_lit (PS * ps, Lit * lit) { assert (lit); @@ -1842,7 +1844,7 @@ add_lit (PS * ps, Lit * lit) *ps->ahead++ = lit; } -static void +static INLINE void push_var_as_marked (PS * ps, Var * v) { if (ps->mhead == ps->eom) @@ -1851,7 +1853,7 @@ push_var_as_marked (PS * ps, Var * v) *ps->mhead++ = v; } -static void +static INLINE void mark_var (PS * ps, Var * v) { assert (!v->mark); @@ -1960,7 +1962,7 @@ fixvar (PS * ps, Var * v) hup (ps, r); } -static void +static INLINE void use_var (PS * ps, Var * v) { if (v->used) @@ -2104,7 +2106,7 @@ zpush (PS * ps, Zhn * zhain) *ps->zhead++ = zhain; } -static int +static INLINE int cmp_resolved (PS * ps, Cls * c, Cls * d) { #ifndef NDEBUG @@ -2115,7 +2117,7 @@ cmp_resolved (PS * ps, Cls * c, Cls * d) return CLS2IDX (c) - CLS2IDX (d); } -static void +static INLINE void bpushc (PS * ps, unsigned char ch) { if (ps->bhead == ps->eob) @@ -2124,7 +2126,7 @@ bpushc (PS * ps, unsigned char ch) *ps->bhead++ = ch; } -static void +static INLINE void bpushu (PS * ps, unsigned u) { while (u & ~0x7f) @@ -2136,7 +2138,7 @@ bpushu (PS * ps, unsigned u) bpushc (ps, u); } -static void +static INLINE void bpushd (PS * ps, unsigned prev, unsigned this) { unsigned delta; @@ -2802,7 +2804,7 @@ hpush (PS * ps, Rnk * r) hup (ps, r); } -static void +static INLINE void fix_trail_lits (PS * ps, long delta) { Lit **p; @@ -2847,7 +2849,7 @@ fix_clause_lits (PS * ps, long delta) } } -static void +static INLINE void fix_added_lits (PS * ps, long delta) { Lit **p; @@ -2855,7 +2857,7 @@ fix_added_lits (PS * ps, long delta) *p += delta; } -static void +static INLINE void fix_assumed_lits (PS * ps, long delta) { Lit **p; @@ -2863,7 +2865,7 @@ fix_assumed_lits (PS * ps, long delta) *p += delta; } -static void +static INLINE void fix_cls_lits (PS * ps, long delta) { Lit **p; @@ -2871,7 +2873,7 @@ fix_cls_lits (PS * ps, long delta) *p += delta; } -static void +static INLINE void fix_heap_rnks (PS * ps, long delta) { Rnk **p; @@ -2882,7 +2884,7 @@ fix_heap_rnks (PS * ps, long delta) #ifndef NADC -static void +static INLINE void fix_ado (long delta, Lit ** ado) { Lit ** p; @@ -2890,7 +2892,7 @@ fix_ado (long delta, Lit ** ado) *p += delta; } -static void +static INLINE void fix_ados (PS * ps, long delta) { Lit *** p; @@ -3051,7 +3053,7 @@ var2reason (PS * ps, Var * var) return res; } -static void +static INLINE void mark_clause_to_be_collected (Cls * c) { assert (!c->collect); @@ -3171,7 +3173,7 @@ mb (PS * ps) return ps->current_bytes / (double) (1 << 20); } -static double +static INLINE double avglevel (PS * ps) { return ps->decisions ? ps->levelsum / ps->decisions : 0.0; @@ -3497,13 +3499,13 @@ inc_activity (PS * ps, Cls * c) *p = addflt (*p, ps->cinc); } -static unsigned +static INLINE unsigned hashlevel (unsigned l) { return 1u << (l & 31); } -static void +static INLINE void push (PS * ps, Var * v) { if (ps->dhead == ps->eod) @@ -3512,7 +3514,7 @@ push (PS * ps, Var * v) *ps->dhead++ = v; } -static Var * +static INLINE Var * pop (PS * ps) { assert (ps->dfs < ps->dhead); @@ -4551,7 +4553,7 @@ force (PS * ps, Cls * c) assign_forced (ps, forced, reason); } -static void +static INLINE void inc_lreduce (PS * ps) { #ifdef STATS @@ -4811,7 +4813,7 @@ collect_clauses (PS * ps) return res; } -static int +static INLINE int need_to_reduce (PS * ps) { return ps->nlclauses >= reduce_limit_on_lclauses (ps); @@ -4975,7 +4977,7 @@ assign_decision (PS * ps, Lit * lit) #ifndef NFL -static int +static INLINE int lit_has_binary_clauses (PS * ps, Lit * lit) { #ifdef NO_BINARY_CLAUSES @@ -4998,7 +5000,7 @@ flbcp (PS * ps) #endif } -inline static int +inline static INLINE int cmp_inverse_rnk (PS * ps, Rnk * a, Rnk * b) { (void) ps; @@ -5635,7 +5637,7 @@ init_reduce (PS * ps) ps->prefix, ps->prefix, ps->lreduce, ps->prefix); } -static unsigned +static INLINE unsigned rng (PS * ps) { unsigned res = ps->srng; @@ -6429,25 +6431,25 @@ reset_assumptions (PS * ps) ps->adecidelevel = 0; } -static void +static INLINE void check_ready (PS * ps) { ABORTIF (!ps || ps->state == RESET, "API usage: uninitialized"); } -static void +static INLINE void check_sat_state (PS * ps) { ABORTIF (ps->state != SAT, "API usage: expected to be in SAT state"); } -static void +static INLINE void check_unsat_state (PS * ps) { ABORTIF (ps->state != UNSAT, "API usage: expected to be in UNSAT state"); } -static void +static INLINE void check_sat_or_unsat_or_unknown_state (PS * ps) { ABORTIF (ps->state != SAT && ps->state != UNSAT && ps->state != UNKNOWN, @@ -6525,7 +6527,7 @@ enter (PS * ps) ps->entered = picosat_time_stamp (); } -static void +static INLINE void leave (PS * ps) { assert (ps->nentered); diff --git a/test/monniaux/rules.mk b/test/monniaux/rules.mk index c8412479..09b845e5 100644 --- a/test/monniaux/rules.mk +++ b/test/monniaux/rules.mk @@ -1,15 +1,17 @@ +ALL_CCOMPFLAGS=-fno-unprototyped CCOMP=ccomp -CCOMPFLAGS=-g -O3 -Wall -fno-unprototyped +CCOMPFLAGS=-g -O3 -Wall $(ALL_CCOMPFLAGS) $(ALL_CFLAGS) -CFLAGS=-g -std=c99 -O3 -Wall -Wextra -Werror=implicit +CFLAGS=-g -std=c99 -O3 -Wall -Wextra -Werror=implicit $(ALL_CFLAGS) K1C_CC=k1-mbr-gcc -K1C_CFLAGS =-g -std=c99 -O2 -Wall -Wextra -Werror=implicit +K1C_CFLAGS =-g -std=c99 -O2 -Wall -Wextra -Werror=implicit $(ALL_CFLAGS) K1C_CCOMP = ../../../ccomp -K1C_CCOMPFLAGS=-O3 -Wall -Wno-c11-extensions -fno-unprototyped # -fpostpass-ilp +K1C_CCOMPFLAGS=-O3 -Wall -Wno-c11-extensions $(ALL_CCOMPFLAGS) $(ALL_CFLAGS) # -fpostpass-ilp EXECUTE=k1-cluster --syscall=libstd_scalls.so -- +EXECUTE_CYCLES=k1-cluster --syscall=libstd_scalls.so --cycle-based -- %.gcc.host.o : %.gcc.host.s $(CC) $(CFLAGS) -c -o $@ $< @@ -48,7 +50,7 @@ EXECUTE=k1-cluster --syscall=libstd_scalls.so -- # $(CCOMP) $(CCOMPFLAGS) $+ -o $@ %.k1c.out : %.k1c - k1-cluster --cycle-based -- $< |tee $@ + $(EXECUTE_CYCLES) $< $(EXECUTE_ARGS) |tee $@ %.host.out : %.host - ./$< |tee $@ + ./$< $(EXECUTE_ARGS) |tee $@ diff --git a/test/monniaux/ternary/Makefile b/test/monniaux/ternary/Makefile new file mode 100644 index 00000000..b051b397 --- /dev/null +++ b/test/monniaux/ternary/Makefile @@ -0,0 +1,26 @@ +include ../rules.mk + +PRODUCTS=ternary.gcc.host.out ternary.ccomp.host.out \ + ternary.gcc.k1c.out ternary.ccomp.k1c.out \ + ternary.gcc.k1c.s ternary.ccomp.k1c.s + +all: $(PRODUCTS) + +ternary.gcc.host.s ternary.ccomp.host.s ternary.gcc.k1c.s ternary.ccomp.k1c.s : ../clock.h + +ternary.ccomp.host: ternary.ccomp.host.o ../clock.gcc.host.o + $(CCOMP) $(CCOMPFLAGS) $+ -o $@ + +ternary.gcc.host: ternary.gcc.host.o ../clock.gcc.host.o + $(CC) $(CFLAGS) $+ -o $@ + +ternary.gcc.k1c: ternary.gcc.k1c.o ../clock.gcc.k1c.o + $(K1C_CC) $(K1C_CFLAGS) $+ -o $@ + +ternary.ccomp.k1c: ternary.ccomp.k1c.o ../clock.gcc.k1c.o + $(K1C_CCOMP) $(K1C_CCOMPFLAGS) $+ -o $@ + +clean: + -rm -f *.o *.s *.k1c + +.PHONY: clean diff --git a/test/monniaux/ternary/ternary.c b/test/monniaux/ternary/ternary.c new file mode 100644 index 00000000..79025639 --- /dev/null +++ b/test/monniaux/ternary/ternary.c @@ -0,0 +1,29 @@ +#include <stdint.h> +#include <stdio.h> +#include <inttypes.h> +#include "../clock.h" + +typedef uint32_t data; + +#if 0 +#define TERNARY(a, b, c) ((a) ? (b) : (c)) +#else +#define TERNARY(a, b, c) (((-(a)) & (b)) | ((-1+(a)) & (c))) +#endif + +data silly_computation(void) { + data x = 1; + for(int i=0; i<10000; i++) { + x = x * TERNARY(((x & 0x100) != 0), 45561U, 337777U); + } + return x; +} + +int main() { + clock_prepare(); + clock_start(); + data result = silly_computation(); + clock_stop(); + printf("result=%" PRIu32 "\ncycles=%" PRIu64 "\n", result, get_total_clock()); + return 0; +} diff --git a/test/monniaux/too_slow/Makefile b/test/monniaux/too_slow/Makefile new file mode 100644 index 00000000..bdc23def --- /dev/null +++ b/test/monniaux/too_slow/Makefile @@ -0,0 +1,27 @@ +include ../rules.mk + +PRODUCTS=memset_from_bitsliced-aes.gcc.host.out memset_from_bitsliced-aes.ccomp.host.out \ + memset_from_bitsliced-aes.gcc.k1c.out memset_from_bitsliced-aes.ccomp.k1c.out \ + memset_from_bitsliced-aes.gcc.k1c.s memset_from_bitsliced-aes.ccomp.k1c.s + +all: $(PRODUCTS) + +memset_from_bitsliced-aes.gcc.host.s memset_from_bitsliced-aes.ccomp.host.s memset_from_bitsliced-aes.gcc.k1c.s memset_from_bitsliced-aes.ccomp.k1c.s : ../clock.h + +memset_from_bitsliced-aes.ccomp.host: memset_from_bitsliced-aes.ccomp.host.o ../clock.gcc.host.o + $(CCOMP) $(CCOMPFLAGS) $+ -o $@ + +memset_from_bitsliced-aes.gcc.host: memset_from_bitsliced-aes.gcc.host.o ../clock.gcc.host.o + $(CC) $(CFLAGS) $+ -o $@ + +memset_from_bitsliced-aes.gcc.k1c: memset_from_bitsliced-aes.gcc.k1c.o ../clock.gcc.k1c.o + $(K1C_CC) $(K1C_CFLAGS) $+ -o $@ + +memset_from_bitsliced-aes.ccomp.k1c: memset_from_bitsliced-aes.ccomp.k1c.o ../clock.gcc.k1c.o + $(K1C_CCOMP) $(K1C_CCOMPFLAGS) $+ -o $@ + +clean: + -rm -f *.o *.s *.k1c + +.PHONY: clean + diff --git a/test/monniaux/too_slow/memset_from_bitsliced-aes.c b/test/monniaux/too_slow/memset_from_bitsliced-aes.c new file mode 100644 index 00000000..32137b55 --- /dev/null +++ b/test/monniaux/too_slow/memset_from_bitsliced-aes.c @@ -0,0 +1,43 @@ +#include <stdint.h> +#include <stdio.h> +#include <string.h> +#include "../clock.h" + +typedef uint64_t a; +a n[128]; +int o, bs_expand_key_k; +void b(a (*)[], uint8_t *); +void c(uint8_t d, uint8_t e, size_t f, uint8_t g, uint8_t iv) { + a i[1]; + b(i, g); +} + +void b(a (*i)[], uint8_t *j) { + for (; o < 176; o += 8) { + bs_expand_key_k = 4; + for (; bs_expand_key_k < 128; bs_expand_key_k += 128 / 64) + ; + memset(n, 0, sizeof(n)); + } +} + +void aes_ctr_test() { + uint8_t k = ""; + uint8_t l = ""; + uint8_t m = ""; + uint8_t output[4]; + c(output, m, 4, k, l); +} + +int main(int argc, char * argv[]) +{ + clock_prepare(); + + clock_start(); + + aes_ctr_test(); + clock_stop(); + print_total_clock(); + + return 0; +} |