aboutsummaryrefslogtreecommitdiffstats
path: root/test/monniaux/bitsliced-aes
diff options
context:
space:
mode:
authorDavid Monniaux <david.monniaux@univ-grenoble-alpes.fr>2019-03-27 08:18:59 +0100
committerDavid Monniaux <david.monniaux@univ-grenoble-alpes.fr>2019-03-27 08:18:59 +0100
commit820c0fef3aa0715c15988800564612dcc86f81b4 (patch)
treed7f6e82c71c3f3818e02e558efd540b402a57bd4 /test/monniaux/bitsliced-aes
parent0d8f4f46407b1634fba4f6cd46ba4955a7859863 (diff)
downloadcompcert-kvx-820c0fef3aa0715c15988800564612dcc86f81b4.tar.gz
compcert-kvx-820c0fef3aa0715c15988800564612dcc86f81b4.zip
hand optimized
Diffstat (limited to 'test/monniaux/bitsliced-aes')
-rw-r--r--test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized3268
-rw-r--r--test/monniaux/bitsliced-aes/notes.txt3
2 files changed, 3271 insertions, 0 deletions
diff --git a/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized b/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized
new file mode 100644
index 00000000..d939f856
--- /dev/null
+++ b/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized
@@ -0,0 +1,3268 @@
+# File generated by CompCert 3.5
+# Command line: -O3 -Wall -Wno-c11-extensions -fno-unprototyped -S bs.c -o bs.ccomp.k1c.s
+ .text
+ .balign 2
+ .globl bs_addroundkey
+bs_addroundkey:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -16
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ make $r5, 0
+;;
+.L100:
+ sxwd $r6 = $r5
+ addw $r5 = $r5, 1
+ make $r32, 128
+;;
+ slld $r2 = $r6, 3
+ compw.lt $r32 = $r5, $r32
+;;
+ addd $r3 = $r0, $r2
+ addd $r4 = $r1, $r2
+;;
+ ld $r7 = 0[$r3]
+;;
+ ld $r9 = 0[$r4]
+;;
+ xord $r6 = $r7, $r9
+;;
+ sd 0[$r3] = $r6
+;;
+ cb.wnez $r32? .L100
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 16
+;;
+ ret
+;;
+ .type bs_addroundkey, @function
+ .size bs_addroundkey, . - bs_addroundkey
+ .text
+ .balign 2
+ .globl bs_apply_sbox
+bs_apply_sbox:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -32
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+;;
+ sd 24[$r12] = $r19
+ make $r19, 0
+;;
+.L101:
+ sxwd $r1 = $r19
+;;
+ slld $r0 = $r1, 3
+;;
+ addd $r0 = $r18, $r0
+ call bs_sbox
+;;
+ addw $r19 = $r19, 8
+ make $r32, 128
+;;
+ compw.lt $r32 = $r19, $r32
+;;
+ cb.wnez $r32? .L101
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 32
+;;
+ ret
+;;
+ .type bs_apply_sbox, @function
+ .size bs_apply_sbox, . - bs_apply_sbox
+ .text
+ .balign 2
+ .globl bs_apply_sbox_rev
+bs_apply_sbox_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -32
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+;;
+ sd 24[$r12] = $r19
+ make $r19, 0
+;;
+.L102:
+ sxwd $r1 = $r19
+;;
+ slld $r0 = $r1, 3
+;;
+ addd $r0 = $r18, $r0
+ call bs_sbox_rev
+;;
+ addw $r19 = $r19, 8
+ make $r32, 128
+;;
+ compw.lt $r32 = $r19, $r32
+;;
+ cb.wnez $r32? .L102
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 32
+;;
+ ret
+;;
+ .type bs_apply_sbox_rev, @function
+ .size bs_apply_sbox_rev, . - bs_apply_sbox_rev
+ .text
+ .balign 2
+ .globl bs_sbox_rev
+bs_sbox_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -96
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+;;
+ sd 24[$r12] = $r19
+;;
+ ld $r7 = 48[$r0]
+;;
+ ld $r3 = 56[$r0]
+;;
+ ld $r4 = 32[$r0]
+ nxord $r40 = $r3, $r7
+;;
+ xord $r10 = $r3, $r4
+ nxord $r11 = $r7, $r4
+ ld $r2 = 8[$r0]
+;;
+ ld $r5 = 24[$r0]
+ nxord $r45 = $r7, $r10
+ xord $r59 = $r7, $r2
+;;
+ xord $r41 = $r4, $r5
+ ld $r1 = 0[$r0]
+ xord $r60 = $r5, $r45
+ andd $r33 = $r10, $r45
+;;
+ nxord $r35 = $r5, $r1
+ xord $r63 = $r2, $r1
+ nxord $r39 = $r1, $r41
+ ld $r6 = 40[$r0]
+;;
+ xord $r46 = $r11, $r63
+ xord $r54 = $r40, $r35
+ ld $r7 = 16[$r0]
+ nxord $r57 = $r6, $r5
+;;
+ xord $r52 = $r40, $r63
+ xord $r50 = $r41, $r63
+ nxord $r47 = $r6, $r7
+ nxord $r38 = $r7, $r2
+;;
+ nxord $r58 = $r6, $r46
+ xord $r19 = $r11, $r47
+ xord $r63 = $r59, $r57
+ xord $r7 = $r41, $r38
+;;
+ xord $r44 = $r35, $r59
+ xord $r18 = $r3, $r47
+ xord $r3 = $r54, $r7
+ xord $r55 = $r54, $r38
+;;
+ nxord $r34 = $r6, $r41
+ xord $r2 = $r50, $r63
+ andd $r57 = $r52, $r19
+ andd $r17 = $r50, $r63
+;;
+ xord $r36 = $r55, $r57
+ andd $r62 = $r46, $r18
+ andd $r53 = $r11, $r39
+ xord $r6 = $r2, $r17
+;;
+ andd $r42 = $r44, $r58
+ andd $r15 = $r41, $r3
+ andd $r2 = $r60, $r7
+ andd $r37 = $r40, $r54
+;;
+ xord $r59 = $r62, $r57
+ xord $r51 = $r42, $r17
+ xord $r8 = $r2, $r15
+ xord $r4 = $r37, $r15
+;;
+ xord $r5 = $r36, $r33
+ xord $r38 = $r59, $r35
+ xord $r48 = $r6, $r53
+ xord $r47 = $r51, $r4
+;;
+ xord $r53 = $r5, $r8
+ xord $r43 = $r38, $r4
+ xord $r56 = $r48, $r8
+ xord $r57 = $r47, $r34
+;;
+ xord $r49 = $r56, $r57
+ andd $r48 = $r56, $r53
+ xord $r47 = $r53, $r43
+ andd $r9 = $r53, $r57
+;;
+ xord $r36 = $r43, $r48
+ xord $r35 = $r57, $r48
+ andd $r62 = $r47, $r9
+ xord $r17 = $r47, $r48
+;;
+ andd $r15 = $r35, $r47
+ andd $r42 = $r36, $r49
+ andd $r47 = $r43, $r56
+ xord $r59 = $r49, $r48
+;;
+ andd $r37 = $r49, $r47
+ xord $r5 = $r43, $r15
+ xord $r4 = $r62, $r17
+ xord $r55 = $r57, $r42
+;;
+ xord $r1 = $r37, $r59
+ xord $r2 = $r5, $r55
+ xord $r47 = $r5, $r4
+ andd $r35 = $r4, $r39
+;;
+ xord $r61 = $r4, $r1
+ xord $r33 = $r55, $r1
+ andd $r62 = $r1, $r45
+ andd $r45 = $r55, $r18
+;;
+ xord $r48 = $r2, $r61
+ andd $r49 = $r2, $r3
+ andd $r6 = $r1, $r10
+ andd $r3 = $r47, $r50
+;;
+ andd $r56 = $r47, $r63
+ andd $r42 = $r5, $r58
+ andd $r1 = $r4, $r11
+ andd $r57 = $r2, $r41
+;;
+ andd $r9 = $r61, $r54
+ andd $r51 = $r33, $r52
+ andd $r58 = $r55, $r46
+ andd $r53 = $r5, $r44
+;;
+ andd $r41 = $r48, $r60
+ andd $r10 = $r61, $r40
+ xord $r59 = $r49, $r57
+ xord $r61 = $r3, $r1
+;;
+ andd $r34 = $r33, $r19
+ andd $r39 = $r48, $r7
+ xord $r55 = $r9, $r41
+ xord $r60 = $r45, $r6
+;;
+ xord $r48 = $r62, $r35
+ xord $r15 = $r56, $r53
+ xord $r44 = $r59, $r61
+ xord $r49 = $r51, $r10
+;;
+ xord $r54 = $r34, $r42
+ xord $r51 = $r58, $r60
+ xord $r59 = $r59, $r48
+ xord $r8 = $r56, $r42
+;;
+ xord $r47 = $r9, $r1
+ xord $r11 = $r60, $r15
+ xord $r40 = $r55, $r44
+ xord $r60 = $r15, $r51
+;;
+ xord $r52 = $r56, $r41
+ xord $r56 = $r10, $r54
+ xord $r2 = $r49, $r59
+ xord $r5 = $r59, $r60
+;;
+ xord $r7 = $r3, $r55
+ xord $r61 = $r51, $r56
+ xord $r59 = $r47, $r11
+ xord $r47 = $r8, $r40
+;;
+ xord $r63 = $r35, $r39
+ xord $r4 = $r34, $r45
+ sd 88[$r12] = $r47
+ xord $r51 = $r2, $r59
+;;
+ xord $r10 = $r55, $r48
+ xord $r50 = $r44, $r63
+ sd 80[$r12] = $r51
+ xord $r37 = $r7, $r5
+;;
+ xord $r53 = $r54, $r44
+ sd 72[$r12] = $r37
+ xord $r37 = $r4, $r40
+ xord $r40 = $r50, $r61
+;;
+ xord $r1 = $r58, $r57
+ sd 64[$r12] = $r37
+ xord $r46 = $r10, $r53
+ xord $r7 = $r52, $r50
+;;
+ sd 56[$r12] = $r40
+ xord $r49 = $r49, $r1
+ addd $r1 = $r12, 32
+ make $r2, 64
+;;
+ sd 48[$r12] = $r46
+;;
+ sd 40[$r12] = $r7
+;;
+ sd 32[$r12] = $r49
+ call memmove
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 96
+;;
+ ret
+;;
+ .type bs_sbox_rev, @function
+ .size bs_sbox_rev, . - bs_sbox_rev
+ .text
+ .balign 2
+ .globl bs_sbox
+bs_sbox:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -80
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ ld $r5 = 56[$r0]
+;;
+ ld $r6 = 32[$r0]
+;;
+ xord $r41 = $r5, $r6
+ ld $r2 = 16[$r0]
+;;
+ xord $r42 = $r5, $r2
+ ld $r4 = 8[$r0]
+ xord $r49 = $r6, $r2
+;;
+ xord $r48 = $r5, $r4
+ ld $r55 = 24[$r0]
+;;
+ xord $r9 = $r55, $r4
+ ld $r3 = 48[$r0]
+;;
+ xord $r5 = $r41, $r9
+ ld $r7 = 40[$r0]
+ xord $r34 = $r3, $r2
+;;
+ xord $r10 = $r3, $r7
+ ld $r1 = 0[$r0]
+ xord $r11 = $r7, $r2
+ xord $r3 = $r48, $r49
+;;
+ xord $r33 = $r9, $r34
+ xord $r8 = $r9, $r11
+ xord $r44 = $r6, $r1
+ xord $r47 = $r4, $r1
+;;
+ xord $r59 = $r1, $r10
+ xord $r7 = $r5, $r34
+ xord $r9 = $r10, $r44
+ xord $r4 = $r10, $r47
+;;
+ xord $r61 = $r1, $r5
+ xord $r57 = $r5, $r10
+ andd $r50 = $r3, $r5
+ andd $r43 = $r9, $r1
+;;
+ xord $r34 = $r59, $r8
+ xord $r6 = $r41, $r9
+ xord $r35 = $r42, $r4
+ xord $r36 = $r48, $r8
+;;
+ xord $r38 = $r41, $r11
+ xord $r40 = $r7, $r50
+ xord $r11 = $r43, $r50
+ andd $r50 = $r48, $r8
+;;
+ xord $r15 = $r42, $r57
+ andd $r62 = $r35, $r61
+ andd $r37 = $r4, $r59
+ xord $r52 = $r36, $r50
+;;
+ andd $r53 = $r6, $r34
+ andd $r55 = $r41, $r33
+ andd $r46 = $r49, $r38
+ andd $r54 = $r42, $r57
+;;
+ xord $r39 = $r53, $r50
+ xord $r60 = $r46, $r55
+ xord $r55 = $r54, $r55
+ xord $r10 = $r40, $r62
+;;
+ xord $r44 = $r6, $r34
+ xord $r43 = $r11, $r15
+ xord $r15 = $r52, $r37
+ xord $r17 = $r39, $r55
+;;
+ xord $r45 = $r10, $r60
+ xord $r55 = $r43, $r55
+ xord $r50 = $r15, $r60
+ xord $r46 = $r17, $r44
+;;
+ xord $r63 = $r50, $r46
+ andd $r43 = $r50, $r45
+ xord $r56 = $r45, $r55
+ andd $r54 = $r45, $r46
+;;
+ xord $r36 = $r55, $r43
+ xord $r47 = $r46, $r43
+ andd $r40 = $r56, $r54
+ andd $r60 = $r55, $r50
+;;
+ andd $r2 = $r47, $r56
+ andd $r58 = $r36, $r63
+ xord $r36 = $r56, $r43
+ andd $r15 = $r63, $r60
+;;
+ xord $r47 = $r63, $r43
+ xord $r17 = $r55, $r2
+ xord $r50 = $r40, $r36
+ xord $r52 = $r46, $r58
+;;
+ xord $r58 = $r15, $r47
+ xord $r51 = $r17, $r52
+ xord $r7 = $r17, $r50
+ andd $r43 = $r52, $r1
+;;
+ xord $r53 = $r50, $r58
+ xord $r62 = $r52, $r58
+ andd $r44 = $r7, $r8
+ andd $r8 = $r50, $r59
+;;
+ xord $r40 = $r51, $r53
+ andd $r45 = $r58, $r61
+ andd $r54 = $r51, $r33
+ andd $r10 = $r58, $r35
+;;
+ andd $r47 = $r40, $r38
+ andd $r46 = $r62, $r3
+ andd $r2 = $r51, $r41
+ xord $r35 = $r8, $r10
+;;
+ andd $r5 = $r62, $r5
+ andd $r36 = $r17, $r34
+ andd $r39 = $r53, $r57
+ andd $r56 = $r7, $r48
+;;
+ andd $r41 = $r40, $r49
+ xord $r34 = $r45, $r46
+ xord $r51 = $r44, $r2
+ xord $r62 = $r54, $r47
+;;
+ andd $r38 = $r52, $r9
+ andd $r37 = $r50, $r4
+ andd $r9 = $r17, $r6
+ xord $r59 = $r46, $r35
+;;
+ andd $r61 = $r53, $r42
+ xord $r1 = $r2, $r41
+ xord $r63 = $r5, $r43
+ xord $r33 = $r39, $r56
+;;
+ xord $r42 = $r41, $r51
+ xord $r51 = $r5, $r34
+ xord $r52 = $r36, $r37
+ xord $r49 = $r59, $r62
+;;
+ xord $r57 = $r47, $r33
+ xord $r3 = $r9, $r63
+ xord $r11 = $r54, $r2
+ xord $r50 = $r10, $r1
+;;
+ xord $r37 = $r43, $r36
+ xord $r6 = $r56, $r52
+ xord $r9 = $r51, $r62
+ xord $r40 = $r42, $r49
+;;
+ xord $r36 = $r61, $r33
+ xord $r10 = $r42, $r57
+ xord $r56 = $r52, $r57
+ xord $r57 = $r3, $r11
+;;
+ xord $r5 = $r35, $r51
+ sd 72[$r12] = $r40
+ nxord $r43 = $r50, $r9
+ nxord $r17 = $r36, $r57
+;;
+ xord $r39 = $r8, $r1
+ xord $r53 = $r38, $r35
+ xord $r8 = $r1, $r35
+ sd 64[$r12] = $r43
+;;
+ xord $r7 = $r34, $r37
+ xord $r58 = $r3, $r53
+ sd 56[$r12] = $r17
+ xord $r38 = $r42, $r5
+;;
+ xord $r48 = $r6, $r63
+ sd 48[$r12] = $r38
+ xord $r1 = $r8, $r7
+ xord $r43 = $r10, $r58
+;;
+ sd 40[$r12] = $r1
+ nxord $r4 = $r39, $r56
+ nxord $r34 = $r42, $r48
+ addd $r1 = $r12, 16
+;;
+ sd 32[$r12] = $r43
+ make $r2, 64
+;;
+ sd 24[$r12] = $r4
+;;
+ sd 16[$r12] = $r34
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 80
+;;
+ ret
+;;
+ .type bs_sbox, @function
+ .size bs_sbox, . - bs_sbox
+ .text
+ .balign 2
+ .globl bs_transpose
+bs_transpose:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1056
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+ addd $r0 = $r12, 24
+ make $r1, 0
+;;
+ make $r2, 1024
+ call memset
+;;
+ addd $r0 = $r12, 24
+ addd $r1 = $r18, 0
+ call bs_transpose_dst
+;;
+ addd $r1 = $r12, 24
+ make $r2, 1024
+ addd $r0 = $r18, 0
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ ld $r18 = 16[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1056
+;;
+ ret
+;;
+ .type bs_transpose, @function
+ .size bs_transpose, . - bs_transpose
+ .text
+ .balign 2
+ .globl bs_transpose_dst
+bs_transpose_dst:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -16
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ make $r4, 0
+;;
+.L103:
+ make $r35, 1
+ make $r17, 0
+;;
+ slld $r41 = $r35, $r4
+;;
+ addw $r9 = $r41, 0
+;;
+.L104:
+ sllw $r10 = $r4, 1
+ sllw $r42 = $r17, 6
+ make $r6, 0
+;;
+ addw $r36 = $r10, $r17
+;;
+ sxwd $r15 = $r36
+;;
+ slld $r2 = $r15, 3
+;;
+ addd $r8 = $r1, $r2
+;;
+ ld $r11 = 0[$r8]
+;;
+.L105:
+ addw $r40 = $r42, $r6
+ make $r2, 0
+ make $r44, 1
+ make $r32, 64
+;;
+ sxwd $r34 = $r40
+ sxwd $r39 = $r9
+ slld $r37 = $r44, $r6
+ addw $r6 = $r6, 1
+;;
+ ld.xs $r7 = $r34[$r0]
+ andd $r33 = $r11, $r37
+ compw.lt $r32 = $r6, $r32
+;;
+ cmoved.dnez $r33? $r2 = $r39
+;;
+ ord $r38 = $r7, $r2
+;;
+ sd.xs $r34[$r0] = $r38
+ cb.wnez $r32? .L105
+;;
+ addw $r17 = $r17, 1
+ make $r32, 2
+;;
+ compw.lt $r32 = $r17, $r32
+;;
+ cb.wnez $r32? .L104
+;;
+ addw $r4 = $r4, 1
+ make $r32, 64
+;;
+ compw.lt $r32 = $r4, $r32
+;;
+ cb.wnez $r32? .L103
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 16
+;;
+ ret
+;;
+ .type bs_transpose_dst, @function
+ .size bs_transpose_dst, . - bs_transpose_dst
+ .text
+ .balign 2
+ .globl bs_transpose_rev
+bs_transpose_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1056
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+ addd $r0 = $r12, 24
+ make $r1, 0
+;;
+ make $r2, 1024
+ call memset
+;;
+ make $r3, 0
+;;
+.L106:
+ sxwd $r8 = $r3
+ sraw $r32 = $r3, 31
+ make $r11, 0
+;;
+ slld $r34 = $r8, 3
+ srlw $r32 = $r32, 26
+;;
+ addd $r6 = $r18, $r34
+ addw $r32 = $r3, $r32
+;;
+ sraw $r2 = $r32, 6
+;;
+ sxwd $r5 = $r2
+;;
+ ld $r36 = 0[$r6]
+;;
+.L107:
+ make $r39, 1
+;;
+ slld $r38 = $r39, $r11
+;;
+ andd $r17 = $r36, $r38
+;;
+ cb.deqz $r17? .L108
+;;
+ make $r44, 1
+ sraw $r32 = $r3, 31
+;;
+ srlw $r32 = $r32, 26
+;;
+ addw $r32 = $r3, $r32
+;;
+ sraw $r40 = $r32, 6
+;;
+ sllw $r9 = $r40, 6
+;;
+ sbfw $r45 = $r9, $r3
+;;
+ slld $r0 = $r44, $r45
+ goto .L109
+;;
+.L108:
+ make $r0, 0
+;;
+.L109:
+ addd $r37 = $r12, 24
+ sllw $r46 = $r11, 1
+ addw $r11 = $r11, 1
+ make $r32, 64
+;;
+ sxwd $r7 = $r46
+ compw.lt $r32 = $r11, $r32
+;;
+ addd $r4 = $r7, $r5
+;;
+ slld $r10 = $r4, 3
+;;
+ addd $r1 = $r37, $r10
+;;
+ ld $r41 = 0[$r1]
+;;
+ ord $r35 = $r41, $r0
+;;
+ sd 0[$r1] = $r35
+;;
+ cb.wnez $r32? .L107
+;;
+ addw $r3 = $r3, 1
+ make $r32, 128
+;;
+ compw.lt $r32 = $r3, $r32
+;;
+ cb.wnez $r32? .L106
+;;
+ addd $r1 = $r12, 24
+ make $r2, 1024
+ addd $r0 = $r18, 0
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ ld $r18 = 16[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1056
+;;
+ ret
+;;
+ .type bs_transpose_rev, @function
+ .size bs_transpose_rev, . - bs_transpose_rev
+ .text
+ .balign 2
+ .globl bs_shiftrows
+bs_shiftrows:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1040
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ addd $r50 = $r12, 16
+ addd $r1 = $r0, 0
+ addd $r43 = $r0, 256
+ addd $r8 = $r0, 512
+;;
+ addd $r60 = $r0, 768
+ make $r15, 0
+ make $r52, 32
+ make $r3, 64
+;;
+ make $r36, 96
+ make $r7, 0
+;;
+.L110:
+ ld $r5 = 0[$r1]
+ addw $r59 = $r52, 40
+ addw $r7 = $r7, 1
+ make $r32, 4
+;;
+ sd 0[$r50] = $r5
+ andw $r52 = $r59, 127
+ addw $r63 = $r36, 40
+ compw.lt $r32 = $r7, $r32
+;;
+ andw $r36 = $r63, 127
+ sxwd $r62 = $r52
+;;
+ slld $r53 = $r62, 3
+;;
+ ld $r11 = 8[$r1]
+;;
+ sd 8[$r50] = $r11
+;;
+ ld $r61 = 16[$r1]
+;;
+ sd 16[$r50] = $r61
+;;
+ ld $r6 = 24[$r1]
+;;
+ sd 24[$r50] = $r6
+;;
+ ld $r56 = 32[$r1]
+;;
+ sd 32[$r50] = $r56
+;;
+ ld $r2 = 40[$r1]
+;;
+ sd 40[$r50] = $r2
+ addw $r2 = $r3, 40
+;;
+ andw $r3 = $r2, 127
+ sxwd $r2 = $r36
+;;
+ sxwd $r5 = $r3
+ slld $r39 = $r2, 3
+;;
+ ld $r38 = 48[$r1]
+ slld $r46 = $r5, 3
+;;
+ sd 48[$r50] = $r38
+;;
+ ld $r54 = 56[$r1]
+;;
+ sd 56[$r50] = $r54
+;;
+ ld $r4 = 0[$r43]
+;;
+ sd 256[$r50] = $r4
+;;
+ ld $r58 = 8[$r43]
+;;
+ sd 264[$r50] = $r58
+;;
+ ld $r10 = 16[$r43]
+;;
+ sd 272[$r50] = $r10
+;;
+ ld $r34 = 24[$r43]
+;;
+ sd 280[$r50] = $r34
+;;
+ ld $r51 = 32[$r43]
+;;
+ sd 288[$r50] = $r51
+;;
+ ld $r9 = 40[$r43]
+;;
+ sd 296[$r50] = $r9
+;;
+ ld $r1 = 48[$r43]
+;;
+ sd 304[$r50] = $r1
+;;
+ ld $r4 = 56[$r43]
+ addd $r43 = $r0, $r53
+;;
+ sd 312[$r50] = $r4
+;;
+ ld $r41 = 0[$r8]
+;;
+ sd 512[$r50] = $r41
+;;
+ ld $r9 = 8[$r8]
+;;
+ sd 520[$r50] = $r9
+;;
+ ld $r6 = 16[$r8]
+;;
+ sd 528[$r50] = $r6
+;;
+ ld $r9 = 24[$r8]
+;;
+ sd 536[$r50] = $r9
+;;
+ ld $r42 = 32[$r8]
+;;
+ sd 544[$r50] = $r42
+;;
+ ld $r35 = 40[$r8]
+;;
+ sd 552[$r50] = $r35
+;;
+ ld $r10 = 48[$r8]
+;;
+ sd 560[$r50] = $r10
+;;
+ ld $r57 = 56[$r8]
+;;
+ sd 568[$r50] = $r57
+;;
+ ld $r17 = 0[$r60]
+;;
+ sd 768[$r50] = $r17
+;;
+ ld $r8 = 8[$r60]
+;;
+ sd 776[$r50] = $r8
+ addw $r8 = $r15, 40
+;;
+ andw $r15 = $r8, 127
+ addd $r8 = $r0, $r46
+;;
+ sxwd $r37 = $r15
+;;
+ ld $r48 = 16[$r60]
+ slld $r40 = $r37, 3
+;;
+ sd 784[$r50] = $r48
+ addd $r1 = $r0, $r40
+;;
+ ld $r33 = 24[$r60]
+;;
+ sd 792[$r50] = $r33
+;;
+ ld $r47 = 32[$r60]
+;;
+ sd 800[$r50] = $r47
+;;
+ ld $r4 = 40[$r60]
+;;
+ sd 808[$r50] = $r4
+;;
+ ld $r44 = 48[$r60]
+;;
+ sd 816[$r50] = $r44
+;;
+ ld $r49 = 56[$r60]
+ addd $r60 = $r0, $r39
+;;
+ sd 824[$r50] = $r49
+ addd $r50 = $r50, 64
+ cb.wnez $r32? .L110
+;;
+ addd $r1 = $r12, 16
+ make $r2, 1024
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1040
+;;
+ ret
+;;
+ .type bs_shiftrows, @function
+ .size bs_shiftrows, . - bs_shiftrows
+ .text
+ .balign 2
+ .globl bs_shiftrows_rev
+bs_shiftrows_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1040
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ addd $r56 = $r12, 16
+ addd $r34 = $r12, 16
+ addd $r45 = $r12, 272
+ addd $r6 = $r12, 528
+;;
+ addd $r62 = $r12, 784
+ make $r4, 0
+ make $r10, 32
+ make $r55, 64
+;;
+ make $r2, 96
+ make $r59, 0
+;;
+.L111:
+ ld $r43 = 0[$r0]
+ addw $r9 = $r4, 40
+ addw $r59 = $r59, 1
+ make $r32, 4
+;;
+ sd 0[$r34] = $r43
+ andw $r4 = $r9, 127
+ addw $r51 = $r10, 40
+ compw.lt $r32 = $r59, $r32
+;;
+ andw $r10 = $r51, 127
+ sxwd $r39 = $r4
+;;
+ slld $r60 = $r39, 3
+;;
+ ld $r57 = 8[$r0]
+;;
+ sd 8[$r34] = $r57
+;;
+ ld $r63 = 16[$r0]
+;;
+ sd 16[$r34] = $r63
+;;
+ ld $r7 = 24[$r0]
+;;
+ sd 24[$r34] = $r7
+;;
+ ld $r44 = 32[$r0]
+;;
+ sd 32[$r34] = $r44
+;;
+ ld $r42 = 40[$r0]
+;;
+ sd 40[$r34] = $r42
+;;
+ ld $r40 = 48[$r0]
+;;
+ sd 48[$r34] = $r40
+;;
+ ld $r61 = 56[$r0]
+;;
+ sd 56[$r34] = $r61
+ addd $r34 = $r56, $r60
+;;
+ ld $r35 = 256[$r0]
+;;
+ sd 0[$r45] = $r35
+;;
+ ld $r1 = 264[$r0]
+;;
+ sd 8[$r45] = $r1
+ addw $r1 = $r2, 40
+;;
+ andw $r2 = $r1, 127
+;;
+ ld $r49 = 272[$r0]
+;;
+ sd 16[$r45] = $r49
+;;
+ ld $r37 = 280[$r0]
+;;
+ sd 24[$r45] = $r37
+;;
+ ld $r54 = 288[$r0]
+;;
+ sd 32[$r45] = $r54
+;;
+ ld $r15 = 296[$r0]
+;;
+ sd 40[$r45] = $r15
+;;
+ ld $r3 = 304[$r0]
+;;
+ sd 48[$r45] = $r3
+;;
+ ld $r5 = 312[$r0]
+;;
+ sd 56[$r45] = $r5
+ sxwd $r5 = $r2
+;;
+ slld $r38 = $r5, 3
+;;
+ ld $r53 = 512[$r0]
+;;
+ sd 0[$r6] = $r53
+;;
+ ld $r33 = 520[$r0]
+;;
+ sd 8[$r6] = $r33
+;;
+ ld $r8 = 528[$r0]
+;;
+ sd 16[$r6] = $r8
+;;
+ ld $r11 = 536[$r0]
+;;
+ sd 24[$r6] = $r11
+;;
+ ld $r47 = 544[$r0]
+;;
+ sd 32[$r6] = $r47
+;;
+ ld $r3 = 552[$r0]
+;;
+ sd 40[$r6] = $r3
+;;
+ ld $r17 = 560[$r0]
+;;
+ sd 48[$r6] = $r17
+;;
+ ld $r52 = 568[$r0]
+;;
+ sd 56[$r6] = $r52
+ sxwd $r6 = $r10
+;;
+ slld $r1 = $r6, 3
+;;
+ addd $r45 = $r56, $r1
+;;
+ ld $r8 = 768[$r0]
+;;
+ sd 0[$r62] = $r8
+;;
+ ld $r41 = 776[$r0]
+;;
+ sd 8[$r62] = $r41
+;;
+ ld $r3 = 784[$r0]
+;;
+ sd 16[$r62] = $r3
+ addw $r3 = $r55, 40
+;;
+ andw $r55 = $r3, 127
+;;
+ sxwd $r7 = $r55
+;;
+ ld $r36 = 792[$r0]
+ slld $r58 = $r7, 3
+;;
+ sd 24[$r62] = $r36
+ addd $r6 = $r56, $r58
+;;
+ ld $r48 = 800[$r0]
+;;
+ sd 32[$r62] = $r48
+;;
+ ld $r11 = 808[$r0]
+;;
+ sd 40[$r62] = $r11
+;;
+ ld $r46 = 816[$r0]
+;;
+ sd 48[$r62] = $r46
+;;
+ ld $r50 = 824[$r0]
+ addd $r0 = $r0, 64
+;;
+ sd 56[$r62] = $r50
+ addd $r62 = $r56, $r38
+ cb.wnez $r32? .L111
+;;
+ addd $r0 = $r0, -256
+ addd $r1 = $r12, 16
+ make $r2, 1024
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1040
+;;
+ ret
+;;
+ .type bs_shiftrows_rev, @function
+ .size bs_shiftrows_rev, . - bs_shiftrows_rev
+ .text
+ .balign 2
+ .globl bs_shiftmix
+bs_shiftmix:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1088
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r4 = $r0, 256
+ addd $r1 = $r0, 512
+ addd $r3 = $r0, 768
+;;
+ sd 24[$r12] = $r19
+ addd $r19 = $r12, 64
+ make $r18, 0
+ addd $r2 = $r0, 0
+;;
+ sd 32[$r12] = $r20
+ make $r20, 64
+;;
+ sd 40[$r12] = $r21
+ make $r21, 96
+;;
+ sd 48[$r12] = $r22
+ make $r22, 32
+;;
+ sd 56[$r12] = $r23
+ make $r23, 0
+;;
+.L112:
+ ld $r46 = 64[$r4]
+ addw $r23 = $r23, 1
+ make $r32, 4
+;;
+ ld $r8 = 128[$r1]
+ compw.lt $r32 = $r23, $r32
+;;
+ ld $r5 = 56[$r2]
+ xord $r57 = $r46, $r8
+;;
+ ld $r59 = 120[$r4]
+;;
+ xord $r7 = $r5, $r59
+ ld $r17 = 192[$r3]
+;;
+ xord $r5 = $r57, $r17
+;;
+ xord $r61 = $r5, $r7
+;;
+ sd 0[$r19] = $r61
+;;
+ ld $r48 = 0[$r2]
+;;
+ ld $r62 = 64[$r4]
+;;
+ xord $r42 = $r48, $r62
+ ld $r60 = 72[$r4]
+;;
+ xord $r5 = $r42, $r60
+ ld $r61 = 136[$r1]
+;;
+ xord $r45 = $r5, $r61
+ ld $r40 = 200[$r3]
+;;
+ xord $r45 = $r45, $r40
+;;
+ xord $r5 = $r45, $r7
+;;
+ sd 8[$r19] = $r5
+;;
+ ld $r11 = 8[$r2]
+;;
+ ld $r51 = 72[$r4]
+;;
+ xord $r45 = $r11, $r51
+ ld $r40 = 80[$r4]
+;;
+ xord $r37 = $r45, $r40
+ ld $r39 = 144[$r1]
+;;
+ xord $r6 = $r37, $r39
+ ld $r42 = 208[$r3]
+;;
+ xord $r59 = $r6, $r42
+;;
+ sd 16[$r19] = $r59
+;;
+ ld $r6 = 16[$r2]
+;;
+ ld $r44 = 80[$r4]
+;;
+ xord $r43 = $r6, $r44
+ ld $r9 = 88[$r4]
+;;
+ xord $r52 = $r43, $r9
+ ld $r46 = 152[$r1]
+;;
+ xord $r42 = $r52, $r46
+ ld $r48 = 216[$r3]
+;;
+ xord $r5 = $r42, $r48
+;;
+ xord $r55 = $r5, $r7
+;;
+ sd 24[$r19] = $r55
+;;
+ ld $r34 = 24[$r2]
+;;
+ ld $r8 = 88[$r4]
+;;
+ xord $r62 = $r34, $r8
+ ld $r47 = 96[$r4]
+;;
+ xord $r38 = $r62, $r47
+ ld $r50 = 160[$r1]
+;;
+ xord $r34 = $r38, $r50
+ ld $r56 = 224[$r3]
+;;
+ xord $r8 = $r34, $r56
+;;
+ xord $r11 = $r8, $r7
+;;
+ sd 32[$r19] = $r11
+;;
+ ld $r5 = 96[$r4]
+;;
+ ld $r53 = 32[$r2]
+;;
+ xord $r44 = $r53, $r5
+ ld $r54 = 168[$r1]
+;;
+ ld $r5 = 104[$r4]
+;;
+ xord $r40 = $r44, $r5
+;;
+ xord $r10 = $r40, $r54
+;;
+ ld $r5 = 232[$r3]
+;;
+ xord $r39 = $r10, $r5
+;;
+ sd 40[$r19] = $r39
+;;
+ ld $r5 = 40[$r2]
+;;
+ ld $r58 = 104[$r4]
+;;
+ xord $r17 = $r5, $r58
+ ld $r15 = 112[$r4]
+;;
+ xord $r37 = $r17, $r15
+ ld $r5 = 176[$r1]
+;;
+ xord $r57 = $r37, $r5
+ ld $r51 = 240[$r3]
+;;
+ xord $r57 = $r57, $r51
+;;
+ sd 48[$r19] = $r57
+;;
+ ld $r40 = 48[$r2]
+;;
+ ld $r52 = 112[$r4]
+;;
+ xord $r35 = $r40, $r52
+ ld $r5 = 120[$r4]
+;;
+ xord $r5 = $r35, $r5
+ ld $r49 = 184[$r1]
+;;
+ xord $r15 = $r5, $r49
+ ld $r34 = 248[$r3]
+;;
+ xord $r46 = $r15, $r34
+;;
+ sd 56[$r19] = $r46
+;;
+ ld $r33 = 0[$r2]
+;;
+ ld $r36 = 128[$r1]
+;;
+ ld $r48 = 120[$r4]
+ xord $r42 = $r33, $r36
+;;
+ ld $r5 = 184[$r1]
+;;
+ xord $r34 = $r48, $r5
+ ld $r47 = 192[$r3]
+;;
+ xord $r60 = $r42, $r47
+;;
+ xord $r60 = $r60, $r34
+;;
+ sd 64[$r19] = $r60
+;;
+ ld $r43 = 8[$r2]
+;;
+ ld $r47 = 64[$r4]
+;;
+ xord $r63 = $r43, $r47
+ ld $r52 = 128[$r1]
+;;
+ xord $r5 = $r63, $r52
+ ld $r7 = 136[$r1]
+;;
+ xord $r60 = $r5, $r7
+ ld $r15 = 200[$r3]
+;;
+ xord $r55 = $r60, $r15
+;;
+ xord $r48 = $r55, $r34
+;;
+ sd 72[$r19] = $r48
+;;
+ ld $r56 = 16[$r2]
+;;
+ ld $r5 = 72[$r4]
+;;
+ xord $r7 = $r56, $r5
+ ld $r46 = 136[$r1]
+;;
+ xord $r41 = $r7, $r46
+ ld $r40 = 144[$r1]
+;;
+ xord $r5 = $r41, $r40
+ ld $r47 = 208[$r3]
+;;
+ xord $r5 = $r5, $r47
+;;
+ sd 80[$r19] = $r5
+;;
+ ld $r52 = 24[$r2]
+;;
+ ld $r54 = 80[$r4]
+;;
+ xord $r35 = $r52, $r54
+ ld $r63 = 144[$r1]
+;;
+ xord $r7 = $r35, $r63
+ ld $r8 = 152[$r1]
+;;
+ xord $r33 = $r7, $r8
+ ld $r37 = 216[$r3]
+;;
+ xord $r56 = $r33, $r37
+;;
+ xord $r54 = $r56, $r34
+;;
+ sd 88[$r19] = $r54
+;;
+ ld $r9 = 32[$r2]
+;;
+ ld $r6 = 88[$r4]
+;;
+ xord $r44 = $r9, $r6
+ ld $r51 = 152[$r1]
+;;
+ xord $r35 = $r44, $r51
+ ld $r52 = 160[$r1]
+;;
+ xord $r38 = $r35, $r52
+ ld $r9 = 224[$r3]
+;;
+ xord $r62 = $r38, $r9
+;;
+ xord $r6 = $r62, $r34
+;;
+ sd 96[$r19] = $r6
+;;
+ ld $r15 = 40[$r2]
+;;
+ ld $r17 = 96[$r4]
+;;
+ xord $r36 = $r15, $r17
+ ld $r5 = 160[$r1]
+;;
+ xord $r50 = $r36, $r5
+ ld $r51 = 168[$r1]
+;;
+ xord $r37 = $r50, $r51
+ ld $r42 = 232[$r3]
+;;
+ xord $r58 = $r37, $r42
+;;
+ sd 104[$r19] = $r58
+;;
+ ld $r56 = 48[$r2]
+;;
+ ld $r41 = 104[$r4]
+;;
+ xord $r11 = $r56, $r41
+ ld $r48 = 168[$r1]
+;;
+ xord $r51 = $r11, $r48
+ ld $r58 = 176[$r1]
+;;
+ xord $r61 = $r51, $r58
+ ld $r5 = 240[$r3]
+;;
+ xord $r61 = $r61, $r5
+;;
+ sd 112[$r19] = $r61
+;;
+ ld $r34 = 56[$r2]
+;;
+ ld $r56 = 112[$r4]
+;;
+ xord $r46 = $r34, $r56
+ ld $r9 = 176[$r1]
+;;
+ xord $r62 = $r46, $r9
+ ld $r33 = 184[$r1]
+;;
+ xord $r46 = $r62, $r33
+ ld $r61 = 248[$r3]
+;;
+ xord $r40 = $r46, $r61
+;;
+ sd 120[$r19] = $r40
+;;
+ ld $r5 = 184[$r1]
+;;
+ ld $r59 = 248[$r3]
+;;
+ xord $r43 = $r5, $r59
+ ld $r55 = 0[$r2]
+;;
+ ld $r5 = 64[$r4]
+;;
+ xord $r42 = $r55, $r5
+ ld $r35 = 192[$r3]
+;;
+ xord $r49 = $r42, $r35
+;;
+ xord $r5 = $r49, $r43
+;;
+ sd 128[$r19] = $r5
+;;
+ ld $r57 = 8[$r2]
+;;
+ ld $r5 = 72[$r4]
+;;
+ xord $r44 = $r57, $r5
+ ld $r45 = 128[$r1]
+;;
+ xord $r17 = $r44, $r45
+ ld $r33 = 192[$r3]
+;;
+ xord $r52 = $r17, $r33
+ ld $r39 = 200[$r3]
+;;
+ xord $r35 = $r52, $r39
+;;
+ xord $r62 = $r35, $r43
+;;
+ sd 136[$r19] = $r62
+;;
+ ld $r5 = 16[$r2]
+;;
+ ld $r39 = 80[$r4]
+;;
+ xord $r36 = $r5, $r39
+ ld $r41 = 136[$r1]
+;;
+ xord $r6 = $r36, $r41
+ ld $r5 = 200[$r3]
+;;
+ xord $r35 = $r6, $r5
+ ld $r11 = 208[$r3]
+;;
+ xord $r37 = $r35, $r11
+;;
+ sd 144[$r19] = $r37
+;;
+ ld $r5 = 24[$r2]
+;;
+ ld $r63 = 88[$r4]
+;;
+ xord $r33 = $r5, $r63
+ ld $r45 = 144[$r1]
+;;
+ xord $r49 = $r33, $r45
+ ld $r36 = 208[$r3]
+;;
+ xord $r55 = $r49, $r36
+ ld $r8 = 216[$r3]
+;;
+ xord $r41 = $r55, $r8
+;;
+ xord $r58 = $r41, $r43
+;;
+ sd 152[$r19] = $r58
+;;
+ ld $r6 = 32[$r2]
+;;
+ ld $r47 = 96[$r4]
+;;
+ xord $r11 = $r6, $r47
+ ld $r61 = 152[$r1]
+;;
+ xord $r44 = $r11, $r61
+ ld $r9 = 216[$r3]
+;;
+ xord $r59 = $r44, $r9
+ ld $r34 = 224[$r3]
+;;
+ xord $r7 = $r59, $r34
+;;
+ xord $r17 = $r7, $r43
+;;
+ sd 160[$r19] = $r17
+;;
+ ld $r54 = 40[$r2]
+;;
+ ld $r53 = 104[$r4]
+;;
+ xord $r7 = $r54, $r53
+ ld $r59 = 160[$r1]
+;;
+ xord $r37 = $r7, $r59
+ ld $r41 = 224[$r3]
+;;
+ xord $r10 = $r37, $r41
+ ld $r46 = 232[$r3]
+;;
+ xord $r10 = $r10, $r46
+;;
+ sd 168[$r19] = $r10
+;;
+ ld $r58 = 48[$r2]
+;;
+ ld $r5 = 112[$r4]
+;;
+ xord $r40 = $r58, $r5
+ ld $r38 = 168[$r1]
+;;
+ xord $r57 = $r40, $r38
+ ld $r51 = 232[$r3]
+;;
+ xord $r60 = $r57, $r51
+ ld $r55 = 240[$r3]
+;;
+ xord $r53 = $r60, $r55
+;;
+ sd 176[$r19] = $r53
+;;
+ ld $r45 = 56[$r2]
+;;
+ ld $r41 = 120[$r4]
+;;
+ xord $r5 = $r45, $r41
+ ld $r53 = 176[$r1]
+;;
+ xord $r38 = $r5, $r53
+ ld $r8 = 240[$r3]
+;;
+ xord $r43 = $r38, $r8
+ ld $r63 = 248[$r3]
+;;
+ xord $r6 = $r43, $r63
+;;
+ sd 184[$r19] = $r6
+;;
+ ld $r8 = 0[$r2]
+;;
+ ld $r58 = 64[$r4]
+;;
+ ld $r35 = 56[$r2]
+ xord $r54 = $r8, $r58
+;;
+ ld $r5 = 248[$r3]
+;;
+ xord $r50 = $r35, $r5
+ ld $r51 = 128[$r1]
+;;
+ xord $r11 = $r54, $r51
+;;
+ xord $r38 = $r11, $r50
+;;
+ sd 192[$r19] = $r38
+;;
+ ld $r63 = 8[$r2]
+;;
+ ld $r54 = 0[$r2]
+;;
+ xord $r54 = $r63, $r54
+ ld $r36 = 72[$r4]
+;;
+ xord $r5 = $r54, $r36
+ ld $r41 = 136[$r1]
+;;
+ xord $r39 = $r5, $r41
+ ld $r58 = 192[$r3]
+;;
+ xord $r44 = $r39, $r58
+;;
+ xord $r33 = $r44, $r50
+;;
+ sd 200[$r19] = $r33
+;;
+ ld $r5 = 8[$r2]
+;;
+ ld $r63 = 16[$r2]
+;;
+ xord $r54 = $r63, $r5
+ ld $r49 = 80[$r4]
+ addw $r63 = $r18, 32
+;;
+ xord $r51 = $r54, $r49
+ ld $r5 = 144[$r1]
+ andw $r18 = $r63, 127
+;;
+ xord $r43 = $r51, $r5
+ ld $r57 = 200[$r3]
+;;
+ xord $r47 = $r43, $r57
+;;
+ sd 208[$r19] = $r47
+ addw $r47 = $r21, 32
+;;
+ andw $r21 = $r47, 127
+;;
+ ld $r7 = 24[$r2]
+;;
+ ld $r15 = 16[$r2]
+;;
+ xord $r56 = $r7, $r15
+ ld $r48 = 88[$r4]
+;;
+ xord $r10 = $r56, $r48
+ ld $r51 = 152[$r1]
+;;
+ xord $r39 = $r10, $r51
+ addw $r10 = $r22, 32
+;;
+ ld $r48 = 208[$r3]
+ andw $r22 = $r10, 127
+;;
+ xord $r53 = $r39, $r48
+;;
+ xord $r37 = $r53, $r50
+;;
+ sd 216[$r19] = $r37
+;;
+ ld $r9 = 32[$r2]
+;;
+ ld $r15 = 24[$r2]
+;;
+ xord $r43 = $r9, $r15
+ ld $r53 = 96[$r4]
+ addw $r15 = $r20, 32
+;;
+ xord $r42 = $r43, $r53
+ ld $r17 = 160[$r1]
+ andw $r20 = $r15, 127
+;;
+ xord $r55 = $r42, $r17
+ ld $r62 = 216[$r3]
+ sxwd $r8 = $r20
+;;
+ xord $r60 = $r55, $r62
+ slld $r43 = $r8, 3
+;;
+ xord $r5 = $r60, $r50
+ sxwd $r50 = $r18
+;;
+ sd 224[$r19] = $r5
+ slld $r39 = $r50, 3
+;;
+ ld $r5 = 40[$r2]
+;;
+ ld $r51 = 32[$r2]
+;;
+ xord $r62 = $r5, $r51
+ ld $r45 = 168[$r1]
+;;
+ ld $r5 = 104[$r4]
+;;
+ xord $r9 = $r62, $r5
+;;
+ xord $r17 = $r9, $r45
+;;
+ ld $r5 = 224[$r3]
+;;
+ xord $r49 = $r17, $r5
+;;
+ sd 232[$r19] = $r49
+;;
+ ld $r33 = 48[$r2]
+;;
+ ld $r57 = 40[$r2]
+;;
+ xord $r49 = $r33, $r57
+ ld $r55 = 112[$r4]
+;;
+ xord $r59 = $r49, $r55
+ ld $r36 = 176[$r1]
+;;
+ xord $r61 = $r59, $r36
+ ld $r52 = 232[$r3]
+;;
+ xord $r6 = $r61, $r52
+;;
+ sd 240[$r19] = $r6
+;;
+ ld $r49 = 56[$r2]
+;;
+ ld $r45 = 48[$r2]
+ addd $r2 = $r0, $r39
+;;
+ xord $r56 = $r49, $r45
+ ld $r59 = 120[$r4]
+;;
+ xord $r11 = $r56, $r59
+ ld $r38 = 184[$r1]
+;;
+ xord $r4 = $r11, $r38
+ ld $r34 = 240[$r3]
+ sxwd $r38 = $r22
+ sxwd $r3 = $r21
+;;
+ xord $r1 = $r4, $r34
+ slld $r10 = $r38, 3
+ slld $r36 = $r3, 3
+;;
+ sd 248[$r19] = $r1
+ addd $r19 = $r19, 256
+ addd $r4 = $r0, $r10
+ addd $r1 = $r0, $r43
+;;
+ addd $r3 = $r0, $r36
+ cb.wnez $r32? .L112
+;;
+ addd $r1 = $r12, 64
+ make $r2, 1024
+ call memmove
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r20 = 32[$r12]
+;;
+ ld $r21 = 40[$r12]
+;;
+ ld $r22 = 48[$r12]
+;;
+ ld $r23 = 56[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1088
+;;
+ ret
+;;
+ .type bs_shiftmix, @function
+ .size bs_shiftmix, . - bs_shiftmix
+ .text
+ .balign 2
+ .globl bs_mixcolumns
+bs_mixcolumns:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1040
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ addd $r46 = $r12, 16
+ make $r45, 0
+;;
+.L113:
+ ld $r60 = 64[$r0]
+ addw $r45 = $r45, 1
+ make $r32, 4
+;;
+ ld $r54 = 128[$r0]
+ compw.lt $r32 = $r45, $r32
+;;
+ ld $r44 = 56[$r0]
+ xord $r49 = $r60, $r54
+;;
+ ld $r7 = 120[$r0]
+;;
+ xord $r57 = $r44, $r7
+ ld $r5 = 192[$r0]
+;;
+ xord $r1 = $r49, $r5
+;;
+ xord $r40 = $r1, $r57
+;;
+ sd 0[$r46] = $r40
+;;
+ ld $r42 = 0[$r0]
+;;
+ xord $r53 = $r42, $r60
+ ld $r39 = 72[$r0]
+;;
+ xord $r43 = $r53, $r39
+ ld $r55 = 136[$r0]
+;;
+ xord $r6 = $r43, $r55
+ ld $r2 = 200[$r0]
+;;
+ xord $r15 = $r6, $r2
+;;
+ xord $r8 = $r15, $r57
+;;
+ sd 8[$r46] = $r8
+;;
+ ld $r35 = 8[$r0]
+;;
+ xord $r59 = $r35, $r39
+ ld $r37 = 80[$r0]
+ xord $r60 = $r35, $r60
+;;
+ xord $r56 = $r59, $r37
+ ld $r6 = 144[$r0]
+ xord $r59 = $r59, $r54
+;;
+ xord $r43 = $r56, $r6
+ ld $r51 = 208[$r0]
+ xord $r59 = $r59, $r5
+;;
+ xord $r11 = $r43, $r51
+ xord $r59 = $r59, $r2
+;;
+ sd 16[$r46] = $r11
+;;
+ ld $r47 = 16[$r0]
+;;
+ xord $r11 = $r47, $r37
+ ld $r52 = 88[$r0]
+;;
+ xord $r48 = $r11, $r52
+ ld $r17 = 152[$r0]
+;;
+ xord $r4 = $r48, $r17
+ ld $r1 = 216[$r0]
+;;
+ xord $r4 = $r4, $r1
+;;
+ xord $r3 = $r4, $r57
+;;
+ sd 24[$r46] = $r3
+;;
+ ld $r8 = 24[$r0]
+;;
+ xord $r58 = $r8, $r52
+ ld $r36 = 96[$r0]
+;;
+ xord $r9 = $r58, $r36
+ ld $r50 = 160[$r0]
+ xord $r58 = $r58, $r6
+;;
+ xord $r40 = $r9, $r50
+ ld $r4 = 224[$r0]
+;;
+ xord $r61 = $r40, $r4
+;;
+ xord $r48 = $r61, $r57
+;;
+ sd 32[$r46] = $r48
+;;
+ ld $r15 = 32[$r0]
+;;
+ xord $r57 = $r15, $r36
+ ld $r38 = 104[$r0]
+;;
+ xord $r61 = $r57, $r38
+ ld $r3 = 168[$r0]
+;;
+ xord $r9 = $r61, $r3
+ ld $r48 = 232[$r0]
+;;
+ xord $r9 = $r9, $r48
+;;
+ sd 40[$r46] = $r9
+;;
+ ld $r43 = 40[$r0]
+;;
+ xord $r34 = $r43, $r38
+ ld $r33 = 112[$r0]
+;;
+ xord $r40 = $r34, $r33
+ ld $r10 = 176[$r0]
+ xord $r63 = $r44, $r33
+;;
+ xord $r49 = $r40, $r10
+ ld $r41 = 240[$r0]
+;;
+ xord $r62 = $r49, $r41
+;;
+ sd 48[$r46] = $r62
+ xord $r62 = $r42, $r54
+;;
+ xord $r62 = $r62, $r5
+;;
+ ld $r9 = 48[$r0]
+;;
+ xord $r56 = $r9, $r33
+ ld $r40 = 184[$r0]
+;;
+ xord $r49 = $r56, $r7
+ xord $r56 = $r56, $r3
+;;
+ xord $r61 = $r49, $r40
+;;
+ ld $r49 = 248[$r0]
+ addd $r0 = $r0, 256
+;;
+ xord $r61 = $r61, $r49
+;;
+ sd 56[$r46] = $r61
+ xord $r61 = $r7, $r40
+;;
+ xord $r62 = $r62, $r61
+;;
+ sd 64[$r46] = $r62
+ xord $r62 = $r60, $r54
+;;
+ xord $r60 = $r62, $r55
+;;
+ xord $r60 = $r60, $r2
+;;
+ xord $r60 = $r60, $r61
+;;
+ sd 72[$r46] = $r60
+ xord $r60 = $r47, $r39
+;;
+ xord $r60 = $r60, $r55
+;;
+ xord $r60 = $r60, $r6
+;;
+ xord $r60 = $r60, $r51
+;;
+ sd 80[$r46] = $r60
+ xord $r60 = $r8, $r37
+;;
+ xord $r60 = $r60, $r6
+;;
+ xord $r60 = $r60, $r17
+;;
+ xord $r60 = $r60, $r1
+;;
+ xord $r60 = $r60, $r61
+;;
+ sd 88[$r46] = $r60
+ xord $r60 = $r15, $r52
+;;
+ xord $r60 = $r60, $r17
+;;
+ xord $r60 = $r60, $r50
+;;
+ xord $r60 = $r60, $r4
+;;
+ xord $r60 = $r60, $r61
+ xord $r61 = $r53, $r5
+;;
+ sd 96[$r46] = $r60
+ xord $r60 = $r43, $r36
+;;
+ xord $r60 = $r60, $r50
+;;
+ xord $r60 = $r60, $r3
+;;
+ xord $r60 = $r60, $r48
+;;
+ sd 104[$r46] = $r60
+ xord $r60 = $r9, $r38
+;;
+ xord $r60 = $r60, $r3
+;;
+ xord $r60 = $r60, $r10
+;;
+ xord $r60 = $r60, $r41
+;;
+ sd 112[$r46] = $r60
+ xord $r60 = $r63, $r10
+;;
+ xord $r60 = $r60, $r40
+;;
+ xord $r60 = $r60, $r49
+;;
+ sd 120[$r46] = $r60
+ xord $r60 = $r40, $r49
+;;
+ xord $r61 = $r61, $r60
+ xord $r63 = $r59, $r60
+ xord $r59 = $r11, $r55
+;;
+ sd 128[$r46] = $r61
+ xord $r59 = $r59, $r2
+;;
+ sd 136[$r46] = $r63
+ xord $r11 = $r59, $r51
+ xord $r63 = $r58, $r51
+;;
+ sd 144[$r46] = $r11
+ xord $r58 = $r63, $r1
+ xord $r11 = $r57, $r17
+;;
+ xord $r61 = $r58, $r60
+ xord $r57 = $r11, $r1
+;;
+ sd 152[$r46] = $r61
+ xord $r57 = $r57, $r4
+;;
+ xord $r57 = $r57, $r60
+;;
+ sd 160[$r46] = $r57
+ xord $r57 = $r34, $r50
+;;
+ xord $r57 = $r57, $r4
+;;
+ xord $r11 = $r57, $r48
+ xord $r57 = $r53, $r54
+;;
+ sd 168[$r46] = $r11
+ xord $r11 = $r56, $r48
+;;
+ xord $r56 = $r11, $r41
+ xord $r11 = $r44, $r7
+;;
+ sd 176[$r46] = $r56
+ xord $r34 = $r11, $r10
+;;
+ xord $r11 = $r34, $r41
+;;
+ xord $r56 = $r11, $r49
+ xord $r49 = $r44, $r49
+ xord $r11 = $r35, $r42
+ xord $r42 = $r9, $r43
+;;
+ sd 184[$r46] = $r56
+ xord $r53 = $r57, $r49
+ xord $r11 = $r11, $r39
+ xord $r58 = $r42, $r33
+;;
+ sd 192[$r46] = $r53
+ xord $r61 = $r11, $r55
+ xord $r53 = $r47, $r35
+;;
+ xord $r39 = $r61, $r5
+ xord $r62 = $r53, $r37
+;;
+ xord $r34 = $r39, $r49
+ xord $r37 = $r62, $r6
+;;
+ sd 200[$r46] = $r34
+ xord $r57 = $r37, $r2
+ xord $r37 = $r8, $r47
+ xord $r34 = $r15, $r8
+;;
+ sd 208[$r46] = $r57
+ xord $r35 = $r37, $r52
+;;
+ xord $r47 = $r35, $r17
+;;
+ xord $r47 = $r47, $r51
+;;
+ xord $r54 = $r47, $r49
+ xord $r47 = $r34, $r36
+;;
+ sd 216[$r46] = $r54
+ xord $r35 = $r47, $r50
+;;
+ xord $r39 = $r35, $r1
+;;
+ xord $r11 = $r39, $r49
+ xord $r49 = $r43, $r15
+;;
+ sd 224[$r46] = $r11
+ xord $r53 = $r49, $r38
+ xord $r11 = $r58, $r10
+ xord $r38 = $r44, $r9
+;;
+ xord $r6 = $r53, $r3
+ xord $r56 = $r11, $r48
+ xord $r52 = $r38, $r7
+;;
+ xord $r15 = $r6, $r4
+ xord $r34 = $r52, $r40
+;;
+ sd 232[$r46] = $r15
+ xord $r58 = $r34, $r41
+;;
+ sd 240[$r46] = $r56
+;;
+ sd 248[$r46] = $r58
+ addd $r46 = $r46, 256
+ cb.wnez $r32? .L113
+;;
+ addd $r0 = $r0, -1024
+ addd $r1 = $r46, -1024
+ make $r2, 1024
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1040
+;;
+ ret
+;;
+ .type bs_mixcolumns, @function
+ .size bs_mixcolumns, . - bs_mixcolumns
+ .text
+ .balign 2
+ .globl bs_mixcolumns_rev
+bs_mixcolumns_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1040
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ addd $r49 = $r12, 16
+ make $r50, 0
+;;
+.L114:
+ ld $r47 = 56[$r0]
+ addw $r50 = $r50, 8
+ make $r32, 32
+;;
+ ld $r11 = 48[$r0]
+ compw.lt $r32 = $r50, $r32
+;;
+ xord $r56 = $r47, $r11
+ ld $r6 = 40[$r0]
+;;
+ xord $r2 = $r56, $r6
+ ld $r15 = 120[$r0]
+;;
+ ld $r33 = 104[$r0]
+ xord $r1 = $r47, $r15
+;;
+ xord $r60 = $r15, $r33
+ ld $r7 = 176[$r0]
+;;
+ xord $r57 = $r2, $r60
+ ld $r48 = 168[$r0]
+;;
+ xord $r63 = $r7, $r48
+ ld $r44 = 64[$r0]
+;;
+ xord $r62 = $r57, $r63
+ ld $r34 = 128[$r0]
+;;
+ ld $r3 = 232[$r0]
+ xord $r38 = $r44, $r34
+;;
+ xord $r9 = $r62, $r3
+ ld $r17 = 192[$r0]
+;;
+ ld $r51 = 112[$r0]
+ xord $r36 = $r38, $r17
+;;
+ xord $r8 = $r56, $r51
+ ld $r10 = 184[$r0]
+ xord $r40 = $r36, $r9
+;;
+ xord $r57 = $r10, $r7
+ ld $r5 = 240[$r0]
+ xord $r4 = $r1, $r10
+;;
+ xord $r54 = $r8, $r57
+ ld $r39 = 248[$r0]
+;;
+ xord $r35 = $r54, $r5
+ xord $r58 = $r4, $r39
+ sd 0[$r49] = $r40
+;;
+ ld $r8 = 0[$r0]
+;;
+ xord $r36 = $r8, $r44
+ ld $r52 = 72[$r0]
+;;
+ xord $r59 = $r36, $r52
+ ld $r45 = 136[$r0]
+;;
+ xord $r59 = $r59, $r45
+ ld $r4 = 200[$r0]
+;;
+ xord $r59 = $r59, $r4
+;;
+ xord $r36 = $r59, $r9
+;;
+ xord $r61 = $r36, $r35
+;;
+ sd 8[$r49] = $r61
+;;
+ ld $r43 = 8[$r0]
+;;
+ xord $r53 = $r43, $r8
+ ld $r46 = 80[$r0]
+;;
+ xord $r53 = $r53, $r52
+ ld $r2 = 144[$r0]
+;;
+ xord $r36 = $r53, $r46
+ ld $r42 = 208[$r0]
+;;
+ xord $r59 = $r36, $r2
+;;
+ xord $r53 = $r59, $r34
+;;
+ xord $r53 = $r53, $r42
+;;
+ xord $r37 = $r53, $r35
+;;
+ xord $r53 = $r37, $r58
+;;
+ sd 16[$r49] = $r53
+;;
+ ld $r1 = 16[$r0]
+;;
+ xord $r41 = $r1, $r43
+ ld $r38 = 88[$r0]
+;;
+ xord $r36 = $r41, $r8
+;;
+ xord $r53 = $r36, $r44
+;;
+ xord $r36 = $r53, $r46
+ ld $r41 = 152[$r0]
+;;
+ xord $r36 = $r36, $r38
+;;
+ xord $r40 = $r36, $r41
+;;
+ xord $r53 = $r40, $r45
+;;
+ xord $r36 = $r53, $r34
+;;
+ ld $r53 = 216[$r0]
+;;
+ xord $r54 = $r36, $r53
+;;
+ xord $r36 = $r54, $r17
+;;
+ xord $r36 = $r36, $r9
+;;
+ xord $r36 = $r36, $r58
+;;
+ sd 24[$r49] = $r36
+;;
+ ld $r36 = 24[$r0]
+;;
+ xord $r54 = $r36, $r1
+ ld $r40 = 96[$r0]
+;;
+ xord $r61 = $r54, $r43
+ ld $r37 = 160[$r0]
+;;
+ xord $r54 = $r61, $r52
+;;
+ xord $r59 = $r54, $r38
+;;
+ xord $r59 = $r59, $r40
+;;
+ xord $r54 = $r59, $r37
+;;
+ xord $r54 = $r54, $r2
+;;
+ xord $r55 = $r54, $r45
+;;
+ ld $r54 = 224[$r0]
+;;
+ xord $r55 = $r55, $r54
+;;
+ xord $r55 = $r55, $r4
+;;
+ xord $r55 = $r55, $r9
+;;
+ xord $r9 = $r55, $r35
+;;
+ sd 32[$r49] = $r9
+;;
+ ld $r9 = 32[$r0]
+ addd $r0 = $r0, 256
+;;
+ xord $r55 = $r9, $r36
+ xord $r61 = $r6, $r9
+;;
+ xord $r55 = $r55, $r1
+;;
+ xord $r55 = $r55, $r46
+;;
+ xord $r55 = $r55, $r40
+;;
+ xord $r55 = $r55, $r33
+;;
+ xord $r55 = $r55, $r48
+;;
+ xord $r55 = $r55, $r41
+;;
+ xord $r55 = $r55, $r2
+;;
+ xord $r55 = $r55, $r3
+;;
+ xord $r55 = $r55, $r42
+;;
+ xord $r35 = $r55, $r35
+;;
+ xord $r60 = $r35, $r58
+ xord $r35 = $r61, $r36
+;;
+ sd 40[$r49] = $r60
+ xord $r55 = $r35, $r38
+;;
+ xord $r62 = $r55, $r33
+;;
+ xord $r59 = $r62, $r51
+ xord $r62 = $r8, $r34
+;;
+ xord $r35 = $r59, $r7
+ xord $r59 = $r11, $r6
+ xord $r62 = $r62, $r17
+;;
+ xord $r35 = $r35, $r37
+;;
+ xord $r35 = $r35, $r41
+;;
+ xord $r55 = $r35, $r5
+ xord $r35 = $r59, $r9
+;;
+ xord $r63 = $r55, $r53
+ xord $r60 = $r35, $r40
+;;
+ xord $r55 = $r63, $r58
+ xord $r35 = $r60, $r51
+;;
+ sd 48[$r49] = $r55
+ xord $r63 = $r35, $r15
+ xord $r55 = $r15, $r51
+;;
+ xord $r35 = $r63, $r10
+ xord $r63 = $r1, $r52
+;;
+ xord $r35 = $r35, $r48
+;;
+ xord $r35 = $r35, $r37
+;;
+ xord $r60 = $r35, $r39
+ xord $r35 = $r55, $r33
+;;
+ xord $r60 = $r60, $r54
+ xord $r35 = $r6, $r35
+;;
+ sd 56[$r49] = $r60
+ xord $r60 = $r10, $r48
+;;
+ xord $r61 = $r35, $r60
+ xord $r60 = $r5, $r3
+ xord $r35 = $r39, $r5
+;;
+ xord $r60 = $r61, $r60
+ xord $r61 = $r11, $r55
+;;
+ xord $r61 = $r61, $r7
+ xord $r62 = $r62, $r60
+;;
+ xord $r61 = $r61, $r35
+ sd 64[$r49] = $r62
+ xord $r62 = $r43, $r44
+;;
+ xord $r62 = $r62, $r45
+;;
+ xord $r62 = $r62, $r34
+;;
+ xord $r62 = $r62, $r4
+;;
+ xord $r62 = $r62, $r60
+;;
+ xord $r62 = $r62, $r61
+;;
+ sd 72[$r49] = $r62
+ xord $r62 = $r63, $r44
+;;
+ xord $r62 = $r62, $r2
+;;
+ xord $r62 = $r62, $r45
+;;
+ xord $r62 = $r62, $r42
+;;
+ xord $r62 = $r62, $r17
+;;
+ xord $r62 = $r62, $r61
+;;
+ xord $r62 = $r62, $r58
+;;
+ sd 80[$r49] = $r62
+ xord $r62 = $r36, $r8
+;;
+ xord $r62 = $r62, $r46
+;;
+ xord $r62 = $r62, $r52
+;;
+ xord $r62 = $r62, $r44
+;;
+ xord $r62 = $r62, $r41
+;;
+ xord $r62 = $r62, $r2
+;;
+ xord $r62 = $r62, $r34
+;;
+ xord $r62 = $r62, $r53
+;;
+ xord $r62 = $r62, $r4
+;;
+ xord $r62 = $r62, $r17
+;;
+ xord $r62 = $r62, $r60
+;;
+ xord $r62 = $r62, $r58
+;;
+ sd 88[$r49] = $r62
+ xord $r62 = $r9, $r43
+;;
+ xord $r62 = $r62, $r38
+;;
+ xord $r62 = $r62, $r46
+;;
+ xord $r62 = $r62, $r52
+;;
+ xord $r62 = $r62, $r37
+;;
+ xord $r62 = $r62, $r41
+;;
+ xord $r62 = $r62, $r45
+;;
+ xord $r62 = $r62, $r54
+;;
+ xord $r62 = $r62, $r42
+;;
+ xord $r62 = $r62, $r4
+;;
+ xord $r60 = $r62, $r60
+;;
+ xord $r60 = $r60, $r61
+;;
+ sd 96[$r49] = $r60
+ xord $r60 = $r6, $r1
+;;
+ xord $r60 = $r60, $r40
+;;
+ xord $r60 = $r60, $r38
+;;
+ xord $r60 = $r60, $r46
+;;
+ xord $r60 = $r60, $r48
+;;
+ xord $r60 = $r60, $r37
+;;
+ xord $r60 = $r60, $r2
+;;
+ xord $r60 = $r60, $r3
+;;
+ xord $r60 = $r60, $r53
+;;
+ xord $r60 = $r60, $r42
+;;
+ xord $r60 = $r60, $r61
+;;
+ xord $r60 = $r60, $r58
+;;
+ sd 104[$r49] = $r60
+ xord $r60 = $r11, $r36
+;;
+ xord $r60 = $r60, $r33
+;;
+ xord $r60 = $r60, $r40
+;;
+ xord $r60 = $r60, $r38
+;;
+ xord $r62 = $r60, $r7
+;;
+ xord $r60 = $r62, $r48
+;;
+ xord $r60 = $r60, $r41
+;;
+ xord $r60 = $r60, $r5
+;;
+ xord $r60 = $r60, $r54
+;;
+ xord $r60 = $r60, $r53
+;;
+ xord $r58 = $r60, $r58
+;;
+ sd 112[$r49] = $r58
+ xord $r58 = $r47, $r9
+;;
+ xord $r58 = $r58, $r51
+;;
+ xord $r58 = $r58, $r33
+;;
+ xord $r58 = $r58, $r40
+;;
+ xord $r58 = $r58, $r10
+;;
+ xord $r58 = $r58, $r7
+;;
+ xord $r58 = $r58, $r37
+;;
+ xord $r58 = $r58, $r39
+;;
+ xord $r63 = $r58, $r3
+;;
+ xord $r58 = $r63, $r54
+;;
+ sd 120[$r49] = $r58
+ xord $r58 = $r57, $r48
+ xord $r57 = $r51, $r57
+;;
+ xord $r60 = $r33, $r58
+ xord $r58 = $r39, $r3
+ xord $r63 = $r57, $r5
+;;
+ xord $r58 = $r60, $r58
+ xord $r63 = $r63, $r56
+ xord $r56 = $r15, $r10
+;;
+ xord $r61 = $r58, $r59
+ xord $r56 = $r56, $r39
+;;
+ xord $r62 = $r56, $r47
+ xord $r56 = $r44, $r17
+;;
+ xord $r56 = $r56, $r8
+;;
+ xord $r56 = $r56, $r61
+;;
+ sd 128[$r49] = $r56
+ xord $r56 = $r52, $r34
+;;
+ xord $r56 = $r56, $r4
+;;
+ xord $r56 = $r56, $r17
+;;
+ xord $r56 = $r56, $r43
+;;
+ xord $r56 = $r56, $r61
+;;
+ xord $r56 = $r56, $r63
+;;
+ sd 136[$r49] = $r56
+ xord $r56 = $r46, $r45
+;;
+ xord $r56 = $r56, $r34
+;;
+ xord $r56 = $r56, $r42
+;;
+ xord $r56 = $r56, $r4
+;;
+ xord $r56 = $r56, $r1
+;;
+ xord $r56 = $r56, $r8
+;;
+ xord $r56 = $r56, $r63
+;;
+ xord $r56 = $r56, $r62
+;;
+ sd 144[$r49] = $r56
+ xord $r56 = $r38, $r44
+;;
+ xord $r56 = $r56, $r2
+;;
+ xord $r56 = $r56, $r45
+;;
+ xord $r56 = $r56, $r34
+;;
+ xord $r56 = $r56, $r53
+;;
+ xord $r56 = $r56, $r42
+;;
+ xord $r56 = $r56, $r17
+;;
+ xord $r56 = $r56, $r36
+;;
+ xord $r59 = $r56, $r43
+;;
+ xord $r56 = $r59, $r8
+;;
+ xord $r59 = $r56, $r61
+;;
+ xord $r56 = $r59, $r62
+ xord $r59 = $r33, $r46
+;;
+ sd 152[$r49] = $r56
+ xord $r56 = $r40, $r52
+;;
+ xord $r56 = $r56, $r41
+;;
+ xord $r56 = $r56, $r2
+;;
+ xord $r56 = $r56, $r45
+;;
+ xord $r56 = $r56, $r54
+;;
+ xord $r56 = $r56, $r53
+;;
+ xord $r56 = $r56, $r4
+;;
+ xord $r56 = $r56, $r9
+;;
+ xord $r56 = $r56, $r1
+;;
+ xord $r56 = $r56, $r43
+;;
+ xord $r56 = $r56, $r61
+;;
+ xord $r56 = $r56, $r63
+;;
+ sd 160[$r49] = $r56
+ xord $r56 = $r59, $r37
+;;
+ xord $r56 = $r56, $r41
+;;
+ xord $r56 = $r56, $r2
+;;
+ xord $r56 = $r56, $r3
+;;
+ xord $r56 = $r56, $r54
+;;
+ xord $r57 = $r56, $r42
+;;
+ xord $r56 = $r57, $r6
+;;
+ xord $r56 = $r56, $r36
+;;
+ xord $r56 = $r56, $r1
+;;
+ xord $r56 = $r56, $r63
+;;
+ xord $r56 = $r56, $r62
+;;
+ sd 168[$r49] = $r56
+ xord $r56 = $r51, $r38
+;;
+ xord $r56 = $r56, $r48
+;;
+ xord $r56 = $r56, $r37
+;;
+ xord $r58 = $r56, $r41
+;;
+ xord $r56 = $r58, $r5
+;;
+ xord $r56 = $r56, $r3
+;;
+ xord $r56 = $r56, $r53
+;;
+ xord $r56 = $r56, $r11
+;;
+ xord $r58 = $r56, $r9
+;;
+ xord $r56 = $r58, $r36
+;;
+ xord $r56 = $r56, $r62
+;;
+ sd 176[$r49] = $r56
+ xord $r56 = $r15, $r40
+;;
+ xord $r56 = $r56, $r7
+;;
+ xord $r56 = $r56, $r48
+;;
+ xord $r56 = $r56, $r37
+;;
+ xord $r60 = $r56, $r39
+ xord $r39 = $r10, $r39
+;;
+ xord $r56 = $r60, $r5
+;;
+ xord $r56 = $r56, $r54
+;;
+ xord $r56 = $r56, $r47
+;;
+ xord $r56 = $r56, $r6
+;;
+ xord $r56 = $r56, $r9
+;;
+ sd 184[$r49] = $r56
+ xord $r56 = $r35, $r3
+;;
+ xord $r57 = $r48, $r56
+ xord $r56 = $r47, $r6
+;;
+ xord $r57 = $r57, $r56
+ xord $r56 = $r51, $r33
+;;
+ xord $r56 = $r57, $r56
+ xord $r57 = $r7, $r35
+;;
+ xord $r35 = $r57, $r11
+ xord $r57 = $r39, $r47
+;;
+ xord $r55 = $r35, $r55
+ xord $r35 = $r57, $r15
+ xord $r57 = $r34, $r8
+ xord $r34 = $r41, $r34
+;;
+ xord $r39 = $r57, $r44
+ xord $r57 = $r45, $r17
+ xord $r58 = $r34, $r42
+;;
+ xord $r39 = $r39, $r56
+ xord $r34 = $r58, $r4
+;;
+ sd 192[$r49] = $r39
+ xord $r39 = $r57, $r43
+;;
+ xord $r39 = $r39, $r8
+;;
+ xord $r39 = $r39, $r52
+;;
+ xord $r57 = $r39, $r56
+;;
+ xord $r39 = $r57, $r55
+;;
+ sd 200[$r49] = $r39
+ xord $r39 = $r2, $r4
+;;
+ xord $r39 = $r39, $r17
+ xord $r17 = $r34, $r17
+;;
+ xord $r39 = $r39, $r1
+;;
+ xord $r57 = $r39, $r43
+;;
+ xord $r39 = $r57, $r46
+;;
+ xord $r39 = $r39, $r44
+;;
+ xord $r63 = $r39, $r55
+;;
+ xord $r57 = $r63, $r35
+;;
+ sd 208[$r49] = $r57
+ xord $r57 = $r17, $r36
+;;
+ xord $r57 = $r57, $r1
+;;
+ xord $r57 = $r57, $r8
+ xord $r8 = $r48, $r2
+;;
+ xord $r17 = $r57, $r38
+ xord $r57 = $r37, $r45
+;;
+ xord $r59 = $r17, $r52
+ xord $r62 = $r57, $r53
+;;
+ xord $r34 = $r59, $r44
+ xord $r39 = $r62, $r42
+;;
+ xord $r44 = $r34, $r56
+;;
+ xord $r44 = $r44, $r35
+;;
+ sd 216[$r49] = $r44
+ xord $r44 = $r39, $r4
+;;
+ xord $r4 = $r44, $r9
+;;
+ xord $r4 = $r4, $r36
+;;
+ xord $r57 = $r4, $r43
+ xord $r43 = $r8, $r54
+;;
+ xord $r59 = $r57, $r40
+ xord $r48 = $r43, $r53
+;;
+ xord $r34 = $r59, $r46
+;;
+ xord $r52 = $r34, $r52
+;;
+ xord $r52 = $r52, $r56
+ xord $r56 = $r10, $r37
+;;
+ xord $r39 = $r52, $r55
+ xord $r52 = $r48, $r42
+;;
+ sd 224[$r49] = $r39
+ xord $r42 = $r52, $r6
+;;
+ xord $r17 = $r42, $r9
+;;
+ xord $r39 = $r17, $r1
+;;
+ xord $r48 = $r39, $r33
+;;
+ xord $r42 = $r48, $r38
+;;
+ xord $r17 = $r42, $r46
+;;
+ xord $r52 = $r17, $r55
+ xord $r55 = $r7, $r41
+;;
+ xord $r17 = $r52, $r35
+ xord $r34 = $r55, $r3
+;;
+ sd 232[$r49] = $r17
+ xord $r44 = $r34, $r54
+;;
+ xord $r39 = $r44, $r53
+;;
+ xord $r60 = $r39, $r11
+;;
+ xord $r61 = $r60, $r6
+;;
+ xord $r8 = $r61, $r36
+;;
+ xord $r4 = $r8, $r51
+;;
+ xord $r39 = $r4, $r40
+;;
+ xord $r2 = $r39, $r38
+;;
+ xord $r2 = $r2, $r35
+ xord $r35 = $r56, $r5
+;;
+ sd 240[$r49] = $r2
+ xord $r62 = $r35, $r3
+;;
+ xord $r38 = $r62, $r54
+;;
+ xord $r48 = $r38, $r47
+;;
+ xord $r38 = $r48, $r11
+;;
+ xord $r11 = $r38, $r9
+;;
+ xord $r1 = $r11, $r15
+;;
+ xord $r55 = $r1, $r33
+;;
+ xord $r5 = $r55, $r40
+;;
+ sd 248[$r49] = $r5
+ addd $r49 = $r49, 256
+ cb.wnez $r32? .L114
+;;
+ addd $r0 = $r0, -1024
+ addd $r1 = $r49, -1024
+ make $r2, 1024
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1040
+;;
+ ret
+;;
+ .type bs_mixcolumns_rev, @function
+ .size bs_mixcolumns_rev, . - bs_mixcolumns_rev
+ .text
+ .balign 2
+ .globl bs_expand_key
+bs_expand_key:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -224
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+ addd $r0 = $r12, 40
+ make $r2, 16
+;;
+ sd 24[$r12] = $r19
+;;
+ sd 32[$r12] = $r20
+ call memmove
+;;
+ addd $r0 = $r12, 40
+ call expand_key
+;;
+ make $r20, 0
+ make $r19, 0
+;;
+.L115:
+ sxwd $r1 = $r20
+ addd $r11 = $r12, 40
+ sxwd $r15 = $r19
+ make $r2, 16
+;;
+ slld $r34 = $r1, 10
+ addd $r1 = $r11, $r15
+;;
+ addd $r0 = $r18, $r34
+ call memmove
+;;
+ make $r1, 2
+;;
+.L116:
+ make $r35, 0
+;;
+.L117:
+ addw $r38 = $r1, $r35
+ sxwd $r37 = $r35
+ addw $r35 = $r35, 1
+ make $r32, 2
+;;
+ sxwd $r6 = $r20
+ sxwd $r33 = $r38
+ slld $r10 = $r37, 3
+ compw.lt $r32 = $r35, $r32
+;;
+ slld $r3 = $r6, 10
+ slld $r9 = $r33, 3
+;;
+ addd $r0 = $r18, $r3
+;;
+ addd $r39 = $r0, $r9
+ addd $r8 = $r0, $r10
+;;
+ ld $r17 = 0[$r8]
+;;
+ sd 0[$r39] = $r17
+;;
+ cb.wnez $r32? .L117
+;;
+ addw $r1 = $r1, 2
+ make $r32, 128
+;;
+ compw.lt $r32 = $r1, $r32
+;;
+ cb.wnez $r32? .L116
+;;
+ call bs_transpose
+;;
+ addw $r20 = $r20, 1
+ addw $r19 = $r19, 16
+ make $r32, 176
+;;
+ compw.lt $r32 = $r19, $r32
+;;
+ cb.wnez $r32? .L115
+;;
+ ld $r16 = 8[$r12]
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r20 = 32[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 224
+;;
+ ret
+;;
+ .type bs_expand_key, @function
+ .size bs_expand_key, . - bs_expand_key
+ .text
+ .balign 2
+ .globl bs_cipher
+bs_cipher:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -48
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+;;
+ sd 24[$r12] = $r19
+ addd $r19 = $r1, 0
+ addd $r0 = $r18, 0
+;;
+ sd 32[$r12] = $r20
+ call bs_transpose
+;;
+ addd $r1 = $r19, 0
+ addd $r0 = $r18, 0
+ call bs_addroundkey
+;;
+ make $r20, 1
+;;
+.L118:
+ addd $r0 = $r18, 0
+ call bs_apply_sbox
+;;
+ addd $r0 = $r18, 0
+ call bs_shiftmix
+;;
+ sxwd $r6 = $r20
+ addd $r0 = $r18, 0
+;;
+ slld $r4 = $r6, 10
+;;
+ addd $r1 = $r19, $r4
+ call bs_addroundkey
+;;
+ addw $r20 = $r20, 1
+ make $r32, 10
+;;
+ compw.lt $r32 = $r20, $r32
+;;
+ cb.wnez $r32? .L118
+;;
+ addd $r0 = $r18, 0
+ call bs_apply_sbox
+;;
+ addd $r0 = $r18, 0
+ call bs_shiftrows
+;;
+ addd $r1 = $r19, 10240
+ addd $r0 = $r18, 0
+ call bs_addroundkey
+;;
+ addd $r0 = $r18, 0
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r20 = 32[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 48
+;;
+ goto bs_transpose_rev
+;;
+ .type bs_cipher, @function
+ .size bs_cipher, . - bs_cipher
+ .text
+ .balign 2
+ .globl bs_cipher_rev
+bs_cipher_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -48
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+;;
+ sd 24[$r12] = $r19
+ addd $r19 = $r1, 0
+ addd $r0 = $r18, 0
+;;
+ sd 32[$r12] = $r20
+ call bs_transpose
+;;
+ addd $r1 = $r19, 10240
+ addd $r0 = $r18, 0
+ call bs_addroundkey
+;;
+ make $r20, 9
+;;
+.L119:
+ addd $r0 = $r18, 0
+ call bs_shiftrows_rev
+;;
+ addd $r0 = $r18, 0
+ call bs_apply_sbox_rev
+;;
+ sxwd $r5 = $r20
+ addd $r0 = $r18, 0
+;;
+ slld $r8 = $r5, 10
+;;
+ addd $r1 = $r19, $r8
+ call bs_addroundkey
+;;
+ addd $r0 = $r18, 0
+ call bs_mixcolumns_rev
+;;
+ addw $r20 = $r20, -1
+;;
+ cb.wgtz $r20? .L119
+;;
+ addd $r0 = $r18, 0
+ call bs_shiftrows_rev
+;;
+ addd $r0 = $r18, 0
+ call bs_apply_sbox_rev
+;;
+ addd $r1 = $r19, 0
+ addd $r0 = $r18, 0
+ call bs_addroundkey
+;;
+ addd $r0 = $r18, 0
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r20 = 32[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 48
+;;
+ goto bs_transpose_rev
+;;
+ .type bs_cipher_rev, @function
+ .size bs_cipher_rev, . - bs_cipher_rev
diff --git a/test/monniaux/bitsliced-aes/notes.txt b/test/monniaux/bitsliced-aes/notes.txt
index 815d5931..7ad2ff3c 100644
--- a/test/monniaux/bitsliced-aes/notes.txt
+++ b/test/monniaux/bitsliced-aes/notes.txt
@@ -49,3 +49,6 @@ cycles: 1849125
==> test.gcc.k1c.out <==
cycles: 5255763
+
+* hand optimized loads
+cycles: 6027072