aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mppa_k1c/Asm.v2
-rw-r--r--mppa_k1c/Asmblock.v17
-rw-r--r--mppa_k1c/Asmblockdeps.v1
-rw-r--r--mppa_k1c/Asmblockgen.v14
-rw-r--r--mppa_k1c/Asmblockgenproof1.v111
-rw-r--r--mppa_k1c/Machregs.v2
-rw-r--r--mppa_k1c/NeedOp.v45
-rw-r--r--mppa_k1c/Op.v55
-rw-r--r--mppa_k1c/PostpassSchedulingOracle.ml10
-rw-r--r--mppa_k1c/SelectLong.vp22
-rw-r--r--mppa_k1c/SelectLongproof.v103
-rw-r--r--mppa_k1c/SelectOp.vp20
-rw-r--r--mppa_k1c/SelectOpproof.v89
-rw-r--r--mppa_k1c/TargetPrinter.ml4
-rw-r--r--mppa_k1c/ValueAOp.v34
-rw-r--r--test/monniaux/bitsliced-aes/bs.c53
-rw-r--r--test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized3268
-rw-r--r--test/monniaux/bitsliced-aes/notes.org59
-rw-r--r--test/monniaux/ternary_builtin/ternary_builtin.c11
19 files changed, 3835 insertions, 85 deletions
diff --git a/mppa_k1c/Asm.v b/mppa_k1c/Asm.v
index 7afed212..245804f3 100644
--- a/mppa_k1c/Asm.v
+++ b/mppa_k1c/Asm.v
@@ -208,6 +208,7 @@ Inductive instruction : Type :=
| Pandnil (rd rs: ireg) (imm: int64) (**r andn long *)
| Pornil (rd rs: ireg) (imm: int64) (**r orn long *)
| Pmaddil (rd rs: ireg) (imm: int64) (**r multiply add imm long *)
+ | Pcmove (bt: btest) (rcond rd rs : ireg) (** conditional move *)
.
(** Correspondance between Asmblock and Asm *)
@@ -354,6 +355,7 @@ Definition basic_to_instruction (b: basic) :=
(** ARRR *)
| PArithARRR Asmblock.Pmaddw rd rs1 rs2 => Pmaddw rd rs1 rs2
| PArithARRR Asmblock.Pmaddl rd rs1 rs2 => Pmaddl rd rs1 rs2
+ | PArithARRR (Asmblock.Pcmove cond) rd rs1 rs2=> Pcmove cond rd rs1 rs2
(** ARRI32 *)
| PArithARRI32 Asmblock.Pmaddiw rd rs1 imm => Pmaddiw rd rs1 imm
diff --git a/mppa_k1c/Asmblock.v b/mppa_k1c/Asmblock.v
index b341388c..2fe27143 100644
--- a/mppa_k1c/Asmblock.v
+++ b/mppa_k1c/Asmblock.v
@@ -412,6 +412,7 @@ Inductive arith_name_rri64 : Type :=
Inductive arith_name_arrr : Type :=
| Pmaddw (**r multiply add word *)
| Pmaddl (**r multiply add long *)
+ | Pcmove (bt: btest) (**r conditional move *)
.
Inductive arith_name_arri32 : Type :=
@@ -1207,6 +1208,22 @@ Definition arith_eval_arrr n v1 v2 v3 :=
match n with
| Pmaddw => Val.add v1 (Val.mul v2 v3)
| Pmaddl => Val.addl v1 (Val.mull v2 v3)
+ | Pcmove bt =>
+ match cmp_for_btest bt with
+ | (Some c, Int) =>
+ match Val.cmp_bool c v2 (Vint Int.zero) with
+ | None => Vundef
+ | Some true => v3
+ | Some false => v1
+ end
+ | (Some c, Long) =>
+ match Val.cmpl_bool c v2 (Vlong Int64.zero) with
+ | None => Vundef
+ | Some true => v3
+ | Some false => v1
+ end
+ | (None, _) => Vundef
+ end
end.
Definition arith_eval_arri32 n v1 v2 v3 :=
diff --git a/mppa_k1c/Asmblockdeps.v b/mppa_k1c/Asmblockdeps.v
index 500fc504..6d124556 100644
--- a/mppa_k1c/Asmblockdeps.v
+++ b/mppa_k1c/Asmblockdeps.v
@@ -1433,6 +1433,7 @@ Definition string_of_name_arrr (n: arith_name_arrr): pstring :=
match n with
| Pmaddw => "Pmaddw"
| Pmaddl => "Pmaddl"
+ | Pcmove _ => "Pcmove"
end.
Definition string_of_name_arri32 (n: arith_name_arri32): pstring :=
diff --git a/mppa_k1c/Asmblockgen.v b/mppa_k1c/Asmblockgen.v
index c03e319c..cf0b2a0a 100644
--- a/mppa_k1c/Asmblockgen.v
+++ b/mppa_k1c/Asmblockgen.v
@@ -729,6 +729,20 @@ Definition transl_op
do rd <- ireg_of res;
transl_cond_op cmp rd args k
+ | Oselect, a0 :: a1 :: aS :: nil =>
+ assertion (mreg_eq a0 res);
+ do r0 <- ireg_of a0;
+ do r1 <- ireg_of a1;
+ do rS <- ireg_of aS;
+ OK (Pcmove BTwnez r0 rS r1 ::i k)
+
+ | Oselectl, a0 :: a1 :: aS :: nil =>
+ assertion (mreg_eq a0 res);
+ do r0 <- ireg_of a0;
+ do r1 <- ireg_of a1;
+ do rS <- ireg_of aS;
+ OK (Pcmove BTdnez r0 rS r1 ::i k)
+
| _, _ =>
Error(msg "Asmgenblock.transl_op")
end.
diff --git a/mppa_k1c/Asmblockgenproof1.v b/mppa_k1c/Asmblockgenproof1.v
index 5486a497..16663522 100644
--- a/mppa_k1c/Asmblockgenproof1.v
+++ b/mppa_k1c/Asmblockgenproof1.v
@@ -1558,6 +1558,26 @@ Ltac TranslOpSimpl :=
[ apply exec_straight_one; reflexivity
| split; [ apply Val.lessdef_same; simpl; Simpl; fail | intros; simpl; Simpl; fail ] ].
+Lemma int_eq_comm:
+ forall (x y: int),
+ (Int.eq x y) = (Int.eq y x).
+Proof.
+ intros.
+ unfold Int.eq.
+ unfold zeq.
+ destruct (Z.eq_dec _ _); destruct (Z.eq_dec _ _); congruence.
+Qed.
+
+Lemma int64_eq_comm:
+ forall (x y: int64),
+ (Int64.eq x y) = (Int64.eq y x).
+Proof.
+ intros.
+ unfold Int64.eq.
+ unfold zeq.
+ destruct (Z.eq_dec _ _); destruct (Z.eq_dec _ _); congruence.
+Qed.
+
Lemma transl_op_correct:
forall op args res k (rs: regset) m v c,
transl_op op args res k = OK c ->
@@ -1645,69 +1665,34 @@ Opaque Int.eq.
- (* Ocmp *)
exploit transl_cond_op_correct; eauto. intros (rs' & A & B & C).
exists rs'; split. eexact A. eauto with asmgen.
-(*
-- (* intconst *)
- exploit loadimm32_correct; eauto. intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-- (* longconst *)
- exploit loadimm64_correct; eauto. intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-- (* floatconst *)
- destruct (Float.eq_dec n Float.zero).
-+ subst n. econstructor; split.
- apply exec_straight_one. simpl; eauto. auto.
- split; intros; Simpl.
-+ econstructor; split.
- apply exec_straight_one. simpl; eauto. auto.
- split; intros; Simpl.
-- (* singleconst *)
- destruct (Float32.eq_dec n Float32.zero).
-+ subst n. econstructor; split.
- apply exec_straight_one. simpl; eauto. auto.
- split; intros; Simpl.
-+ econstructor; split.
- apply exec_straight_one. simpl; eauto. auto.
- split; intros; Simpl.
-- (* stackoffset *)
- exploit addptrofs_correct. instantiate (1 := X2); auto with asmgen. intros (rs' & A & B & C).
- exists rs'; split; eauto. auto with asmgen.
-- (* addimm *)
- exploit (opimm32_correct Paddw Paddiw Val.add); auto. instantiate (1 := x0); eauto with asmgen.
- intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-- (* andimm *)
- exploit (opimm32_correct Pandw Pandiw Val.and); auto. instantiate (1 := x0); eauto with asmgen.
- intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-- (* orimm *)
- exploit (opimm32_correct Porw Poriw Val.or); auto. instantiate (1 := x0); eauto with asmgen.
- intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-- (* xorimm *)
- exploit (opimm32_correct Pxorw Pxoriw Val.xor); auto. instantiate (1 := x0); eauto with asmgen.
- intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-
-
-
-- (* addlimm *)
- exploit (opimm64_correct Paddl Paddil Val.addl); auto. instantiate (1 := x0); eauto with asmgen.
- intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-
-- (* andimm *)
- exploit (opimm64_correct Pandl Pandil Val.andl); auto. instantiate (1 := x0); eauto with asmgen.
- intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-- (* orimm *)
- exploit (opimm64_correct Porl Poril Val.orl); auto. instantiate (1 := x0); eauto with asmgen.
- intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-- (* xorimm *)
- exploit (opimm64_correct Pxorl Pxoril Val.xorl); auto. instantiate (1 := x0); eauto with asmgen.
- intros (rs' & A & B & C).
- exists rs'; split; eauto. rewrite B; auto with asmgen.
-*)
+- (* Oselect *)
+ econstructor; split.
+ + eapply exec_straight_one.
+ simpl; reflexivity.
+ + split.
+ * unfold select.
+ destruct (rs x1) eqn:eqX1; try constructor.
+ destruct (rs x) eqn:eqX; try constructor.
+ destruct (rs x0) eqn:eqX0; try constructor.
+ simpl.
+ rewrite int_eq_comm.
+ destruct (Int.eq i Int.zero); simpl; rewrite Pregmap.gss; constructor.
+ * intros.
+ rewrite Pregmap.gso; congruence.
+- (* Oselectl *)
+ econstructor; split.
+ + eapply exec_straight_one.
+ simpl; reflexivity.
+ + split.
+ * unfold selectl.
+ destruct (rs x1) eqn:eqX1; try constructor.
+ destruct (rs x) eqn:eqX; try constructor.
+ destruct (rs x0) eqn:eqX0; try constructor.
+ simpl.
+ rewrite int64_eq_comm.
+ destruct (Int64.eq i Int64.zero); simpl; rewrite Pregmap.gss; constructor.
+ * intros.
+ rewrite Pregmap.gso; congruence.
Qed.
(** Memory accesses *)
diff --git a/mppa_k1c/Machregs.v b/mppa_k1c/Machregs.v
index 4de37af4..2b3fb1aa 100644
--- a/mppa_k1c/Machregs.v
+++ b/mppa_k1c/Machregs.v
@@ -210,7 +210,7 @@ Global Opaque
Definition two_address_op (op: operation) : bool :=
match op with
- | Ocast32unsigned | Omadd | Omaddimm _ | Omaddl | Omaddlimm _ => true
+ | Ocast32unsigned | Omadd | Omaddimm _ | Omaddl | Omaddlimm _ | Oselect | Oselectl => true
| _ => false
end.
diff --git a/mppa_k1c/NeedOp.v b/mppa_k1c/NeedOp.v
index 2577370c..a6ecb820 100644
--- a/mppa_k1c/NeedOp.v
+++ b/mppa_k1c/NeedOp.v
@@ -117,6 +117,7 @@ Definition needs_of_operation (op: operation) (nv: nval): list nval :=
| Ointofsingle | Ointuofsingle | Osingleofint | Osingleofintu => op1 (default nv)
| Olongofsingle | Olonguofsingle | Osingleoflong | Osingleoflongu => op1 (default nv)
| Ocmp c => needs_of_condition c
+ | Oselect | Oselectl => op3 (default nv)
end.
Definition operation_is_redundant (op: operation) (nv: nval): bool :=
@@ -186,6 +187,46 @@ Proof.
trivial.
Qed.
+Lemma select_sound:
+ forall v0 w0 v1 w1 v2 w2 x,
+ vagree v0 w0 (default x) ->
+ vagree v1 w1 (default x) ->
+ vagree v2 w2 (default x) ->
+ vagree (select v0 v1 v2) (select w0 w1 w2) x.
+Proof.
+ unfold default; intros.
+ destruct x; trivial.
+ - destruct v2; simpl; trivial.
+ destruct v0; simpl; trivial.
+ destruct v1; simpl; trivial.
+ inv H. inv H0. inv H1. simpl.
+ constructor.
+ - destruct v2; simpl; trivial.
+ destruct v0; simpl; trivial.
+ destruct v1; simpl; trivial.
+ inv H. inv H0. inv H1. simpl.
+ constructor.
+Qed.
+
+Lemma selectl_sound:
+ forall v0 w0 v1 w1 v2 w2 x,
+ vagree v0 w0 (default x) ->
+ vagree v1 w1 (default x) ->
+ vagree v2 w2 (default x) ->
+ vagree (selectl v0 v1 v2) (selectl w0 w1 w2) x.
+Proof.
+ unfold default; intros.
+ destruct x; trivial.
+ - destruct v2; simpl; trivial.
+ destruct v0; simpl; trivial.
+ destruct v1; simpl; trivial.
+ - destruct v2; simpl; trivial.
+ destruct v0; simpl; trivial.
+ destruct v1; simpl; trivial.
+ inv H. inv H0. inv H1. simpl.
+ constructor.
+Qed.
+
Remark default_idem: forall nv, default (default nv) = default nv.
Proof.
destruct nv; simpl; trivial.
@@ -238,6 +279,10 @@ Proof.
apply mull_sound; trivial.
rewrite default_idem; trivial.
rewrite default_idem; trivial.
+ (* select *)
+- apply select_sound; trivial.
+ (* selectl *)
+- apply selectl_sound; trivial.
Qed.
Lemma operation_is_redundant_sound:
diff --git a/mppa_k1c/Op.v b/mppa_k1c/Op.v
index c4338857..ec3f1077 100644
--- a/mppa_k1c/Op.v
+++ b/mppa_k1c/Op.v
@@ -181,7 +181,9 @@ Inductive operation : Type :=
| Osingleoflong (**r [rd = float32_of_signed_long(r1)] *)
| Osingleoflongu (**r [rd = float32_of_unsigned_int(r1)] *)
(*c Boolean tests: *)
- | Ocmp (cond: condition). (**r [rd = 1] if condition holds, [rd = 0] otherwise. *)
+ | Ocmp (cond: condition) (**r [rd = 1] if condition holds, [rd = 0] otherwise. *)
+ | Oselect (**r [rd = if r3 then r2 else r1] *)
+ | Oselectl. (**r [rd = if r3 then r2 else r1] *)
(** Addressing modes. [r1], [r2], etc, are the arguments to the
addressing. *)
@@ -250,6 +252,40 @@ Definition eval_condition (cond: condition) (vl: list val) (m: mem): option bool
| _, _ => None
end.
+Definition select (v0 : val) (v1 : val) (vselect : val) : val :=
+ match vselect with
+ | Vint iselect =>
+ match v0 with
+ | Vint i0 =>
+ match v1 with
+ | Vint i1 =>
+ Vint (if Int.cmp Ceq Int.zero iselect
+ then i0
+ else i1)
+ | _ => Vundef
+ end
+ | _ => Vundef
+ end
+ | _ => Vundef
+ end.
+
+Definition selectl (v0 : val) (v1 : val) (vselect : val) : val :=
+ match vselect with
+ | Vlong iselect =>
+ match v0 with
+ | Vlong i0 =>
+ match v1 with
+ | Vlong i1 =>
+ Vlong (if Int64.cmp Ceq Int64.zero iselect
+ then i0
+ else i1)
+ | _ => Vundef
+ end
+ | _ => Vundef
+ end
+ | _ => Vundef
+ end.
+
Definition eval_operation
(F V: Type) (genv: Genv.t F V) (sp: val)
(op: operation) (vl: list val) (m: mem): option val :=
@@ -378,6 +414,8 @@ Definition eval_operation
| Osingleoflong, v1::nil => Val.singleoflong v1
| Osingleoflongu, v1::nil => Val.singleoflongu v1
| Ocmp c, _ => Some (Val.of_optbool (eval_condition c vl m))
+ | Oselect, v0::v1::vselect::nil => Some (select v0 v1 vselect)
+ | Oselectl, v0::v1::vselect::nil => Some (selectl v0 v1 vselect)
| _, _ => None
end.
@@ -565,6 +603,9 @@ Definition type_of_operation (op: operation) : list typ * typ :=
| Osingleoflong => (Tlong :: nil, Tsingle)
| Osingleoflongu => (Tlong :: nil, Tsingle)
| Ocmp c => (type_of_condition c, Tint)
+
+ | Oselect => (Tint :: Tint :: Tint :: nil, Tint)
+ | Oselectl => (Tlong :: Tlong :: Tlong :: nil, Tlong)
end.
Definition type_of_addressing (addr: addressing) : list typ :=
@@ -799,6 +840,10 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type).
- destruct v0; simpl in H0; inv H0...
(* cmp *)
- destruct (eval_condition cond vl m)... destruct b...
+ (* select *)
+ - destruct v0; destruct v1; destruct v2; simpl in *; try discriminate; trivial.
+ (* selectl *)
+ - destruct v0; destruct v1; destruct v2; simpl in *; try discriminate; trivial.
Qed.
End SOUNDNESS.
@@ -1324,6 +1369,14 @@ Proof.
exploit eval_condition_inj; eauto. intros EQ; rewrite EQ.
destruct b; simpl; constructor.
simpl; constructor.
+ (* select *)
+ - inv H3; simpl; try constructor.
+ inv H4; simpl; try constructor.
+ inv H2; simpl; constructor.
+ (* selectl *)
+ - inv H3; simpl; try constructor.
+ inv H4; simpl; try constructor.
+ inv H2; simpl; constructor.
Qed.
Lemma eval_addressing_inj:
diff --git a/mppa_k1c/PostpassSchedulingOracle.ml b/mppa_k1c/PostpassSchedulingOracle.ml
index 56b00c7e..9e6e819c 100644
--- a/mppa_k1c/PostpassSchedulingOracle.ml
+++ b/mppa_k1c/PostpassSchedulingOracle.ml
@@ -128,6 +128,7 @@ let arith_rri64_str = function
let arith_arrr_str = function
| Pmaddw -> "Pmaddw"
| Pmaddl -> "Pmaddl"
+ | Pcmove _ -> "Pcmove"
let arith_ri32_str = "Pmake"
@@ -415,7 +416,7 @@ type real_instruction =
| Addw | Andw | Compw | Mulw | Orw | Sbfw | Sraw | Srlw | Sllw | Rorw | Xorw
| Addd | Andd | Compd | Muld | Ord | Sbfd | Srad | Srld | Slld | Xord
| Nandw | Norw | Nxorw | Nandd | Nord | Nxord | Andnw | Ornw | Andnd | Ornd
- | Maddw | Maddd
+ | Maddw | Maddd | Cmoved
| Make | Nop | Sxwd | Zxwd
(* LSU *)
| Lbs | Lbz | Lhs | Lhz | Lws | Ld
@@ -482,7 +483,8 @@ let ab_inst_to_real = function
| "Pfixedudrzz" -> Fixedudz
| "Pfixeddrzz_i32" -> Fixeddz
| "Pfixedudrzz_i32" -> Fixedudz
-
+ | "Pcmove" -> Cmoved
+
| "Plb" -> Lbs
| "Plbu" -> Lbz
| "Plh" -> Lhs
@@ -531,7 +533,7 @@ let rec_to_usage r =
| Some U27L5 | Some U27L10 -> alu_tiny_x
| _ -> raise InvalidEncoding)
| Addd | Andd | Nandd | Ord | Nord | Sbfd | Xord
- | Nxord | Andnd | Ornd ->
+ | Nxord | Andnd | Ornd | Cmoved ->
(match encoding with None | Some U6 | Some S10 -> alu_tiny
| Some U27L5 | Some U27L10 -> alu_tiny_x
| Some E27U27L10 -> alu_tiny_y)
@@ -584,7 +586,7 @@ let real_inst_to_latency = function
| Rorw | Nandw | Norw | Nxorw | Ornw | Andnw
| Nandd | Nord | Nxord | Ornd | Andnd
| Addd | Andd | Compd | Ord | Sbfd | Srad | Srld | Slld | Xord | Make
- | Sxwd | Zxwd | Fcompw | Fcompd
+ | Sxwd | Zxwd | Fcompw | Fcompd | Cmoved
-> 1
| Floatwz | Floatuwz | Fixeduwz | Fixedwz | Floatdz | Floatudz | Fixeddz | Fixedudz -> 4
| Mulw | Muld | Maddw | Maddd -> 2 (* FIXME - WORST CASE. If it's S10 then it's only 1 *)
diff --git a/mppa_k1c/SelectLong.vp b/mppa_k1c/SelectLong.vp
index 0c3618d7..6c34de19 100644
--- a/mppa_k1c/SelectLong.vp
+++ b/mppa_k1c/SelectLong.vp
@@ -258,9 +258,17 @@ Nondetfunction andl (e1: expr) (e2: expr) :=
| Eop (Olongconst n1) Enil, t2 => andlimm n1 t2
| t1, Eop (Olongconst n2) Enil => andlimm n2 t1
| (Eop Onotl (t1:::Enil)), t2 => Eop Oandnl (t1:::t2:::Enil)
- | t1, (Eop Onotl (t2:::Enil)) => Eop Oandnl (t2:::t1:::Enil)
+ | t1, (Eop Onotl (t2:::Enil)) => Eop Oandnl (t2:::t1:::Enil)
| _, _ => Eop Oandl (e1:::e2:::Enil)
end.
+(*
+ | (Eop Ocast32signed
+ ((Eop Oneg ((Eop (Ocmp (Ccomplimm Cne zero1))
+ (y1:::Enil)):::Enil)):::Enil)), v1 =>
+ if Int64.eq zero1 Int64.zero
+ then Eop Oselectl ((Eop (Olongconst Int64.zero) Enil):::v1:::y1:::Enil)
+ else Eop Oandl (e1:::e2:::Enil)
+*)
Nondetfunction orlimm (n1: int64) (e2: expr) :=
if Int64.eq n1 Int64.zero then e2 else
@@ -278,8 +286,20 @@ Nondetfunction orl (e1: expr) (e2: expr) :=
| t1, Eop (Olongconst n2) Enil => orlimm n2 t1
| (Eop Onotl (t1:::Enil)), t2 => Eop Oornl (t1:::t2:::Enil)
| t1, (Eop Onotl (t2:::Enil)) => Eop Oornl (t2:::t1:::Enil)
+ | (Eop Oandl ((Eop Ocast32signed
+ ((Eop Oneg ((Eop (Ocmp (Ccomplimm Ceq zero0))
+ (y0:::Enil)):::Enil)):::Enil)):::v0:::Enil)),
+ (Eop Oandl ((Eop Ocast32signed
+ ((Eop Oneg ((Eop (Ocmp (Ccomplimm Cne zero1))
+ (y1:::Enil)):::Enil)):::Enil)):::v1:::Enil)) =>
+ if same_expr_pure y0 y1
+ && Int64.eq zero0 Int64.zero
+ && Int64.eq zero1 Int64.zero
+ then Eop Oselectl (v0:::v1:::y0:::Enil)
+ else Eop Oorl (e1:::e2:::Enil)
| _, _ => Eop Oorl (e1:::e2:::Enil)
end.
+
Nondetfunction xorlimm (n1: int64) (e2: expr) :=
if Int64.eq n1 Int64.zero then e2 else
diff --git a/mppa_k1c/SelectLongproof.v b/mppa_k1c/SelectLongproof.v
index 79187338..dd4cfa69 100644
--- a/mppa_k1c/SelectLongproof.v
+++ b/mppa_k1c/SelectLongproof.v
@@ -390,6 +390,15 @@ Proof.
- TrivialExists.
Qed.
+Lemma int64_eq_commut: forall x y : int64,
+ (Int64.eq x y) = (Int64.eq y x).
+Proof.
+ intros.
+ predSpec Int64.eq Int64.eq_spec x y;
+ predSpec Int64.eq Int64.eq_spec y x;
+ congruence.
+Qed.
+
Theorem eval_andl: binary_constructor_sound andl Val.andl.
Proof.
unfold andl; destruct Archi.splitlong. apply SplitLongproof.eval_andl.
@@ -398,6 +407,25 @@ Proof.
- InvEval. apply eval_andlimm; auto.
- (*andn*) InvEval. TrivialExists. simpl. congruence.
- (*andn reverse*) InvEval. rewrite Val.andl_commut. TrivialExists; simpl. congruence.
+ (*
+- (* selectl *)
+ InvEval.
+ predSpec Int64.eq Int64.eq_spec zero1 Int64.zero; simpl; TrivialExists.
+ + constructor. econstructor; constructor.
+ constructor; try constructor; try constructor; try eassumption.
+ + simpl in *. f_equal. inv H6.
+ unfold selectl.
+ simpl.
+ destruct v3; simpl; trivial.
+ rewrite int64_eq_commut.
+ destruct (Int64.eq i Int64.zero); simpl.
+ * replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve.
+ destruct y; simpl; trivial.
+ * replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve.
+ destruct y; simpl; trivial.
+ rewrite Int64.and_commut. rewrite Int64.and_mone. reflexivity.
+ + constructor. econstructor. constructor. econstructor. constructor. econstructor. constructor. eassumption. constructor. simpl. f_equal. constructor. simpl. f_equal. constructor. simpl. f_equal. constructor. eassumption. constructor.
+ + simpl in *. congruence. *)
- TrivialExists.
Qed.
@@ -414,6 +442,7 @@ Proof.
- TrivialExists.
Qed.
+
Theorem eval_orl: binary_constructor_sound orl Val.orl.
Proof.
unfold orl; destruct Archi.splitlong. apply SplitLongproof.eval_orl.
@@ -423,6 +452,80 @@ Proof.
- InvEval. apply eval_orlimm; auto.
- (*orn*) InvEval. TrivialExists; simpl; congruence.
- (*orn reversed*) InvEval. rewrite Val.orl_commut. TrivialExists; simpl; congruence.
+ - (* selectl *)
+ destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try TrivialExists.
+ predSpec Int64.eq Int64.eq_spec zero0 Int64.zero; simpl; try TrivialExists.
+ predSpec Int64.eq Int64.eq_spec zero1 Int64.zero; simpl; [ | TrivialExists].
+ inv H.
+ inv H0.
+ inv H6.
+ inv H3.
+ inv H2.
+ inv H7.
+ inv H4.
+ inv H3.
+ inv H6.
+ inv H4.
+ inv H3.
+ inv H14.
+ inv H13.
+ inv H6.
+ inv H4.
+ inv H13.
+ inv H14.
+ inv H9.
+ inv H11.
+ inv H13.
+ inv H3.
+ inv H6.
+ inv H7.
+ inv H3.
+ inv H14.
+ inv H17.
+ simpl in *.
+ inv H8.
+ inv H5.
+ inv H10.
+ inv H12.
+ inv H15.
+ inv H16.
+ inv H11.
+ inv H13.
+ unfold same_expr_pure in PURE.
+ destruct y0; try congruence.
+ destruct y1; try congruence.
+ destruct (ident_eq i i0); try congruence; clear PURE.
+ rewrite <- e0 in *; clear e0.
+ inv H6.
+ inv H7.
+ rename v10 into vtest.
+ replace v11 with vtest in * by congruence.
+ TrivialExists.
+ simpl.
+ f_equal.
+ unfold selectl.
+ destruct vtest; simpl; trivial.
+ rewrite Val.andl_commut.
+ destruct v4; simpl; trivial.
+ rewrite Val.andl_commut.
+ rewrite Val.orl_commut.
+ destruct v9; simpl; trivial.
+ rewrite int64_eq_commut.
+ destruct (Int64.eq i1 Int64.zero); simpl.
+
+ + replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve.
+ replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve.
+ rewrite Int64.and_mone.
+ rewrite Int64.and_zero.
+ rewrite Int64.or_commut.
+ rewrite Int64.or_zero.
+ reflexivity.
+ + replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve.
+ replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve.
+ rewrite Int64.and_mone.
+ rewrite Int64.and_zero.
+ rewrite Int64.or_zero.
+ reflexivity.
- TrivialExists.
Qed.
diff --git a/mppa_k1c/SelectOp.vp b/mppa_k1c/SelectOp.vp
index f6605c11..13650a2c 100644
--- a/mppa_k1c/SelectOp.vp
+++ b/mppa_k1c/SelectOp.vp
@@ -275,7 +275,25 @@ Nondetfunction or (e1: expr) (e2: expr) :=
then Eop (Ororimm n2) (t1:::Enil)
else Eop Oor (e1:::e2:::Enil)
| (Eop Onot (t1:::Enil)), t2 => Eop Oorn (t1:::t2:::Enil)
- | t1, (Eop Onot (t2:::Enil)) => Eop Oorn (t2:::t1:::Enil)
+ | t1, (Eop Onot (t2:::Enil)) => Eop Oorn (t2:::t1:::Enil)
+ | (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompimm Ceq zero0))
+ (y0:::Enil)):::Enil)):::v0:::Enil)),
+ (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompimm Cne zero1))
+ (y1:::Enil)):::Enil)):::v1:::Enil)) =>
+ if same_expr_pure y0 y1
+ && Int.eq zero0 Int.zero
+ && Int.eq zero1 Int.zero
+ then Eop Oselect (v0:::v1:::y0:::Enil)
+ else Eop Oor (e1:::e2:::Enil)
+ | (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompuimm Ceq zero0))
+ (y0:::Enil)):::Enil)):::v0:::Enil)),
+ (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompuimm Cne zero1))
+ (y1:::Enil)):::Enil)):::v1:::Enil)) =>
+ if same_expr_pure y0 y1
+ && Int.eq zero0 Int.zero
+ && Int.eq zero1 Int.zero
+ then Eop Oselect (v0:::v1:::y0:::Enil)
+ else Eop Oor (e1:::e2:::Enil)
| _, _ => Eop Oor (e1:::e2:::Enil)
end.
diff --git a/mppa_k1c/SelectOpproof.v b/mppa_k1c/SelectOpproof.v
index 89af39ee..d35c4b6d 100644
--- a/mppa_k1c/SelectOpproof.v
+++ b/mppa_k1c/SelectOpproof.v
@@ -92,7 +92,7 @@ Let ge := Genv.globalenv prog.
Variable sp: val.
Variable e: env.
Variable m: mem.
-
+
(* Helper lemmas - from SplitLongproof.v *)
Ltac UseHelper := decompose [Logic.and] arith_helpers_correct; eauto.
@@ -162,7 +162,7 @@ Definition binary_constructor_sound (cstr: expr -> expr -> expr) (sem: val -> va
eval_expr ge sp e m le a x ->
eval_expr ge sp e m le b y ->
exists v, eval_expr ge sp e m le (cstr a b) v /\ Val.lessdef (sem x y) v.
-
+
Theorem eval_addrsymbol:
forall le id ofs,
exists v, eval_expr ge sp e m le (addrsymbol id ofs) v /\ Val.lessdef (Genv.symbol_address ge id ofs) v.
@@ -526,6 +526,15 @@ Proof.
discriminate.
Qed.
+Lemma int_eq_commut: forall x y : int,
+ (Int.eq x y) = (Int.eq y x).
+Proof.
+ intros.
+ predSpec Int.eq Int.eq_spec x y;
+ predSpec Int.eq Int.eq_spec y x;
+ congruence.
+Qed.
+
Theorem eval_or: binary_constructor_sound or Val.or.
Proof.
unfold or; red; intros.
@@ -553,6 +562,82 @@ Proof.
exists (Val.ror v1 (Vint n2)); split. EvalOp. rewrite Val.or_commut. apply ROR; auto.
- (*orn*) TrivialExists; simpl; congruence.
- (*orn reversed*) rewrite Val.or_commut. TrivialExists; simpl; congruence.
+ - (* select *)
+ destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try exact DEFAULT.
+ predSpec Int.eq Int.eq_spec zero0 Int.zero; simpl; try exact DEFAULT.
+ predSpec Int.eq Int.eq_spec zero1 Int.zero; simpl; try exact DEFAULT.
+ TrivialExists.
+ simpl in *.
+ unfold select.
+ f_equal.
+ inv H6.
+ inv H7.
+ inv H9.
+ inv H11.
+ unfold same_expr_pure in PURE.
+ destruct y0; try congruence.
+ destruct y1; try congruence.
+ destruct (ident_eq i i0); try congruence.
+ rewrite <- e0 in *. clear e0. clear PURE.
+ inv H2. inv H5.
+ replace v8 with v4 in * by congruence.
+ rename v4 into vselect.
+ destruct vselect; simpl; trivial.
+ rewrite (Val.and_commut _ v5).
+ destruct v5; simpl; trivial.
+ rewrite (Val.and_commut _ v9).
+ rewrite Val.or_commut.
+ destruct v9; simpl; trivial.
+ rewrite int_eq_commut.
+ destruct (Int.eq i1 Int.zero); simpl.
+ + rewrite Int.and_zero.
+ rewrite Int.or_commut.
+ rewrite Int.or_zero.
+ rewrite Int.and_mone.
+ reflexivity.
+ + rewrite Int.and_mone.
+ rewrite Int.neg_zero.
+ rewrite Int.and_zero.
+ rewrite Int.or_zero.
+ reflexivity.
+ - (* select unsigned *)
+ destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try exact DEFAULT.
+ predSpec Int.eq Int.eq_spec zero0 Int.zero; simpl; try exact DEFAULT.
+ predSpec Int.eq Int.eq_spec zero1 Int.zero; simpl; try exact DEFAULT.
+ TrivialExists.
+ simpl in *.
+ unfold select.
+ f_equal.
+ inv H6.
+ inv H7.
+ inv H9.
+ inv H11.
+ unfold same_expr_pure in PURE.
+ destruct y0; try congruence.
+ destruct y1; try congruence.
+ destruct (ident_eq i i0); try congruence.
+ rewrite <- e0 in *. clear e0. clear PURE.
+ inv H2. inv H5.
+ replace v8 with v4 in * by congruence.
+ rename v4 into vselect.
+ destruct vselect; simpl; trivial.
+ rewrite (Val.and_commut _ v5).
+ destruct v5; simpl; trivial.
+ rewrite (Val.and_commut _ v9).
+ rewrite Val.or_commut.
+ destruct v9; simpl; trivial.
+ rewrite int_eq_commut.
+ destruct (Int.eq i1 Int.zero); simpl.
+ + rewrite Int.and_zero.
+ rewrite Int.or_commut.
+ rewrite Int.or_zero.
+ rewrite Int.and_mone.
+ reflexivity.
+ + rewrite Int.and_mone.
+ rewrite Int.neg_zero.
+ rewrite Int.and_zero.
+ rewrite Int.or_zero.
+ reflexivity.
- apply DEFAULT.
Qed.
diff --git a/mppa_k1c/TargetPrinter.ml b/mppa_k1c/TargetPrinter.ml
index 41a6622a..4be94390 100644
--- a/mppa_k1c/TargetPrinter.ml
+++ b/mppa_k1c/TargetPrinter.ml
@@ -504,6 +504,10 @@ module Target (*: TARGET*) =
| Pmaddil (rd, rs, imm) ->
fprintf oc " maddd %a = %a, %a\n" ireg rd ireg rs coqint64 imm
+ | Pcmove (bt, rd, rcond, rs) ->
+ fprintf oc " cmoved.%a %a? %a = %a\n"
+ bcond bt ireg rcond ireg rd ireg rs
+
let get_section_names name =
let (text, lit) =
match C2C.atom_sections name with
diff --git a/mppa_k1c/ValueAOp.v b/mppa_k1c/ValueAOp.v
index fb1977ea..a3843301 100644
--- a/mppa_k1c/ValueAOp.v
+++ b/mppa_k1c/ValueAOp.v
@@ -41,6 +41,24 @@ Definition eval_static_addressing (addr: addressing) (vl: list aval): aval :=
| _, _ => Vbot
end.
+Definition select (v0 v1 vselect : aval) : aval :=
+ match vselect with
+ | I iselect =>
+ if Int.eq Int.zero iselect
+ then binop_int (fun x0 x1 => x0) v0 v1
+ else binop_int (fun x0 x1 => x1) v0 v1
+ | _ => Vtop
+ end.
+
+Definition selectl (v0 v1 vselect : aval) : aval :=
+ match vselect with
+ | L iselect =>
+ if Int64.eq Int64.zero iselect
+ then binop_long (fun x0 x1 => x0) v0 v1
+ else binop_long (fun x0 x1 => x1) v0 v1
+ | _ => Vtop
+ end.
+
Definition eval_static_operation (op: operation) (vl: list aval): aval :=
match op, vl with
| Omove, v1::nil => v1
@@ -165,6 +183,8 @@ Definition eval_static_operation (op: operation) (vl: list aval): aval :=
| Osingleoflong, v1::nil => singleoflong v1
| Osingleoflongu, v1::nil => singleoflongu v1
| Ocmp c, _ => of_optbool (eval_static_condition c vl)
+ | Oselect, v0::v1::vselect::nil => select v0 v1 vselect
+ | Oselectl, v0::v1::vselect::nil => selectl v0 v1 vselect
| _, _ => Vbot
end.
@@ -241,6 +261,20 @@ Proof.
destruct (propagate_float_constants tt); constructor.
rewrite Ptrofs.add_zero_l; eauto with va.
apply of_optbool_sound. eapply eval_static_condition_sound; eauto.
+ (* select *)
+ - inv H2; simpl; try constructor.
+ + destruct (Int.eq _ _); apply binop_int_sound; trivial.
+ + destruct (Int.eq _ _);
+ destruct a1; destruct a0; eauto; constructor.
+ + destruct (Int.eq _ _);
+ destruct a1; destruct a0; eauto; constructor.
+ + destruct (Int.eq _ _);
+ destruct a1; destruct a0; eauto; constructor.
+ (* selectl *)
+ - inv H2; simpl; try constructor.
+ + destruct (Int64.eq _ _); apply binop_long_sound; trivial.
+ + destruct (Int64.eq _ _);
+ destruct a1; destruct a0; eauto; constructor.
Qed.
End SOUNDNESS.
diff --git a/test/monniaux/bitsliced-aes/bs.c b/test/monniaux/bitsliced-aes/bs.c
index 4a9df4aa..063f36f5 100644
--- a/test/monniaux/bitsliced-aes/bs.c
+++ b/test/monniaux/bitsliced-aes/bs.c
@@ -2,6 +2,21 @@
#include <string.h>
#include "bs.h"
+
+static inline long compcert_ternary_signedl(long x, long v0, long v1) {
+ return ((-(x==0)) & v0) | ((-(x!=0)) & v1);
+}
+
+static inline word_t compcert_ternary(word_t x, word_t v0, word_t v1) {
+ return compcert_ternary_signedl(x, v0, v1);
+}
+
+#if defined(__K1C__)
+#define TERNARY(x, v0, v1) compcert_ternary((x), (v0), (v1))
+#else
+#define TERNARY(x, v0, v1) ((x) ? (v1) : (v0))
+#endif
+
#if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) ||\
defined(__amd64__) || defined(__amd32__)|| defined(__amd16__)
#define bs2le(x) (x)
@@ -14,12 +29,6 @@
#error "endianness not supported"
#endif
-#if 1
-#define TERNARY_XY0(t, x) ((-((t) != 0)) & (x))
-#else
-#define TERNARY_XY0(t, x) (((t) != 0) ? (x) : (0))
-#endif
-
void bs_addroundkey(word_t * B, word_t * rk)
{
int i;
@@ -393,15 +402,23 @@ void bs_transpose_dst(word_t * transpose, word_t * blocks)
int offset = i << MUL_SHIFT;
#ifndef UNROLL_TRANSPOSE
- /* DM experiments */
- /* The normal ternary operator costs us a lot!
- from 10145951 to 7995063 */
- int j;
+ int j;
+#ifdef __COMPCERT__
+ word_t *transptr = transpose+offset;
+ word_t bitmask = ONE;
+ for(j=0; j < WORD_SIZE; j++)
+ {
+ word_t old = *transptr;
+ *(transptr++) = TERNARY(w & bitmask, old, old|bitpos);
+ bitmask <<= 1;
+ }
+#else
for(j=0; j < WORD_SIZE; j++)
{
// TODO make const time
- transpose[offset + j] |= TERNARY_XY0(w & (ONE << j), bitpos);
+ transpose[offset + j] |= (w & (ONE << j)) ? bitpos : 0;
}
+#endif
#else
transpose[(offset)+ 0 ] |= (w & (ONE << 0 )) ? (bitpos) : 0;
@@ -494,11 +511,23 @@ void bs_transpose_rev(word_t * blocks)
word_t offset = k / WORD_SIZE;
#ifndef UNROLL_TRANSPOSE
int j;
+#ifdef __COMPCERT__
+ word_t *transptr = transpose + offset;
+ word_t bitmask = ONE;
for(j=0; j < WORD_SIZE; j++)
{
- word_t bit = TERNARY_XY0((w & (ONE << j)), (ONE << (k % WORD_SIZE)));
+ word_t old = *transptr;
+ *transptr = TERNARY(w & bitmask, old, old | bitpos);
+ transptr += WORDS_PER_BLOCK;
+ bitmask <<= 1;
+ }
+#else
+ for(j=0; j < WORD_SIZE; j++)
+ {
+ word_t bit = (w & (ONE << j)) ? (ONE << (k % WORD_SIZE)) : 0;
transpose[j * WORDS_PER_BLOCK + (offset)] |= bit;
}
+#endif
#else
transpose[0 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 0 )) ? bitpos : 0;
transpose[1 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 1 )) ? bitpos : 0;
diff --git a/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized b/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized
new file mode 100644
index 00000000..d939f856
--- /dev/null
+++ b/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized
@@ -0,0 +1,3268 @@
+# File generated by CompCert 3.5
+# Command line: -O3 -Wall -Wno-c11-extensions -fno-unprototyped -S bs.c -o bs.ccomp.k1c.s
+ .text
+ .balign 2
+ .globl bs_addroundkey
+bs_addroundkey:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -16
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ make $r5, 0
+;;
+.L100:
+ sxwd $r6 = $r5
+ addw $r5 = $r5, 1
+ make $r32, 128
+;;
+ slld $r2 = $r6, 3
+ compw.lt $r32 = $r5, $r32
+;;
+ addd $r3 = $r0, $r2
+ addd $r4 = $r1, $r2
+;;
+ ld $r7 = 0[$r3]
+;;
+ ld $r9 = 0[$r4]
+;;
+ xord $r6 = $r7, $r9
+;;
+ sd 0[$r3] = $r6
+;;
+ cb.wnez $r32? .L100
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 16
+;;
+ ret
+;;
+ .type bs_addroundkey, @function
+ .size bs_addroundkey, . - bs_addroundkey
+ .text
+ .balign 2
+ .globl bs_apply_sbox
+bs_apply_sbox:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -32
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+;;
+ sd 24[$r12] = $r19
+ make $r19, 0
+;;
+.L101:
+ sxwd $r1 = $r19
+;;
+ slld $r0 = $r1, 3
+;;
+ addd $r0 = $r18, $r0
+ call bs_sbox
+;;
+ addw $r19 = $r19, 8
+ make $r32, 128
+;;
+ compw.lt $r32 = $r19, $r32
+;;
+ cb.wnez $r32? .L101
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 32
+;;
+ ret
+;;
+ .type bs_apply_sbox, @function
+ .size bs_apply_sbox, . - bs_apply_sbox
+ .text
+ .balign 2
+ .globl bs_apply_sbox_rev
+bs_apply_sbox_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -32
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+;;
+ sd 24[$r12] = $r19
+ make $r19, 0
+;;
+.L102:
+ sxwd $r1 = $r19
+;;
+ slld $r0 = $r1, 3
+;;
+ addd $r0 = $r18, $r0
+ call bs_sbox_rev
+;;
+ addw $r19 = $r19, 8
+ make $r32, 128
+;;
+ compw.lt $r32 = $r19, $r32
+;;
+ cb.wnez $r32? .L102
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 32
+;;
+ ret
+;;
+ .type bs_apply_sbox_rev, @function
+ .size bs_apply_sbox_rev, . - bs_apply_sbox_rev
+ .text
+ .balign 2
+ .globl bs_sbox_rev
+bs_sbox_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -96
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+;;
+ sd 24[$r12] = $r19
+;;
+ ld $r7 = 48[$r0]
+;;
+ ld $r3 = 56[$r0]
+;;
+ ld $r4 = 32[$r0]
+ nxord $r40 = $r3, $r7
+;;
+ xord $r10 = $r3, $r4
+ nxord $r11 = $r7, $r4
+ ld $r2 = 8[$r0]
+;;
+ ld $r5 = 24[$r0]
+ nxord $r45 = $r7, $r10
+ xord $r59 = $r7, $r2
+;;
+ xord $r41 = $r4, $r5
+ ld $r1 = 0[$r0]
+ xord $r60 = $r5, $r45
+ andd $r33 = $r10, $r45
+;;
+ nxord $r35 = $r5, $r1
+ xord $r63 = $r2, $r1
+ nxord $r39 = $r1, $r41
+ ld $r6 = 40[$r0]
+;;
+ xord $r46 = $r11, $r63
+ xord $r54 = $r40, $r35
+ ld $r7 = 16[$r0]
+ nxord $r57 = $r6, $r5
+;;
+ xord $r52 = $r40, $r63
+ xord $r50 = $r41, $r63
+ nxord $r47 = $r6, $r7
+ nxord $r38 = $r7, $r2
+;;
+ nxord $r58 = $r6, $r46
+ xord $r19 = $r11, $r47
+ xord $r63 = $r59, $r57
+ xord $r7 = $r41, $r38
+;;
+ xord $r44 = $r35, $r59
+ xord $r18 = $r3, $r47
+ xord $r3 = $r54, $r7
+ xord $r55 = $r54, $r38
+;;
+ nxord $r34 = $r6, $r41
+ xord $r2 = $r50, $r63
+ andd $r57 = $r52, $r19
+ andd $r17 = $r50, $r63
+;;
+ xord $r36 = $r55, $r57
+ andd $r62 = $r46, $r18
+ andd $r53 = $r11, $r39
+ xord $r6 = $r2, $r17
+;;
+ andd $r42 = $r44, $r58
+ andd $r15 = $r41, $r3
+ andd $r2 = $r60, $r7
+ andd $r37 = $r40, $r54
+;;
+ xord $r59 = $r62, $r57
+ xord $r51 = $r42, $r17
+ xord $r8 = $r2, $r15
+ xord $r4 = $r37, $r15
+;;
+ xord $r5 = $r36, $r33
+ xord $r38 = $r59, $r35
+ xord $r48 = $r6, $r53
+ xord $r47 = $r51, $r4
+;;
+ xord $r53 = $r5, $r8
+ xord $r43 = $r38, $r4
+ xord $r56 = $r48, $r8
+ xord $r57 = $r47, $r34
+;;
+ xord $r49 = $r56, $r57
+ andd $r48 = $r56, $r53
+ xord $r47 = $r53, $r43
+ andd $r9 = $r53, $r57
+;;
+ xord $r36 = $r43, $r48
+ xord $r35 = $r57, $r48
+ andd $r62 = $r47, $r9
+ xord $r17 = $r47, $r48
+;;
+ andd $r15 = $r35, $r47
+ andd $r42 = $r36, $r49
+ andd $r47 = $r43, $r56
+ xord $r59 = $r49, $r48
+;;
+ andd $r37 = $r49, $r47
+ xord $r5 = $r43, $r15
+ xord $r4 = $r62, $r17
+ xord $r55 = $r57, $r42
+;;
+ xord $r1 = $r37, $r59
+ xord $r2 = $r5, $r55
+ xord $r47 = $r5, $r4
+ andd $r35 = $r4, $r39
+;;
+ xord $r61 = $r4, $r1
+ xord $r33 = $r55, $r1
+ andd $r62 = $r1, $r45
+ andd $r45 = $r55, $r18
+;;
+ xord $r48 = $r2, $r61
+ andd $r49 = $r2, $r3
+ andd $r6 = $r1, $r10
+ andd $r3 = $r47, $r50
+;;
+ andd $r56 = $r47, $r63
+ andd $r42 = $r5, $r58
+ andd $r1 = $r4, $r11
+ andd $r57 = $r2, $r41
+;;
+ andd $r9 = $r61, $r54
+ andd $r51 = $r33, $r52
+ andd $r58 = $r55, $r46
+ andd $r53 = $r5, $r44
+;;
+ andd $r41 = $r48, $r60
+ andd $r10 = $r61, $r40
+ xord $r59 = $r49, $r57
+ xord $r61 = $r3, $r1
+;;
+ andd $r34 = $r33, $r19
+ andd $r39 = $r48, $r7
+ xord $r55 = $r9, $r41
+ xord $r60 = $r45, $r6
+;;
+ xord $r48 = $r62, $r35
+ xord $r15 = $r56, $r53
+ xord $r44 = $r59, $r61
+ xord $r49 = $r51, $r10
+;;
+ xord $r54 = $r34, $r42
+ xord $r51 = $r58, $r60
+ xord $r59 = $r59, $r48
+ xord $r8 = $r56, $r42
+;;
+ xord $r47 = $r9, $r1
+ xord $r11 = $r60, $r15
+ xord $r40 = $r55, $r44
+ xord $r60 = $r15, $r51
+;;
+ xord $r52 = $r56, $r41
+ xord $r56 = $r10, $r54
+ xord $r2 = $r49, $r59
+ xord $r5 = $r59, $r60
+;;
+ xord $r7 = $r3, $r55
+ xord $r61 = $r51, $r56
+ xord $r59 = $r47, $r11
+ xord $r47 = $r8, $r40
+;;
+ xord $r63 = $r35, $r39
+ xord $r4 = $r34, $r45
+ sd 88[$r12] = $r47
+ xord $r51 = $r2, $r59
+;;
+ xord $r10 = $r55, $r48
+ xord $r50 = $r44, $r63
+ sd 80[$r12] = $r51
+ xord $r37 = $r7, $r5
+;;
+ xord $r53 = $r54, $r44
+ sd 72[$r12] = $r37
+ xord $r37 = $r4, $r40
+ xord $r40 = $r50, $r61
+;;
+ xord $r1 = $r58, $r57
+ sd 64[$r12] = $r37
+ xord $r46 = $r10, $r53
+ xord $r7 = $r52, $r50
+;;
+ sd 56[$r12] = $r40
+ xord $r49 = $r49, $r1
+ addd $r1 = $r12, 32
+ make $r2, 64
+;;
+ sd 48[$r12] = $r46
+;;
+ sd 40[$r12] = $r7
+;;
+ sd 32[$r12] = $r49
+ call memmove
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 96
+;;
+ ret
+;;
+ .type bs_sbox_rev, @function
+ .size bs_sbox_rev, . - bs_sbox_rev
+ .text
+ .balign 2
+ .globl bs_sbox
+bs_sbox:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -80
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ ld $r5 = 56[$r0]
+;;
+ ld $r6 = 32[$r0]
+;;
+ xord $r41 = $r5, $r6
+ ld $r2 = 16[$r0]
+;;
+ xord $r42 = $r5, $r2
+ ld $r4 = 8[$r0]
+ xord $r49 = $r6, $r2
+;;
+ xord $r48 = $r5, $r4
+ ld $r55 = 24[$r0]
+;;
+ xord $r9 = $r55, $r4
+ ld $r3 = 48[$r0]
+;;
+ xord $r5 = $r41, $r9
+ ld $r7 = 40[$r0]
+ xord $r34 = $r3, $r2
+;;
+ xord $r10 = $r3, $r7
+ ld $r1 = 0[$r0]
+ xord $r11 = $r7, $r2
+ xord $r3 = $r48, $r49
+;;
+ xord $r33 = $r9, $r34
+ xord $r8 = $r9, $r11
+ xord $r44 = $r6, $r1
+ xord $r47 = $r4, $r1
+;;
+ xord $r59 = $r1, $r10
+ xord $r7 = $r5, $r34
+ xord $r9 = $r10, $r44
+ xord $r4 = $r10, $r47
+;;
+ xord $r61 = $r1, $r5
+ xord $r57 = $r5, $r10
+ andd $r50 = $r3, $r5
+ andd $r43 = $r9, $r1
+;;
+ xord $r34 = $r59, $r8
+ xord $r6 = $r41, $r9
+ xord $r35 = $r42, $r4
+ xord $r36 = $r48, $r8
+;;
+ xord $r38 = $r41, $r11
+ xord $r40 = $r7, $r50
+ xord $r11 = $r43, $r50
+ andd $r50 = $r48, $r8
+;;
+ xord $r15 = $r42, $r57
+ andd $r62 = $r35, $r61
+ andd $r37 = $r4, $r59
+ xord $r52 = $r36, $r50
+;;
+ andd $r53 = $r6, $r34
+ andd $r55 = $r41, $r33
+ andd $r46 = $r49, $r38
+ andd $r54 = $r42, $r57
+;;
+ xord $r39 = $r53, $r50
+ xord $r60 = $r46, $r55
+ xord $r55 = $r54, $r55
+ xord $r10 = $r40, $r62
+;;
+ xord $r44 = $r6, $r34
+ xord $r43 = $r11, $r15
+ xord $r15 = $r52, $r37
+ xord $r17 = $r39, $r55
+;;
+ xord $r45 = $r10, $r60
+ xord $r55 = $r43, $r55
+ xord $r50 = $r15, $r60
+ xord $r46 = $r17, $r44
+;;
+ xord $r63 = $r50, $r46
+ andd $r43 = $r50, $r45
+ xord $r56 = $r45, $r55
+ andd $r54 = $r45, $r46
+;;
+ xord $r36 = $r55, $r43
+ xord $r47 = $r46, $r43
+ andd $r40 = $r56, $r54
+ andd $r60 = $r55, $r50
+;;
+ andd $r2 = $r47, $r56
+ andd $r58 = $r36, $r63
+ xord $r36 = $r56, $r43
+ andd $r15 = $r63, $r60
+;;
+ xord $r47 = $r63, $r43
+ xord $r17 = $r55, $r2
+ xord $r50 = $r40, $r36
+ xord $r52 = $r46, $r58
+;;
+ xord $r58 = $r15, $r47
+ xord $r51 = $r17, $r52
+ xord $r7 = $r17, $r50
+ andd $r43 = $r52, $r1
+;;
+ xord $r53 = $r50, $r58
+ xord $r62 = $r52, $r58
+ andd $r44 = $r7, $r8
+ andd $r8 = $r50, $r59
+;;
+ xord $r40 = $r51, $r53
+ andd $r45 = $r58, $r61
+ andd $r54 = $r51, $r33
+ andd $r10 = $r58, $r35
+;;
+ andd $r47 = $r40, $r38
+ andd $r46 = $r62, $r3
+ andd $r2 = $r51, $r41
+ xord $r35 = $r8, $r10
+;;
+ andd $r5 = $r62, $r5
+ andd $r36 = $r17, $r34
+ andd $r39 = $r53, $r57
+ andd $r56 = $r7, $r48
+;;
+ andd $r41 = $r40, $r49
+ xord $r34 = $r45, $r46
+ xord $r51 = $r44, $r2
+ xord $r62 = $r54, $r47
+;;
+ andd $r38 = $r52, $r9
+ andd $r37 = $r50, $r4
+ andd $r9 = $r17, $r6
+ xord $r59 = $r46, $r35
+;;
+ andd $r61 = $r53, $r42
+ xord $r1 = $r2, $r41
+ xord $r63 = $r5, $r43
+ xord $r33 = $r39, $r56
+;;
+ xord $r42 = $r41, $r51
+ xord $r51 = $r5, $r34
+ xord $r52 = $r36, $r37
+ xord $r49 = $r59, $r62
+;;
+ xord $r57 = $r47, $r33
+ xord $r3 = $r9, $r63
+ xord $r11 = $r54, $r2
+ xord $r50 = $r10, $r1
+;;
+ xord $r37 = $r43, $r36
+ xord $r6 = $r56, $r52
+ xord $r9 = $r51, $r62
+ xord $r40 = $r42, $r49
+;;
+ xord $r36 = $r61, $r33
+ xord $r10 = $r42, $r57
+ xord $r56 = $r52, $r57
+ xord $r57 = $r3, $r11
+;;
+ xord $r5 = $r35, $r51
+ sd 72[$r12] = $r40
+ nxord $r43 = $r50, $r9
+ nxord $r17 = $r36, $r57
+;;
+ xord $r39 = $r8, $r1
+ xord $r53 = $r38, $r35
+ xord $r8 = $r1, $r35
+ sd 64[$r12] = $r43
+;;
+ xord $r7 = $r34, $r37
+ xord $r58 = $r3, $r53
+ sd 56[$r12] = $r17
+ xord $r38 = $r42, $r5
+;;
+ xord $r48 = $r6, $r63
+ sd 48[$r12] = $r38
+ xord $r1 = $r8, $r7
+ xord $r43 = $r10, $r58
+;;
+ sd 40[$r12] = $r1
+ nxord $r4 = $r39, $r56
+ nxord $r34 = $r42, $r48
+ addd $r1 = $r12, 16
+;;
+ sd 32[$r12] = $r43
+ make $r2, 64
+;;
+ sd 24[$r12] = $r4
+;;
+ sd 16[$r12] = $r34
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 80
+;;
+ ret
+;;
+ .type bs_sbox, @function
+ .size bs_sbox, . - bs_sbox
+ .text
+ .balign 2
+ .globl bs_transpose
+bs_transpose:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1056
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+ addd $r0 = $r12, 24
+ make $r1, 0
+;;
+ make $r2, 1024
+ call memset
+;;
+ addd $r0 = $r12, 24
+ addd $r1 = $r18, 0
+ call bs_transpose_dst
+;;
+ addd $r1 = $r12, 24
+ make $r2, 1024
+ addd $r0 = $r18, 0
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ ld $r18 = 16[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1056
+;;
+ ret
+;;
+ .type bs_transpose, @function
+ .size bs_transpose, . - bs_transpose
+ .text
+ .balign 2
+ .globl bs_transpose_dst
+bs_transpose_dst:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -16
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ make $r4, 0
+;;
+.L103:
+ make $r35, 1
+ make $r17, 0
+;;
+ slld $r41 = $r35, $r4
+;;
+ addw $r9 = $r41, 0
+;;
+.L104:
+ sllw $r10 = $r4, 1
+ sllw $r42 = $r17, 6
+ make $r6, 0
+;;
+ addw $r36 = $r10, $r17
+;;
+ sxwd $r15 = $r36
+;;
+ slld $r2 = $r15, 3
+;;
+ addd $r8 = $r1, $r2
+;;
+ ld $r11 = 0[$r8]
+;;
+.L105:
+ addw $r40 = $r42, $r6
+ make $r2, 0
+ make $r44, 1
+ make $r32, 64
+;;
+ sxwd $r34 = $r40
+ sxwd $r39 = $r9
+ slld $r37 = $r44, $r6
+ addw $r6 = $r6, 1
+;;
+ ld.xs $r7 = $r34[$r0]
+ andd $r33 = $r11, $r37
+ compw.lt $r32 = $r6, $r32
+;;
+ cmoved.dnez $r33? $r2 = $r39
+;;
+ ord $r38 = $r7, $r2
+;;
+ sd.xs $r34[$r0] = $r38
+ cb.wnez $r32? .L105
+;;
+ addw $r17 = $r17, 1
+ make $r32, 2
+;;
+ compw.lt $r32 = $r17, $r32
+;;
+ cb.wnez $r32? .L104
+;;
+ addw $r4 = $r4, 1
+ make $r32, 64
+;;
+ compw.lt $r32 = $r4, $r32
+;;
+ cb.wnez $r32? .L103
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 16
+;;
+ ret
+;;
+ .type bs_transpose_dst, @function
+ .size bs_transpose_dst, . - bs_transpose_dst
+ .text
+ .balign 2
+ .globl bs_transpose_rev
+bs_transpose_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1056
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+ addd $r0 = $r12, 24
+ make $r1, 0
+;;
+ make $r2, 1024
+ call memset
+;;
+ make $r3, 0
+;;
+.L106:
+ sxwd $r8 = $r3
+ sraw $r32 = $r3, 31
+ make $r11, 0
+;;
+ slld $r34 = $r8, 3
+ srlw $r32 = $r32, 26
+;;
+ addd $r6 = $r18, $r34
+ addw $r32 = $r3, $r32
+;;
+ sraw $r2 = $r32, 6
+;;
+ sxwd $r5 = $r2
+;;
+ ld $r36 = 0[$r6]
+;;
+.L107:
+ make $r39, 1
+;;
+ slld $r38 = $r39, $r11
+;;
+ andd $r17 = $r36, $r38
+;;
+ cb.deqz $r17? .L108
+;;
+ make $r44, 1
+ sraw $r32 = $r3, 31
+;;
+ srlw $r32 = $r32, 26
+;;
+ addw $r32 = $r3, $r32
+;;
+ sraw $r40 = $r32, 6
+;;
+ sllw $r9 = $r40, 6
+;;
+ sbfw $r45 = $r9, $r3
+;;
+ slld $r0 = $r44, $r45
+ goto .L109
+;;
+.L108:
+ make $r0, 0
+;;
+.L109:
+ addd $r37 = $r12, 24
+ sllw $r46 = $r11, 1
+ addw $r11 = $r11, 1
+ make $r32, 64
+;;
+ sxwd $r7 = $r46
+ compw.lt $r32 = $r11, $r32
+;;
+ addd $r4 = $r7, $r5
+;;
+ slld $r10 = $r4, 3
+;;
+ addd $r1 = $r37, $r10
+;;
+ ld $r41 = 0[$r1]
+;;
+ ord $r35 = $r41, $r0
+;;
+ sd 0[$r1] = $r35
+;;
+ cb.wnez $r32? .L107
+;;
+ addw $r3 = $r3, 1
+ make $r32, 128
+;;
+ compw.lt $r32 = $r3, $r32
+;;
+ cb.wnez $r32? .L106
+;;
+ addd $r1 = $r12, 24
+ make $r2, 1024
+ addd $r0 = $r18, 0
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ ld $r18 = 16[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1056
+;;
+ ret
+;;
+ .type bs_transpose_rev, @function
+ .size bs_transpose_rev, . - bs_transpose_rev
+ .text
+ .balign 2
+ .globl bs_shiftrows
+bs_shiftrows:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1040
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ addd $r50 = $r12, 16
+ addd $r1 = $r0, 0
+ addd $r43 = $r0, 256
+ addd $r8 = $r0, 512
+;;
+ addd $r60 = $r0, 768
+ make $r15, 0
+ make $r52, 32
+ make $r3, 64
+;;
+ make $r36, 96
+ make $r7, 0
+;;
+.L110:
+ ld $r5 = 0[$r1]
+ addw $r59 = $r52, 40
+ addw $r7 = $r7, 1
+ make $r32, 4
+;;
+ sd 0[$r50] = $r5
+ andw $r52 = $r59, 127
+ addw $r63 = $r36, 40
+ compw.lt $r32 = $r7, $r32
+;;
+ andw $r36 = $r63, 127
+ sxwd $r62 = $r52
+;;
+ slld $r53 = $r62, 3
+;;
+ ld $r11 = 8[$r1]
+;;
+ sd 8[$r50] = $r11
+;;
+ ld $r61 = 16[$r1]
+;;
+ sd 16[$r50] = $r61
+;;
+ ld $r6 = 24[$r1]
+;;
+ sd 24[$r50] = $r6
+;;
+ ld $r56 = 32[$r1]
+;;
+ sd 32[$r50] = $r56
+;;
+ ld $r2 = 40[$r1]
+;;
+ sd 40[$r50] = $r2
+ addw $r2 = $r3, 40
+;;
+ andw $r3 = $r2, 127
+ sxwd $r2 = $r36
+;;
+ sxwd $r5 = $r3
+ slld $r39 = $r2, 3
+;;
+ ld $r38 = 48[$r1]
+ slld $r46 = $r5, 3
+;;
+ sd 48[$r50] = $r38
+;;
+ ld $r54 = 56[$r1]
+;;
+ sd 56[$r50] = $r54
+;;
+ ld $r4 = 0[$r43]
+;;
+ sd 256[$r50] = $r4
+;;
+ ld $r58 = 8[$r43]
+;;
+ sd 264[$r50] = $r58
+;;
+ ld $r10 = 16[$r43]
+;;
+ sd 272[$r50] = $r10
+;;
+ ld $r34 = 24[$r43]
+;;
+ sd 280[$r50] = $r34
+;;
+ ld $r51 = 32[$r43]
+;;
+ sd 288[$r50] = $r51
+;;
+ ld $r9 = 40[$r43]
+;;
+ sd 296[$r50] = $r9
+;;
+ ld $r1 = 48[$r43]
+;;
+ sd 304[$r50] = $r1
+;;
+ ld $r4 = 56[$r43]
+ addd $r43 = $r0, $r53
+;;
+ sd 312[$r50] = $r4
+;;
+ ld $r41 = 0[$r8]
+;;
+ sd 512[$r50] = $r41
+;;
+ ld $r9 = 8[$r8]
+;;
+ sd 520[$r50] = $r9
+;;
+ ld $r6 = 16[$r8]
+;;
+ sd 528[$r50] = $r6
+;;
+ ld $r9 = 24[$r8]
+;;
+ sd 536[$r50] = $r9
+;;
+ ld $r42 = 32[$r8]
+;;
+ sd 544[$r50] = $r42
+;;
+ ld $r35 = 40[$r8]
+;;
+ sd 552[$r50] = $r35
+;;
+ ld $r10 = 48[$r8]
+;;
+ sd 560[$r50] = $r10
+;;
+ ld $r57 = 56[$r8]
+;;
+ sd 568[$r50] = $r57
+;;
+ ld $r17 = 0[$r60]
+;;
+ sd 768[$r50] = $r17
+;;
+ ld $r8 = 8[$r60]
+;;
+ sd 776[$r50] = $r8
+ addw $r8 = $r15, 40
+;;
+ andw $r15 = $r8, 127
+ addd $r8 = $r0, $r46
+;;
+ sxwd $r37 = $r15
+;;
+ ld $r48 = 16[$r60]
+ slld $r40 = $r37, 3
+;;
+ sd 784[$r50] = $r48
+ addd $r1 = $r0, $r40
+;;
+ ld $r33 = 24[$r60]
+;;
+ sd 792[$r50] = $r33
+;;
+ ld $r47 = 32[$r60]
+;;
+ sd 800[$r50] = $r47
+;;
+ ld $r4 = 40[$r60]
+;;
+ sd 808[$r50] = $r4
+;;
+ ld $r44 = 48[$r60]
+;;
+ sd 816[$r50] = $r44
+;;
+ ld $r49 = 56[$r60]
+ addd $r60 = $r0, $r39
+;;
+ sd 824[$r50] = $r49
+ addd $r50 = $r50, 64
+ cb.wnez $r32? .L110
+;;
+ addd $r1 = $r12, 16
+ make $r2, 1024
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1040
+;;
+ ret
+;;
+ .type bs_shiftrows, @function
+ .size bs_shiftrows, . - bs_shiftrows
+ .text
+ .balign 2
+ .globl bs_shiftrows_rev
+bs_shiftrows_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1040
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ addd $r56 = $r12, 16
+ addd $r34 = $r12, 16
+ addd $r45 = $r12, 272
+ addd $r6 = $r12, 528
+;;
+ addd $r62 = $r12, 784
+ make $r4, 0
+ make $r10, 32
+ make $r55, 64
+;;
+ make $r2, 96
+ make $r59, 0
+;;
+.L111:
+ ld $r43 = 0[$r0]
+ addw $r9 = $r4, 40
+ addw $r59 = $r59, 1
+ make $r32, 4
+;;
+ sd 0[$r34] = $r43
+ andw $r4 = $r9, 127
+ addw $r51 = $r10, 40
+ compw.lt $r32 = $r59, $r32
+;;
+ andw $r10 = $r51, 127
+ sxwd $r39 = $r4
+;;
+ slld $r60 = $r39, 3
+;;
+ ld $r57 = 8[$r0]
+;;
+ sd 8[$r34] = $r57
+;;
+ ld $r63 = 16[$r0]
+;;
+ sd 16[$r34] = $r63
+;;
+ ld $r7 = 24[$r0]
+;;
+ sd 24[$r34] = $r7
+;;
+ ld $r44 = 32[$r0]
+;;
+ sd 32[$r34] = $r44
+;;
+ ld $r42 = 40[$r0]
+;;
+ sd 40[$r34] = $r42
+;;
+ ld $r40 = 48[$r0]
+;;
+ sd 48[$r34] = $r40
+;;
+ ld $r61 = 56[$r0]
+;;
+ sd 56[$r34] = $r61
+ addd $r34 = $r56, $r60
+;;
+ ld $r35 = 256[$r0]
+;;
+ sd 0[$r45] = $r35
+;;
+ ld $r1 = 264[$r0]
+;;
+ sd 8[$r45] = $r1
+ addw $r1 = $r2, 40
+;;
+ andw $r2 = $r1, 127
+;;
+ ld $r49 = 272[$r0]
+;;
+ sd 16[$r45] = $r49
+;;
+ ld $r37 = 280[$r0]
+;;
+ sd 24[$r45] = $r37
+;;
+ ld $r54 = 288[$r0]
+;;
+ sd 32[$r45] = $r54
+;;
+ ld $r15 = 296[$r0]
+;;
+ sd 40[$r45] = $r15
+;;
+ ld $r3 = 304[$r0]
+;;
+ sd 48[$r45] = $r3
+;;
+ ld $r5 = 312[$r0]
+;;
+ sd 56[$r45] = $r5
+ sxwd $r5 = $r2
+;;
+ slld $r38 = $r5, 3
+;;
+ ld $r53 = 512[$r0]
+;;
+ sd 0[$r6] = $r53
+;;
+ ld $r33 = 520[$r0]
+;;
+ sd 8[$r6] = $r33
+;;
+ ld $r8 = 528[$r0]
+;;
+ sd 16[$r6] = $r8
+;;
+ ld $r11 = 536[$r0]
+;;
+ sd 24[$r6] = $r11
+;;
+ ld $r47 = 544[$r0]
+;;
+ sd 32[$r6] = $r47
+;;
+ ld $r3 = 552[$r0]
+;;
+ sd 40[$r6] = $r3
+;;
+ ld $r17 = 560[$r0]
+;;
+ sd 48[$r6] = $r17
+;;
+ ld $r52 = 568[$r0]
+;;
+ sd 56[$r6] = $r52
+ sxwd $r6 = $r10
+;;
+ slld $r1 = $r6, 3
+;;
+ addd $r45 = $r56, $r1
+;;
+ ld $r8 = 768[$r0]
+;;
+ sd 0[$r62] = $r8
+;;
+ ld $r41 = 776[$r0]
+;;
+ sd 8[$r62] = $r41
+;;
+ ld $r3 = 784[$r0]
+;;
+ sd 16[$r62] = $r3
+ addw $r3 = $r55, 40
+;;
+ andw $r55 = $r3, 127
+;;
+ sxwd $r7 = $r55
+;;
+ ld $r36 = 792[$r0]
+ slld $r58 = $r7, 3
+;;
+ sd 24[$r62] = $r36
+ addd $r6 = $r56, $r58
+;;
+ ld $r48 = 800[$r0]
+;;
+ sd 32[$r62] = $r48
+;;
+ ld $r11 = 808[$r0]
+;;
+ sd 40[$r62] = $r11
+;;
+ ld $r46 = 816[$r0]
+;;
+ sd 48[$r62] = $r46
+;;
+ ld $r50 = 824[$r0]
+ addd $r0 = $r0, 64
+;;
+ sd 56[$r62] = $r50
+ addd $r62 = $r56, $r38
+ cb.wnez $r32? .L111
+;;
+ addd $r0 = $r0, -256
+ addd $r1 = $r12, 16
+ make $r2, 1024
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1040
+;;
+ ret
+;;
+ .type bs_shiftrows_rev, @function
+ .size bs_shiftrows_rev, . - bs_shiftrows_rev
+ .text
+ .balign 2
+ .globl bs_shiftmix
+bs_shiftmix:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1088
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r4 = $r0, 256
+ addd $r1 = $r0, 512
+ addd $r3 = $r0, 768
+;;
+ sd 24[$r12] = $r19
+ addd $r19 = $r12, 64
+ make $r18, 0
+ addd $r2 = $r0, 0
+;;
+ sd 32[$r12] = $r20
+ make $r20, 64
+;;
+ sd 40[$r12] = $r21
+ make $r21, 96
+;;
+ sd 48[$r12] = $r22
+ make $r22, 32
+;;
+ sd 56[$r12] = $r23
+ make $r23, 0
+;;
+.L112:
+ ld $r46 = 64[$r4]
+ addw $r23 = $r23, 1
+ make $r32, 4
+;;
+ ld $r8 = 128[$r1]
+ compw.lt $r32 = $r23, $r32
+;;
+ ld $r5 = 56[$r2]
+ xord $r57 = $r46, $r8
+;;
+ ld $r59 = 120[$r4]
+;;
+ xord $r7 = $r5, $r59
+ ld $r17 = 192[$r3]
+;;
+ xord $r5 = $r57, $r17
+;;
+ xord $r61 = $r5, $r7
+;;
+ sd 0[$r19] = $r61
+;;
+ ld $r48 = 0[$r2]
+;;
+ ld $r62 = 64[$r4]
+;;
+ xord $r42 = $r48, $r62
+ ld $r60 = 72[$r4]
+;;
+ xord $r5 = $r42, $r60
+ ld $r61 = 136[$r1]
+;;
+ xord $r45 = $r5, $r61
+ ld $r40 = 200[$r3]
+;;
+ xord $r45 = $r45, $r40
+;;
+ xord $r5 = $r45, $r7
+;;
+ sd 8[$r19] = $r5
+;;
+ ld $r11 = 8[$r2]
+;;
+ ld $r51 = 72[$r4]
+;;
+ xord $r45 = $r11, $r51
+ ld $r40 = 80[$r4]
+;;
+ xord $r37 = $r45, $r40
+ ld $r39 = 144[$r1]
+;;
+ xord $r6 = $r37, $r39
+ ld $r42 = 208[$r3]
+;;
+ xord $r59 = $r6, $r42
+;;
+ sd 16[$r19] = $r59
+;;
+ ld $r6 = 16[$r2]
+;;
+ ld $r44 = 80[$r4]
+;;
+ xord $r43 = $r6, $r44
+ ld $r9 = 88[$r4]
+;;
+ xord $r52 = $r43, $r9
+ ld $r46 = 152[$r1]
+;;
+ xord $r42 = $r52, $r46
+ ld $r48 = 216[$r3]
+;;
+ xord $r5 = $r42, $r48
+;;
+ xord $r55 = $r5, $r7
+;;
+ sd 24[$r19] = $r55
+;;
+ ld $r34 = 24[$r2]
+;;
+ ld $r8 = 88[$r4]
+;;
+ xord $r62 = $r34, $r8
+ ld $r47 = 96[$r4]
+;;
+ xord $r38 = $r62, $r47
+ ld $r50 = 160[$r1]
+;;
+ xord $r34 = $r38, $r50
+ ld $r56 = 224[$r3]
+;;
+ xord $r8 = $r34, $r56
+;;
+ xord $r11 = $r8, $r7
+;;
+ sd 32[$r19] = $r11
+;;
+ ld $r5 = 96[$r4]
+;;
+ ld $r53 = 32[$r2]
+;;
+ xord $r44 = $r53, $r5
+ ld $r54 = 168[$r1]
+;;
+ ld $r5 = 104[$r4]
+;;
+ xord $r40 = $r44, $r5
+;;
+ xord $r10 = $r40, $r54
+;;
+ ld $r5 = 232[$r3]
+;;
+ xord $r39 = $r10, $r5
+;;
+ sd 40[$r19] = $r39
+;;
+ ld $r5 = 40[$r2]
+;;
+ ld $r58 = 104[$r4]
+;;
+ xord $r17 = $r5, $r58
+ ld $r15 = 112[$r4]
+;;
+ xord $r37 = $r17, $r15
+ ld $r5 = 176[$r1]
+;;
+ xord $r57 = $r37, $r5
+ ld $r51 = 240[$r3]
+;;
+ xord $r57 = $r57, $r51
+;;
+ sd 48[$r19] = $r57
+;;
+ ld $r40 = 48[$r2]
+;;
+ ld $r52 = 112[$r4]
+;;
+ xord $r35 = $r40, $r52
+ ld $r5 = 120[$r4]
+;;
+ xord $r5 = $r35, $r5
+ ld $r49 = 184[$r1]
+;;
+ xord $r15 = $r5, $r49
+ ld $r34 = 248[$r3]
+;;
+ xord $r46 = $r15, $r34
+;;
+ sd 56[$r19] = $r46
+;;
+ ld $r33 = 0[$r2]
+;;
+ ld $r36 = 128[$r1]
+;;
+ ld $r48 = 120[$r4]
+ xord $r42 = $r33, $r36
+;;
+ ld $r5 = 184[$r1]
+;;
+ xord $r34 = $r48, $r5
+ ld $r47 = 192[$r3]
+;;
+ xord $r60 = $r42, $r47
+;;
+ xord $r60 = $r60, $r34
+;;
+ sd 64[$r19] = $r60
+;;
+ ld $r43 = 8[$r2]
+;;
+ ld $r47 = 64[$r4]
+;;
+ xord $r63 = $r43, $r47
+ ld $r52 = 128[$r1]
+;;
+ xord $r5 = $r63, $r52
+ ld $r7 = 136[$r1]
+;;
+ xord $r60 = $r5, $r7
+ ld $r15 = 200[$r3]
+;;
+ xord $r55 = $r60, $r15
+;;
+ xord $r48 = $r55, $r34
+;;
+ sd 72[$r19] = $r48
+;;
+ ld $r56 = 16[$r2]
+;;
+ ld $r5 = 72[$r4]
+;;
+ xord $r7 = $r56, $r5
+ ld $r46 = 136[$r1]
+;;
+ xord $r41 = $r7, $r46
+ ld $r40 = 144[$r1]
+;;
+ xord $r5 = $r41, $r40
+ ld $r47 = 208[$r3]
+;;
+ xord $r5 = $r5, $r47
+;;
+ sd 80[$r19] = $r5
+;;
+ ld $r52 = 24[$r2]
+;;
+ ld $r54 = 80[$r4]
+;;
+ xord $r35 = $r52, $r54
+ ld $r63 = 144[$r1]
+;;
+ xord $r7 = $r35, $r63
+ ld $r8 = 152[$r1]
+;;
+ xord $r33 = $r7, $r8
+ ld $r37 = 216[$r3]
+;;
+ xord $r56 = $r33, $r37
+;;
+ xord $r54 = $r56, $r34
+;;
+ sd 88[$r19] = $r54
+;;
+ ld $r9 = 32[$r2]
+;;
+ ld $r6 = 88[$r4]
+;;
+ xord $r44 = $r9, $r6
+ ld $r51 = 152[$r1]
+;;
+ xord $r35 = $r44, $r51
+ ld $r52 = 160[$r1]
+;;
+ xord $r38 = $r35, $r52
+ ld $r9 = 224[$r3]
+;;
+ xord $r62 = $r38, $r9
+;;
+ xord $r6 = $r62, $r34
+;;
+ sd 96[$r19] = $r6
+;;
+ ld $r15 = 40[$r2]
+;;
+ ld $r17 = 96[$r4]
+;;
+ xord $r36 = $r15, $r17
+ ld $r5 = 160[$r1]
+;;
+ xord $r50 = $r36, $r5
+ ld $r51 = 168[$r1]
+;;
+ xord $r37 = $r50, $r51
+ ld $r42 = 232[$r3]
+;;
+ xord $r58 = $r37, $r42
+;;
+ sd 104[$r19] = $r58
+;;
+ ld $r56 = 48[$r2]
+;;
+ ld $r41 = 104[$r4]
+;;
+ xord $r11 = $r56, $r41
+ ld $r48 = 168[$r1]
+;;
+ xord $r51 = $r11, $r48
+ ld $r58 = 176[$r1]
+;;
+ xord $r61 = $r51, $r58
+ ld $r5 = 240[$r3]
+;;
+ xord $r61 = $r61, $r5
+;;
+ sd 112[$r19] = $r61
+;;
+ ld $r34 = 56[$r2]
+;;
+ ld $r56 = 112[$r4]
+;;
+ xord $r46 = $r34, $r56
+ ld $r9 = 176[$r1]
+;;
+ xord $r62 = $r46, $r9
+ ld $r33 = 184[$r1]
+;;
+ xord $r46 = $r62, $r33
+ ld $r61 = 248[$r3]
+;;
+ xord $r40 = $r46, $r61
+;;
+ sd 120[$r19] = $r40
+;;
+ ld $r5 = 184[$r1]
+;;
+ ld $r59 = 248[$r3]
+;;
+ xord $r43 = $r5, $r59
+ ld $r55 = 0[$r2]
+;;
+ ld $r5 = 64[$r4]
+;;
+ xord $r42 = $r55, $r5
+ ld $r35 = 192[$r3]
+;;
+ xord $r49 = $r42, $r35
+;;
+ xord $r5 = $r49, $r43
+;;
+ sd 128[$r19] = $r5
+;;
+ ld $r57 = 8[$r2]
+;;
+ ld $r5 = 72[$r4]
+;;
+ xord $r44 = $r57, $r5
+ ld $r45 = 128[$r1]
+;;
+ xord $r17 = $r44, $r45
+ ld $r33 = 192[$r3]
+;;
+ xord $r52 = $r17, $r33
+ ld $r39 = 200[$r3]
+;;
+ xord $r35 = $r52, $r39
+;;
+ xord $r62 = $r35, $r43
+;;
+ sd 136[$r19] = $r62
+;;
+ ld $r5 = 16[$r2]
+;;
+ ld $r39 = 80[$r4]
+;;
+ xord $r36 = $r5, $r39
+ ld $r41 = 136[$r1]
+;;
+ xord $r6 = $r36, $r41
+ ld $r5 = 200[$r3]
+;;
+ xord $r35 = $r6, $r5
+ ld $r11 = 208[$r3]
+;;
+ xord $r37 = $r35, $r11
+;;
+ sd 144[$r19] = $r37
+;;
+ ld $r5 = 24[$r2]
+;;
+ ld $r63 = 88[$r4]
+;;
+ xord $r33 = $r5, $r63
+ ld $r45 = 144[$r1]
+;;
+ xord $r49 = $r33, $r45
+ ld $r36 = 208[$r3]
+;;
+ xord $r55 = $r49, $r36
+ ld $r8 = 216[$r3]
+;;
+ xord $r41 = $r55, $r8
+;;
+ xord $r58 = $r41, $r43
+;;
+ sd 152[$r19] = $r58
+;;
+ ld $r6 = 32[$r2]
+;;
+ ld $r47 = 96[$r4]
+;;
+ xord $r11 = $r6, $r47
+ ld $r61 = 152[$r1]
+;;
+ xord $r44 = $r11, $r61
+ ld $r9 = 216[$r3]
+;;
+ xord $r59 = $r44, $r9
+ ld $r34 = 224[$r3]
+;;
+ xord $r7 = $r59, $r34
+;;
+ xord $r17 = $r7, $r43
+;;
+ sd 160[$r19] = $r17
+;;
+ ld $r54 = 40[$r2]
+;;
+ ld $r53 = 104[$r4]
+;;
+ xord $r7 = $r54, $r53
+ ld $r59 = 160[$r1]
+;;
+ xord $r37 = $r7, $r59
+ ld $r41 = 224[$r3]
+;;
+ xord $r10 = $r37, $r41
+ ld $r46 = 232[$r3]
+;;
+ xord $r10 = $r10, $r46
+;;
+ sd 168[$r19] = $r10
+;;
+ ld $r58 = 48[$r2]
+;;
+ ld $r5 = 112[$r4]
+;;
+ xord $r40 = $r58, $r5
+ ld $r38 = 168[$r1]
+;;
+ xord $r57 = $r40, $r38
+ ld $r51 = 232[$r3]
+;;
+ xord $r60 = $r57, $r51
+ ld $r55 = 240[$r3]
+;;
+ xord $r53 = $r60, $r55
+;;
+ sd 176[$r19] = $r53
+;;
+ ld $r45 = 56[$r2]
+;;
+ ld $r41 = 120[$r4]
+;;
+ xord $r5 = $r45, $r41
+ ld $r53 = 176[$r1]
+;;
+ xord $r38 = $r5, $r53
+ ld $r8 = 240[$r3]
+;;
+ xord $r43 = $r38, $r8
+ ld $r63 = 248[$r3]
+;;
+ xord $r6 = $r43, $r63
+;;
+ sd 184[$r19] = $r6
+;;
+ ld $r8 = 0[$r2]
+;;
+ ld $r58 = 64[$r4]
+;;
+ ld $r35 = 56[$r2]
+ xord $r54 = $r8, $r58
+;;
+ ld $r5 = 248[$r3]
+;;
+ xord $r50 = $r35, $r5
+ ld $r51 = 128[$r1]
+;;
+ xord $r11 = $r54, $r51
+;;
+ xord $r38 = $r11, $r50
+;;
+ sd 192[$r19] = $r38
+;;
+ ld $r63 = 8[$r2]
+;;
+ ld $r54 = 0[$r2]
+;;
+ xord $r54 = $r63, $r54
+ ld $r36 = 72[$r4]
+;;
+ xord $r5 = $r54, $r36
+ ld $r41 = 136[$r1]
+;;
+ xord $r39 = $r5, $r41
+ ld $r58 = 192[$r3]
+;;
+ xord $r44 = $r39, $r58
+;;
+ xord $r33 = $r44, $r50
+;;
+ sd 200[$r19] = $r33
+;;
+ ld $r5 = 8[$r2]
+;;
+ ld $r63 = 16[$r2]
+;;
+ xord $r54 = $r63, $r5
+ ld $r49 = 80[$r4]
+ addw $r63 = $r18, 32
+;;
+ xord $r51 = $r54, $r49
+ ld $r5 = 144[$r1]
+ andw $r18 = $r63, 127
+;;
+ xord $r43 = $r51, $r5
+ ld $r57 = 200[$r3]
+;;
+ xord $r47 = $r43, $r57
+;;
+ sd 208[$r19] = $r47
+ addw $r47 = $r21, 32
+;;
+ andw $r21 = $r47, 127
+;;
+ ld $r7 = 24[$r2]
+;;
+ ld $r15 = 16[$r2]
+;;
+ xord $r56 = $r7, $r15
+ ld $r48 = 88[$r4]
+;;
+ xord $r10 = $r56, $r48
+ ld $r51 = 152[$r1]
+;;
+ xord $r39 = $r10, $r51
+ addw $r10 = $r22, 32
+;;
+ ld $r48 = 208[$r3]
+ andw $r22 = $r10, 127
+;;
+ xord $r53 = $r39, $r48
+;;
+ xord $r37 = $r53, $r50
+;;
+ sd 216[$r19] = $r37
+;;
+ ld $r9 = 32[$r2]
+;;
+ ld $r15 = 24[$r2]
+;;
+ xord $r43 = $r9, $r15
+ ld $r53 = 96[$r4]
+ addw $r15 = $r20, 32
+;;
+ xord $r42 = $r43, $r53
+ ld $r17 = 160[$r1]
+ andw $r20 = $r15, 127
+;;
+ xord $r55 = $r42, $r17
+ ld $r62 = 216[$r3]
+ sxwd $r8 = $r20
+;;
+ xord $r60 = $r55, $r62
+ slld $r43 = $r8, 3
+;;
+ xord $r5 = $r60, $r50
+ sxwd $r50 = $r18
+;;
+ sd 224[$r19] = $r5
+ slld $r39 = $r50, 3
+;;
+ ld $r5 = 40[$r2]
+;;
+ ld $r51 = 32[$r2]
+;;
+ xord $r62 = $r5, $r51
+ ld $r45 = 168[$r1]
+;;
+ ld $r5 = 104[$r4]
+;;
+ xord $r9 = $r62, $r5
+;;
+ xord $r17 = $r9, $r45
+;;
+ ld $r5 = 224[$r3]
+;;
+ xord $r49 = $r17, $r5
+;;
+ sd 232[$r19] = $r49
+;;
+ ld $r33 = 48[$r2]
+;;
+ ld $r57 = 40[$r2]
+;;
+ xord $r49 = $r33, $r57
+ ld $r55 = 112[$r4]
+;;
+ xord $r59 = $r49, $r55
+ ld $r36 = 176[$r1]
+;;
+ xord $r61 = $r59, $r36
+ ld $r52 = 232[$r3]
+;;
+ xord $r6 = $r61, $r52
+;;
+ sd 240[$r19] = $r6
+;;
+ ld $r49 = 56[$r2]
+;;
+ ld $r45 = 48[$r2]
+ addd $r2 = $r0, $r39
+;;
+ xord $r56 = $r49, $r45
+ ld $r59 = 120[$r4]
+;;
+ xord $r11 = $r56, $r59
+ ld $r38 = 184[$r1]
+;;
+ xord $r4 = $r11, $r38
+ ld $r34 = 240[$r3]
+ sxwd $r38 = $r22
+ sxwd $r3 = $r21
+;;
+ xord $r1 = $r4, $r34
+ slld $r10 = $r38, 3
+ slld $r36 = $r3, 3
+;;
+ sd 248[$r19] = $r1
+ addd $r19 = $r19, 256
+ addd $r4 = $r0, $r10
+ addd $r1 = $r0, $r43
+;;
+ addd $r3 = $r0, $r36
+ cb.wnez $r32? .L112
+;;
+ addd $r1 = $r12, 64
+ make $r2, 1024
+ call memmove
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r20 = 32[$r12]
+;;
+ ld $r21 = 40[$r12]
+;;
+ ld $r22 = 48[$r12]
+;;
+ ld $r23 = 56[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1088
+;;
+ ret
+;;
+ .type bs_shiftmix, @function
+ .size bs_shiftmix, . - bs_shiftmix
+ .text
+ .balign 2
+ .globl bs_mixcolumns
+bs_mixcolumns:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1040
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ addd $r46 = $r12, 16
+ make $r45, 0
+;;
+.L113:
+ ld $r60 = 64[$r0]
+ addw $r45 = $r45, 1
+ make $r32, 4
+;;
+ ld $r54 = 128[$r0]
+ compw.lt $r32 = $r45, $r32
+;;
+ ld $r44 = 56[$r0]
+ xord $r49 = $r60, $r54
+;;
+ ld $r7 = 120[$r0]
+;;
+ xord $r57 = $r44, $r7
+ ld $r5 = 192[$r0]
+;;
+ xord $r1 = $r49, $r5
+;;
+ xord $r40 = $r1, $r57
+;;
+ sd 0[$r46] = $r40
+;;
+ ld $r42 = 0[$r0]
+;;
+ xord $r53 = $r42, $r60
+ ld $r39 = 72[$r0]
+;;
+ xord $r43 = $r53, $r39
+ ld $r55 = 136[$r0]
+;;
+ xord $r6 = $r43, $r55
+ ld $r2 = 200[$r0]
+;;
+ xord $r15 = $r6, $r2
+;;
+ xord $r8 = $r15, $r57
+;;
+ sd 8[$r46] = $r8
+;;
+ ld $r35 = 8[$r0]
+;;
+ xord $r59 = $r35, $r39
+ ld $r37 = 80[$r0]
+ xord $r60 = $r35, $r60
+;;
+ xord $r56 = $r59, $r37
+ ld $r6 = 144[$r0]
+ xord $r59 = $r59, $r54
+;;
+ xord $r43 = $r56, $r6
+ ld $r51 = 208[$r0]
+ xord $r59 = $r59, $r5
+;;
+ xord $r11 = $r43, $r51
+ xord $r59 = $r59, $r2
+;;
+ sd 16[$r46] = $r11
+;;
+ ld $r47 = 16[$r0]
+;;
+ xord $r11 = $r47, $r37
+ ld $r52 = 88[$r0]
+;;
+ xord $r48 = $r11, $r52
+ ld $r17 = 152[$r0]
+;;
+ xord $r4 = $r48, $r17
+ ld $r1 = 216[$r0]
+;;
+ xord $r4 = $r4, $r1
+;;
+ xord $r3 = $r4, $r57
+;;
+ sd 24[$r46] = $r3
+;;
+ ld $r8 = 24[$r0]
+;;
+ xord $r58 = $r8, $r52
+ ld $r36 = 96[$r0]
+;;
+ xord $r9 = $r58, $r36
+ ld $r50 = 160[$r0]
+ xord $r58 = $r58, $r6
+;;
+ xord $r40 = $r9, $r50
+ ld $r4 = 224[$r0]
+;;
+ xord $r61 = $r40, $r4
+;;
+ xord $r48 = $r61, $r57
+;;
+ sd 32[$r46] = $r48
+;;
+ ld $r15 = 32[$r0]
+;;
+ xord $r57 = $r15, $r36
+ ld $r38 = 104[$r0]
+;;
+ xord $r61 = $r57, $r38
+ ld $r3 = 168[$r0]
+;;
+ xord $r9 = $r61, $r3
+ ld $r48 = 232[$r0]
+;;
+ xord $r9 = $r9, $r48
+;;
+ sd 40[$r46] = $r9
+;;
+ ld $r43 = 40[$r0]
+;;
+ xord $r34 = $r43, $r38
+ ld $r33 = 112[$r0]
+;;
+ xord $r40 = $r34, $r33
+ ld $r10 = 176[$r0]
+ xord $r63 = $r44, $r33
+;;
+ xord $r49 = $r40, $r10
+ ld $r41 = 240[$r0]
+;;
+ xord $r62 = $r49, $r41
+;;
+ sd 48[$r46] = $r62
+ xord $r62 = $r42, $r54
+;;
+ xord $r62 = $r62, $r5
+;;
+ ld $r9 = 48[$r0]
+;;
+ xord $r56 = $r9, $r33
+ ld $r40 = 184[$r0]
+;;
+ xord $r49 = $r56, $r7
+ xord $r56 = $r56, $r3
+;;
+ xord $r61 = $r49, $r40
+;;
+ ld $r49 = 248[$r0]
+ addd $r0 = $r0, 256
+;;
+ xord $r61 = $r61, $r49
+;;
+ sd 56[$r46] = $r61
+ xord $r61 = $r7, $r40
+;;
+ xord $r62 = $r62, $r61
+;;
+ sd 64[$r46] = $r62
+ xord $r62 = $r60, $r54
+;;
+ xord $r60 = $r62, $r55
+;;
+ xord $r60 = $r60, $r2
+;;
+ xord $r60 = $r60, $r61
+;;
+ sd 72[$r46] = $r60
+ xord $r60 = $r47, $r39
+;;
+ xord $r60 = $r60, $r55
+;;
+ xord $r60 = $r60, $r6
+;;
+ xord $r60 = $r60, $r51
+;;
+ sd 80[$r46] = $r60
+ xord $r60 = $r8, $r37
+;;
+ xord $r60 = $r60, $r6
+;;
+ xord $r60 = $r60, $r17
+;;
+ xord $r60 = $r60, $r1
+;;
+ xord $r60 = $r60, $r61
+;;
+ sd 88[$r46] = $r60
+ xord $r60 = $r15, $r52
+;;
+ xord $r60 = $r60, $r17
+;;
+ xord $r60 = $r60, $r50
+;;
+ xord $r60 = $r60, $r4
+;;
+ xord $r60 = $r60, $r61
+ xord $r61 = $r53, $r5
+;;
+ sd 96[$r46] = $r60
+ xord $r60 = $r43, $r36
+;;
+ xord $r60 = $r60, $r50
+;;
+ xord $r60 = $r60, $r3
+;;
+ xord $r60 = $r60, $r48
+;;
+ sd 104[$r46] = $r60
+ xord $r60 = $r9, $r38
+;;
+ xord $r60 = $r60, $r3
+;;
+ xord $r60 = $r60, $r10
+;;
+ xord $r60 = $r60, $r41
+;;
+ sd 112[$r46] = $r60
+ xord $r60 = $r63, $r10
+;;
+ xord $r60 = $r60, $r40
+;;
+ xord $r60 = $r60, $r49
+;;
+ sd 120[$r46] = $r60
+ xord $r60 = $r40, $r49
+;;
+ xord $r61 = $r61, $r60
+ xord $r63 = $r59, $r60
+ xord $r59 = $r11, $r55
+;;
+ sd 128[$r46] = $r61
+ xord $r59 = $r59, $r2
+;;
+ sd 136[$r46] = $r63
+ xord $r11 = $r59, $r51
+ xord $r63 = $r58, $r51
+;;
+ sd 144[$r46] = $r11
+ xord $r58 = $r63, $r1
+ xord $r11 = $r57, $r17
+;;
+ xord $r61 = $r58, $r60
+ xord $r57 = $r11, $r1
+;;
+ sd 152[$r46] = $r61
+ xord $r57 = $r57, $r4
+;;
+ xord $r57 = $r57, $r60
+;;
+ sd 160[$r46] = $r57
+ xord $r57 = $r34, $r50
+;;
+ xord $r57 = $r57, $r4
+;;
+ xord $r11 = $r57, $r48
+ xord $r57 = $r53, $r54
+;;
+ sd 168[$r46] = $r11
+ xord $r11 = $r56, $r48
+;;
+ xord $r56 = $r11, $r41
+ xord $r11 = $r44, $r7
+;;
+ sd 176[$r46] = $r56
+ xord $r34 = $r11, $r10
+;;
+ xord $r11 = $r34, $r41
+;;
+ xord $r56 = $r11, $r49
+ xord $r49 = $r44, $r49
+ xord $r11 = $r35, $r42
+ xord $r42 = $r9, $r43
+;;
+ sd 184[$r46] = $r56
+ xord $r53 = $r57, $r49
+ xord $r11 = $r11, $r39
+ xord $r58 = $r42, $r33
+;;
+ sd 192[$r46] = $r53
+ xord $r61 = $r11, $r55
+ xord $r53 = $r47, $r35
+;;
+ xord $r39 = $r61, $r5
+ xord $r62 = $r53, $r37
+;;
+ xord $r34 = $r39, $r49
+ xord $r37 = $r62, $r6
+;;
+ sd 200[$r46] = $r34
+ xord $r57 = $r37, $r2
+ xord $r37 = $r8, $r47
+ xord $r34 = $r15, $r8
+;;
+ sd 208[$r46] = $r57
+ xord $r35 = $r37, $r52
+;;
+ xord $r47 = $r35, $r17
+;;
+ xord $r47 = $r47, $r51
+;;
+ xord $r54 = $r47, $r49
+ xord $r47 = $r34, $r36
+;;
+ sd 216[$r46] = $r54
+ xord $r35 = $r47, $r50
+;;
+ xord $r39 = $r35, $r1
+;;
+ xord $r11 = $r39, $r49
+ xord $r49 = $r43, $r15
+;;
+ sd 224[$r46] = $r11
+ xord $r53 = $r49, $r38
+ xord $r11 = $r58, $r10
+ xord $r38 = $r44, $r9
+;;
+ xord $r6 = $r53, $r3
+ xord $r56 = $r11, $r48
+ xord $r52 = $r38, $r7
+;;
+ xord $r15 = $r6, $r4
+ xord $r34 = $r52, $r40
+;;
+ sd 232[$r46] = $r15
+ xord $r58 = $r34, $r41
+;;
+ sd 240[$r46] = $r56
+;;
+ sd 248[$r46] = $r58
+ addd $r46 = $r46, 256
+ cb.wnez $r32? .L113
+;;
+ addd $r0 = $r0, -1024
+ addd $r1 = $r46, -1024
+ make $r2, 1024
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1040
+;;
+ ret
+;;
+ .type bs_mixcolumns, @function
+ .size bs_mixcolumns, . - bs_mixcolumns
+ .text
+ .balign 2
+ .globl bs_mixcolumns_rev
+bs_mixcolumns_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -1040
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ addd $r49 = $r12, 16
+ make $r50, 0
+;;
+.L114:
+ ld $r47 = 56[$r0]
+ addw $r50 = $r50, 8
+ make $r32, 32
+;;
+ ld $r11 = 48[$r0]
+ compw.lt $r32 = $r50, $r32
+;;
+ xord $r56 = $r47, $r11
+ ld $r6 = 40[$r0]
+;;
+ xord $r2 = $r56, $r6
+ ld $r15 = 120[$r0]
+;;
+ ld $r33 = 104[$r0]
+ xord $r1 = $r47, $r15
+;;
+ xord $r60 = $r15, $r33
+ ld $r7 = 176[$r0]
+;;
+ xord $r57 = $r2, $r60
+ ld $r48 = 168[$r0]
+;;
+ xord $r63 = $r7, $r48
+ ld $r44 = 64[$r0]
+;;
+ xord $r62 = $r57, $r63
+ ld $r34 = 128[$r0]
+;;
+ ld $r3 = 232[$r0]
+ xord $r38 = $r44, $r34
+;;
+ xord $r9 = $r62, $r3
+ ld $r17 = 192[$r0]
+;;
+ ld $r51 = 112[$r0]
+ xord $r36 = $r38, $r17
+;;
+ xord $r8 = $r56, $r51
+ ld $r10 = 184[$r0]
+ xord $r40 = $r36, $r9
+;;
+ xord $r57 = $r10, $r7
+ ld $r5 = 240[$r0]
+ xord $r4 = $r1, $r10
+;;
+ xord $r54 = $r8, $r57
+ ld $r39 = 248[$r0]
+;;
+ xord $r35 = $r54, $r5
+ xord $r58 = $r4, $r39
+ sd 0[$r49] = $r40
+;;
+ ld $r8 = 0[$r0]
+;;
+ xord $r36 = $r8, $r44
+ ld $r52 = 72[$r0]
+;;
+ xord $r59 = $r36, $r52
+ ld $r45 = 136[$r0]
+;;
+ xord $r59 = $r59, $r45
+ ld $r4 = 200[$r0]
+;;
+ xord $r59 = $r59, $r4
+;;
+ xord $r36 = $r59, $r9
+;;
+ xord $r61 = $r36, $r35
+;;
+ sd 8[$r49] = $r61
+;;
+ ld $r43 = 8[$r0]
+;;
+ xord $r53 = $r43, $r8
+ ld $r46 = 80[$r0]
+;;
+ xord $r53 = $r53, $r52
+ ld $r2 = 144[$r0]
+;;
+ xord $r36 = $r53, $r46
+ ld $r42 = 208[$r0]
+;;
+ xord $r59 = $r36, $r2
+;;
+ xord $r53 = $r59, $r34
+;;
+ xord $r53 = $r53, $r42
+;;
+ xord $r37 = $r53, $r35
+;;
+ xord $r53 = $r37, $r58
+;;
+ sd 16[$r49] = $r53
+;;
+ ld $r1 = 16[$r0]
+;;
+ xord $r41 = $r1, $r43
+ ld $r38 = 88[$r0]
+;;
+ xord $r36 = $r41, $r8
+;;
+ xord $r53 = $r36, $r44
+;;
+ xord $r36 = $r53, $r46
+ ld $r41 = 152[$r0]
+;;
+ xord $r36 = $r36, $r38
+;;
+ xord $r40 = $r36, $r41
+;;
+ xord $r53 = $r40, $r45
+;;
+ xord $r36 = $r53, $r34
+;;
+ ld $r53 = 216[$r0]
+;;
+ xord $r54 = $r36, $r53
+;;
+ xord $r36 = $r54, $r17
+;;
+ xord $r36 = $r36, $r9
+;;
+ xord $r36 = $r36, $r58
+;;
+ sd 24[$r49] = $r36
+;;
+ ld $r36 = 24[$r0]
+;;
+ xord $r54 = $r36, $r1
+ ld $r40 = 96[$r0]
+;;
+ xord $r61 = $r54, $r43
+ ld $r37 = 160[$r0]
+;;
+ xord $r54 = $r61, $r52
+;;
+ xord $r59 = $r54, $r38
+;;
+ xord $r59 = $r59, $r40
+;;
+ xord $r54 = $r59, $r37
+;;
+ xord $r54 = $r54, $r2
+;;
+ xord $r55 = $r54, $r45
+;;
+ ld $r54 = 224[$r0]
+;;
+ xord $r55 = $r55, $r54
+;;
+ xord $r55 = $r55, $r4
+;;
+ xord $r55 = $r55, $r9
+;;
+ xord $r9 = $r55, $r35
+;;
+ sd 32[$r49] = $r9
+;;
+ ld $r9 = 32[$r0]
+ addd $r0 = $r0, 256
+;;
+ xord $r55 = $r9, $r36
+ xord $r61 = $r6, $r9
+;;
+ xord $r55 = $r55, $r1
+;;
+ xord $r55 = $r55, $r46
+;;
+ xord $r55 = $r55, $r40
+;;
+ xord $r55 = $r55, $r33
+;;
+ xord $r55 = $r55, $r48
+;;
+ xord $r55 = $r55, $r41
+;;
+ xord $r55 = $r55, $r2
+;;
+ xord $r55 = $r55, $r3
+;;
+ xord $r55 = $r55, $r42
+;;
+ xord $r35 = $r55, $r35
+;;
+ xord $r60 = $r35, $r58
+ xord $r35 = $r61, $r36
+;;
+ sd 40[$r49] = $r60
+ xord $r55 = $r35, $r38
+;;
+ xord $r62 = $r55, $r33
+;;
+ xord $r59 = $r62, $r51
+ xord $r62 = $r8, $r34
+;;
+ xord $r35 = $r59, $r7
+ xord $r59 = $r11, $r6
+ xord $r62 = $r62, $r17
+;;
+ xord $r35 = $r35, $r37
+;;
+ xord $r35 = $r35, $r41
+;;
+ xord $r55 = $r35, $r5
+ xord $r35 = $r59, $r9
+;;
+ xord $r63 = $r55, $r53
+ xord $r60 = $r35, $r40
+;;
+ xord $r55 = $r63, $r58
+ xord $r35 = $r60, $r51
+;;
+ sd 48[$r49] = $r55
+ xord $r63 = $r35, $r15
+ xord $r55 = $r15, $r51
+;;
+ xord $r35 = $r63, $r10
+ xord $r63 = $r1, $r52
+;;
+ xord $r35 = $r35, $r48
+;;
+ xord $r35 = $r35, $r37
+;;
+ xord $r60 = $r35, $r39
+ xord $r35 = $r55, $r33
+;;
+ xord $r60 = $r60, $r54
+ xord $r35 = $r6, $r35
+;;
+ sd 56[$r49] = $r60
+ xord $r60 = $r10, $r48
+;;
+ xord $r61 = $r35, $r60
+ xord $r60 = $r5, $r3
+ xord $r35 = $r39, $r5
+;;
+ xord $r60 = $r61, $r60
+ xord $r61 = $r11, $r55
+;;
+ xord $r61 = $r61, $r7
+ xord $r62 = $r62, $r60
+;;
+ xord $r61 = $r61, $r35
+ sd 64[$r49] = $r62
+ xord $r62 = $r43, $r44
+;;
+ xord $r62 = $r62, $r45
+;;
+ xord $r62 = $r62, $r34
+;;
+ xord $r62 = $r62, $r4
+;;
+ xord $r62 = $r62, $r60
+;;
+ xord $r62 = $r62, $r61
+;;
+ sd 72[$r49] = $r62
+ xord $r62 = $r63, $r44
+;;
+ xord $r62 = $r62, $r2
+;;
+ xord $r62 = $r62, $r45
+;;
+ xord $r62 = $r62, $r42
+;;
+ xord $r62 = $r62, $r17
+;;
+ xord $r62 = $r62, $r61
+;;
+ xord $r62 = $r62, $r58
+;;
+ sd 80[$r49] = $r62
+ xord $r62 = $r36, $r8
+;;
+ xord $r62 = $r62, $r46
+;;
+ xord $r62 = $r62, $r52
+;;
+ xord $r62 = $r62, $r44
+;;
+ xord $r62 = $r62, $r41
+;;
+ xord $r62 = $r62, $r2
+;;
+ xord $r62 = $r62, $r34
+;;
+ xord $r62 = $r62, $r53
+;;
+ xord $r62 = $r62, $r4
+;;
+ xord $r62 = $r62, $r17
+;;
+ xord $r62 = $r62, $r60
+;;
+ xord $r62 = $r62, $r58
+;;
+ sd 88[$r49] = $r62
+ xord $r62 = $r9, $r43
+;;
+ xord $r62 = $r62, $r38
+;;
+ xord $r62 = $r62, $r46
+;;
+ xord $r62 = $r62, $r52
+;;
+ xord $r62 = $r62, $r37
+;;
+ xord $r62 = $r62, $r41
+;;
+ xord $r62 = $r62, $r45
+;;
+ xord $r62 = $r62, $r54
+;;
+ xord $r62 = $r62, $r42
+;;
+ xord $r62 = $r62, $r4
+;;
+ xord $r60 = $r62, $r60
+;;
+ xord $r60 = $r60, $r61
+;;
+ sd 96[$r49] = $r60
+ xord $r60 = $r6, $r1
+;;
+ xord $r60 = $r60, $r40
+;;
+ xord $r60 = $r60, $r38
+;;
+ xord $r60 = $r60, $r46
+;;
+ xord $r60 = $r60, $r48
+;;
+ xord $r60 = $r60, $r37
+;;
+ xord $r60 = $r60, $r2
+;;
+ xord $r60 = $r60, $r3
+;;
+ xord $r60 = $r60, $r53
+;;
+ xord $r60 = $r60, $r42
+;;
+ xord $r60 = $r60, $r61
+;;
+ xord $r60 = $r60, $r58
+;;
+ sd 104[$r49] = $r60
+ xord $r60 = $r11, $r36
+;;
+ xord $r60 = $r60, $r33
+;;
+ xord $r60 = $r60, $r40
+;;
+ xord $r60 = $r60, $r38
+;;
+ xord $r62 = $r60, $r7
+;;
+ xord $r60 = $r62, $r48
+;;
+ xord $r60 = $r60, $r41
+;;
+ xord $r60 = $r60, $r5
+;;
+ xord $r60 = $r60, $r54
+;;
+ xord $r60 = $r60, $r53
+;;
+ xord $r58 = $r60, $r58
+;;
+ sd 112[$r49] = $r58
+ xord $r58 = $r47, $r9
+;;
+ xord $r58 = $r58, $r51
+;;
+ xord $r58 = $r58, $r33
+;;
+ xord $r58 = $r58, $r40
+;;
+ xord $r58 = $r58, $r10
+;;
+ xord $r58 = $r58, $r7
+;;
+ xord $r58 = $r58, $r37
+;;
+ xord $r58 = $r58, $r39
+;;
+ xord $r63 = $r58, $r3
+;;
+ xord $r58 = $r63, $r54
+;;
+ sd 120[$r49] = $r58
+ xord $r58 = $r57, $r48
+ xord $r57 = $r51, $r57
+;;
+ xord $r60 = $r33, $r58
+ xord $r58 = $r39, $r3
+ xord $r63 = $r57, $r5
+;;
+ xord $r58 = $r60, $r58
+ xord $r63 = $r63, $r56
+ xord $r56 = $r15, $r10
+;;
+ xord $r61 = $r58, $r59
+ xord $r56 = $r56, $r39
+;;
+ xord $r62 = $r56, $r47
+ xord $r56 = $r44, $r17
+;;
+ xord $r56 = $r56, $r8
+;;
+ xord $r56 = $r56, $r61
+;;
+ sd 128[$r49] = $r56
+ xord $r56 = $r52, $r34
+;;
+ xord $r56 = $r56, $r4
+;;
+ xord $r56 = $r56, $r17
+;;
+ xord $r56 = $r56, $r43
+;;
+ xord $r56 = $r56, $r61
+;;
+ xord $r56 = $r56, $r63
+;;
+ sd 136[$r49] = $r56
+ xord $r56 = $r46, $r45
+;;
+ xord $r56 = $r56, $r34
+;;
+ xord $r56 = $r56, $r42
+;;
+ xord $r56 = $r56, $r4
+;;
+ xord $r56 = $r56, $r1
+;;
+ xord $r56 = $r56, $r8
+;;
+ xord $r56 = $r56, $r63
+;;
+ xord $r56 = $r56, $r62
+;;
+ sd 144[$r49] = $r56
+ xord $r56 = $r38, $r44
+;;
+ xord $r56 = $r56, $r2
+;;
+ xord $r56 = $r56, $r45
+;;
+ xord $r56 = $r56, $r34
+;;
+ xord $r56 = $r56, $r53
+;;
+ xord $r56 = $r56, $r42
+;;
+ xord $r56 = $r56, $r17
+;;
+ xord $r56 = $r56, $r36
+;;
+ xord $r59 = $r56, $r43
+;;
+ xord $r56 = $r59, $r8
+;;
+ xord $r59 = $r56, $r61
+;;
+ xord $r56 = $r59, $r62
+ xord $r59 = $r33, $r46
+;;
+ sd 152[$r49] = $r56
+ xord $r56 = $r40, $r52
+;;
+ xord $r56 = $r56, $r41
+;;
+ xord $r56 = $r56, $r2
+;;
+ xord $r56 = $r56, $r45
+;;
+ xord $r56 = $r56, $r54
+;;
+ xord $r56 = $r56, $r53
+;;
+ xord $r56 = $r56, $r4
+;;
+ xord $r56 = $r56, $r9
+;;
+ xord $r56 = $r56, $r1
+;;
+ xord $r56 = $r56, $r43
+;;
+ xord $r56 = $r56, $r61
+;;
+ xord $r56 = $r56, $r63
+;;
+ sd 160[$r49] = $r56
+ xord $r56 = $r59, $r37
+;;
+ xord $r56 = $r56, $r41
+;;
+ xord $r56 = $r56, $r2
+;;
+ xord $r56 = $r56, $r3
+;;
+ xord $r56 = $r56, $r54
+;;
+ xord $r57 = $r56, $r42
+;;
+ xord $r56 = $r57, $r6
+;;
+ xord $r56 = $r56, $r36
+;;
+ xord $r56 = $r56, $r1
+;;
+ xord $r56 = $r56, $r63
+;;
+ xord $r56 = $r56, $r62
+;;
+ sd 168[$r49] = $r56
+ xord $r56 = $r51, $r38
+;;
+ xord $r56 = $r56, $r48
+;;
+ xord $r56 = $r56, $r37
+;;
+ xord $r58 = $r56, $r41
+;;
+ xord $r56 = $r58, $r5
+;;
+ xord $r56 = $r56, $r3
+;;
+ xord $r56 = $r56, $r53
+;;
+ xord $r56 = $r56, $r11
+;;
+ xord $r58 = $r56, $r9
+;;
+ xord $r56 = $r58, $r36
+;;
+ xord $r56 = $r56, $r62
+;;
+ sd 176[$r49] = $r56
+ xord $r56 = $r15, $r40
+;;
+ xord $r56 = $r56, $r7
+;;
+ xord $r56 = $r56, $r48
+;;
+ xord $r56 = $r56, $r37
+;;
+ xord $r60 = $r56, $r39
+ xord $r39 = $r10, $r39
+;;
+ xord $r56 = $r60, $r5
+;;
+ xord $r56 = $r56, $r54
+;;
+ xord $r56 = $r56, $r47
+;;
+ xord $r56 = $r56, $r6
+;;
+ xord $r56 = $r56, $r9
+;;
+ sd 184[$r49] = $r56
+ xord $r56 = $r35, $r3
+;;
+ xord $r57 = $r48, $r56
+ xord $r56 = $r47, $r6
+;;
+ xord $r57 = $r57, $r56
+ xord $r56 = $r51, $r33
+;;
+ xord $r56 = $r57, $r56
+ xord $r57 = $r7, $r35
+;;
+ xord $r35 = $r57, $r11
+ xord $r57 = $r39, $r47
+;;
+ xord $r55 = $r35, $r55
+ xord $r35 = $r57, $r15
+ xord $r57 = $r34, $r8
+ xord $r34 = $r41, $r34
+;;
+ xord $r39 = $r57, $r44
+ xord $r57 = $r45, $r17
+ xord $r58 = $r34, $r42
+;;
+ xord $r39 = $r39, $r56
+ xord $r34 = $r58, $r4
+;;
+ sd 192[$r49] = $r39
+ xord $r39 = $r57, $r43
+;;
+ xord $r39 = $r39, $r8
+;;
+ xord $r39 = $r39, $r52
+;;
+ xord $r57 = $r39, $r56
+;;
+ xord $r39 = $r57, $r55
+;;
+ sd 200[$r49] = $r39
+ xord $r39 = $r2, $r4
+;;
+ xord $r39 = $r39, $r17
+ xord $r17 = $r34, $r17
+;;
+ xord $r39 = $r39, $r1
+;;
+ xord $r57 = $r39, $r43
+;;
+ xord $r39 = $r57, $r46
+;;
+ xord $r39 = $r39, $r44
+;;
+ xord $r63 = $r39, $r55
+;;
+ xord $r57 = $r63, $r35
+;;
+ sd 208[$r49] = $r57
+ xord $r57 = $r17, $r36
+;;
+ xord $r57 = $r57, $r1
+;;
+ xord $r57 = $r57, $r8
+ xord $r8 = $r48, $r2
+;;
+ xord $r17 = $r57, $r38
+ xord $r57 = $r37, $r45
+;;
+ xord $r59 = $r17, $r52
+ xord $r62 = $r57, $r53
+;;
+ xord $r34 = $r59, $r44
+ xord $r39 = $r62, $r42
+;;
+ xord $r44 = $r34, $r56
+;;
+ xord $r44 = $r44, $r35
+;;
+ sd 216[$r49] = $r44
+ xord $r44 = $r39, $r4
+;;
+ xord $r4 = $r44, $r9
+;;
+ xord $r4 = $r4, $r36
+;;
+ xord $r57 = $r4, $r43
+ xord $r43 = $r8, $r54
+;;
+ xord $r59 = $r57, $r40
+ xord $r48 = $r43, $r53
+;;
+ xord $r34 = $r59, $r46
+;;
+ xord $r52 = $r34, $r52
+;;
+ xord $r52 = $r52, $r56
+ xord $r56 = $r10, $r37
+;;
+ xord $r39 = $r52, $r55
+ xord $r52 = $r48, $r42
+;;
+ sd 224[$r49] = $r39
+ xord $r42 = $r52, $r6
+;;
+ xord $r17 = $r42, $r9
+;;
+ xord $r39 = $r17, $r1
+;;
+ xord $r48 = $r39, $r33
+;;
+ xord $r42 = $r48, $r38
+;;
+ xord $r17 = $r42, $r46
+;;
+ xord $r52 = $r17, $r55
+ xord $r55 = $r7, $r41
+;;
+ xord $r17 = $r52, $r35
+ xord $r34 = $r55, $r3
+;;
+ sd 232[$r49] = $r17
+ xord $r44 = $r34, $r54
+;;
+ xord $r39 = $r44, $r53
+;;
+ xord $r60 = $r39, $r11
+;;
+ xord $r61 = $r60, $r6
+;;
+ xord $r8 = $r61, $r36
+;;
+ xord $r4 = $r8, $r51
+;;
+ xord $r39 = $r4, $r40
+;;
+ xord $r2 = $r39, $r38
+;;
+ xord $r2 = $r2, $r35
+ xord $r35 = $r56, $r5
+;;
+ sd 240[$r49] = $r2
+ xord $r62 = $r35, $r3
+;;
+ xord $r38 = $r62, $r54
+;;
+ xord $r48 = $r38, $r47
+;;
+ xord $r38 = $r48, $r11
+;;
+ xord $r11 = $r38, $r9
+;;
+ xord $r1 = $r11, $r15
+;;
+ xord $r55 = $r1, $r33
+;;
+ xord $r5 = $r55, $r40
+;;
+ sd 248[$r49] = $r5
+ addd $r49 = $r49, 256
+ cb.wnez $r32? .L114
+;;
+ addd $r0 = $r0, -1024
+ addd $r1 = $r49, -1024
+ make $r2, 1024
+ call memmove
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 1040
+;;
+ ret
+;;
+ .type bs_mixcolumns_rev, @function
+ .size bs_mixcolumns_rev, . - bs_mixcolumns_rev
+ .text
+ .balign 2
+ .globl bs_expand_key
+bs_expand_key:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -224
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+ addd $r0 = $r12, 40
+ make $r2, 16
+;;
+ sd 24[$r12] = $r19
+;;
+ sd 32[$r12] = $r20
+ call memmove
+;;
+ addd $r0 = $r12, 40
+ call expand_key
+;;
+ make $r20, 0
+ make $r19, 0
+;;
+.L115:
+ sxwd $r1 = $r20
+ addd $r11 = $r12, 40
+ sxwd $r15 = $r19
+ make $r2, 16
+;;
+ slld $r34 = $r1, 10
+ addd $r1 = $r11, $r15
+;;
+ addd $r0 = $r18, $r34
+ call memmove
+;;
+ make $r1, 2
+;;
+.L116:
+ make $r35, 0
+;;
+.L117:
+ addw $r38 = $r1, $r35
+ sxwd $r37 = $r35
+ addw $r35 = $r35, 1
+ make $r32, 2
+;;
+ sxwd $r6 = $r20
+ sxwd $r33 = $r38
+ slld $r10 = $r37, 3
+ compw.lt $r32 = $r35, $r32
+;;
+ slld $r3 = $r6, 10
+ slld $r9 = $r33, 3
+;;
+ addd $r0 = $r18, $r3
+;;
+ addd $r39 = $r0, $r9
+ addd $r8 = $r0, $r10
+;;
+ ld $r17 = 0[$r8]
+;;
+ sd 0[$r39] = $r17
+;;
+ cb.wnez $r32? .L117
+;;
+ addw $r1 = $r1, 2
+ make $r32, 128
+;;
+ compw.lt $r32 = $r1, $r32
+;;
+ cb.wnez $r32? .L116
+;;
+ call bs_transpose
+;;
+ addw $r20 = $r20, 1
+ addw $r19 = $r19, 16
+ make $r32, 176
+;;
+ compw.lt $r32 = $r19, $r32
+;;
+ cb.wnez $r32? .L115
+;;
+ ld $r16 = 8[$r12]
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r20 = 32[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 224
+;;
+ ret
+;;
+ .type bs_expand_key, @function
+ .size bs_expand_key, . - bs_expand_key
+ .text
+ .balign 2
+ .globl bs_cipher
+bs_cipher:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -48
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+;;
+ sd 24[$r12] = $r19
+ addd $r19 = $r1, 0
+ addd $r0 = $r18, 0
+;;
+ sd 32[$r12] = $r20
+ call bs_transpose
+;;
+ addd $r1 = $r19, 0
+ addd $r0 = $r18, 0
+ call bs_addroundkey
+;;
+ make $r20, 1
+;;
+.L118:
+ addd $r0 = $r18, 0
+ call bs_apply_sbox
+;;
+ addd $r0 = $r18, 0
+ call bs_shiftmix
+;;
+ sxwd $r6 = $r20
+ addd $r0 = $r18, 0
+;;
+ slld $r4 = $r6, 10
+;;
+ addd $r1 = $r19, $r4
+ call bs_addroundkey
+;;
+ addw $r20 = $r20, 1
+ make $r32, 10
+;;
+ compw.lt $r32 = $r20, $r32
+;;
+ cb.wnez $r32? .L118
+;;
+ addd $r0 = $r18, 0
+ call bs_apply_sbox
+;;
+ addd $r0 = $r18, 0
+ call bs_shiftrows
+;;
+ addd $r1 = $r19, 10240
+ addd $r0 = $r18, 0
+ call bs_addroundkey
+;;
+ addd $r0 = $r18, 0
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r20 = 32[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 48
+;;
+ goto bs_transpose_rev
+;;
+ .type bs_cipher, @function
+ .size bs_cipher, . - bs_cipher
+ .text
+ .balign 2
+ .globl bs_cipher_rev
+bs_cipher_rev:
+ addd $r17 = $r12, 0
+ addd $r12 = $r12, -48
+;;
+ sd 0[$r12] = $r17
+;;
+;;
+ get $r16 = $ra
+;;
+ sd 8[$r12] = $r16
+;;
+ sd 16[$r12] = $r18
+ addd $r18 = $r0, 0
+;;
+ sd 24[$r12] = $r19
+ addd $r19 = $r1, 0
+ addd $r0 = $r18, 0
+;;
+ sd 32[$r12] = $r20
+ call bs_transpose
+;;
+ addd $r1 = $r19, 10240
+ addd $r0 = $r18, 0
+ call bs_addroundkey
+;;
+ make $r20, 9
+;;
+.L119:
+ addd $r0 = $r18, 0
+ call bs_shiftrows_rev
+;;
+ addd $r0 = $r18, 0
+ call bs_apply_sbox_rev
+;;
+ sxwd $r5 = $r20
+ addd $r0 = $r18, 0
+;;
+ slld $r8 = $r5, 10
+;;
+ addd $r1 = $r19, $r8
+ call bs_addroundkey
+;;
+ addd $r0 = $r18, 0
+ call bs_mixcolumns_rev
+;;
+ addw $r20 = $r20, -1
+;;
+ cb.wgtz $r20? .L119
+;;
+ addd $r0 = $r18, 0
+ call bs_shiftrows_rev
+;;
+ addd $r0 = $r18, 0
+ call bs_apply_sbox_rev
+;;
+ addd $r1 = $r19, 0
+ addd $r0 = $r18, 0
+ call bs_addroundkey
+;;
+ addd $r0 = $r18, 0
+;;
+ ld $r18 = 16[$r12]
+;;
+ ld $r19 = 24[$r12]
+;;
+ ld $r20 = 32[$r12]
+;;
+ ld $r16 = 8[$r12]
+;;
+ set $ra = $r16
+;;
+ addd $r12 = $r12, 48
+;;
+ goto bs_transpose_rev
+;;
+ .type bs_cipher_rev, @function
+ .size bs_cipher_rev, . - bs_cipher_rev
diff --git a/test/monniaux/bitsliced-aes/notes.org b/test/monniaux/bitsliced-aes/notes.org
new file mode 100644
index 00000000..6c2e27fa
--- /dev/null
+++ b/test/monniaux/bitsliced-aes/notes.org
@@ -0,0 +1,59 @@
+* bs_transpose_dst only
+** original
+==> test.ccomp.host.out <==
+cycles: 3080223
+
+==> test.ccomp.k1c.out <==
+cycles: 10145951
+
+==> test.gcc.host.out <==
+cycles: 1485887
+
+==> test.gcc.k1c.out <==
+cycles: 4078535
+
+** neg and
+==> test.ccomp.host.out <==
+cycles: 2905049
+
+==> test.ccomp.k1c.out <==
+cycles: 7995063
+
+==> test.gcc.host.out <==
+cycles: 1858263
+
+==> test.gcc.k1c.out <==
+cycles: 5255763
+
+** cmove mais mauvais scheduling de registres
+==> test.ccomp.host.out <==
+cycles: 4363682
+
+==> test.ccomp.k1c.out <==
+cycles: 7208629
+
+==> test.gcc.host.out <==
+cycles: 2916854
+
+==> test.gcc.k1c.out <==
+cycles: 5646730
+
+** cmove via match du and
+==> test.ccomp.host.out <==
+cycles: 2553732
+
+==> test.ccomp.k1c.out <==
+cycles: 7208629
+
+==> test.gcc.host.out <==
+cycles: 1849125
+
+==> test.gcc.k1c.out <==
+cycles: 5255763
+
+** hand optimized loads
+cycles: 6027072
+
+* both bs_transpose_dst and bs_transpose_rev
+** with both cmove
+6890902
diff --git a/test/monniaux/ternary_builtin/ternary_builtin.c b/test/monniaux/ternary_builtin/ternary_builtin.c
new file mode 100644
index 00000000..caa1c4c7
--- /dev/null
+++ b/test/monniaux/ternary_builtin/ternary_builtin.c
@@ -0,0 +1,11 @@
+int ternary_signed(int x, int v0, int v1) {
+ return ((-(x==0)) & v0) | ((-(x!=0)) & v1);
+}
+
+int ternary_unsigned(unsigned x, int v0, int v1) {
+ return ((-(x==0)) & v0) | ((-(x!=0)) & v1);
+}
+
+long ternary_signedl(long x, long v0, long v1) {
+ return ((-(x==0)) & v0) | ((-(x!=0)) & v1);
+}