diff options
-rw-r--r-- | mppa_k1c/Asm.v | 2 | ||||
-rw-r--r-- | mppa_k1c/Asmblock.v | 17 | ||||
-rw-r--r-- | mppa_k1c/Asmblockdeps.v | 1 | ||||
-rw-r--r-- | mppa_k1c/Asmblockgen.v | 14 | ||||
-rw-r--r-- | mppa_k1c/Asmblockgenproof1.v | 111 | ||||
-rw-r--r-- | mppa_k1c/Machregs.v | 2 | ||||
-rw-r--r-- | mppa_k1c/NeedOp.v | 45 | ||||
-rw-r--r-- | mppa_k1c/Op.v | 55 | ||||
-rw-r--r-- | mppa_k1c/PostpassSchedulingOracle.ml | 10 | ||||
-rw-r--r-- | mppa_k1c/SelectLong.vp | 22 | ||||
-rw-r--r-- | mppa_k1c/SelectLongproof.v | 103 | ||||
-rw-r--r-- | mppa_k1c/SelectOp.vp | 20 | ||||
-rw-r--r-- | mppa_k1c/SelectOpproof.v | 89 | ||||
-rw-r--r-- | mppa_k1c/TargetPrinter.ml | 4 | ||||
-rw-r--r-- | mppa_k1c/ValueAOp.v | 34 | ||||
-rw-r--r-- | test/monniaux/bitsliced-aes/bs.c | 53 | ||||
-rw-r--r-- | test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized | 3268 | ||||
-rw-r--r-- | test/monniaux/bitsliced-aes/notes.org | 59 | ||||
-rw-r--r-- | test/monniaux/ternary_builtin/ternary_builtin.c | 11 |
19 files changed, 3835 insertions, 85 deletions
diff --git a/mppa_k1c/Asm.v b/mppa_k1c/Asm.v index 7afed212..245804f3 100644 --- a/mppa_k1c/Asm.v +++ b/mppa_k1c/Asm.v @@ -208,6 +208,7 @@ Inductive instruction : Type := | Pandnil (rd rs: ireg) (imm: int64) (**r andn long *)
| Pornil (rd rs: ireg) (imm: int64) (**r orn long *)
| Pmaddil (rd rs: ireg) (imm: int64) (**r multiply add imm long *)
+ | Pcmove (bt: btest) (rcond rd rs : ireg) (** conditional move *)
.
(** Correspondance between Asmblock and Asm *)
@@ -354,6 +355,7 @@ Definition basic_to_instruction (b: basic) := (** ARRR *)
| PArithARRR Asmblock.Pmaddw rd rs1 rs2 => Pmaddw rd rs1 rs2
| PArithARRR Asmblock.Pmaddl rd rs1 rs2 => Pmaddl rd rs1 rs2
+ | PArithARRR (Asmblock.Pcmove cond) rd rs1 rs2=> Pcmove cond rd rs1 rs2
(** ARRI32 *)
| PArithARRI32 Asmblock.Pmaddiw rd rs1 imm => Pmaddiw rd rs1 imm
diff --git a/mppa_k1c/Asmblock.v b/mppa_k1c/Asmblock.v index b341388c..2fe27143 100644 --- a/mppa_k1c/Asmblock.v +++ b/mppa_k1c/Asmblock.v @@ -412,6 +412,7 @@ Inductive arith_name_rri64 : Type := Inductive arith_name_arrr : Type := | Pmaddw (**r multiply add word *) | Pmaddl (**r multiply add long *) + | Pcmove (bt: btest) (**r conditional move *) . Inductive arith_name_arri32 : Type := @@ -1207,6 +1208,22 @@ Definition arith_eval_arrr n v1 v2 v3 := match n with | Pmaddw => Val.add v1 (Val.mul v2 v3) | Pmaddl => Val.addl v1 (Val.mull v2 v3) + | Pcmove bt => + match cmp_for_btest bt with + | (Some c, Int) => + match Val.cmp_bool c v2 (Vint Int.zero) with + | None => Vundef + | Some true => v3 + | Some false => v1 + end + | (Some c, Long) => + match Val.cmpl_bool c v2 (Vlong Int64.zero) with + | None => Vundef + | Some true => v3 + | Some false => v1 + end + | (None, _) => Vundef + end end. Definition arith_eval_arri32 n v1 v2 v3 := diff --git a/mppa_k1c/Asmblockdeps.v b/mppa_k1c/Asmblockdeps.v index 500fc504..6d124556 100644 --- a/mppa_k1c/Asmblockdeps.v +++ b/mppa_k1c/Asmblockdeps.v @@ -1433,6 +1433,7 @@ Definition string_of_name_arrr (n: arith_name_arrr): pstring := match n with | Pmaddw => "Pmaddw" | Pmaddl => "Pmaddl" + | Pcmove _ => "Pcmove" end. Definition string_of_name_arri32 (n: arith_name_arri32): pstring := diff --git a/mppa_k1c/Asmblockgen.v b/mppa_k1c/Asmblockgen.v index c03e319c..cf0b2a0a 100644 --- a/mppa_k1c/Asmblockgen.v +++ b/mppa_k1c/Asmblockgen.v @@ -729,6 +729,20 @@ Definition transl_op do rd <- ireg_of res; transl_cond_op cmp rd args k + | Oselect, a0 :: a1 :: aS :: nil => + assertion (mreg_eq a0 res); + do r0 <- ireg_of a0; + do r1 <- ireg_of a1; + do rS <- ireg_of aS; + OK (Pcmove BTwnez r0 rS r1 ::i k) + + | Oselectl, a0 :: a1 :: aS :: nil => + assertion (mreg_eq a0 res); + do r0 <- ireg_of a0; + do r1 <- ireg_of a1; + do rS <- ireg_of aS; + OK (Pcmove BTdnez r0 rS r1 ::i k) + | _, _ => Error(msg "Asmgenblock.transl_op") end. diff --git a/mppa_k1c/Asmblockgenproof1.v b/mppa_k1c/Asmblockgenproof1.v index 5486a497..16663522 100644 --- a/mppa_k1c/Asmblockgenproof1.v +++ b/mppa_k1c/Asmblockgenproof1.v @@ -1558,6 +1558,26 @@ Ltac TranslOpSimpl := [ apply exec_straight_one; reflexivity | split; [ apply Val.lessdef_same; simpl; Simpl; fail | intros; simpl; Simpl; fail ] ]. +Lemma int_eq_comm: + forall (x y: int), + (Int.eq x y) = (Int.eq y x). +Proof. + intros. + unfold Int.eq. + unfold zeq. + destruct (Z.eq_dec _ _); destruct (Z.eq_dec _ _); congruence. +Qed. + +Lemma int64_eq_comm: + forall (x y: int64), + (Int64.eq x y) = (Int64.eq y x). +Proof. + intros. + unfold Int64.eq. + unfold zeq. + destruct (Z.eq_dec _ _); destruct (Z.eq_dec _ _); congruence. +Qed. + Lemma transl_op_correct: forall op args res k (rs: regset) m v c, transl_op op args res k = OK c -> @@ -1645,69 +1665,34 @@ Opaque Int.eq. - (* Ocmp *) exploit transl_cond_op_correct; eauto. intros (rs' & A & B & C). exists rs'; split. eexact A. eauto with asmgen. -(* -- (* intconst *) - exploit loadimm32_correct; eauto. intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* longconst *) - exploit loadimm64_correct; eauto. intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* floatconst *) - destruct (Float.eq_dec n Float.zero). -+ subst n. econstructor; split. - apply exec_straight_one. simpl; eauto. auto. - split; intros; Simpl. -+ econstructor; split. - apply exec_straight_one. simpl; eauto. auto. - split; intros; Simpl. -- (* singleconst *) - destruct (Float32.eq_dec n Float32.zero). -+ subst n. econstructor; split. - apply exec_straight_one. simpl; eauto. auto. - split; intros; Simpl. -+ econstructor; split. - apply exec_straight_one. simpl; eauto. auto. - split; intros; Simpl. -- (* stackoffset *) - exploit addptrofs_correct. instantiate (1 := X2); auto with asmgen. intros (rs' & A & B & C). - exists rs'; split; eauto. auto with asmgen. -- (* addimm *) - exploit (opimm32_correct Paddw Paddiw Val.add); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* andimm *) - exploit (opimm32_correct Pandw Pandiw Val.and); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* orimm *) - exploit (opimm32_correct Porw Poriw Val.or); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* xorimm *) - exploit (opimm32_correct Pxorw Pxoriw Val.xor); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. - - - -- (* addlimm *) - exploit (opimm64_correct Paddl Paddil Val.addl); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. - -- (* andimm *) - exploit (opimm64_correct Pandl Pandil Val.andl); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* orimm *) - exploit (opimm64_correct Porl Poril Val.orl); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* xorimm *) - exploit (opimm64_correct Pxorl Pxoril Val.xorl); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -*) +- (* Oselect *) + econstructor; split. + + eapply exec_straight_one. + simpl; reflexivity. + + split. + * unfold select. + destruct (rs x1) eqn:eqX1; try constructor. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + simpl. + rewrite int_eq_comm. + destruct (Int.eq i Int.zero); simpl; rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. +- (* Oselectl *) + econstructor; split. + + eapply exec_straight_one. + simpl; reflexivity. + + split. + * unfold selectl. + destruct (rs x1) eqn:eqX1; try constructor. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + simpl. + rewrite int64_eq_comm. + destruct (Int64.eq i Int64.zero); simpl; rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. Qed. (** Memory accesses *) diff --git a/mppa_k1c/Machregs.v b/mppa_k1c/Machregs.v index 4de37af4..2b3fb1aa 100644 --- a/mppa_k1c/Machregs.v +++ b/mppa_k1c/Machregs.v @@ -210,7 +210,7 @@ Global Opaque Definition two_address_op (op: operation) : bool := match op with - | Ocast32unsigned | Omadd | Omaddimm _ | Omaddl | Omaddlimm _ => true + | Ocast32unsigned | Omadd | Omaddimm _ | Omaddl | Omaddlimm _ | Oselect | Oselectl => true | _ => false end. diff --git a/mppa_k1c/NeedOp.v b/mppa_k1c/NeedOp.v index 2577370c..a6ecb820 100644 --- a/mppa_k1c/NeedOp.v +++ b/mppa_k1c/NeedOp.v @@ -117,6 +117,7 @@ Definition needs_of_operation (op: operation) (nv: nval): list nval := | Ointofsingle | Ointuofsingle | Osingleofint | Osingleofintu => op1 (default nv) | Olongofsingle | Olonguofsingle | Osingleoflong | Osingleoflongu => op1 (default nv) | Ocmp c => needs_of_condition c + | Oselect | Oselectl => op3 (default nv) end. Definition operation_is_redundant (op: operation) (nv: nval): bool := @@ -186,6 +187,46 @@ Proof. trivial. Qed. +Lemma select_sound: + forall v0 w0 v1 w1 v2 w2 x, + vagree v0 w0 (default x) -> + vagree v1 w1 (default x) -> + vagree v2 w2 (default x) -> + vagree (select v0 v1 v2) (select w0 w1 w2) x. +Proof. + unfold default; intros. + destruct x; trivial. + - destruct v2; simpl; trivial. + destruct v0; simpl; trivial. + destruct v1; simpl; trivial. + inv H. inv H0. inv H1. simpl. + constructor. + - destruct v2; simpl; trivial. + destruct v0; simpl; trivial. + destruct v1; simpl; trivial. + inv H. inv H0. inv H1. simpl. + constructor. +Qed. + +Lemma selectl_sound: + forall v0 w0 v1 w1 v2 w2 x, + vagree v0 w0 (default x) -> + vagree v1 w1 (default x) -> + vagree v2 w2 (default x) -> + vagree (selectl v0 v1 v2) (selectl w0 w1 w2) x. +Proof. + unfold default; intros. + destruct x; trivial. + - destruct v2; simpl; trivial. + destruct v0; simpl; trivial. + destruct v1; simpl; trivial. + - destruct v2; simpl; trivial. + destruct v0; simpl; trivial. + destruct v1; simpl; trivial. + inv H. inv H0. inv H1. simpl. + constructor. +Qed. + Remark default_idem: forall nv, default (default nv) = default nv. Proof. destruct nv; simpl; trivial. @@ -238,6 +279,10 @@ Proof. apply mull_sound; trivial. rewrite default_idem; trivial. rewrite default_idem; trivial. + (* select *) +- apply select_sound; trivial. + (* selectl *) +- apply selectl_sound; trivial. Qed. Lemma operation_is_redundant_sound: diff --git a/mppa_k1c/Op.v b/mppa_k1c/Op.v index c4338857..ec3f1077 100644 --- a/mppa_k1c/Op.v +++ b/mppa_k1c/Op.v @@ -181,7 +181,9 @@ Inductive operation : Type := | Osingleoflong (**r [rd = float32_of_signed_long(r1)] *) | Osingleoflongu (**r [rd = float32_of_unsigned_int(r1)] *) (*c Boolean tests: *) - | Ocmp (cond: condition). (**r [rd = 1] if condition holds, [rd = 0] otherwise. *) + | Ocmp (cond: condition) (**r [rd = 1] if condition holds, [rd = 0] otherwise. *) + | Oselect (**r [rd = if r3 then r2 else r1] *) + | Oselectl. (**r [rd = if r3 then r2 else r1] *) (** Addressing modes. [r1], [r2], etc, are the arguments to the addressing. *) @@ -250,6 +252,40 @@ Definition eval_condition (cond: condition) (vl: list val) (m: mem): option bool | _, _ => None end. +Definition select (v0 : val) (v1 : val) (vselect : val) : val := + match vselect with + | Vint iselect => + match v0 with + | Vint i0 => + match v1 with + | Vint i1 => + Vint (if Int.cmp Ceq Int.zero iselect + then i0 + else i1) + | _ => Vundef + end + | _ => Vundef + end + | _ => Vundef + end. + +Definition selectl (v0 : val) (v1 : val) (vselect : val) : val := + match vselect with + | Vlong iselect => + match v0 with + | Vlong i0 => + match v1 with + | Vlong i1 => + Vlong (if Int64.cmp Ceq Int64.zero iselect + then i0 + else i1) + | _ => Vundef + end + | _ => Vundef + end + | _ => Vundef + end. + Definition eval_operation (F V: Type) (genv: Genv.t F V) (sp: val) (op: operation) (vl: list val) (m: mem): option val := @@ -378,6 +414,8 @@ Definition eval_operation | Osingleoflong, v1::nil => Val.singleoflong v1 | Osingleoflongu, v1::nil => Val.singleoflongu v1 | Ocmp c, _ => Some (Val.of_optbool (eval_condition c vl m)) + | Oselect, v0::v1::vselect::nil => Some (select v0 v1 vselect) + | Oselectl, v0::v1::vselect::nil => Some (selectl v0 v1 vselect) | _, _ => None end. @@ -565,6 +603,9 @@ Definition type_of_operation (op: operation) : list typ * typ := | Osingleoflong => (Tlong :: nil, Tsingle) | Osingleoflongu => (Tlong :: nil, Tsingle) | Ocmp c => (type_of_condition c, Tint) + + | Oselect => (Tint :: Tint :: Tint :: nil, Tint) + | Oselectl => (Tlong :: Tlong :: Tlong :: nil, Tlong) end. Definition type_of_addressing (addr: addressing) : list typ := @@ -799,6 +840,10 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type). - destruct v0; simpl in H0; inv H0... (* cmp *) - destruct (eval_condition cond vl m)... destruct b... + (* select *) + - destruct v0; destruct v1; destruct v2; simpl in *; try discriminate; trivial. + (* selectl *) + - destruct v0; destruct v1; destruct v2; simpl in *; try discriminate; trivial. Qed. End SOUNDNESS. @@ -1324,6 +1369,14 @@ Proof. exploit eval_condition_inj; eauto. intros EQ; rewrite EQ. destruct b; simpl; constructor. simpl; constructor. + (* select *) + - inv H3; simpl; try constructor. + inv H4; simpl; try constructor. + inv H2; simpl; constructor. + (* selectl *) + - inv H3; simpl; try constructor. + inv H4; simpl; try constructor. + inv H2; simpl; constructor. Qed. Lemma eval_addressing_inj: diff --git a/mppa_k1c/PostpassSchedulingOracle.ml b/mppa_k1c/PostpassSchedulingOracle.ml index 56b00c7e..9e6e819c 100644 --- a/mppa_k1c/PostpassSchedulingOracle.ml +++ b/mppa_k1c/PostpassSchedulingOracle.ml @@ -128,6 +128,7 @@ let arith_rri64_str = function let arith_arrr_str = function | Pmaddw -> "Pmaddw" | Pmaddl -> "Pmaddl" + | Pcmove _ -> "Pcmove" let arith_ri32_str = "Pmake" @@ -415,7 +416,7 @@ type real_instruction = | Addw | Andw | Compw | Mulw | Orw | Sbfw | Sraw | Srlw | Sllw | Rorw | Xorw | Addd | Andd | Compd | Muld | Ord | Sbfd | Srad | Srld | Slld | Xord | Nandw | Norw | Nxorw | Nandd | Nord | Nxord | Andnw | Ornw | Andnd | Ornd - | Maddw | Maddd + | Maddw | Maddd | Cmoved | Make | Nop | Sxwd | Zxwd (* LSU *) | Lbs | Lbz | Lhs | Lhz | Lws | Ld @@ -482,7 +483,8 @@ let ab_inst_to_real = function | "Pfixedudrzz" -> Fixedudz | "Pfixeddrzz_i32" -> Fixeddz | "Pfixedudrzz_i32" -> Fixedudz - + | "Pcmove" -> Cmoved + | "Plb" -> Lbs | "Plbu" -> Lbz | "Plh" -> Lhs @@ -531,7 +533,7 @@ let rec_to_usage r = | Some U27L5 | Some U27L10 -> alu_tiny_x | _ -> raise InvalidEncoding) | Addd | Andd | Nandd | Ord | Nord | Sbfd | Xord - | Nxord | Andnd | Ornd -> + | Nxord | Andnd | Ornd | Cmoved -> (match encoding with None | Some U6 | Some S10 -> alu_tiny | Some U27L5 | Some U27L10 -> alu_tiny_x | Some E27U27L10 -> alu_tiny_y) @@ -584,7 +586,7 @@ let real_inst_to_latency = function | Rorw | Nandw | Norw | Nxorw | Ornw | Andnw | Nandd | Nord | Nxord | Ornd | Andnd | Addd | Andd | Compd | Ord | Sbfd | Srad | Srld | Slld | Xord | Make - | Sxwd | Zxwd | Fcompw | Fcompd + | Sxwd | Zxwd | Fcompw | Fcompd | Cmoved -> 1 | Floatwz | Floatuwz | Fixeduwz | Fixedwz | Floatdz | Floatudz | Fixeddz | Fixedudz -> 4 | Mulw | Muld | Maddw | Maddd -> 2 (* FIXME - WORST CASE. If it's S10 then it's only 1 *) diff --git a/mppa_k1c/SelectLong.vp b/mppa_k1c/SelectLong.vp index 0c3618d7..6c34de19 100644 --- a/mppa_k1c/SelectLong.vp +++ b/mppa_k1c/SelectLong.vp @@ -258,9 +258,17 @@ Nondetfunction andl (e1: expr) (e2: expr) := | Eop (Olongconst n1) Enil, t2 => andlimm n1 t2 | t1, Eop (Olongconst n2) Enil => andlimm n2 t1 | (Eop Onotl (t1:::Enil)), t2 => Eop Oandnl (t1:::t2:::Enil) - | t1, (Eop Onotl (t2:::Enil)) => Eop Oandnl (t2:::t1:::Enil) + | t1, (Eop Onotl (t2:::Enil)) => Eop Oandnl (t2:::t1:::Enil) | _, _ => Eop Oandl (e1:::e2:::Enil) end. +(* + | (Eop Ocast32signed + ((Eop Oneg ((Eop (Ocmp (Ccomplimm Cne zero1)) + (y1:::Enil)):::Enil)):::Enil)), v1 => + if Int64.eq zero1 Int64.zero + then Eop Oselectl ((Eop (Olongconst Int64.zero) Enil):::v1:::y1:::Enil) + else Eop Oandl (e1:::e2:::Enil) +*) Nondetfunction orlimm (n1: int64) (e2: expr) := if Int64.eq n1 Int64.zero then e2 else @@ -278,8 +286,20 @@ Nondetfunction orl (e1: expr) (e2: expr) := | t1, Eop (Olongconst n2) Enil => orlimm n2 t1 | (Eop Onotl (t1:::Enil)), t2 => Eop Oornl (t1:::t2:::Enil) | t1, (Eop Onotl (t2:::Enil)) => Eop Oornl (t2:::t1:::Enil) + | (Eop Oandl ((Eop Ocast32signed + ((Eop Oneg ((Eop (Ocmp (Ccomplimm Ceq zero0)) + (y0:::Enil)):::Enil)):::Enil)):::v0:::Enil)), + (Eop Oandl ((Eop Ocast32signed + ((Eop Oneg ((Eop (Ocmp (Ccomplimm Cne zero1)) + (y1:::Enil)):::Enil)):::Enil)):::v1:::Enil)) => + if same_expr_pure y0 y1 + && Int64.eq zero0 Int64.zero + && Int64.eq zero1 Int64.zero + then Eop Oselectl (v0:::v1:::y0:::Enil) + else Eop Oorl (e1:::e2:::Enil) | _, _ => Eop Oorl (e1:::e2:::Enil) end. + Nondetfunction xorlimm (n1: int64) (e2: expr) := if Int64.eq n1 Int64.zero then e2 else diff --git a/mppa_k1c/SelectLongproof.v b/mppa_k1c/SelectLongproof.v index 79187338..dd4cfa69 100644 --- a/mppa_k1c/SelectLongproof.v +++ b/mppa_k1c/SelectLongproof.v @@ -390,6 +390,15 @@ Proof. - TrivialExists. Qed. +Lemma int64_eq_commut: forall x y : int64, + (Int64.eq x y) = (Int64.eq y x). +Proof. + intros. + predSpec Int64.eq Int64.eq_spec x y; + predSpec Int64.eq Int64.eq_spec y x; + congruence. +Qed. + Theorem eval_andl: binary_constructor_sound andl Val.andl. Proof. unfold andl; destruct Archi.splitlong. apply SplitLongproof.eval_andl. @@ -398,6 +407,25 @@ Proof. - InvEval. apply eval_andlimm; auto. - (*andn*) InvEval. TrivialExists. simpl. congruence. - (*andn reverse*) InvEval. rewrite Val.andl_commut. TrivialExists; simpl. congruence. + (* +- (* selectl *) + InvEval. + predSpec Int64.eq Int64.eq_spec zero1 Int64.zero; simpl; TrivialExists. + + constructor. econstructor; constructor. + constructor; try constructor; try constructor; try eassumption. + + simpl in *. f_equal. inv H6. + unfold selectl. + simpl. + destruct v3; simpl; trivial. + rewrite int64_eq_commut. + destruct (Int64.eq i Int64.zero); simpl. + * replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve. + destruct y; simpl; trivial. + * replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve. + destruct y; simpl; trivial. + rewrite Int64.and_commut. rewrite Int64.and_mone. reflexivity. + + constructor. econstructor. constructor. econstructor. constructor. econstructor. constructor. eassumption. constructor. simpl. f_equal. constructor. simpl. f_equal. constructor. simpl. f_equal. constructor. eassumption. constructor. + + simpl in *. congruence. *) - TrivialExists. Qed. @@ -414,6 +442,7 @@ Proof. - TrivialExists. Qed. + Theorem eval_orl: binary_constructor_sound orl Val.orl. Proof. unfold orl; destruct Archi.splitlong. apply SplitLongproof.eval_orl. @@ -423,6 +452,80 @@ Proof. - InvEval. apply eval_orlimm; auto. - (*orn*) InvEval. TrivialExists; simpl; congruence. - (*orn reversed*) InvEval. rewrite Val.orl_commut. TrivialExists; simpl; congruence. + - (* selectl *) + destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try TrivialExists. + predSpec Int64.eq Int64.eq_spec zero0 Int64.zero; simpl; try TrivialExists. + predSpec Int64.eq Int64.eq_spec zero1 Int64.zero; simpl; [ | TrivialExists]. + inv H. + inv H0. + inv H6. + inv H3. + inv H2. + inv H7. + inv H4. + inv H3. + inv H6. + inv H4. + inv H3. + inv H14. + inv H13. + inv H6. + inv H4. + inv H13. + inv H14. + inv H9. + inv H11. + inv H13. + inv H3. + inv H6. + inv H7. + inv H3. + inv H14. + inv H17. + simpl in *. + inv H8. + inv H5. + inv H10. + inv H12. + inv H15. + inv H16. + inv H11. + inv H13. + unfold same_expr_pure in PURE. + destruct y0; try congruence. + destruct y1; try congruence. + destruct (ident_eq i i0); try congruence; clear PURE. + rewrite <- e0 in *; clear e0. + inv H6. + inv H7. + rename v10 into vtest. + replace v11 with vtest in * by congruence. + TrivialExists. + simpl. + f_equal. + unfold selectl. + destruct vtest; simpl; trivial. + rewrite Val.andl_commut. + destruct v4; simpl; trivial. + rewrite Val.andl_commut. + rewrite Val.orl_commut. + destruct v9; simpl; trivial. + rewrite int64_eq_commut. + destruct (Int64.eq i1 Int64.zero); simpl. + + + replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve. + replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve. + rewrite Int64.and_mone. + rewrite Int64.and_zero. + rewrite Int64.or_commut. + rewrite Int64.or_zero. + reflexivity. + + replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve. + replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve. + rewrite Int64.and_mone. + rewrite Int64.and_zero. + rewrite Int64.or_zero. + reflexivity. - TrivialExists. Qed. diff --git a/mppa_k1c/SelectOp.vp b/mppa_k1c/SelectOp.vp index f6605c11..13650a2c 100644 --- a/mppa_k1c/SelectOp.vp +++ b/mppa_k1c/SelectOp.vp @@ -275,7 +275,25 @@ Nondetfunction or (e1: expr) (e2: expr) := then Eop (Ororimm n2) (t1:::Enil) else Eop Oor (e1:::e2:::Enil) | (Eop Onot (t1:::Enil)), t2 => Eop Oorn (t1:::t2:::Enil) - | t1, (Eop Onot (t2:::Enil)) => Eop Oorn (t2:::t1:::Enil) + | t1, (Eop Onot (t2:::Enil)) => Eop Oorn (t2:::t1:::Enil) + | (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompimm Ceq zero0)) + (y0:::Enil)):::Enil)):::v0:::Enil)), + (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompimm Cne zero1)) + (y1:::Enil)):::Enil)):::v1:::Enil)) => + if same_expr_pure y0 y1 + && Int.eq zero0 Int.zero + && Int.eq zero1 Int.zero + then Eop Oselect (v0:::v1:::y0:::Enil) + else Eop Oor (e1:::e2:::Enil) + | (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompuimm Ceq zero0)) + (y0:::Enil)):::Enil)):::v0:::Enil)), + (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompuimm Cne zero1)) + (y1:::Enil)):::Enil)):::v1:::Enil)) => + if same_expr_pure y0 y1 + && Int.eq zero0 Int.zero + && Int.eq zero1 Int.zero + then Eop Oselect (v0:::v1:::y0:::Enil) + else Eop Oor (e1:::e2:::Enil) | _, _ => Eop Oor (e1:::e2:::Enil) end. diff --git a/mppa_k1c/SelectOpproof.v b/mppa_k1c/SelectOpproof.v index 89af39ee..d35c4b6d 100644 --- a/mppa_k1c/SelectOpproof.v +++ b/mppa_k1c/SelectOpproof.v @@ -92,7 +92,7 @@ Let ge := Genv.globalenv prog. Variable sp: val. Variable e: env. Variable m: mem. - + (* Helper lemmas - from SplitLongproof.v *) Ltac UseHelper := decompose [Logic.and] arith_helpers_correct; eauto. @@ -162,7 +162,7 @@ Definition binary_constructor_sound (cstr: expr -> expr -> expr) (sem: val -> va eval_expr ge sp e m le a x -> eval_expr ge sp e m le b y -> exists v, eval_expr ge sp e m le (cstr a b) v /\ Val.lessdef (sem x y) v. - + Theorem eval_addrsymbol: forall le id ofs, exists v, eval_expr ge sp e m le (addrsymbol id ofs) v /\ Val.lessdef (Genv.symbol_address ge id ofs) v. @@ -526,6 +526,15 @@ Proof. discriminate. Qed. +Lemma int_eq_commut: forall x y : int, + (Int.eq x y) = (Int.eq y x). +Proof. + intros. + predSpec Int.eq Int.eq_spec x y; + predSpec Int.eq Int.eq_spec y x; + congruence. +Qed. + Theorem eval_or: binary_constructor_sound or Val.or. Proof. unfold or; red; intros. @@ -553,6 +562,82 @@ Proof. exists (Val.ror v1 (Vint n2)); split. EvalOp. rewrite Val.or_commut. apply ROR; auto. - (*orn*) TrivialExists; simpl; congruence. - (*orn reversed*) rewrite Val.or_commut. TrivialExists; simpl; congruence. + - (* select *) + destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try exact DEFAULT. + predSpec Int.eq Int.eq_spec zero0 Int.zero; simpl; try exact DEFAULT. + predSpec Int.eq Int.eq_spec zero1 Int.zero; simpl; try exact DEFAULT. + TrivialExists. + simpl in *. + unfold select. + f_equal. + inv H6. + inv H7. + inv H9. + inv H11. + unfold same_expr_pure in PURE. + destruct y0; try congruence. + destruct y1; try congruence. + destruct (ident_eq i i0); try congruence. + rewrite <- e0 in *. clear e0. clear PURE. + inv H2. inv H5. + replace v8 with v4 in * by congruence. + rename v4 into vselect. + destruct vselect; simpl; trivial. + rewrite (Val.and_commut _ v5). + destruct v5; simpl; trivial. + rewrite (Val.and_commut _ v9). + rewrite Val.or_commut. + destruct v9; simpl; trivial. + rewrite int_eq_commut. + destruct (Int.eq i1 Int.zero); simpl. + + rewrite Int.and_zero. + rewrite Int.or_commut. + rewrite Int.or_zero. + rewrite Int.and_mone. + reflexivity. + + rewrite Int.and_mone. + rewrite Int.neg_zero. + rewrite Int.and_zero. + rewrite Int.or_zero. + reflexivity. + - (* select unsigned *) + destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try exact DEFAULT. + predSpec Int.eq Int.eq_spec zero0 Int.zero; simpl; try exact DEFAULT. + predSpec Int.eq Int.eq_spec zero1 Int.zero; simpl; try exact DEFAULT. + TrivialExists. + simpl in *. + unfold select. + f_equal. + inv H6. + inv H7. + inv H9. + inv H11. + unfold same_expr_pure in PURE. + destruct y0; try congruence. + destruct y1; try congruence. + destruct (ident_eq i i0); try congruence. + rewrite <- e0 in *. clear e0. clear PURE. + inv H2. inv H5. + replace v8 with v4 in * by congruence. + rename v4 into vselect. + destruct vselect; simpl; trivial. + rewrite (Val.and_commut _ v5). + destruct v5; simpl; trivial. + rewrite (Val.and_commut _ v9). + rewrite Val.or_commut. + destruct v9; simpl; trivial. + rewrite int_eq_commut. + destruct (Int.eq i1 Int.zero); simpl. + + rewrite Int.and_zero. + rewrite Int.or_commut. + rewrite Int.or_zero. + rewrite Int.and_mone. + reflexivity. + + rewrite Int.and_mone. + rewrite Int.neg_zero. + rewrite Int.and_zero. + rewrite Int.or_zero. + reflexivity. - apply DEFAULT. Qed. diff --git a/mppa_k1c/TargetPrinter.ml b/mppa_k1c/TargetPrinter.ml index 41a6622a..4be94390 100644 --- a/mppa_k1c/TargetPrinter.ml +++ b/mppa_k1c/TargetPrinter.ml @@ -504,6 +504,10 @@ module Target (*: TARGET*) = | Pmaddil (rd, rs, imm) -> fprintf oc " maddd %a = %a, %a\n" ireg rd ireg rs coqint64 imm + | Pcmove (bt, rd, rcond, rs) -> + fprintf oc " cmoved.%a %a? %a = %a\n" + bcond bt ireg rcond ireg rd ireg rs + let get_section_names name = let (text, lit) = match C2C.atom_sections name with diff --git a/mppa_k1c/ValueAOp.v b/mppa_k1c/ValueAOp.v index fb1977ea..a3843301 100644 --- a/mppa_k1c/ValueAOp.v +++ b/mppa_k1c/ValueAOp.v @@ -41,6 +41,24 @@ Definition eval_static_addressing (addr: addressing) (vl: list aval): aval := | _, _ => Vbot end. +Definition select (v0 v1 vselect : aval) : aval := + match vselect with + | I iselect => + if Int.eq Int.zero iselect + then binop_int (fun x0 x1 => x0) v0 v1 + else binop_int (fun x0 x1 => x1) v0 v1 + | _ => Vtop + end. + +Definition selectl (v0 v1 vselect : aval) : aval := + match vselect with + | L iselect => + if Int64.eq Int64.zero iselect + then binop_long (fun x0 x1 => x0) v0 v1 + else binop_long (fun x0 x1 => x1) v0 v1 + | _ => Vtop + end. + Definition eval_static_operation (op: operation) (vl: list aval): aval := match op, vl with | Omove, v1::nil => v1 @@ -165,6 +183,8 @@ Definition eval_static_operation (op: operation) (vl: list aval): aval := | Osingleoflong, v1::nil => singleoflong v1 | Osingleoflongu, v1::nil => singleoflongu v1 | Ocmp c, _ => of_optbool (eval_static_condition c vl) + | Oselect, v0::v1::vselect::nil => select v0 v1 vselect + | Oselectl, v0::v1::vselect::nil => selectl v0 v1 vselect | _, _ => Vbot end. @@ -241,6 +261,20 @@ Proof. destruct (propagate_float_constants tt); constructor. rewrite Ptrofs.add_zero_l; eauto with va. apply of_optbool_sound. eapply eval_static_condition_sound; eauto. + (* select *) + - inv H2; simpl; try constructor. + + destruct (Int.eq _ _); apply binop_int_sound; trivial. + + destruct (Int.eq _ _); + destruct a1; destruct a0; eauto; constructor. + + destruct (Int.eq _ _); + destruct a1; destruct a0; eauto; constructor. + + destruct (Int.eq _ _); + destruct a1; destruct a0; eauto; constructor. + (* selectl *) + - inv H2; simpl; try constructor. + + destruct (Int64.eq _ _); apply binop_long_sound; trivial. + + destruct (Int64.eq _ _); + destruct a1; destruct a0; eauto; constructor. Qed. End SOUNDNESS. diff --git a/test/monniaux/bitsliced-aes/bs.c b/test/monniaux/bitsliced-aes/bs.c index 4a9df4aa..063f36f5 100644 --- a/test/monniaux/bitsliced-aes/bs.c +++ b/test/monniaux/bitsliced-aes/bs.c @@ -2,6 +2,21 @@ #include <string.h> #include "bs.h" + +static inline long compcert_ternary_signedl(long x, long v0, long v1) { + return ((-(x==0)) & v0) | ((-(x!=0)) & v1); +} + +static inline word_t compcert_ternary(word_t x, word_t v0, word_t v1) { + return compcert_ternary_signedl(x, v0, v1); +} + +#if defined(__K1C__) +#define TERNARY(x, v0, v1) compcert_ternary((x), (v0), (v1)) +#else +#define TERNARY(x, v0, v1) ((x) ? (v1) : (v0)) +#endif + #if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) ||\ defined(__amd64__) || defined(__amd32__)|| defined(__amd16__) #define bs2le(x) (x) @@ -14,12 +29,6 @@ #error "endianness not supported" #endif -#if 1 -#define TERNARY_XY0(t, x) ((-((t) != 0)) & (x)) -#else -#define TERNARY_XY0(t, x) (((t) != 0) ? (x) : (0)) -#endif - void bs_addroundkey(word_t * B, word_t * rk) { int i; @@ -393,15 +402,23 @@ void bs_transpose_dst(word_t * transpose, word_t * blocks) int offset = i << MUL_SHIFT; #ifndef UNROLL_TRANSPOSE - /* DM experiments */ - /* The normal ternary operator costs us a lot! - from 10145951 to 7995063 */ - int j; + int j; +#ifdef __COMPCERT__ + word_t *transptr = transpose+offset; + word_t bitmask = ONE; + for(j=0; j < WORD_SIZE; j++) + { + word_t old = *transptr; + *(transptr++) = TERNARY(w & bitmask, old, old|bitpos); + bitmask <<= 1; + } +#else for(j=0; j < WORD_SIZE; j++) { // TODO make const time - transpose[offset + j] |= TERNARY_XY0(w & (ONE << j), bitpos); + transpose[offset + j] |= (w & (ONE << j)) ? bitpos : 0; } +#endif #else transpose[(offset)+ 0 ] |= (w & (ONE << 0 )) ? (bitpos) : 0; @@ -494,11 +511,23 @@ void bs_transpose_rev(word_t * blocks) word_t offset = k / WORD_SIZE; #ifndef UNROLL_TRANSPOSE int j; +#ifdef __COMPCERT__ + word_t *transptr = transpose + offset; + word_t bitmask = ONE; for(j=0; j < WORD_SIZE; j++) { - word_t bit = TERNARY_XY0((w & (ONE << j)), (ONE << (k % WORD_SIZE))); + word_t old = *transptr; + *transptr = TERNARY(w & bitmask, old, old | bitpos); + transptr += WORDS_PER_BLOCK; + bitmask <<= 1; + } +#else + for(j=0; j < WORD_SIZE; j++) + { + word_t bit = (w & (ONE << j)) ? (ONE << (k % WORD_SIZE)) : 0; transpose[j * WORDS_PER_BLOCK + (offset)] |= bit; } +#endif #else transpose[0 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 0 )) ? bitpos : 0; transpose[1 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 1 )) ? bitpos : 0; diff --git a/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized b/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized new file mode 100644 index 00000000..d939f856 --- /dev/null +++ b/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized @@ -0,0 +1,3268 @@ +# File generated by CompCert 3.5 +# Command line: -O3 -Wall -Wno-c11-extensions -fno-unprototyped -S bs.c -o bs.ccomp.k1c.s + .text + .balign 2 + .globl bs_addroundkey +bs_addroundkey: + addd $r17 = $r12, 0 + addd $r12 = $r12, -16 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + make $r5, 0 +;; +.L100: + sxwd $r6 = $r5 + addw $r5 = $r5, 1 + make $r32, 128 +;; + slld $r2 = $r6, 3 + compw.lt $r32 = $r5, $r32 +;; + addd $r3 = $r0, $r2 + addd $r4 = $r1, $r2 +;; + ld $r7 = 0[$r3] +;; + ld $r9 = 0[$r4] +;; + xord $r6 = $r7, $r9 +;; + sd 0[$r3] = $r6 +;; + cb.wnez $r32? .L100 +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 16 +;; + ret +;; + .type bs_addroundkey, @function + .size bs_addroundkey, . - bs_addroundkey + .text + .balign 2 + .globl bs_apply_sbox +bs_apply_sbox: + addd $r17 = $r12, 0 + addd $r12 = $r12, -32 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 +;; + sd 24[$r12] = $r19 + make $r19, 0 +;; +.L101: + sxwd $r1 = $r19 +;; + slld $r0 = $r1, 3 +;; + addd $r0 = $r18, $r0 + call bs_sbox +;; + addw $r19 = $r19, 8 + make $r32, 128 +;; + compw.lt $r32 = $r19, $r32 +;; + cb.wnez $r32? .L101 +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 32 +;; + ret +;; + .type bs_apply_sbox, @function + .size bs_apply_sbox, . - bs_apply_sbox + .text + .balign 2 + .globl bs_apply_sbox_rev +bs_apply_sbox_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -32 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 +;; + sd 24[$r12] = $r19 + make $r19, 0 +;; +.L102: + sxwd $r1 = $r19 +;; + slld $r0 = $r1, 3 +;; + addd $r0 = $r18, $r0 + call bs_sbox_rev +;; + addw $r19 = $r19, 8 + make $r32, 128 +;; + compw.lt $r32 = $r19, $r32 +;; + cb.wnez $r32? .L102 +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 32 +;; + ret +;; + .type bs_apply_sbox_rev, @function + .size bs_apply_sbox_rev, . - bs_apply_sbox_rev + .text + .balign 2 + .globl bs_sbox_rev +bs_sbox_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -96 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 +;; + sd 24[$r12] = $r19 +;; + ld $r7 = 48[$r0] +;; + ld $r3 = 56[$r0] +;; + ld $r4 = 32[$r0] + nxord $r40 = $r3, $r7 +;; + xord $r10 = $r3, $r4 + nxord $r11 = $r7, $r4 + ld $r2 = 8[$r0] +;; + ld $r5 = 24[$r0] + nxord $r45 = $r7, $r10 + xord $r59 = $r7, $r2 +;; + xord $r41 = $r4, $r5 + ld $r1 = 0[$r0] + xord $r60 = $r5, $r45 + andd $r33 = $r10, $r45 +;; + nxord $r35 = $r5, $r1 + xord $r63 = $r2, $r1 + nxord $r39 = $r1, $r41 + ld $r6 = 40[$r0] +;; + xord $r46 = $r11, $r63 + xord $r54 = $r40, $r35 + ld $r7 = 16[$r0] + nxord $r57 = $r6, $r5 +;; + xord $r52 = $r40, $r63 + xord $r50 = $r41, $r63 + nxord $r47 = $r6, $r7 + nxord $r38 = $r7, $r2 +;; + nxord $r58 = $r6, $r46 + xord $r19 = $r11, $r47 + xord $r63 = $r59, $r57 + xord $r7 = $r41, $r38 +;; + xord $r44 = $r35, $r59 + xord $r18 = $r3, $r47 + xord $r3 = $r54, $r7 + xord $r55 = $r54, $r38 +;; + nxord $r34 = $r6, $r41 + xord $r2 = $r50, $r63 + andd $r57 = $r52, $r19 + andd $r17 = $r50, $r63 +;; + xord $r36 = $r55, $r57 + andd $r62 = $r46, $r18 + andd $r53 = $r11, $r39 + xord $r6 = $r2, $r17 +;; + andd $r42 = $r44, $r58 + andd $r15 = $r41, $r3 + andd $r2 = $r60, $r7 + andd $r37 = $r40, $r54 +;; + xord $r59 = $r62, $r57 + xord $r51 = $r42, $r17 + xord $r8 = $r2, $r15 + xord $r4 = $r37, $r15 +;; + xord $r5 = $r36, $r33 + xord $r38 = $r59, $r35 + xord $r48 = $r6, $r53 + xord $r47 = $r51, $r4 +;; + xord $r53 = $r5, $r8 + xord $r43 = $r38, $r4 + xord $r56 = $r48, $r8 + xord $r57 = $r47, $r34 +;; + xord $r49 = $r56, $r57 + andd $r48 = $r56, $r53 + xord $r47 = $r53, $r43 + andd $r9 = $r53, $r57 +;; + xord $r36 = $r43, $r48 + xord $r35 = $r57, $r48 + andd $r62 = $r47, $r9 + xord $r17 = $r47, $r48 +;; + andd $r15 = $r35, $r47 + andd $r42 = $r36, $r49 + andd $r47 = $r43, $r56 + xord $r59 = $r49, $r48 +;; + andd $r37 = $r49, $r47 + xord $r5 = $r43, $r15 + xord $r4 = $r62, $r17 + xord $r55 = $r57, $r42 +;; + xord $r1 = $r37, $r59 + xord $r2 = $r5, $r55 + xord $r47 = $r5, $r4 + andd $r35 = $r4, $r39 +;; + xord $r61 = $r4, $r1 + xord $r33 = $r55, $r1 + andd $r62 = $r1, $r45 + andd $r45 = $r55, $r18 +;; + xord $r48 = $r2, $r61 + andd $r49 = $r2, $r3 + andd $r6 = $r1, $r10 + andd $r3 = $r47, $r50 +;; + andd $r56 = $r47, $r63 + andd $r42 = $r5, $r58 + andd $r1 = $r4, $r11 + andd $r57 = $r2, $r41 +;; + andd $r9 = $r61, $r54 + andd $r51 = $r33, $r52 + andd $r58 = $r55, $r46 + andd $r53 = $r5, $r44 +;; + andd $r41 = $r48, $r60 + andd $r10 = $r61, $r40 + xord $r59 = $r49, $r57 + xord $r61 = $r3, $r1 +;; + andd $r34 = $r33, $r19 + andd $r39 = $r48, $r7 + xord $r55 = $r9, $r41 + xord $r60 = $r45, $r6 +;; + xord $r48 = $r62, $r35 + xord $r15 = $r56, $r53 + xord $r44 = $r59, $r61 + xord $r49 = $r51, $r10 +;; + xord $r54 = $r34, $r42 + xord $r51 = $r58, $r60 + xord $r59 = $r59, $r48 + xord $r8 = $r56, $r42 +;; + xord $r47 = $r9, $r1 + xord $r11 = $r60, $r15 + xord $r40 = $r55, $r44 + xord $r60 = $r15, $r51 +;; + xord $r52 = $r56, $r41 + xord $r56 = $r10, $r54 + xord $r2 = $r49, $r59 + xord $r5 = $r59, $r60 +;; + xord $r7 = $r3, $r55 + xord $r61 = $r51, $r56 + xord $r59 = $r47, $r11 + xord $r47 = $r8, $r40 +;; + xord $r63 = $r35, $r39 + xord $r4 = $r34, $r45 + sd 88[$r12] = $r47 + xord $r51 = $r2, $r59 +;; + xord $r10 = $r55, $r48 + xord $r50 = $r44, $r63 + sd 80[$r12] = $r51 + xord $r37 = $r7, $r5 +;; + xord $r53 = $r54, $r44 + sd 72[$r12] = $r37 + xord $r37 = $r4, $r40 + xord $r40 = $r50, $r61 +;; + xord $r1 = $r58, $r57 + sd 64[$r12] = $r37 + xord $r46 = $r10, $r53 + xord $r7 = $r52, $r50 +;; + sd 56[$r12] = $r40 + xord $r49 = $r49, $r1 + addd $r1 = $r12, 32 + make $r2, 64 +;; + sd 48[$r12] = $r46 +;; + sd 40[$r12] = $r7 +;; + sd 32[$r12] = $r49 + call memmove +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 96 +;; + ret +;; + .type bs_sbox_rev, @function + .size bs_sbox_rev, . - bs_sbox_rev + .text + .balign 2 + .globl bs_sbox +bs_sbox: + addd $r17 = $r12, 0 + addd $r12 = $r12, -80 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + ld $r5 = 56[$r0] +;; + ld $r6 = 32[$r0] +;; + xord $r41 = $r5, $r6 + ld $r2 = 16[$r0] +;; + xord $r42 = $r5, $r2 + ld $r4 = 8[$r0] + xord $r49 = $r6, $r2 +;; + xord $r48 = $r5, $r4 + ld $r55 = 24[$r0] +;; + xord $r9 = $r55, $r4 + ld $r3 = 48[$r0] +;; + xord $r5 = $r41, $r9 + ld $r7 = 40[$r0] + xord $r34 = $r3, $r2 +;; + xord $r10 = $r3, $r7 + ld $r1 = 0[$r0] + xord $r11 = $r7, $r2 + xord $r3 = $r48, $r49 +;; + xord $r33 = $r9, $r34 + xord $r8 = $r9, $r11 + xord $r44 = $r6, $r1 + xord $r47 = $r4, $r1 +;; + xord $r59 = $r1, $r10 + xord $r7 = $r5, $r34 + xord $r9 = $r10, $r44 + xord $r4 = $r10, $r47 +;; + xord $r61 = $r1, $r5 + xord $r57 = $r5, $r10 + andd $r50 = $r3, $r5 + andd $r43 = $r9, $r1 +;; + xord $r34 = $r59, $r8 + xord $r6 = $r41, $r9 + xord $r35 = $r42, $r4 + xord $r36 = $r48, $r8 +;; + xord $r38 = $r41, $r11 + xord $r40 = $r7, $r50 + xord $r11 = $r43, $r50 + andd $r50 = $r48, $r8 +;; + xord $r15 = $r42, $r57 + andd $r62 = $r35, $r61 + andd $r37 = $r4, $r59 + xord $r52 = $r36, $r50 +;; + andd $r53 = $r6, $r34 + andd $r55 = $r41, $r33 + andd $r46 = $r49, $r38 + andd $r54 = $r42, $r57 +;; + xord $r39 = $r53, $r50 + xord $r60 = $r46, $r55 + xord $r55 = $r54, $r55 + xord $r10 = $r40, $r62 +;; + xord $r44 = $r6, $r34 + xord $r43 = $r11, $r15 + xord $r15 = $r52, $r37 + xord $r17 = $r39, $r55 +;; + xord $r45 = $r10, $r60 + xord $r55 = $r43, $r55 + xord $r50 = $r15, $r60 + xord $r46 = $r17, $r44 +;; + xord $r63 = $r50, $r46 + andd $r43 = $r50, $r45 + xord $r56 = $r45, $r55 + andd $r54 = $r45, $r46 +;; + xord $r36 = $r55, $r43 + xord $r47 = $r46, $r43 + andd $r40 = $r56, $r54 + andd $r60 = $r55, $r50 +;; + andd $r2 = $r47, $r56 + andd $r58 = $r36, $r63 + xord $r36 = $r56, $r43 + andd $r15 = $r63, $r60 +;; + xord $r47 = $r63, $r43 + xord $r17 = $r55, $r2 + xord $r50 = $r40, $r36 + xord $r52 = $r46, $r58 +;; + xord $r58 = $r15, $r47 + xord $r51 = $r17, $r52 + xord $r7 = $r17, $r50 + andd $r43 = $r52, $r1 +;; + xord $r53 = $r50, $r58 + xord $r62 = $r52, $r58 + andd $r44 = $r7, $r8 + andd $r8 = $r50, $r59 +;; + xord $r40 = $r51, $r53 + andd $r45 = $r58, $r61 + andd $r54 = $r51, $r33 + andd $r10 = $r58, $r35 +;; + andd $r47 = $r40, $r38 + andd $r46 = $r62, $r3 + andd $r2 = $r51, $r41 + xord $r35 = $r8, $r10 +;; + andd $r5 = $r62, $r5 + andd $r36 = $r17, $r34 + andd $r39 = $r53, $r57 + andd $r56 = $r7, $r48 +;; + andd $r41 = $r40, $r49 + xord $r34 = $r45, $r46 + xord $r51 = $r44, $r2 + xord $r62 = $r54, $r47 +;; + andd $r38 = $r52, $r9 + andd $r37 = $r50, $r4 + andd $r9 = $r17, $r6 + xord $r59 = $r46, $r35 +;; + andd $r61 = $r53, $r42 + xord $r1 = $r2, $r41 + xord $r63 = $r5, $r43 + xord $r33 = $r39, $r56 +;; + xord $r42 = $r41, $r51 + xord $r51 = $r5, $r34 + xord $r52 = $r36, $r37 + xord $r49 = $r59, $r62 +;; + xord $r57 = $r47, $r33 + xord $r3 = $r9, $r63 + xord $r11 = $r54, $r2 + xord $r50 = $r10, $r1 +;; + xord $r37 = $r43, $r36 + xord $r6 = $r56, $r52 + xord $r9 = $r51, $r62 + xord $r40 = $r42, $r49 +;; + xord $r36 = $r61, $r33 + xord $r10 = $r42, $r57 + xord $r56 = $r52, $r57 + xord $r57 = $r3, $r11 +;; + xord $r5 = $r35, $r51 + sd 72[$r12] = $r40 + nxord $r43 = $r50, $r9 + nxord $r17 = $r36, $r57 +;; + xord $r39 = $r8, $r1 + xord $r53 = $r38, $r35 + xord $r8 = $r1, $r35 + sd 64[$r12] = $r43 +;; + xord $r7 = $r34, $r37 + xord $r58 = $r3, $r53 + sd 56[$r12] = $r17 + xord $r38 = $r42, $r5 +;; + xord $r48 = $r6, $r63 + sd 48[$r12] = $r38 + xord $r1 = $r8, $r7 + xord $r43 = $r10, $r58 +;; + sd 40[$r12] = $r1 + nxord $r4 = $r39, $r56 + nxord $r34 = $r42, $r48 + addd $r1 = $r12, 16 +;; + sd 32[$r12] = $r43 + make $r2, 64 +;; + sd 24[$r12] = $r4 +;; + sd 16[$r12] = $r34 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 80 +;; + ret +;; + .type bs_sbox, @function + .size bs_sbox, . - bs_sbox + .text + .balign 2 + .globl bs_transpose +bs_transpose: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1056 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 + addd $r0 = $r12, 24 + make $r1, 0 +;; + make $r2, 1024 + call memset +;; + addd $r0 = $r12, 24 + addd $r1 = $r18, 0 + call bs_transpose_dst +;; + addd $r1 = $r12, 24 + make $r2, 1024 + addd $r0 = $r18, 0 + call memmove +;; + ld $r16 = 8[$r12] +;; + ld $r18 = 16[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1056 +;; + ret +;; + .type bs_transpose, @function + .size bs_transpose, . - bs_transpose + .text + .balign 2 + .globl bs_transpose_dst +bs_transpose_dst: + addd $r17 = $r12, 0 + addd $r12 = $r12, -16 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + make $r4, 0 +;; +.L103: + make $r35, 1 + make $r17, 0 +;; + slld $r41 = $r35, $r4 +;; + addw $r9 = $r41, 0 +;; +.L104: + sllw $r10 = $r4, 1 + sllw $r42 = $r17, 6 + make $r6, 0 +;; + addw $r36 = $r10, $r17 +;; + sxwd $r15 = $r36 +;; + slld $r2 = $r15, 3 +;; + addd $r8 = $r1, $r2 +;; + ld $r11 = 0[$r8] +;; +.L105: + addw $r40 = $r42, $r6 + make $r2, 0 + make $r44, 1 + make $r32, 64 +;; + sxwd $r34 = $r40 + sxwd $r39 = $r9 + slld $r37 = $r44, $r6 + addw $r6 = $r6, 1 +;; + ld.xs $r7 = $r34[$r0] + andd $r33 = $r11, $r37 + compw.lt $r32 = $r6, $r32 +;; + cmoved.dnez $r33? $r2 = $r39 +;; + ord $r38 = $r7, $r2 +;; + sd.xs $r34[$r0] = $r38 + cb.wnez $r32? .L105 +;; + addw $r17 = $r17, 1 + make $r32, 2 +;; + compw.lt $r32 = $r17, $r32 +;; + cb.wnez $r32? .L104 +;; + addw $r4 = $r4, 1 + make $r32, 64 +;; + compw.lt $r32 = $r4, $r32 +;; + cb.wnez $r32? .L103 +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 16 +;; + ret +;; + .type bs_transpose_dst, @function + .size bs_transpose_dst, . - bs_transpose_dst + .text + .balign 2 + .globl bs_transpose_rev +bs_transpose_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1056 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 + addd $r0 = $r12, 24 + make $r1, 0 +;; + make $r2, 1024 + call memset +;; + make $r3, 0 +;; +.L106: + sxwd $r8 = $r3 + sraw $r32 = $r3, 31 + make $r11, 0 +;; + slld $r34 = $r8, 3 + srlw $r32 = $r32, 26 +;; + addd $r6 = $r18, $r34 + addw $r32 = $r3, $r32 +;; + sraw $r2 = $r32, 6 +;; + sxwd $r5 = $r2 +;; + ld $r36 = 0[$r6] +;; +.L107: + make $r39, 1 +;; + slld $r38 = $r39, $r11 +;; + andd $r17 = $r36, $r38 +;; + cb.deqz $r17? .L108 +;; + make $r44, 1 + sraw $r32 = $r3, 31 +;; + srlw $r32 = $r32, 26 +;; + addw $r32 = $r3, $r32 +;; + sraw $r40 = $r32, 6 +;; + sllw $r9 = $r40, 6 +;; + sbfw $r45 = $r9, $r3 +;; + slld $r0 = $r44, $r45 + goto .L109 +;; +.L108: + make $r0, 0 +;; +.L109: + addd $r37 = $r12, 24 + sllw $r46 = $r11, 1 + addw $r11 = $r11, 1 + make $r32, 64 +;; + sxwd $r7 = $r46 + compw.lt $r32 = $r11, $r32 +;; + addd $r4 = $r7, $r5 +;; + slld $r10 = $r4, 3 +;; + addd $r1 = $r37, $r10 +;; + ld $r41 = 0[$r1] +;; + ord $r35 = $r41, $r0 +;; + sd 0[$r1] = $r35 +;; + cb.wnez $r32? .L107 +;; + addw $r3 = $r3, 1 + make $r32, 128 +;; + compw.lt $r32 = $r3, $r32 +;; + cb.wnez $r32? .L106 +;; + addd $r1 = $r12, 24 + make $r2, 1024 + addd $r0 = $r18, 0 + call memmove +;; + ld $r16 = 8[$r12] +;; + ld $r18 = 16[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1056 +;; + ret +;; + .type bs_transpose_rev, @function + .size bs_transpose_rev, . - bs_transpose_rev + .text + .balign 2 + .globl bs_shiftrows +bs_shiftrows: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1040 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + addd $r50 = $r12, 16 + addd $r1 = $r0, 0 + addd $r43 = $r0, 256 + addd $r8 = $r0, 512 +;; + addd $r60 = $r0, 768 + make $r15, 0 + make $r52, 32 + make $r3, 64 +;; + make $r36, 96 + make $r7, 0 +;; +.L110: + ld $r5 = 0[$r1] + addw $r59 = $r52, 40 + addw $r7 = $r7, 1 + make $r32, 4 +;; + sd 0[$r50] = $r5 + andw $r52 = $r59, 127 + addw $r63 = $r36, 40 + compw.lt $r32 = $r7, $r32 +;; + andw $r36 = $r63, 127 + sxwd $r62 = $r52 +;; + slld $r53 = $r62, 3 +;; + ld $r11 = 8[$r1] +;; + sd 8[$r50] = $r11 +;; + ld $r61 = 16[$r1] +;; + sd 16[$r50] = $r61 +;; + ld $r6 = 24[$r1] +;; + sd 24[$r50] = $r6 +;; + ld $r56 = 32[$r1] +;; + sd 32[$r50] = $r56 +;; + ld $r2 = 40[$r1] +;; + sd 40[$r50] = $r2 + addw $r2 = $r3, 40 +;; + andw $r3 = $r2, 127 + sxwd $r2 = $r36 +;; + sxwd $r5 = $r3 + slld $r39 = $r2, 3 +;; + ld $r38 = 48[$r1] + slld $r46 = $r5, 3 +;; + sd 48[$r50] = $r38 +;; + ld $r54 = 56[$r1] +;; + sd 56[$r50] = $r54 +;; + ld $r4 = 0[$r43] +;; + sd 256[$r50] = $r4 +;; + ld $r58 = 8[$r43] +;; + sd 264[$r50] = $r58 +;; + ld $r10 = 16[$r43] +;; + sd 272[$r50] = $r10 +;; + ld $r34 = 24[$r43] +;; + sd 280[$r50] = $r34 +;; + ld $r51 = 32[$r43] +;; + sd 288[$r50] = $r51 +;; + ld $r9 = 40[$r43] +;; + sd 296[$r50] = $r9 +;; + ld $r1 = 48[$r43] +;; + sd 304[$r50] = $r1 +;; + ld $r4 = 56[$r43] + addd $r43 = $r0, $r53 +;; + sd 312[$r50] = $r4 +;; + ld $r41 = 0[$r8] +;; + sd 512[$r50] = $r41 +;; + ld $r9 = 8[$r8] +;; + sd 520[$r50] = $r9 +;; + ld $r6 = 16[$r8] +;; + sd 528[$r50] = $r6 +;; + ld $r9 = 24[$r8] +;; + sd 536[$r50] = $r9 +;; + ld $r42 = 32[$r8] +;; + sd 544[$r50] = $r42 +;; + ld $r35 = 40[$r8] +;; + sd 552[$r50] = $r35 +;; + ld $r10 = 48[$r8] +;; + sd 560[$r50] = $r10 +;; + ld $r57 = 56[$r8] +;; + sd 568[$r50] = $r57 +;; + ld $r17 = 0[$r60] +;; + sd 768[$r50] = $r17 +;; + ld $r8 = 8[$r60] +;; + sd 776[$r50] = $r8 + addw $r8 = $r15, 40 +;; + andw $r15 = $r8, 127 + addd $r8 = $r0, $r46 +;; + sxwd $r37 = $r15 +;; + ld $r48 = 16[$r60] + slld $r40 = $r37, 3 +;; + sd 784[$r50] = $r48 + addd $r1 = $r0, $r40 +;; + ld $r33 = 24[$r60] +;; + sd 792[$r50] = $r33 +;; + ld $r47 = 32[$r60] +;; + sd 800[$r50] = $r47 +;; + ld $r4 = 40[$r60] +;; + sd 808[$r50] = $r4 +;; + ld $r44 = 48[$r60] +;; + sd 816[$r50] = $r44 +;; + ld $r49 = 56[$r60] + addd $r60 = $r0, $r39 +;; + sd 824[$r50] = $r49 + addd $r50 = $r50, 64 + cb.wnez $r32? .L110 +;; + addd $r1 = $r12, 16 + make $r2, 1024 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1040 +;; + ret +;; + .type bs_shiftrows, @function + .size bs_shiftrows, . - bs_shiftrows + .text + .balign 2 + .globl bs_shiftrows_rev +bs_shiftrows_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1040 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + addd $r56 = $r12, 16 + addd $r34 = $r12, 16 + addd $r45 = $r12, 272 + addd $r6 = $r12, 528 +;; + addd $r62 = $r12, 784 + make $r4, 0 + make $r10, 32 + make $r55, 64 +;; + make $r2, 96 + make $r59, 0 +;; +.L111: + ld $r43 = 0[$r0] + addw $r9 = $r4, 40 + addw $r59 = $r59, 1 + make $r32, 4 +;; + sd 0[$r34] = $r43 + andw $r4 = $r9, 127 + addw $r51 = $r10, 40 + compw.lt $r32 = $r59, $r32 +;; + andw $r10 = $r51, 127 + sxwd $r39 = $r4 +;; + slld $r60 = $r39, 3 +;; + ld $r57 = 8[$r0] +;; + sd 8[$r34] = $r57 +;; + ld $r63 = 16[$r0] +;; + sd 16[$r34] = $r63 +;; + ld $r7 = 24[$r0] +;; + sd 24[$r34] = $r7 +;; + ld $r44 = 32[$r0] +;; + sd 32[$r34] = $r44 +;; + ld $r42 = 40[$r0] +;; + sd 40[$r34] = $r42 +;; + ld $r40 = 48[$r0] +;; + sd 48[$r34] = $r40 +;; + ld $r61 = 56[$r0] +;; + sd 56[$r34] = $r61 + addd $r34 = $r56, $r60 +;; + ld $r35 = 256[$r0] +;; + sd 0[$r45] = $r35 +;; + ld $r1 = 264[$r0] +;; + sd 8[$r45] = $r1 + addw $r1 = $r2, 40 +;; + andw $r2 = $r1, 127 +;; + ld $r49 = 272[$r0] +;; + sd 16[$r45] = $r49 +;; + ld $r37 = 280[$r0] +;; + sd 24[$r45] = $r37 +;; + ld $r54 = 288[$r0] +;; + sd 32[$r45] = $r54 +;; + ld $r15 = 296[$r0] +;; + sd 40[$r45] = $r15 +;; + ld $r3 = 304[$r0] +;; + sd 48[$r45] = $r3 +;; + ld $r5 = 312[$r0] +;; + sd 56[$r45] = $r5 + sxwd $r5 = $r2 +;; + slld $r38 = $r5, 3 +;; + ld $r53 = 512[$r0] +;; + sd 0[$r6] = $r53 +;; + ld $r33 = 520[$r0] +;; + sd 8[$r6] = $r33 +;; + ld $r8 = 528[$r0] +;; + sd 16[$r6] = $r8 +;; + ld $r11 = 536[$r0] +;; + sd 24[$r6] = $r11 +;; + ld $r47 = 544[$r0] +;; + sd 32[$r6] = $r47 +;; + ld $r3 = 552[$r0] +;; + sd 40[$r6] = $r3 +;; + ld $r17 = 560[$r0] +;; + sd 48[$r6] = $r17 +;; + ld $r52 = 568[$r0] +;; + sd 56[$r6] = $r52 + sxwd $r6 = $r10 +;; + slld $r1 = $r6, 3 +;; + addd $r45 = $r56, $r1 +;; + ld $r8 = 768[$r0] +;; + sd 0[$r62] = $r8 +;; + ld $r41 = 776[$r0] +;; + sd 8[$r62] = $r41 +;; + ld $r3 = 784[$r0] +;; + sd 16[$r62] = $r3 + addw $r3 = $r55, 40 +;; + andw $r55 = $r3, 127 +;; + sxwd $r7 = $r55 +;; + ld $r36 = 792[$r0] + slld $r58 = $r7, 3 +;; + sd 24[$r62] = $r36 + addd $r6 = $r56, $r58 +;; + ld $r48 = 800[$r0] +;; + sd 32[$r62] = $r48 +;; + ld $r11 = 808[$r0] +;; + sd 40[$r62] = $r11 +;; + ld $r46 = 816[$r0] +;; + sd 48[$r62] = $r46 +;; + ld $r50 = 824[$r0] + addd $r0 = $r0, 64 +;; + sd 56[$r62] = $r50 + addd $r62 = $r56, $r38 + cb.wnez $r32? .L111 +;; + addd $r0 = $r0, -256 + addd $r1 = $r12, 16 + make $r2, 1024 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1040 +;; + ret +;; + .type bs_shiftrows_rev, @function + .size bs_shiftrows_rev, . - bs_shiftrows_rev + .text + .balign 2 + .globl bs_shiftmix +bs_shiftmix: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1088 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r4 = $r0, 256 + addd $r1 = $r0, 512 + addd $r3 = $r0, 768 +;; + sd 24[$r12] = $r19 + addd $r19 = $r12, 64 + make $r18, 0 + addd $r2 = $r0, 0 +;; + sd 32[$r12] = $r20 + make $r20, 64 +;; + sd 40[$r12] = $r21 + make $r21, 96 +;; + sd 48[$r12] = $r22 + make $r22, 32 +;; + sd 56[$r12] = $r23 + make $r23, 0 +;; +.L112: + ld $r46 = 64[$r4] + addw $r23 = $r23, 1 + make $r32, 4 +;; + ld $r8 = 128[$r1] + compw.lt $r32 = $r23, $r32 +;; + ld $r5 = 56[$r2] + xord $r57 = $r46, $r8 +;; + ld $r59 = 120[$r4] +;; + xord $r7 = $r5, $r59 + ld $r17 = 192[$r3] +;; + xord $r5 = $r57, $r17 +;; + xord $r61 = $r5, $r7 +;; + sd 0[$r19] = $r61 +;; + ld $r48 = 0[$r2] +;; + ld $r62 = 64[$r4] +;; + xord $r42 = $r48, $r62 + ld $r60 = 72[$r4] +;; + xord $r5 = $r42, $r60 + ld $r61 = 136[$r1] +;; + xord $r45 = $r5, $r61 + ld $r40 = 200[$r3] +;; + xord $r45 = $r45, $r40 +;; + xord $r5 = $r45, $r7 +;; + sd 8[$r19] = $r5 +;; + ld $r11 = 8[$r2] +;; + ld $r51 = 72[$r4] +;; + xord $r45 = $r11, $r51 + ld $r40 = 80[$r4] +;; + xord $r37 = $r45, $r40 + ld $r39 = 144[$r1] +;; + xord $r6 = $r37, $r39 + ld $r42 = 208[$r3] +;; + xord $r59 = $r6, $r42 +;; + sd 16[$r19] = $r59 +;; + ld $r6 = 16[$r2] +;; + ld $r44 = 80[$r4] +;; + xord $r43 = $r6, $r44 + ld $r9 = 88[$r4] +;; + xord $r52 = $r43, $r9 + ld $r46 = 152[$r1] +;; + xord $r42 = $r52, $r46 + ld $r48 = 216[$r3] +;; + xord $r5 = $r42, $r48 +;; + xord $r55 = $r5, $r7 +;; + sd 24[$r19] = $r55 +;; + ld $r34 = 24[$r2] +;; + ld $r8 = 88[$r4] +;; + xord $r62 = $r34, $r8 + ld $r47 = 96[$r4] +;; + xord $r38 = $r62, $r47 + ld $r50 = 160[$r1] +;; + xord $r34 = $r38, $r50 + ld $r56 = 224[$r3] +;; + xord $r8 = $r34, $r56 +;; + xord $r11 = $r8, $r7 +;; + sd 32[$r19] = $r11 +;; + ld $r5 = 96[$r4] +;; + ld $r53 = 32[$r2] +;; + xord $r44 = $r53, $r5 + ld $r54 = 168[$r1] +;; + ld $r5 = 104[$r4] +;; + xord $r40 = $r44, $r5 +;; + xord $r10 = $r40, $r54 +;; + ld $r5 = 232[$r3] +;; + xord $r39 = $r10, $r5 +;; + sd 40[$r19] = $r39 +;; + ld $r5 = 40[$r2] +;; + ld $r58 = 104[$r4] +;; + xord $r17 = $r5, $r58 + ld $r15 = 112[$r4] +;; + xord $r37 = $r17, $r15 + ld $r5 = 176[$r1] +;; + xord $r57 = $r37, $r5 + ld $r51 = 240[$r3] +;; + xord $r57 = $r57, $r51 +;; + sd 48[$r19] = $r57 +;; + ld $r40 = 48[$r2] +;; + ld $r52 = 112[$r4] +;; + xord $r35 = $r40, $r52 + ld $r5 = 120[$r4] +;; + xord $r5 = $r35, $r5 + ld $r49 = 184[$r1] +;; + xord $r15 = $r5, $r49 + ld $r34 = 248[$r3] +;; + xord $r46 = $r15, $r34 +;; + sd 56[$r19] = $r46 +;; + ld $r33 = 0[$r2] +;; + ld $r36 = 128[$r1] +;; + ld $r48 = 120[$r4] + xord $r42 = $r33, $r36 +;; + ld $r5 = 184[$r1] +;; + xord $r34 = $r48, $r5 + ld $r47 = 192[$r3] +;; + xord $r60 = $r42, $r47 +;; + xord $r60 = $r60, $r34 +;; + sd 64[$r19] = $r60 +;; + ld $r43 = 8[$r2] +;; + ld $r47 = 64[$r4] +;; + xord $r63 = $r43, $r47 + ld $r52 = 128[$r1] +;; + xord $r5 = $r63, $r52 + ld $r7 = 136[$r1] +;; + xord $r60 = $r5, $r7 + ld $r15 = 200[$r3] +;; + xord $r55 = $r60, $r15 +;; + xord $r48 = $r55, $r34 +;; + sd 72[$r19] = $r48 +;; + ld $r56 = 16[$r2] +;; + ld $r5 = 72[$r4] +;; + xord $r7 = $r56, $r5 + ld $r46 = 136[$r1] +;; + xord $r41 = $r7, $r46 + ld $r40 = 144[$r1] +;; + xord $r5 = $r41, $r40 + ld $r47 = 208[$r3] +;; + xord $r5 = $r5, $r47 +;; + sd 80[$r19] = $r5 +;; + ld $r52 = 24[$r2] +;; + ld $r54 = 80[$r4] +;; + xord $r35 = $r52, $r54 + ld $r63 = 144[$r1] +;; + xord $r7 = $r35, $r63 + ld $r8 = 152[$r1] +;; + xord $r33 = $r7, $r8 + ld $r37 = 216[$r3] +;; + xord $r56 = $r33, $r37 +;; + xord $r54 = $r56, $r34 +;; + sd 88[$r19] = $r54 +;; + ld $r9 = 32[$r2] +;; + ld $r6 = 88[$r4] +;; + xord $r44 = $r9, $r6 + ld $r51 = 152[$r1] +;; + xord $r35 = $r44, $r51 + ld $r52 = 160[$r1] +;; + xord $r38 = $r35, $r52 + ld $r9 = 224[$r3] +;; + xord $r62 = $r38, $r9 +;; + xord $r6 = $r62, $r34 +;; + sd 96[$r19] = $r6 +;; + ld $r15 = 40[$r2] +;; + ld $r17 = 96[$r4] +;; + xord $r36 = $r15, $r17 + ld $r5 = 160[$r1] +;; + xord $r50 = $r36, $r5 + ld $r51 = 168[$r1] +;; + xord $r37 = $r50, $r51 + ld $r42 = 232[$r3] +;; + xord $r58 = $r37, $r42 +;; + sd 104[$r19] = $r58 +;; + ld $r56 = 48[$r2] +;; + ld $r41 = 104[$r4] +;; + xord $r11 = $r56, $r41 + ld $r48 = 168[$r1] +;; + xord $r51 = $r11, $r48 + ld $r58 = 176[$r1] +;; + xord $r61 = $r51, $r58 + ld $r5 = 240[$r3] +;; + xord $r61 = $r61, $r5 +;; + sd 112[$r19] = $r61 +;; + ld $r34 = 56[$r2] +;; + ld $r56 = 112[$r4] +;; + xord $r46 = $r34, $r56 + ld $r9 = 176[$r1] +;; + xord $r62 = $r46, $r9 + ld $r33 = 184[$r1] +;; + xord $r46 = $r62, $r33 + ld $r61 = 248[$r3] +;; + xord $r40 = $r46, $r61 +;; + sd 120[$r19] = $r40 +;; + ld $r5 = 184[$r1] +;; + ld $r59 = 248[$r3] +;; + xord $r43 = $r5, $r59 + ld $r55 = 0[$r2] +;; + ld $r5 = 64[$r4] +;; + xord $r42 = $r55, $r5 + ld $r35 = 192[$r3] +;; + xord $r49 = $r42, $r35 +;; + xord $r5 = $r49, $r43 +;; + sd 128[$r19] = $r5 +;; + ld $r57 = 8[$r2] +;; + ld $r5 = 72[$r4] +;; + xord $r44 = $r57, $r5 + ld $r45 = 128[$r1] +;; + xord $r17 = $r44, $r45 + ld $r33 = 192[$r3] +;; + xord $r52 = $r17, $r33 + ld $r39 = 200[$r3] +;; + xord $r35 = $r52, $r39 +;; + xord $r62 = $r35, $r43 +;; + sd 136[$r19] = $r62 +;; + ld $r5 = 16[$r2] +;; + ld $r39 = 80[$r4] +;; + xord $r36 = $r5, $r39 + ld $r41 = 136[$r1] +;; + xord $r6 = $r36, $r41 + ld $r5 = 200[$r3] +;; + xord $r35 = $r6, $r5 + ld $r11 = 208[$r3] +;; + xord $r37 = $r35, $r11 +;; + sd 144[$r19] = $r37 +;; + ld $r5 = 24[$r2] +;; + ld $r63 = 88[$r4] +;; + xord $r33 = $r5, $r63 + ld $r45 = 144[$r1] +;; + xord $r49 = $r33, $r45 + ld $r36 = 208[$r3] +;; + xord $r55 = $r49, $r36 + ld $r8 = 216[$r3] +;; + xord $r41 = $r55, $r8 +;; + xord $r58 = $r41, $r43 +;; + sd 152[$r19] = $r58 +;; + ld $r6 = 32[$r2] +;; + ld $r47 = 96[$r4] +;; + xord $r11 = $r6, $r47 + ld $r61 = 152[$r1] +;; + xord $r44 = $r11, $r61 + ld $r9 = 216[$r3] +;; + xord $r59 = $r44, $r9 + ld $r34 = 224[$r3] +;; + xord $r7 = $r59, $r34 +;; + xord $r17 = $r7, $r43 +;; + sd 160[$r19] = $r17 +;; + ld $r54 = 40[$r2] +;; + ld $r53 = 104[$r4] +;; + xord $r7 = $r54, $r53 + ld $r59 = 160[$r1] +;; + xord $r37 = $r7, $r59 + ld $r41 = 224[$r3] +;; + xord $r10 = $r37, $r41 + ld $r46 = 232[$r3] +;; + xord $r10 = $r10, $r46 +;; + sd 168[$r19] = $r10 +;; + ld $r58 = 48[$r2] +;; + ld $r5 = 112[$r4] +;; + xord $r40 = $r58, $r5 + ld $r38 = 168[$r1] +;; + xord $r57 = $r40, $r38 + ld $r51 = 232[$r3] +;; + xord $r60 = $r57, $r51 + ld $r55 = 240[$r3] +;; + xord $r53 = $r60, $r55 +;; + sd 176[$r19] = $r53 +;; + ld $r45 = 56[$r2] +;; + ld $r41 = 120[$r4] +;; + xord $r5 = $r45, $r41 + ld $r53 = 176[$r1] +;; + xord $r38 = $r5, $r53 + ld $r8 = 240[$r3] +;; + xord $r43 = $r38, $r8 + ld $r63 = 248[$r3] +;; + xord $r6 = $r43, $r63 +;; + sd 184[$r19] = $r6 +;; + ld $r8 = 0[$r2] +;; + ld $r58 = 64[$r4] +;; + ld $r35 = 56[$r2] + xord $r54 = $r8, $r58 +;; + ld $r5 = 248[$r3] +;; + xord $r50 = $r35, $r5 + ld $r51 = 128[$r1] +;; + xord $r11 = $r54, $r51 +;; + xord $r38 = $r11, $r50 +;; + sd 192[$r19] = $r38 +;; + ld $r63 = 8[$r2] +;; + ld $r54 = 0[$r2] +;; + xord $r54 = $r63, $r54 + ld $r36 = 72[$r4] +;; + xord $r5 = $r54, $r36 + ld $r41 = 136[$r1] +;; + xord $r39 = $r5, $r41 + ld $r58 = 192[$r3] +;; + xord $r44 = $r39, $r58 +;; + xord $r33 = $r44, $r50 +;; + sd 200[$r19] = $r33 +;; + ld $r5 = 8[$r2] +;; + ld $r63 = 16[$r2] +;; + xord $r54 = $r63, $r5 + ld $r49 = 80[$r4] + addw $r63 = $r18, 32 +;; + xord $r51 = $r54, $r49 + ld $r5 = 144[$r1] + andw $r18 = $r63, 127 +;; + xord $r43 = $r51, $r5 + ld $r57 = 200[$r3] +;; + xord $r47 = $r43, $r57 +;; + sd 208[$r19] = $r47 + addw $r47 = $r21, 32 +;; + andw $r21 = $r47, 127 +;; + ld $r7 = 24[$r2] +;; + ld $r15 = 16[$r2] +;; + xord $r56 = $r7, $r15 + ld $r48 = 88[$r4] +;; + xord $r10 = $r56, $r48 + ld $r51 = 152[$r1] +;; + xord $r39 = $r10, $r51 + addw $r10 = $r22, 32 +;; + ld $r48 = 208[$r3] + andw $r22 = $r10, 127 +;; + xord $r53 = $r39, $r48 +;; + xord $r37 = $r53, $r50 +;; + sd 216[$r19] = $r37 +;; + ld $r9 = 32[$r2] +;; + ld $r15 = 24[$r2] +;; + xord $r43 = $r9, $r15 + ld $r53 = 96[$r4] + addw $r15 = $r20, 32 +;; + xord $r42 = $r43, $r53 + ld $r17 = 160[$r1] + andw $r20 = $r15, 127 +;; + xord $r55 = $r42, $r17 + ld $r62 = 216[$r3] + sxwd $r8 = $r20 +;; + xord $r60 = $r55, $r62 + slld $r43 = $r8, 3 +;; + xord $r5 = $r60, $r50 + sxwd $r50 = $r18 +;; + sd 224[$r19] = $r5 + slld $r39 = $r50, 3 +;; + ld $r5 = 40[$r2] +;; + ld $r51 = 32[$r2] +;; + xord $r62 = $r5, $r51 + ld $r45 = 168[$r1] +;; + ld $r5 = 104[$r4] +;; + xord $r9 = $r62, $r5 +;; + xord $r17 = $r9, $r45 +;; + ld $r5 = 224[$r3] +;; + xord $r49 = $r17, $r5 +;; + sd 232[$r19] = $r49 +;; + ld $r33 = 48[$r2] +;; + ld $r57 = 40[$r2] +;; + xord $r49 = $r33, $r57 + ld $r55 = 112[$r4] +;; + xord $r59 = $r49, $r55 + ld $r36 = 176[$r1] +;; + xord $r61 = $r59, $r36 + ld $r52 = 232[$r3] +;; + xord $r6 = $r61, $r52 +;; + sd 240[$r19] = $r6 +;; + ld $r49 = 56[$r2] +;; + ld $r45 = 48[$r2] + addd $r2 = $r0, $r39 +;; + xord $r56 = $r49, $r45 + ld $r59 = 120[$r4] +;; + xord $r11 = $r56, $r59 + ld $r38 = 184[$r1] +;; + xord $r4 = $r11, $r38 + ld $r34 = 240[$r3] + sxwd $r38 = $r22 + sxwd $r3 = $r21 +;; + xord $r1 = $r4, $r34 + slld $r10 = $r38, 3 + slld $r36 = $r3, 3 +;; + sd 248[$r19] = $r1 + addd $r19 = $r19, 256 + addd $r4 = $r0, $r10 + addd $r1 = $r0, $r43 +;; + addd $r3 = $r0, $r36 + cb.wnez $r32? .L112 +;; + addd $r1 = $r12, 64 + make $r2, 1024 + call memmove +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r20 = 32[$r12] +;; + ld $r21 = 40[$r12] +;; + ld $r22 = 48[$r12] +;; + ld $r23 = 56[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1088 +;; + ret +;; + .type bs_shiftmix, @function + .size bs_shiftmix, . - bs_shiftmix + .text + .balign 2 + .globl bs_mixcolumns +bs_mixcolumns: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1040 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + addd $r46 = $r12, 16 + make $r45, 0 +;; +.L113: + ld $r60 = 64[$r0] + addw $r45 = $r45, 1 + make $r32, 4 +;; + ld $r54 = 128[$r0] + compw.lt $r32 = $r45, $r32 +;; + ld $r44 = 56[$r0] + xord $r49 = $r60, $r54 +;; + ld $r7 = 120[$r0] +;; + xord $r57 = $r44, $r7 + ld $r5 = 192[$r0] +;; + xord $r1 = $r49, $r5 +;; + xord $r40 = $r1, $r57 +;; + sd 0[$r46] = $r40 +;; + ld $r42 = 0[$r0] +;; + xord $r53 = $r42, $r60 + ld $r39 = 72[$r0] +;; + xord $r43 = $r53, $r39 + ld $r55 = 136[$r0] +;; + xord $r6 = $r43, $r55 + ld $r2 = 200[$r0] +;; + xord $r15 = $r6, $r2 +;; + xord $r8 = $r15, $r57 +;; + sd 8[$r46] = $r8 +;; + ld $r35 = 8[$r0] +;; + xord $r59 = $r35, $r39 + ld $r37 = 80[$r0] + xord $r60 = $r35, $r60 +;; + xord $r56 = $r59, $r37 + ld $r6 = 144[$r0] + xord $r59 = $r59, $r54 +;; + xord $r43 = $r56, $r6 + ld $r51 = 208[$r0] + xord $r59 = $r59, $r5 +;; + xord $r11 = $r43, $r51 + xord $r59 = $r59, $r2 +;; + sd 16[$r46] = $r11 +;; + ld $r47 = 16[$r0] +;; + xord $r11 = $r47, $r37 + ld $r52 = 88[$r0] +;; + xord $r48 = $r11, $r52 + ld $r17 = 152[$r0] +;; + xord $r4 = $r48, $r17 + ld $r1 = 216[$r0] +;; + xord $r4 = $r4, $r1 +;; + xord $r3 = $r4, $r57 +;; + sd 24[$r46] = $r3 +;; + ld $r8 = 24[$r0] +;; + xord $r58 = $r8, $r52 + ld $r36 = 96[$r0] +;; + xord $r9 = $r58, $r36 + ld $r50 = 160[$r0] + xord $r58 = $r58, $r6 +;; + xord $r40 = $r9, $r50 + ld $r4 = 224[$r0] +;; + xord $r61 = $r40, $r4 +;; + xord $r48 = $r61, $r57 +;; + sd 32[$r46] = $r48 +;; + ld $r15 = 32[$r0] +;; + xord $r57 = $r15, $r36 + ld $r38 = 104[$r0] +;; + xord $r61 = $r57, $r38 + ld $r3 = 168[$r0] +;; + xord $r9 = $r61, $r3 + ld $r48 = 232[$r0] +;; + xord $r9 = $r9, $r48 +;; + sd 40[$r46] = $r9 +;; + ld $r43 = 40[$r0] +;; + xord $r34 = $r43, $r38 + ld $r33 = 112[$r0] +;; + xord $r40 = $r34, $r33 + ld $r10 = 176[$r0] + xord $r63 = $r44, $r33 +;; + xord $r49 = $r40, $r10 + ld $r41 = 240[$r0] +;; + xord $r62 = $r49, $r41 +;; + sd 48[$r46] = $r62 + xord $r62 = $r42, $r54 +;; + xord $r62 = $r62, $r5 +;; + ld $r9 = 48[$r0] +;; + xord $r56 = $r9, $r33 + ld $r40 = 184[$r0] +;; + xord $r49 = $r56, $r7 + xord $r56 = $r56, $r3 +;; + xord $r61 = $r49, $r40 +;; + ld $r49 = 248[$r0] + addd $r0 = $r0, 256 +;; + xord $r61 = $r61, $r49 +;; + sd 56[$r46] = $r61 + xord $r61 = $r7, $r40 +;; + xord $r62 = $r62, $r61 +;; + sd 64[$r46] = $r62 + xord $r62 = $r60, $r54 +;; + xord $r60 = $r62, $r55 +;; + xord $r60 = $r60, $r2 +;; + xord $r60 = $r60, $r61 +;; + sd 72[$r46] = $r60 + xord $r60 = $r47, $r39 +;; + xord $r60 = $r60, $r55 +;; + xord $r60 = $r60, $r6 +;; + xord $r60 = $r60, $r51 +;; + sd 80[$r46] = $r60 + xord $r60 = $r8, $r37 +;; + xord $r60 = $r60, $r6 +;; + xord $r60 = $r60, $r17 +;; + xord $r60 = $r60, $r1 +;; + xord $r60 = $r60, $r61 +;; + sd 88[$r46] = $r60 + xord $r60 = $r15, $r52 +;; + xord $r60 = $r60, $r17 +;; + xord $r60 = $r60, $r50 +;; + xord $r60 = $r60, $r4 +;; + xord $r60 = $r60, $r61 + xord $r61 = $r53, $r5 +;; + sd 96[$r46] = $r60 + xord $r60 = $r43, $r36 +;; + xord $r60 = $r60, $r50 +;; + xord $r60 = $r60, $r3 +;; + xord $r60 = $r60, $r48 +;; + sd 104[$r46] = $r60 + xord $r60 = $r9, $r38 +;; + xord $r60 = $r60, $r3 +;; + xord $r60 = $r60, $r10 +;; + xord $r60 = $r60, $r41 +;; + sd 112[$r46] = $r60 + xord $r60 = $r63, $r10 +;; + xord $r60 = $r60, $r40 +;; + xord $r60 = $r60, $r49 +;; + sd 120[$r46] = $r60 + xord $r60 = $r40, $r49 +;; + xord $r61 = $r61, $r60 + xord $r63 = $r59, $r60 + xord $r59 = $r11, $r55 +;; + sd 128[$r46] = $r61 + xord $r59 = $r59, $r2 +;; + sd 136[$r46] = $r63 + xord $r11 = $r59, $r51 + xord $r63 = $r58, $r51 +;; + sd 144[$r46] = $r11 + xord $r58 = $r63, $r1 + xord $r11 = $r57, $r17 +;; + xord $r61 = $r58, $r60 + xord $r57 = $r11, $r1 +;; + sd 152[$r46] = $r61 + xord $r57 = $r57, $r4 +;; + xord $r57 = $r57, $r60 +;; + sd 160[$r46] = $r57 + xord $r57 = $r34, $r50 +;; + xord $r57 = $r57, $r4 +;; + xord $r11 = $r57, $r48 + xord $r57 = $r53, $r54 +;; + sd 168[$r46] = $r11 + xord $r11 = $r56, $r48 +;; + xord $r56 = $r11, $r41 + xord $r11 = $r44, $r7 +;; + sd 176[$r46] = $r56 + xord $r34 = $r11, $r10 +;; + xord $r11 = $r34, $r41 +;; + xord $r56 = $r11, $r49 + xord $r49 = $r44, $r49 + xord $r11 = $r35, $r42 + xord $r42 = $r9, $r43 +;; + sd 184[$r46] = $r56 + xord $r53 = $r57, $r49 + xord $r11 = $r11, $r39 + xord $r58 = $r42, $r33 +;; + sd 192[$r46] = $r53 + xord $r61 = $r11, $r55 + xord $r53 = $r47, $r35 +;; + xord $r39 = $r61, $r5 + xord $r62 = $r53, $r37 +;; + xord $r34 = $r39, $r49 + xord $r37 = $r62, $r6 +;; + sd 200[$r46] = $r34 + xord $r57 = $r37, $r2 + xord $r37 = $r8, $r47 + xord $r34 = $r15, $r8 +;; + sd 208[$r46] = $r57 + xord $r35 = $r37, $r52 +;; + xord $r47 = $r35, $r17 +;; + xord $r47 = $r47, $r51 +;; + xord $r54 = $r47, $r49 + xord $r47 = $r34, $r36 +;; + sd 216[$r46] = $r54 + xord $r35 = $r47, $r50 +;; + xord $r39 = $r35, $r1 +;; + xord $r11 = $r39, $r49 + xord $r49 = $r43, $r15 +;; + sd 224[$r46] = $r11 + xord $r53 = $r49, $r38 + xord $r11 = $r58, $r10 + xord $r38 = $r44, $r9 +;; + xord $r6 = $r53, $r3 + xord $r56 = $r11, $r48 + xord $r52 = $r38, $r7 +;; + xord $r15 = $r6, $r4 + xord $r34 = $r52, $r40 +;; + sd 232[$r46] = $r15 + xord $r58 = $r34, $r41 +;; + sd 240[$r46] = $r56 +;; + sd 248[$r46] = $r58 + addd $r46 = $r46, 256 + cb.wnez $r32? .L113 +;; + addd $r0 = $r0, -1024 + addd $r1 = $r46, -1024 + make $r2, 1024 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1040 +;; + ret +;; + .type bs_mixcolumns, @function + .size bs_mixcolumns, . - bs_mixcolumns + .text + .balign 2 + .globl bs_mixcolumns_rev +bs_mixcolumns_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1040 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + addd $r49 = $r12, 16 + make $r50, 0 +;; +.L114: + ld $r47 = 56[$r0] + addw $r50 = $r50, 8 + make $r32, 32 +;; + ld $r11 = 48[$r0] + compw.lt $r32 = $r50, $r32 +;; + xord $r56 = $r47, $r11 + ld $r6 = 40[$r0] +;; + xord $r2 = $r56, $r6 + ld $r15 = 120[$r0] +;; + ld $r33 = 104[$r0] + xord $r1 = $r47, $r15 +;; + xord $r60 = $r15, $r33 + ld $r7 = 176[$r0] +;; + xord $r57 = $r2, $r60 + ld $r48 = 168[$r0] +;; + xord $r63 = $r7, $r48 + ld $r44 = 64[$r0] +;; + xord $r62 = $r57, $r63 + ld $r34 = 128[$r0] +;; + ld $r3 = 232[$r0] + xord $r38 = $r44, $r34 +;; + xord $r9 = $r62, $r3 + ld $r17 = 192[$r0] +;; + ld $r51 = 112[$r0] + xord $r36 = $r38, $r17 +;; + xord $r8 = $r56, $r51 + ld $r10 = 184[$r0] + xord $r40 = $r36, $r9 +;; + xord $r57 = $r10, $r7 + ld $r5 = 240[$r0] + xord $r4 = $r1, $r10 +;; + xord $r54 = $r8, $r57 + ld $r39 = 248[$r0] +;; + xord $r35 = $r54, $r5 + xord $r58 = $r4, $r39 + sd 0[$r49] = $r40 +;; + ld $r8 = 0[$r0] +;; + xord $r36 = $r8, $r44 + ld $r52 = 72[$r0] +;; + xord $r59 = $r36, $r52 + ld $r45 = 136[$r0] +;; + xord $r59 = $r59, $r45 + ld $r4 = 200[$r0] +;; + xord $r59 = $r59, $r4 +;; + xord $r36 = $r59, $r9 +;; + xord $r61 = $r36, $r35 +;; + sd 8[$r49] = $r61 +;; + ld $r43 = 8[$r0] +;; + xord $r53 = $r43, $r8 + ld $r46 = 80[$r0] +;; + xord $r53 = $r53, $r52 + ld $r2 = 144[$r0] +;; + xord $r36 = $r53, $r46 + ld $r42 = 208[$r0] +;; + xord $r59 = $r36, $r2 +;; + xord $r53 = $r59, $r34 +;; + xord $r53 = $r53, $r42 +;; + xord $r37 = $r53, $r35 +;; + xord $r53 = $r37, $r58 +;; + sd 16[$r49] = $r53 +;; + ld $r1 = 16[$r0] +;; + xord $r41 = $r1, $r43 + ld $r38 = 88[$r0] +;; + xord $r36 = $r41, $r8 +;; + xord $r53 = $r36, $r44 +;; + xord $r36 = $r53, $r46 + ld $r41 = 152[$r0] +;; + xord $r36 = $r36, $r38 +;; + xord $r40 = $r36, $r41 +;; + xord $r53 = $r40, $r45 +;; + xord $r36 = $r53, $r34 +;; + ld $r53 = 216[$r0] +;; + xord $r54 = $r36, $r53 +;; + xord $r36 = $r54, $r17 +;; + xord $r36 = $r36, $r9 +;; + xord $r36 = $r36, $r58 +;; + sd 24[$r49] = $r36 +;; + ld $r36 = 24[$r0] +;; + xord $r54 = $r36, $r1 + ld $r40 = 96[$r0] +;; + xord $r61 = $r54, $r43 + ld $r37 = 160[$r0] +;; + xord $r54 = $r61, $r52 +;; + xord $r59 = $r54, $r38 +;; + xord $r59 = $r59, $r40 +;; + xord $r54 = $r59, $r37 +;; + xord $r54 = $r54, $r2 +;; + xord $r55 = $r54, $r45 +;; + ld $r54 = 224[$r0] +;; + xord $r55 = $r55, $r54 +;; + xord $r55 = $r55, $r4 +;; + xord $r55 = $r55, $r9 +;; + xord $r9 = $r55, $r35 +;; + sd 32[$r49] = $r9 +;; + ld $r9 = 32[$r0] + addd $r0 = $r0, 256 +;; + xord $r55 = $r9, $r36 + xord $r61 = $r6, $r9 +;; + xord $r55 = $r55, $r1 +;; + xord $r55 = $r55, $r46 +;; + xord $r55 = $r55, $r40 +;; + xord $r55 = $r55, $r33 +;; + xord $r55 = $r55, $r48 +;; + xord $r55 = $r55, $r41 +;; + xord $r55 = $r55, $r2 +;; + xord $r55 = $r55, $r3 +;; + xord $r55 = $r55, $r42 +;; + xord $r35 = $r55, $r35 +;; + xord $r60 = $r35, $r58 + xord $r35 = $r61, $r36 +;; + sd 40[$r49] = $r60 + xord $r55 = $r35, $r38 +;; + xord $r62 = $r55, $r33 +;; + xord $r59 = $r62, $r51 + xord $r62 = $r8, $r34 +;; + xord $r35 = $r59, $r7 + xord $r59 = $r11, $r6 + xord $r62 = $r62, $r17 +;; + xord $r35 = $r35, $r37 +;; + xord $r35 = $r35, $r41 +;; + xord $r55 = $r35, $r5 + xord $r35 = $r59, $r9 +;; + xord $r63 = $r55, $r53 + xord $r60 = $r35, $r40 +;; + xord $r55 = $r63, $r58 + xord $r35 = $r60, $r51 +;; + sd 48[$r49] = $r55 + xord $r63 = $r35, $r15 + xord $r55 = $r15, $r51 +;; + xord $r35 = $r63, $r10 + xord $r63 = $r1, $r52 +;; + xord $r35 = $r35, $r48 +;; + xord $r35 = $r35, $r37 +;; + xord $r60 = $r35, $r39 + xord $r35 = $r55, $r33 +;; + xord $r60 = $r60, $r54 + xord $r35 = $r6, $r35 +;; + sd 56[$r49] = $r60 + xord $r60 = $r10, $r48 +;; + xord $r61 = $r35, $r60 + xord $r60 = $r5, $r3 + xord $r35 = $r39, $r5 +;; + xord $r60 = $r61, $r60 + xord $r61 = $r11, $r55 +;; + xord $r61 = $r61, $r7 + xord $r62 = $r62, $r60 +;; + xord $r61 = $r61, $r35 + sd 64[$r49] = $r62 + xord $r62 = $r43, $r44 +;; + xord $r62 = $r62, $r45 +;; + xord $r62 = $r62, $r34 +;; + xord $r62 = $r62, $r4 +;; + xord $r62 = $r62, $r60 +;; + xord $r62 = $r62, $r61 +;; + sd 72[$r49] = $r62 + xord $r62 = $r63, $r44 +;; + xord $r62 = $r62, $r2 +;; + xord $r62 = $r62, $r45 +;; + xord $r62 = $r62, $r42 +;; + xord $r62 = $r62, $r17 +;; + xord $r62 = $r62, $r61 +;; + xord $r62 = $r62, $r58 +;; + sd 80[$r49] = $r62 + xord $r62 = $r36, $r8 +;; + xord $r62 = $r62, $r46 +;; + xord $r62 = $r62, $r52 +;; + xord $r62 = $r62, $r44 +;; + xord $r62 = $r62, $r41 +;; + xord $r62 = $r62, $r2 +;; + xord $r62 = $r62, $r34 +;; + xord $r62 = $r62, $r53 +;; + xord $r62 = $r62, $r4 +;; + xord $r62 = $r62, $r17 +;; + xord $r62 = $r62, $r60 +;; + xord $r62 = $r62, $r58 +;; + sd 88[$r49] = $r62 + xord $r62 = $r9, $r43 +;; + xord $r62 = $r62, $r38 +;; + xord $r62 = $r62, $r46 +;; + xord $r62 = $r62, $r52 +;; + xord $r62 = $r62, $r37 +;; + xord $r62 = $r62, $r41 +;; + xord $r62 = $r62, $r45 +;; + xord $r62 = $r62, $r54 +;; + xord $r62 = $r62, $r42 +;; + xord $r62 = $r62, $r4 +;; + xord $r60 = $r62, $r60 +;; + xord $r60 = $r60, $r61 +;; + sd 96[$r49] = $r60 + xord $r60 = $r6, $r1 +;; + xord $r60 = $r60, $r40 +;; + xord $r60 = $r60, $r38 +;; + xord $r60 = $r60, $r46 +;; + xord $r60 = $r60, $r48 +;; + xord $r60 = $r60, $r37 +;; + xord $r60 = $r60, $r2 +;; + xord $r60 = $r60, $r3 +;; + xord $r60 = $r60, $r53 +;; + xord $r60 = $r60, $r42 +;; + xord $r60 = $r60, $r61 +;; + xord $r60 = $r60, $r58 +;; + sd 104[$r49] = $r60 + xord $r60 = $r11, $r36 +;; + xord $r60 = $r60, $r33 +;; + xord $r60 = $r60, $r40 +;; + xord $r60 = $r60, $r38 +;; + xord $r62 = $r60, $r7 +;; + xord $r60 = $r62, $r48 +;; + xord $r60 = $r60, $r41 +;; + xord $r60 = $r60, $r5 +;; + xord $r60 = $r60, $r54 +;; + xord $r60 = $r60, $r53 +;; + xord $r58 = $r60, $r58 +;; + sd 112[$r49] = $r58 + xord $r58 = $r47, $r9 +;; + xord $r58 = $r58, $r51 +;; + xord $r58 = $r58, $r33 +;; + xord $r58 = $r58, $r40 +;; + xord $r58 = $r58, $r10 +;; + xord $r58 = $r58, $r7 +;; + xord $r58 = $r58, $r37 +;; + xord $r58 = $r58, $r39 +;; + xord $r63 = $r58, $r3 +;; + xord $r58 = $r63, $r54 +;; + sd 120[$r49] = $r58 + xord $r58 = $r57, $r48 + xord $r57 = $r51, $r57 +;; + xord $r60 = $r33, $r58 + xord $r58 = $r39, $r3 + xord $r63 = $r57, $r5 +;; + xord $r58 = $r60, $r58 + xord $r63 = $r63, $r56 + xord $r56 = $r15, $r10 +;; + xord $r61 = $r58, $r59 + xord $r56 = $r56, $r39 +;; + xord $r62 = $r56, $r47 + xord $r56 = $r44, $r17 +;; + xord $r56 = $r56, $r8 +;; + xord $r56 = $r56, $r61 +;; + sd 128[$r49] = $r56 + xord $r56 = $r52, $r34 +;; + xord $r56 = $r56, $r4 +;; + xord $r56 = $r56, $r17 +;; + xord $r56 = $r56, $r43 +;; + xord $r56 = $r56, $r61 +;; + xord $r56 = $r56, $r63 +;; + sd 136[$r49] = $r56 + xord $r56 = $r46, $r45 +;; + xord $r56 = $r56, $r34 +;; + xord $r56 = $r56, $r42 +;; + xord $r56 = $r56, $r4 +;; + xord $r56 = $r56, $r1 +;; + xord $r56 = $r56, $r8 +;; + xord $r56 = $r56, $r63 +;; + xord $r56 = $r56, $r62 +;; + sd 144[$r49] = $r56 + xord $r56 = $r38, $r44 +;; + xord $r56 = $r56, $r2 +;; + xord $r56 = $r56, $r45 +;; + xord $r56 = $r56, $r34 +;; + xord $r56 = $r56, $r53 +;; + xord $r56 = $r56, $r42 +;; + xord $r56 = $r56, $r17 +;; + xord $r56 = $r56, $r36 +;; + xord $r59 = $r56, $r43 +;; + xord $r56 = $r59, $r8 +;; + xord $r59 = $r56, $r61 +;; + xord $r56 = $r59, $r62 + xord $r59 = $r33, $r46 +;; + sd 152[$r49] = $r56 + xord $r56 = $r40, $r52 +;; + xord $r56 = $r56, $r41 +;; + xord $r56 = $r56, $r2 +;; + xord $r56 = $r56, $r45 +;; + xord $r56 = $r56, $r54 +;; + xord $r56 = $r56, $r53 +;; + xord $r56 = $r56, $r4 +;; + xord $r56 = $r56, $r9 +;; + xord $r56 = $r56, $r1 +;; + xord $r56 = $r56, $r43 +;; + xord $r56 = $r56, $r61 +;; + xord $r56 = $r56, $r63 +;; + sd 160[$r49] = $r56 + xord $r56 = $r59, $r37 +;; + xord $r56 = $r56, $r41 +;; + xord $r56 = $r56, $r2 +;; + xord $r56 = $r56, $r3 +;; + xord $r56 = $r56, $r54 +;; + xord $r57 = $r56, $r42 +;; + xord $r56 = $r57, $r6 +;; + xord $r56 = $r56, $r36 +;; + xord $r56 = $r56, $r1 +;; + xord $r56 = $r56, $r63 +;; + xord $r56 = $r56, $r62 +;; + sd 168[$r49] = $r56 + xord $r56 = $r51, $r38 +;; + xord $r56 = $r56, $r48 +;; + xord $r56 = $r56, $r37 +;; + xord $r58 = $r56, $r41 +;; + xord $r56 = $r58, $r5 +;; + xord $r56 = $r56, $r3 +;; + xord $r56 = $r56, $r53 +;; + xord $r56 = $r56, $r11 +;; + xord $r58 = $r56, $r9 +;; + xord $r56 = $r58, $r36 +;; + xord $r56 = $r56, $r62 +;; + sd 176[$r49] = $r56 + xord $r56 = $r15, $r40 +;; + xord $r56 = $r56, $r7 +;; + xord $r56 = $r56, $r48 +;; + xord $r56 = $r56, $r37 +;; + xord $r60 = $r56, $r39 + xord $r39 = $r10, $r39 +;; + xord $r56 = $r60, $r5 +;; + xord $r56 = $r56, $r54 +;; + xord $r56 = $r56, $r47 +;; + xord $r56 = $r56, $r6 +;; + xord $r56 = $r56, $r9 +;; + sd 184[$r49] = $r56 + xord $r56 = $r35, $r3 +;; + xord $r57 = $r48, $r56 + xord $r56 = $r47, $r6 +;; + xord $r57 = $r57, $r56 + xord $r56 = $r51, $r33 +;; + xord $r56 = $r57, $r56 + xord $r57 = $r7, $r35 +;; + xord $r35 = $r57, $r11 + xord $r57 = $r39, $r47 +;; + xord $r55 = $r35, $r55 + xord $r35 = $r57, $r15 + xord $r57 = $r34, $r8 + xord $r34 = $r41, $r34 +;; + xord $r39 = $r57, $r44 + xord $r57 = $r45, $r17 + xord $r58 = $r34, $r42 +;; + xord $r39 = $r39, $r56 + xord $r34 = $r58, $r4 +;; + sd 192[$r49] = $r39 + xord $r39 = $r57, $r43 +;; + xord $r39 = $r39, $r8 +;; + xord $r39 = $r39, $r52 +;; + xord $r57 = $r39, $r56 +;; + xord $r39 = $r57, $r55 +;; + sd 200[$r49] = $r39 + xord $r39 = $r2, $r4 +;; + xord $r39 = $r39, $r17 + xord $r17 = $r34, $r17 +;; + xord $r39 = $r39, $r1 +;; + xord $r57 = $r39, $r43 +;; + xord $r39 = $r57, $r46 +;; + xord $r39 = $r39, $r44 +;; + xord $r63 = $r39, $r55 +;; + xord $r57 = $r63, $r35 +;; + sd 208[$r49] = $r57 + xord $r57 = $r17, $r36 +;; + xord $r57 = $r57, $r1 +;; + xord $r57 = $r57, $r8 + xord $r8 = $r48, $r2 +;; + xord $r17 = $r57, $r38 + xord $r57 = $r37, $r45 +;; + xord $r59 = $r17, $r52 + xord $r62 = $r57, $r53 +;; + xord $r34 = $r59, $r44 + xord $r39 = $r62, $r42 +;; + xord $r44 = $r34, $r56 +;; + xord $r44 = $r44, $r35 +;; + sd 216[$r49] = $r44 + xord $r44 = $r39, $r4 +;; + xord $r4 = $r44, $r9 +;; + xord $r4 = $r4, $r36 +;; + xord $r57 = $r4, $r43 + xord $r43 = $r8, $r54 +;; + xord $r59 = $r57, $r40 + xord $r48 = $r43, $r53 +;; + xord $r34 = $r59, $r46 +;; + xord $r52 = $r34, $r52 +;; + xord $r52 = $r52, $r56 + xord $r56 = $r10, $r37 +;; + xord $r39 = $r52, $r55 + xord $r52 = $r48, $r42 +;; + sd 224[$r49] = $r39 + xord $r42 = $r52, $r6 +;; + xord $r17 = $r42, $r9 +;; + xord $r39 = $r17, $r1 +;; + xord $r48 = $r39, $r33 +;; + xord $r42 = $r48, $r38 +;; + xord $r17 = $r42, $r46 +;; + xord $r52 = $r17, $r55 + xord $r55 = $r7, $r41 +;; + xord $r17 = $r52, $r35 + xord $r34 = $r55, $r3 +;; + sd 232[$r49] = $r17 + xord $r44 = $r34, $r54 +;; + xord $r39 = $r44, $r53 +;; + xord $r60 = $r39, $r11 +;; + xord $r61 = $r60, $r6 +;; + xord $r8 = $r61, $r36 +;; + xord $r4 = $r8, $r51 +;; + xord $r39 = $r4, $r40 +;; + xord $r2 = $r39, $r38 +;; + xord $r2 = $r2, $r35 + xord $r35 = $r56, $r5 +;; + sd 240[$r49] = $r2 + xord $r62 = $r35, $r3 +;; + xord $r38 = $r62, $r54 +;; + xord $r48 = $r38, $r47 +;; + xord $r38 = $r48, $r11 +;; + xord $r11 = $r38, $r9 +;; + xord $r1 = $r11, $r15 +;; + xord $r55 = $r1, $r33 +;; + xord $r5 = $r55, $r40 +;; + sd 248[$r49] = $r5 + addd $r49 = $r49, 256 + cb.wnez $r32? .L114 +;; + addd $r0 = $r0, -1024 + addd $r1 = $r49, -1024 + make $r2, 1024 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1040 +;; + ret +;; + .type bs_mixcolumns_rev, @function + .size bs_mixcolumns_rev, . - bs_mixcolumns_rev + .text + .balign 2 + .globl bs_expand_key +bs_expand_key: + addd $r17 = $r12, 0 + addd $r12 = $r12, -224 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 + addd $r0 = $r12, 40 + make $r2, 16 +;; + sd 24[$r12] = $r19 +;; + sd 32[$r12] = $r20 + call memmove +;; + addd $r0 = $r12, 40 + call expand_key +;; + make $r20, 0 + make $r19, 0 +;; +.L115: + sxwd $r1 = $r20 + addd $r11 = $r12, 40 + sxwd $r15 = $r19 + make $r2, 16 +;; + slld $r34 = $r1, 10 + addd $r1 = $r11, $r15 +;; + addd $r0 = $r18, $r34 + call memmove +;; + make $r1, 2 +;; +.L116: + make $r35, 0 +;; +.L117: + addw $r38 = $r1, $r35 + sxwd $r37 = $r35 + addw $r35 = $r35, 1 + make $r32, 2 +;; + sxwd $r6 = $r20 + sxwd $r33 = $r38 + slld $r10 = $r37, 3 + compw.lt $r32 = $r35, $r32 +;; + slld $r3 = $r6, 10 + slld $r9 = $r33, 3 +;; + addd $r0 = $r18, $r3 +;; + addd $r39 = $r0, $r9 + addd $r8 = $r0, $r10 +;; + ld $r17 = 0[$r8] +;; + sd 0[$r39] = $r17 +;; + cb.wnez $r32? .L117 +;; + addw $r1 = $r1, 2 + make $r32, 128 +;; + compw.lt $r32 = $r1, $r32 +;; + cb.wnez $r32? .L116 +;; + call bs_transpose +;; + addw $r20 = $r20, 1 + addw $r19 = $r19, 16 + make $r32, 176 +;; + compw.lt $r32 = $r19, $r32 +;; + cb.wnez $r32? .L115 +;; + ld $r16 = 8[$r12] +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r20 = 32[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 224 +;; + ret +;; + .type bs_expand_key, @function + .size bs_expand_key, . - bs_expand_key + .text + .balign 2 + .globl bs_cipher +bs_cipher: + addd $r17 = $r12, 0 + addd $r12 = $r12, -48 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 +;; + sd 24[$r12] = $r19 + addd $r19 = $r1, 0 + addd $r0 = $r18, 0 +;; + sd 32[$r12] = $r20 + call bs_transpose +;; + addd $r1 = $r19, 0 + addd $r0 = $r18, 0 + call bs_addroundkey +;; + make $r20, 1 +;; +.L118: + addd $r0 = $r18, 0 + call bs_apply_sbox +;; + addd $r0 = $r18, 0 + call bs_shiftmix +;; + sxwd $r6 = $r20 + addd $r0 = $r18, 0 +;; + slld $r4 = $r6, 10 +;; + addd $r1 = $r19, $r4 + call bs_addroundkey +;; + addw $r20 = $r20, 1 + make $r32, 10 +;; + compw.lt $r32 = $r20, $r32 +;; + cb.wnez $r32? .L118 +;; + addd $r0 = $r18, 0 + call bs_apply_sbox +;; + addd $r0 = $r18, 0 + call bs_shiftrows +;; + addd $r1 = $r19, 10240 + addd $r0 = $r18, 0 + call bs_addroundkey +;; + addd $r0 = $r18, 0 +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r20 = 32[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 48 +;; + goto bs_transpose_rev +;; + .type bs_cipher, @function + .size bs_cipher, . - bs_cipher + .text + .balign 2 + .globl bs_cipher_rev +bs_cipher_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -48 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 +;; + sd 24[$r12] = $r19 + addd $r19 = $r1, 0 + addd $r0 = $r18, 0 +;; + sd 32[$r12] = $r20 + call bs_transpose +;; + addd $r1 = $r19, 10240 + addd $r0 = $r18, 0 + call bs_addroundkey +;; + make $r20, 9 +;; +.L119: + addd $r0 = $r18, 0 + call bs_shiftrows_rev +;; + addd $r0 = $r18, 0 + call bs_apply_sbox_rev +;; + sxwd $r5 = $r20 + addd $r0 = $r18, 0 +;; + slld $r8 = $r5, 10 +;; + addd $r1 = $r19, $r8 + call bs_addroundkey +;; + addd $r0 = $r18, 0 + call bs_mixcolumns_rev +;; + addw $r20 = $r20, -1 +;; + cb.wgtz $r20? .L119 +;; + addd $r0 = $r18, 0 + call bs_shiftrows_rev +;; + addd $r0 = $r18, 0 + call bs_apply_sbox_rev +;; + addd $r1 = $r19, 0 + addd $r0 = $r18, 0 + call bs_addroundkey +;; + addd $r0 = $r18, 0 +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r20 = 32[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 48 +;; + goto bs_transpose_rev +;; + .type bs_cipher_rev, @function + .size bs_cipher_rev, . - bs_cipher_rev diff --git a/test/monniaux/bitsliced-aes/notes.org b/test/monniaux/bitsliced-aes/notes.org new file mode 100644 index 00000000..6c2e27fa --- /dev/null +++ b/test/monniaux/bitsliced-aes/notes.org @@ -0,0 +1,59 @@ +* bs_transpose_dst only +** original +==> test.ccomp.host.out <== +cycles: 3080223 + +==> test.ccomp.k1c.out <== +cycles: 10145951 + +==> test.gcc.host.out <== +cycles: 1485887 + +==> test.gcc.k1c.out <== +cycles: 4078535 + +** neg and +==> test.ccomp.host.out <== +cycles: 2905049 + +==> test.ccomp.k1c.out <== +cycles: 7995063 + +==> test.gcc.host.out <== +cycles: 1858263 + +==> test.gcc.k1c.out <== +cycles: 5255763 + +** cmove mais mauvais scheduling de registres +==> test.ccomp.host.out <== +cycles: 4363682 + +==> test.ccomp.k1c.out <== +cycles: 7208629 + +==> test.gcc.host.out <== +cycles: 2916854 + +==> test.gcc.k1c.out <== +cycles: 5646730 + +** cmove via match du and +==> test.ccomp.host.out <== +cycles: 2553732 + +==> test.ccomp.k1c.out <== +cycles: 7208629 + +==> test.gcc.host.out <== +cycles: 1849125 + +==> test.gcc.k1c.out <== +cycles: 5255763 + +** hand optimized loads +cycles: 6027072 + +* both bs_transpose_dst and bs_transpose_rev +** with both cmove +6890902 diff --git a/test/monniaux/ternary_builtin/ternary_builtin.c b/test/monniaux/ternary_builtin/ternary_builtin.c new file mode 100644 index 00000000..caa1c4c7 --- /dev/null +++ b/test/monniaux/ternary_builtin/ternary_builtin.c @@ -0,0 +1,11 @@ +int ternary_signed(int x, int v0, int v1) { + return ((-(x==0)) & v0) | ((-(x!=0)) & v1); +} + +int ternary_unsigned(unsigned x, int v0, int v1) { + return ((-(x==0)) & v0) | ((-(x!=0)) & v1); +} + +long ternary_signedl(long x, long v0, long v1) { + return ((-(x==0)) & v0) | ((-(x!=0)) & v1); +} |