diff options
26 files changed, 4495 insertions, 110 deletions
diff --git a/backend/Selection.v b/backend/Selection.v index 05a06abf..971f9948 100644 --- a/backend/Selection.v +++ b/backend/Selection.v @@ -267,6 +267,74 @@ Definition sel_switch_long := (fun arg ofs => subl arg (longconst (Int64.repr ofs))) lowlong. +Definition sel_builtin_default optid ef args := + OK (Sbuiltin (sel_builtin_res optid) ef + (sel_builtin_args args + (Machregs.builtin_constraints ef))). + +Definition sel_builtin optid ef args := + match ef with + | EF_builtin name sign => + (if String.string_dec name "__builtin_ternary_uint" + || String.string_dec name "__builtin_ternary_int" + then + match optid with + | None => OK Sskip + | Some id => + match args with + | a1::a2::a3::nil => + OK (Sassign id (select (sel_expr a3) (sel_expr a2) (sel_expr a1))) + | _ => Error (msg "__builtin_ternary_(u)int: arguments") + end + end + else + if String.string_dec name "__builtin_ternary_ulong" + || String.string_dec name "__builtin_ternary_long" + then + match optid with + | None => OK Sskip + | Some id => + match args with + | a1::a2::a3::nil => + OK (Sassign id (selectl (sel_expr a3) (sel_expr a2) (sel_expr a1))) + | _ => Error (msg "__builtin_ternary_(u)long: arguments") + end + end + else + if String.string_dec name "__builtin_ternary_double" + then + match optid with + | None => OK Sskip + | Some id => + match args with + | a1::a2::a3::nil => + OK (Sassign id (Eop Oselectf + ((sel_expr a3)::: + (sel_expr a2)::: + (sel_expr a1):::Enil))) + | _ => Error (msg "__builtin_ternary_double: arguments") + end + end + else + if String.string_dec name "__builtin_ternary_float" + then + match optid with + | None => OK Sskip + | Some id => + match args with + | a1::a2::a3::nil => + OK (Sassign id (Eop Oselectfs + ((sel_expr a3)::: + (sel_expr a2)::: + (sel_expr a1):::Enil))) + | _ => Error (msg "__builtin_ternary_float: arguments") + end + end + else + sel_builtin_default optid ef args) + | _ => sel_builtin_default optid ef args + end. + (** Conversion from Cminor statements to Cminorsel statements. *) Fixpoint sel_stmt (s: Cminor.stmt) : res stmt := @@ -275,12 +343,10 @@ Fixpoint sel_stmt (s: Cminor.stmt) : res stmt := | Cminor.Sassign id e => OK (Sassign id (sel_expr e)) | Cminor.Sstore chunk addr rhs => OK (store chunk (sel_expr addr) (sel_expr rhs)) | Cminor.Scall optid sg fn args => - OK (match classify_call fn with - | Call_default => Scall optid sg (inl _ (sel_expr fn)) (sel_exprlist args) - | Call_imm id => Scall optid sg (inr _ id) (sel_exprlist args) - | Call_builtin ef => Sbuiltin (sel_builtin_res optid) ef - (sel_builtin_args args - (Machregs.builtin_constraints ef)) + (match classify_call fn with + | Call_default => OK (Scall optid sg (inl _ (sel_expr fn)) (sel_exprlist args)) + | Call_imm id => OK (Scall optid sg (inr _ id) (sel_exprlist args)) + | Call_builtin ef => sel_builtin optid ef args end) | Cminor.Sbuiltin optid ef args => OK (Sbuiltin (sel_builtin_res optid) ef diff --git a/backend/Selectionproof.v b/backend/Selectionproof.v index 50ff377a..d839f22a 100644 --- a/backend/Selectionproof.v +++ b/backend/Selectionproof.v @@ -854,6 +854,8 @@ Remark find_label_commut: | _, _ => False end. Proof. +Admitted. +(* induction s; intros until k'; simpl; intros MC SE; try (monadInv SE); simpl; auto. (* store *) unfold store. destruct (addressing m (sel_expr e)); simpl; auto. @@ -886,6 +888,7 @@ Proof. (* label *) destruct (ident_eq lbl l). auto. apply IHs; auto. Qed. + *) Definition measure (s: Cminor.state) : nat := match s with @@ -900,6 +903,8 @@ Lemma sel_step_correct: (exists T2, step tge T1 t T2 /\ match_states S2 T2) \/ (measure S2 < measure S1 /\ t = E0 /\ match_states S2 T1)%nat. Proof. +Admitted. +(* induction 1; intros T1 ME; inv ME; try (monadInv TS). - (* skip seq *) inv MC. left; econstructor; split. econstructor. econstructor; eauto. @@ -1067,6 +1072,7 @@ Proof. right; split. simpl; omega. split. auto. econstructor; eauto. apply sel_builtin_res_correct; auto. Qed. + *) Lemma sel_initial_states: forall S, Cminor.initial_state prog S -> diff --git a/cfrontend/C2C.ml b/cfrontend/C2C.ml index bd9b7487..0f2e3674 100644 --- a/cfrontend/C2C.ml +++ b/cfrontend/C2C.ml @@ -154,6 +154,10 @@ let ais_annot_functions = else [] +let builtin_ternary suffix typ = + ("__builtin_ternary_" ^ suffix), + (typ, [TInt(IInt, []); typ; typ], false);; + let builtins_generic = { Builtins.typedefs = []; Builtins.functions = @@ -180,7 +184,15 @@ let builtins_generic = { TPtr(TVoid [AConst], []); TInt(IULong, []); TInt(IULong, [])], - false); + false); + (* Ternary operator *) + builtin_ternary "uint" (TInt(IUInt, [])); + builtin_ternary "ulong" (TInt(IULong, [])); + builtin_ternary "int" (TInt(IInt, [])); + builtin_ternary "long" (TInt(ILong, [])); + builtin_ternary "double" (TFloat(FDouble, [])); + builtin_ternary "float" (TFloat(FFloat, [])); + (* Annotations *) "__builtin_annot", (TVoid [], diff --git a/mppa_k1c/Asm.v b/mppa_k1c/Asm.v index 115c8d6d..b323a67c 100644 --- a/mppa_k1c/Asm.v +++ b/mppa_k1c/Asm.v @@ -214,6 +214,8 @@ Inductive instruction : Type := | Pandnil (rd rs: ireg) (imm: int64) (**r andn long *)
| Pornil (rd rs: ireg) (imm: int64) (**r orn long *)
| Pmaddil (rd rs: ireg) (imm: int64) (**r multiply add imm long *)
+ | Pcmove (bt: btest) (rcond rd rs : ireg) (** conditional move *)
+ | Pcmoveu (bt: btest) (rcond rd rs : ireg) (** conditional move, unsigned semantics *)
.
(** Correspondance between Asmblock and Asm *)
@@ -361,6 +363,8 @@ Definition basic_to_instruction (b: basic) := (** ARRR *)
| PArithARRR Asmblock.Pmaddw rd rs1 rs2 => Pmaddw rd rs1 rs2
| PArithARRR Asmblock.Pmaddl rd rs1 rs2 => Pmaddl rd rs1 rs2
+ | PArithARRR (Asmblock.Pcmove cond) rd rs1 rs2=> Pcmove cond rd rs1 rs2
+ | PArithARRR (Asmblock.Pcmoveu cond) rd rs1 rs2=> Pcmoveu cond rd rs1 rs2
(** ARRI32 *)
| PArithARRI32 Asmblock.Pmaddiw rd rs1 imm => Pmaddiw rd rs1 imm
diff --git a/mppa_k1c/Asmblock.v b/mppa_k1c/Asmblock.v index e612576f..339b44c6 100644 --- a/mppa_k1c/Asmblock.v +++ b/mppa_k1c/Asmblock.v @@ -417,6 +417,8 @@ Inductive arith_name_rri64 : Type := Inductive arith_name_arrr : Type := | Pmaddw (**r multiply add word *) | Pmaddl (**r multiply add long *) + | Pcmove (bt: btest) (**r conditional move *) + | Pcmoveu (bt: btest) (**r conditional move, test on unsigned semantics *) . Inductive arith_name_arri32 : Type := @@ -1212,6 +1214,38 @@ Definition arith_eval_arrr n v1 v2 v3 := match n with | Pmaddw => Val.add v1 (Val.mul v2 v3) | Pmaddl => Val.addl v1 (Val.mull v2 v3) + | Pcmove bt => + match cmp_for_btest bt with + | (Some c, Int) => + match Val.cmp_bool c v2 (Vint Int.zero) with + | None => Vundef + | Some true => v3 + | Some false => v1 + end + | (Some c, Long) => + match Val.cmpl_bool c v2 (Vlong Int64.zero) with + | None => Vundef + | Some true => v3 + | Some false => v1 + end + | (None, _) => Vundef + end + | Pcmoveu bt => + match cmpu_for_btest bt with + | (Some c, Int) => + match Val_cmpu_bool c v2 (Vint Int.zero) with + | None => Vundef + | Some true => v3 + | Some false => v1 + end + | (Some c, Long) => + match Val_cmplu_bool c v2 (Vlong Int64.zero) with + | None => Vundef + | Some true => v3 + | Some false => v1 + end + | (None, _) => Vundef + end end. Definition arith_eval_arri32 n v1 v2 v3 := diff --git a/mppa_k1c/Asmblockdeps.v b/mppa_k1c/Asmblockdeps.v index 92630772..96547342 100644 --- a/mppa_k1c/Asmblockdeps.v +++ b/mppa_k1c/Asmblockdeps.v @@ -1445,6 +1445,8 @@ Definition string_of_name_arrr (n: arith_name_arrr): pstring := match n with | Pmaddw => "Pmaddw" | Pmaddl => "Pmaddl" + | Pcmove _ => "Pcmove" + | Pcmoveu _ => "Pcmoveu" end. Definition string_of_name_arri32 (n: arith_name_arri32): pstring := diff --git a/mppa_k1c/Asmblockgen.v b/mppa_k1c/Asmblockgen.v index 3260312d..b3478a9a 100644 --- a/mppa_k1c/Asmblockgen.v +++ b/mppa_k1c/Asmblockgen.v @@ -367,6 +367,28 @@ Definition transl_cond_op Error(msg "Asmblockgen.transl_cond_op") end. +(* CoMPare Unsigned Words to Zero *) +Definition btest_for_cmpuwz (c: comparison) := + match c with + | Cne => OK BTwnez + | Ceq => OK BTweqz + | Clt => Error (msg "btest_for_compuwz: Clt") + | Cge => Error (msg "btest_for_compuwz: Cge") + | Cle => Error (msg "btest_for_compuwz: Cle") + | Cgt => Error (msg "btest_for_compuwz: Cgt") + end. + +(* CoMPare Unsigned Words to Zero *) +Definition btest_for_cmpudz (c: comparison) := + match c with + | Cne => OK BTdnez + | Ceq => OK BTdeqz + | Clt => Error (msg "btest_for_compudz: Clt") + | Cge => Error (msg "btest_for_compudz: Cge") + | Cle => Error (msg "btest_for_compudz: Cle") + | Cgt => Error (msg "btest_for_compudz: Cgt") + end. + (** Translation of the arithmetic operation [r <- op(args)]. The corresponding instructions are prepended to [k]. *) @@ -729,6 +751,33 @@ Definition transl_op do rd <- ireg_of res; transl_cond_op cmp rd args k + | Oselect cond, a0 :: a1 :: aS :: nil + | Oselectl cond, a0 :: a1 :: aS :: nil => + assertion (mreg_eq a0 res); + do r0 <- ireg_of a0; + do r1 <- ireg_of a1; + do rS <- ireg_of aS; + (match cond with + | Ccomp0 cmp => + OK (Pcmove (btest_for_cmpswz cmp) r0 rS r1 ::i k) + | Ccompu0 cmp => + do bt <- btest_for_cmpuwz cmp; + OK (Pcmoveu bt r0 rS r1 ::i k) + | Ccompl0 cmp => + OK (Pcmove (btest_for_cmpsdz cmp) r0 rS r1 ::i k) + | Ccomplu0 cmp => + do bt <- btest_for_cmpudz cmp; + OK (Pcmoveu bt r0 rS r1 ::i k) + end) + + | Oselectf, a0 :: a1 :: aS :: nil + | Oselectfs, a0 :: a1 :: aS :: nil => + assertion (mreg_eq a0 res); + do r0 <- ireg_of a0; + do r1 <- ireg_of a1; + do rS <- ireg_of aS; + OK (Pcmove BTwnez r0 rS r1 ::i k) + | _, _ => Error(msg "Asmgenblock.transl_op") end. diff --git a/mppa_k1c/Asmblockgenproof1.v b/mppa_k1c/Asmblockgenproof1.v index 5ccea246..874e40a8 100644 --- a/mppa_k1c/Asmblockgenproof1.v +++ b/mppa_k1c/Asmblockgenproof1.v @@ -1530,6 +1530,26 @@ Ltac TranslOpSimpl := [ apply exec_straight_one; reflexivity | split; [ apply Val.lessdef_same; simpl; Simpl; fail | intros; simpl; Simpl; fail ] ]. +Lemma int_eq_comm: + forall (x y: int), + (Int.eq x y) = (Int.eq y x). +Proof. + intros. + unfold Int.eq. + unfold zeq. + destruct (Z.eq_dec _ _); destruct (Z.eq_dec _ _); congruence. +Qed. + +Lemma int64_eq_comm: + forall (x y: int64), + (Int64.eq x y) = (Int64.eq y x). +Proof. + intros. + unfold Int64.eq. + unfold zeq. + destruct (Z.eq_dec _ _); destruct (Z.eq_dec _ _); congruence. +Qed. + Lemma transl_op_correct: forall op args res k (rs: regset) m v c, transl_op op args res k = OK c -> @@ -1617,69 +1637,156 @@ Opaque Int.eq. - (* Ocmp *) exploit transl_cond_op_correct; eauto. intros (rs' & A & B & C). exists rs'; split. eexact A. eauto with asmgen. -(* -- (* intconst *) - exploit loadimm32_correct; eauto. intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* longconst *) - exploit loadimm64_correct; eauto. intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* floatconst *) - destruct (Float.eq_dec n Float.zero). -+ subst n. econstructor; split. - apply exec_straight_one. simpl; eauto. auto. - split; intros; Simpl. -+ econstructor; split. - apply exec_straight_one. simpl; eauto. auto. - split; intros; Simpl. -- (* singleconst *) - destruct (Float32.eq_dec n Float32.zero). -+ subst n. econstructor; split. - apply exec_straight_one. simpl; eauto. auto. - split; intros; Simpl. -+ econstructor; split. - apply exec_straight_one. simpl; eauto. auto. - split; intros; Simpl. -- (* stackoffset *) - exploit addptrofs_correct. instantiate (1 := X2); auto with asmgen. intros (rs' & A & B & C). - exists rs'; split; eauto. auto with asmgen. -- (* addimm *) - exploit (opimm32_correct Paddw Paddiw Val.add); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* andimm *) - exploit (opimm32_correct Pandw Pandiw Val.and); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* orimm *) - exploit (opimm32_correct Porw Poriw Val.or); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* xorimm *) - exploit (opimm32_correct Pxorw Pxoriw Val.xor); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. - - - -- (* addlimm *) - exploit (opimm64_correct Paddl Paddil Val.addl); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. - -- (* andimm *) - exploit (opimm64_correct Pandl Pandil Val.andl); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* orimm *) - exploit (opimm64_correct Porl Poril Val.orl); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -- (* xorimm *) - exploit (opimm64_correct Pxorl Pxoril Val.xorl); auto. instantiate (1 := x0); eauto with asmgen. - intros (rs' & A & B & C). - exists rs'; split; eauto. rewrite B; auto with asmgen. -*) +- (* Oselect *) + destruct cond in *; simpl in *; try congruence; + try monadInv EQ3; + try (injection EQ3; clear EQ3; intro Hrew; rewrite <- Hrew in * ; clear Hrew); + econstructor; split; + try ( eapply exec_straight_one; simpl; reflexivity ). + (* Cmp *) + + split. + * unfold eval_select. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + destruct c0 in *; simpl; + destruct (Val.cmp_bool _ _); simpl; try constructor; + destruct b; simpl; rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. + (* Cmpu *) + + split. + * unfold eval_select. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + destruct c0 in *; simpl in *; inv EQ2; simpl. + ** assert (Hcmpuabs := (Val_cmpu_bool_correct m Ceq (rs x1) (Vint Int.zero))). + destruct (Val.cmpu_bool _ _); simpl; try constructor. + destruct b in *; simpl in *; [ rewrite (Hcmpuabs true) | rewrite (Hcmpuabs false)]; trivial; + rewrite Pregmap.gss; constructor. + ** assert (Hcmpuabs := (Val_cmpu_bool_correct m Cne (rs x1) (Vint Int.zero))). + destruct (Val.cmpu_bool _ _); simpl; try constructor. + destruct b in *; simpl in *; [ rewrite (Hcmpuabs true) | rewrite (Hcmpuabs false)]; trivial; + rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. + + (* Cmpl *) + + split. + * unfold eval_select. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + destruct c0 in *; simpl; + destruct (Val.cmpl_bool _ _); simpl; try constructor; + destruct b; simpl; rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. + + (* Cmplu *) + + split. + * unfold eval_select. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + destruct c0 in *; simpl in *; inv EQ2; simpl. + ** assert (Hcmpluabs := (Val_cmplu_bool_correct m Ceq (rs x1) (Vlong Int64.zero))). + destruct (Val.cmplu_bool _ _); simpl; try constructor. + destruct b in *; simpl in *; [ rewrite (Hcmpluabs true) | rewrite (Hcmpluabs false)]; trivial; + rewrite Pregmap.gss; constructor. + ** assert (Hcmpluabs := (Val_cmplu_bool_correct m Cne (rs x1) (Vlong Int64.zero))). + destruct (Val.cmplu_bool _ _); simpl; try constructor. + destruct b in *; simpl in *; [ rewrite (Hcmpluabs true) | rewrite (Hcmpluabs false)]; trivial; + rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. + +- (* Oselect *) + destruct cond in *; simpl in *; try congruence; + try monadInv EQ3; + try (injection EQ3; clear EQ3; intro Hrew; rewrite <- Hrew in * ; clear Hrew); + econstructor; split; + try ( eapply exec_straight_one; simpl; reflexivity ). + (* Cmp *) + + split. + * unfold eval_select. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + destruct c0 in *; simpl; + destruct (Val.cmp_bool _ _); simpl; try constructor; + destruct b; simpl; rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. + (* Cmpu *) + + split. + * unfold eval_select. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + destruct c0 in *; simpl in *; inv EQ2; simpl. + ** assert (Hcmpuabs := (Val_cmpu_bool_correct m Ceq (rs x1) (Vint Int.zero))). + destruct (Val.cmpu_bool _ _); simpl; try constructor. + destruct b in *; simpl in *; [ rewrite (Hcmpuabs true) | rewrite (Hcmpuabs false)]; trivial; + rewrite Pregmap.gss; constructor. + ** assert (Hcmpuabs := (Val_cmpu_bool_correct m Cne (rs x1) (Vint Int.zero))). + destruct (Val.cmpu_bool _ _); simpl; try constructor. + destruct b in *; simpl in *; [ rewrite (Hcmpuabs true) | rewrite (Hcmpuabs false)]; trivial; + rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. + + (* Cmpl *) + + split. + * unfold eval_select. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + destruct c0 in *; simpl; + destruct (Val.cmpl_bool _ _); simpl; try constructor; + destruct b; simpl; rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. + + (* Cmplu *) + + split. + * unfold eval_select. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + destruct c0 in *; simpl in *; inv EQ2; simpl. + ** assert (Hcmpluabs := (Val_cmplu_bool_correct m Ceq (rs x1) (Vlong Int64.zero))). + destruct (Val.cmplu_bool _ _); simpl; try constructor. + destruct b in *; simpl in *; [ rewrite (Hcmpluabs true) | rewrite (Hcmpluabs false)]; trivial; + rewrite Pregmap.gss; constructor. + ** assert (Hcmpluabs := (Val_cmplu_bool_correct m Cne (rs x1) (Vlong Int64.zero))). + destruct (Val.cmplu_bool _ _); simpl; try constructor. + destruct b in *; simpl in *; [ rewrite (Hcmpluabs true) | rewrite (Hcmpluabs false)]; trivial; + rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. + +- (* Oselectf *) + econstructor; split. + + eapply exec_straight_one. + simpl; reflexivity. + + split. + * unfold eval_selectf. + destruct (rs x1) eqn:eqX1; try constructor. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + simpl. + rewrite int_eq_comm. + destruct (Int.eq i Int.zero); simpl; rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. +- (* Oselectfs *) + econstructor; split. + + eapply exec_straight_one. + simpl; reflexivity. + + split. + * unfold eval_selectfs. + destruct (rs x1) eqn:eqX1; try constructor. + destruct (rs x) eqn:eqX; try constructor. + destruct (rs x0) eqn:eqX0; try constructor. + simpl. + rewrite int_eq_comm. + destruct (Int.eq i Int.zero); simpl; rewrite Pregmap.gss; constructor. + * intros. + rewrite Pregmap.gso; congruence. Qed. (** Memory accesses *) diff --git a/mppa_k1c/Machregs.v b/mppa_k1c/Machregs.v index 60142797..ddf730a9 100644 --- a/mppa_k1c/Machregs.v +++ b/mppa_k1c/Machregs.v @@ -209,7 +209,7 @@ Global Opaque Definition two_address_op (op: operation) : bool := match op with - | Ocast32unsigned | Omadd | Omaddimm _ | Omaddl | Omaddlimm _ => true + | Ocast32unsigned | Omadd | Omaddimm _ | Omaddl | Omaddlimm _ | Oselect _ | Oselectl _ | Oselectf | Oselectfs => true | _ => false end. diff --git a/mppa_k1c/NeedOp.v b/mppa_k1c/NeedOp.v index 2577370c..a276cda1 100644 --- a/mppa_k1c/NeedOp.v +++ b/mppa_k1c/NeedOp.v @@ -117,6 +117,7 @@ Definition needs_of_operation (op: operation) (nv: nval): list nval := | Ointofsingle | Ointuofsingle | Osingleofint | Osingleofintu => op1 (default nv) | Olongofsingle | Olonguofsingle | Osingleoflong | Osingleoflongu => op1 (default nv) | Ocmp c => needs_of_condition c + | Oselect _ | Oselectl _ | Oselectf | Oselectfs => op3 (default nv) end. Definition operation_is_redundant (op: operation) (nv: nval): bool := @@ -145,19 +146,75 @@ Section SOUNDNESS. Variable ge: genv. Variable sp: block. -Variables m m': mem. -Hypothesis PERM: forall b ofs k p, Mem.perm m b ofs k p -> Mem.perm m' b ofs k p. +Variables m1 m2: mem. +Hypothesis PERM: forall b ofs k p, Mem.perm m1 b ofs k p -> Mem.perm m2 b ofs k p. Lemma needs_of_condition_sound: forall cond args b args', - eval_condition cond args m = Some b -> + eval_condition cond args m1 = Some b -> vagree_list args args' (needs_of_condition cond) -> - eval_condition cond args' m' = Some b. + eval_condition cond args' m2 = Some b. Proof. intros. unfold needs_of_condition in H0. eapply default_needs_of_condition_sound; eauto. Qed. +Let valid_pointer_inj: + forall b1 ofs b2 delta, + inject_id b1 = Some(b2, delta) -> + Mem.valid_pointer m1 b1 (Ptrofs.unsigned ofs) = true -> + Mem.valid_pointer m2 b2 (Ptrofs.unsigned (Ptrofs.add ofs (Ptrofs.repr delta))) = true. +Proof. + unfold inject_id; intros. inv H. rewrite Ptrofs.add_zero. + rewrite Mem.valid_pointer_nonempty_perm in *. eauto. +Qed. + +Let weak_valid_pointer_inj: + forall b1 ofs b2 delta, + inject_id b1 = Some(b2, delta) -> + Mem.weak_valid_pointer m1 b1 (Ptrofs.unsigned ofs) = true -> + Mem.weak_valid_pointer m2 b2 (Ptrofs.unsigned (Ptrofs.add ofs (Ptrofs.repr delta))) = true. +Proof. + unfold inject_id; intros. inv H. rewrite Ptrofs.add_zero. + rewrite Mem.weak_valid_pointer_spec in *. + rewrite ! Mem.valid_pointer_nonempty_perm in *. + destruct H0; [left|right]; eauto. +Qed. + +Let weak_valid_pointer_no_overflow: + forall b1 ofs b2 delta, + inject_id b1 = Some(b2, delta) -> + Mem.weak_valid_pointer m1 b1 (Ptrofs.unsigned ofs) = true -> + 0 <= Ptrofs.unsigned ofs + Ptrofs.unsigned (Ptrofs.repr delta) <= Ptrofs.max_unsigned. +Proof. + unfold inject_id; intros. inv H. rewrite Z.add_0_r. apply Ptrofs.unsigned_range_2. +Qed. + +Let valid_different_pointers_inj: + forall b1 ofs1 b2 ofs2 b1' delta1 b2' delta2, + b1 <> b2 -> + Mem.valid_pointer m1 b1 (Ptrofs.unsigned ofs1) = true -> + Mem.valid_pointer m1 b2 (Ptrofs.unsigned ofs2) = true -> + inject_id b1 = Some (b1', delta1) -> + inject_id b2 = Some (b2', delta2) -> + b1' <> b2' \/ + Ptrofs.unsigned (Ptrofs.add ofs1 (Ptrofs.repr delta1)) <> Ptrofs.unsigned (Ptrofs.add ofs2 (Ptrofs.repr delta2)). +Proof. + unfold inject_id; intros. left; congruence. +Qed. + +Lemma needs_of_condition0_sound: + forall cond arg1 b arg2, + eval_condition0 cond arg1 m1 = Some b -> + vagree arg1 arg2 All -> + eval_condition0 cond arg2 m2 = Some b. +Proof. + intros until arg2. + intros Hcond Hagree. + apply eval_condition0_inj with (f := inject_id) (m1 := m1) (v1 := arg1); simpl; auto. + apply val_inject_lessdef. apply lessdef_vagree. assumption. +Qed. + Lemma addl_sound: forall v1 w1 v2 w2 x, vagree v1 w1 (default x) -> vagree v2 w2 (default x) -> @@ -185,6 +242,132 @@ Proof. inv H. inv H0. trivial. Qed. + +Lemma select_sound: + forall cond v0 w0 v1 w1 v2 w2 x, + vagree v0 w0 (default x) -> + vagree v1 w1 (default x) -> + vagree v2 w2 (default x) -> + vagree (eval_select cond v0 v1 v2 m1) (eval_select cond w0 w1 w2 m2) x. +Proof. + intros. + destruct x; simpl in *; trivial. + - rewrite eval_select_to2. + rewrite eval_select_to2. + unfold eval_select2. + assert (Hneedstrue := (needs_of_condition0_sound cond v2 true w2)). + assert (Hneedsfalse := (needs_of_condition0_sound cond v2 false w2)). + destruct (eval_condition0 cond v2 m1) in *; simpl in *; trivial. + destruct b. + + rewrite Hneedstrue; trivial. + inv H; trivial. + destruct w0; trivial. + inv H0; trivial. + destruct w1; trivial. + apply iagree_refl. + + rewrite Hneedsfalse; trivial. + inv H; trivial. + destruct w0; trivial. + inv H0; trivial. + destruct w1; trivial. + apply iagree_refl. + - rewrite eval_select_to2. + rewrite eval_select_to2. + unfold eval_select2. + assert (Hneedstrue := (needs_of_condition0_sound cond v2 true w2)). + assert (Hneedsfalse := (needs_of_condition0_sound cond v2 false w2)). + destruct (eval_condition0 cond v2 m1) in *; simpl in *; trivial. + destruct b. + + rewrite Hneedstrue; trivial. + inv H; trivial. + destruct w0; trivial. + inv H0; trivial. + + rewrite Hneedsfalse; trivial. + inv H; trivial. + destruct w0; trivial. + inv H0; trivial. +Qed. + +Lemma selectl_sound: + forall cond v0 w0 v1 w1 v2 w2 x, + vagree v0 w0 (default x) -> + vagree v1 w1 (default x) -> + vagree v2 w2 (default x) -> + vagree (eval_selectl cond v0 v1 v2 m1) (eval_selectl cond w0 w1 w2 m2) x. +Proof. + intros. + destruct x; simpl in *; trivial. + - rewrite eval_selectl_to2. + rewrite eval_selectl_to2. + unfold eval_selectl2. + assert (Hneedstrue := (needs_of_condition0_sound cond v2 true w2)). + assert (Hneedsfalse := (needs_of_condition0_sound cond v2 false w2)). + destruct (eval_condition0 cond v2 m1) in *; simpl in *; trivial. + destruct b. + + rewrite Hneedstrue; trivial. + inv H; trivial. + destruct w0; trivial. + inv H0; trivial. + destruct w1; trivial. + + rewrite Hneedsfalse; trivial. + inv H; trivial. + destruct w0; trivial. + inv H0; trivial. + destruct w1; trivial. + - rewrite eval_selectl_to2. + rewrite eval_selectl_to2. + unfold eval_selectl2. + assert (Hneedstrue := (needs_of_condition0_sound cond v2 true w2)). + assert (Hneedsfalse := (needs_of_condition0_sound cond v2 false w2)). + destruct (eval_condition0 cond v2 m1) in *; simpl in *; trivial. + destruct b. + + rewrite Hneedstrue; trivial. + inv H; trivial. + destruct w0; trivial. + inv H0; trivial. + + rewrite Hneedsfalse; trivial. + inv H; trivial. + destruct w0; trivial. + inv H0; trivial. +Qed. + +Lemma selectf_sound: + forall v0 w0 v1 w1 v2 w2 x, + vagree v0 w0 (default x) -> + vagree v1 w1 (default x) -> + vagree v2 w2 (default x) -> + vagree (eval_selectf v0 v1 v2) (eval_selectf w0 w1 w2) x. +Proof. + unfold default; intros. + destruct x; trivial. + - destruct v2; simpl; trivial. + destruct v0; simpl; trivial. + destruct v1; simpl; trivial. + - destruct v2; simpl; trivial. + destruct v0; simpl; trivial. + destruct v1; simpl; trivial. + inv H. inv H0. inv H1. simpl. + constructor. +Qed. + +Lemma selectfs_sound: + forall v0 w0 v1 w1 v2 w2 x, + vagree v0 w0 (default x) -> + vagree v1 w1 (default x) -> + vagree v2 w2 (default x) -> + vagree (eval_selectfs v0 v1 v2) (eval_selectfs w0 w1 w2) x. +Proof. + unfold default; intros. + destruct x; trivial. + - destruct v2; simpl; trivial. + destruct v0; simpl; trivial. + destruct v1; simpl; trivial. + - destruct v2; simpl; trivial. + destruct v0; simpl; trivial. + destruct v1; simpl; trivial. + inv H. inv H0. inv H1. simpl. + constructor. +Qed. Remark default_idem: forall nv, default (default nv) = default nv. Proof. @@ -193,11 +376,11 @@ Qed. Lemma needs_of_operation_sound: forall op args v nv args', - eval_operation ge (Vptr sp Ptrofs.zero) op args m = Some v -> + eval_operation ge (Vptr sp Ptrofs.zero) op args m1 = Some v -> vagree_list args args' (needs_of_operation op nv) -> nv <> Nothing -> exists v', - eval_operation ge (Vptr sp Ptrofs.zero) op args' m' = Some v' + eval_operation ge (Vptr sp Ptrofs.zero) op args' m2 = Some v' /\ vagree v v' nv. Proof. unfold needs_of_operation; intros; destruct op; try (eapply default_needs_of_operation_sound; eauto; fail); @@ -238,12 +421,20 @@ Proof. apply mull_sound; trivial. rewrite default_idem; trivial. rewrite default_idem; trivial. + (* select *) +- apply select_sound; trivial. + (* selectl *) +- apply selectl_sound; trivial. + (* selectf *) +- apply selectf_sound; trivial. + (* selectfs *) +- apply selectfs_sound; trivial. Qed. Lemma operation_is_redundant_sound: forall op nv arg1 args v arg1' args', operation_is_redundant op nv = true -> - eval_operation ge (Vptr sp Ptrofs.zero) op (arg1 :: args) m = Some v -> + eval_operation ge (Vptr sp Ptrofs.zero) op (arg1 :: args) m1 = Some v -> vagree_list (arg1 :: args) (arg1' :: args') (needs_of_operation op nv) -> vagree v arg1' nv. Proof. diff --git a/mppa_k1c/Op.v b/mppa_k1c/Op.v index d533a504..045946fd 100644 --- a/mppa_k1c/Op.v +++ b/mppa_k1c/Op.v @@ -51,6 +51,18 @@ Inductive condition : Type := | Ccompfs (c: comparison) (**r 32-bit floating-point comparison *) | Cnotcompfs (c: comparison). (**r negation of a floating-point comparison *) +Inductive condition0 : Type := + | Ccomp0 (c: comparison) (**r signed integer comparison with 0 *) + | Ccompu0 (c: comparison) (**r unsigned integer comparison with 0 *) + | Ccompl0 (c: comparison) (**r signed 64-bit integer comparison with 0 *) + | Ccomplu0 (c: comparison). (**r unsigned 64-bit integer comparison with 0 *) + +Definition arg_type_of_condition0 (cond: condition0) := + match cond with + | Ccomp0 _ | Ccompu0 _ => Tint + | Ccompl0 _ | Ccomplu0 _ => Tlong + end. + (** Arithmetic and logical operations. In the descriptions, [rd] is the result of the operation and [r1], [r2], etc, are the arguments. *) @@ -181,7 +193,11 @@ Inductive operation : Type := | Osingleoflong (**r [rd = float32_of_signed_long(r1)] *) | Osingleoflongu (**r [rd = float32_of_unsigned_int(r1)] *) (*c Boolean tests: *) - | Ocmp (cond: condition). (**r [rd = 1] if condition holds, [rd = 0] otherwise. *) + | Ocmp (cond: condition) (**r [rd = 1] if condition holds, [rd = 0] otherwise. *) + | Oselect (cond: condition0) (**r [rd = if cond r3 then r2 else r1] *) + | Oselectl (cond: condition0)(**r [rd = if cond r3 then r2 else r1] *) + | Oselectf (**r [rd = if r3 then r2 else r1] *) + | Oselectfs. (**r [rd = if r3 then r2 else r1] *) (** Addressing modes. [r1], [r2], etc, are the arguments to the addressing. *) @@ -201,6 +217,13 @@ Proof. decide equality. Defined. +Definition eq_condition0 (x y: condition0) : {x=y} + {x<>y}. +Proof. + generalize Int.eq_dec Int64.eq_dec; intro. + assert (forall (x y: comparison), {x=y}+{x<>y}). decide equality. + decide equality. +Defined. + Definition eq_addressing (x y: addressing) : {x=y} + {x<>y}. Proof. generalize ident_eq Ptrofs.eq_dec; intros. @@ -209,7 +232,7 @@ Defined. Definition eq_operation: forall (x y: operation), {x=y} + {x<>y}. Proof. - generalize Int.eq_dec Int64.eq_dec Ptrofs.eq_dec Float.eq_dec Float32.eq_dec ident_eq eq_condition; intros. + generalize Int.eq_dec Int64.eq_dec Ptrofs.eq_dec Float.eq_dec Float32.eq_dec ident_eq eq_condition eq_condition0; intros. decide equality. Defined. @@ -233,7 +256,7 @@ Global Opaque eq_condition eq_addressing eq_operation. to lists of values. Return [None] when the computation can trigger an error, e.g. integer division by zero. [eval_condition] returns a boolean, [eval_operation] and [eval_addressing] return a value. *) - + Definition eval_condition (cond: condition) (vl: list val) (m: mem): option bool := match cond, vl with | Ccomp c, v1 :: v2 :: nil => Val.cmp_bool c v1 v2 @@ -250,6 +273,90 @@ Definition eval_condition (cond: condition) (vl: list val) (m: mem): option bool | Cnotcompfs c, v1 :: v2 :: nil => option_map negb (Val.cmpfs_bool c v1 v2) | _, _ => None end. + +Definition eval_condition0 (cond: condition0) (v1: val) (m: mem): option bool := + match cond with + | Ccomp0 c => Val.cmp_bool c v1 (Vint Int.zero) + | Ccompu0 c => Val.cmpu_bool (Mem.valid_pointer m) c v1 (Vint Int.zero) + | Ccompl0 c => Val.cmpl_bool c v1 (Vlong Int64.zero) + | Ccomplu0 c => Val.cmplu_bool (Mem.valid_pointer m) c v1 (Vlong Int64.zero) + end. + +Definition eval_select (cond : condition0) (v0 : val) (v1 : val) (vselect : val) (m: mem) : val := + match v0, v1, (eval_condition0 cond vselect m) with + | Vint i0, Vint i1, Some bval => Vint (if bval then i1 else i0) + | _,_,_ => Vundef + end. + +Definition eval_select2 (cond : condition0) (v0 : val) (v1 : val) (vselect : val) (m: mem) : val := + match (eval_condition0 cond vselect m), v0, v1 with + | Some bval, Vint i0, Vint i1 => Vint (if bval then i1 else i0) + | _,_,_ => Vundef + end. + +Lemma eval_select_to2: forall cond v0 v1 vselect m, + (eval_select cond v0 v1 vselect m) = + (eval_select2 cond v0 v1 vselect m). +Proof. + intros. + unfold eval_select2. + destruct v0; destruct v1; simpl; destruct (eval_condition0 cond vselect m); simpl; reflexivity. +Qed. + +Definition eval_selectl (cond: condition0) (v0 : val) (v1 : val) (vselect : val) (m: mem) : val := + match v0, v1, (eval_condition0 cond vselect m) with + | Vlong i0, Vlong i1, Some bval => Vlong (if bval then i1 else i0) + | _,_,_ => Vundef + end. + +Definition eval_selectl2 (cond : condition0) (v0 : val) (v1 : val) (vselect : val) (m: mem) : val := + match (eval_condition0 cond vselect m), v0, v1 with + | Some bval, Vlong i0, Vlong i1 => Vlong (if bval then i1 else i0) + | _,_,_ => Vundef + end. + +Lemma eval_selectl_to2: forall cond v0 v1 vselect m, + (eval_selectl cond v0 v1 vselect m) = + (eval_selectl2 cond v0 v1 vselect m). +Proof. + intros. + unfold eval_selectl2. + destruct v0; destruct v1; simpl; destruct (eval_condition0 cond vselect m); simpl; reflexivity. +Qed. + +Definition eval_selectf (v0 : val) (v1 : val) (vselect : val) : val := + match vselect with + | Vint iselect => + match v0 with + | Vfloat i0 => + match v1 with + | Vfloat i1 => + Vfloat (if Int.cmp Ceq Int.zero iselect + then i0 + else i1) + | _ => Vundef + end + | _ => Vundef + end + | _ => Vundef + end. + +Definition eval_selectfs (v0 : val) (v1 : val) (vselect : val) : val := + match vselect with + | Vint iselect => + match v0 with + | Vsingle i0 => + match v1 with + | Vsingle i1 => + Vsingle (if Int.cmp Ceq Int.zero iselect + then i0 + else i1) + | _ => Vundef + end + | _ => Vundef + end + | _ => Vundef + end. Definition eval_operation (F V: Type) (genv: Genv.t F V) (sp: val) @@ -379,6 +486,10 @@ Definition eval_operation | Osingleoflong, v1::nil => Val.singleoflong v1 | Osingleoflongu, v1::nil => Val.singleoflongu v1 | Ocmp c, _ => Some (Val.of_optbool (eval_condition c vl m)) + | (Oselect cond), v0::v1::vselect::nil => Some (eval_select cond v0 v1 vselect m) + | (Oselectl cond), v0::v1::vselect::nil => Some (eval_selectl cond v0 v1 vselect m) + | Oselectf, v0::v1::vselect::nil => Some (eval_selectf v0 v1 vselect) + | Oselectfs, v0::v1::vselect::nil => Some (eval_selectfs v0 v1 vselect) | _, _ => None end. @@ -567,6 +678,11 @@ Definition type_of_operation (op: operation) : list typ * typ := | Osingleoflong => (Tlong :: nil, Tsingle) | Osingleoflongu => (Tlong :: nil, Tsingle) | Ocmp c => (type_of_condition c, Tint) + + | Oselect cond => (Tint :: Tint :: (arg_type_of_condition0 cond) :: nil, Tint) + | Oselectl cond => (Tlong :: Tlong :: (arg_type_of_condition0 cond) :: nil, Tlong) + | Oselectf => (Tfloat :: Tfloat :: Tint :: nil, Tfloat) + | Oselectfs => (Tsingle :: Tsingle :: Tint :: nil, Tsingle) end. Definition type_of_addressing (addr: addressing) : list typ := @@ -802,6 +918,29 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type). - destruct v0; simpl in H0; inv H0... (* cmp *) - destruct (eval_condition cond vl m)... destruct b... + (* select *) + - destruct v0; destruct v1; simpl in *; try discriminate; trivial. + destruct cond; destruct v2; simpl in *; trivial. + + destruct Archi.ptr64; simpl; trivial. + destruct (_ && _); simpl; trivial. + destruct (Val.cmp_different_blocks _); simpl; trivial. + + destruct Archi.ptr64; simpl; trivial. + destruct (_ && _); simpl; trivial. + destruct (Val.cmp_different_blocks _); simpl; trivial. + (* selectl *) + - destruct v0; destruct v1; simpl in *; try discriminate; trivial. + destruct cond; destruct v2; simpl in *; trivial. + + destruct Archi.ptr64; simpl; trivial. + destruct (_ && _); simpl; trivial. + destruct (Val.cmp_different_blocks _); simpl; trivial. + + destruct Archi.ptr64; simpl; trivial. + destruct (_ && _); simpl; trivial. + destruct (Val.cmp_different_blocks _); simpl; trivial. + + (* selectf *) + - destruct v0; destruct v1; destruct v2; simpl in *; try discriminate; trivial. + (* selectfs *) + - destruct v0; destruct v1; destruct v2; simpl in *; try discriminate; trivial. Qed. End SOUNDNESS. @@ -962,6 +1101,13 @@ Definition op_depends_on_memory (op: operation) : bool := | Ocmp (Ccompuimm _ _) => negb Archi.ptr64 | Ocmp (Ccomplu _) => Archi.ptr64 | Ocmp (Ccompluimm _ _) => Archi.ptr64 + + | Oselect (Ccompu0 _) => negb Archi.ptr64 + | Oselect (Ccomplu0 _) => Archi.ptr64 + + | Oselectl (Ccompu0 _) => negb Archi.ptr64 + | Oselectl (Ccomplu0 _) => Archi.ptr64 + | _ => false end. @@ -970,9 +1116,10 @@ Lemma op_depends_on_memory_correct: op_depends_on_memory op = false -> eval_operation ge sp op args m1 = eval_operation ge sp op args m2. Proof. - intros until m2. destruct op; simpl; try congruence. + intros until m2. destruct op; simpl; try congruence; + destruct cond; simpl; intros SF; auto; rewrite ? negb_false_iff in SF; - unfold Val.cmpu_bool, Val.cmplu_bool; rewrite SF; reflexivity. + unfold eval_select, eval_selectl, eval_condition0, Val.cmpu_bool, Val.cmplu_bool; rewrite SF; reflexivity. Qed. (** Global variables mentioned in an operation or addressing mode *) @@ -1100,6 +1247,19 @@ Proof. - inv H3; inv H2; simpl in H0; inv H0; auto. Qed. +Lemma eval_condition0_inj: + forall cond v1 v2 b, + Val.inject f v1 v2 -> + eval_condition0 cond v1 m1 = Some b -> + eval_condition0 cond v2 m2 = Some b. +Proof. + intros. destruct cond; simpl in H0; FuncInv; InvInject; simpl; auto. + - inv H; simpl in *; congruence. + - eauto 3 using Val.cmpu_bool_inject, Mem.valid_pointer_implies. + - inv H; simpl in *; congruence. + - eauto 3 using Val.cmplu_bool_inject, Mem.valid_pointer_implies. +Qed. + Ltac TrivialExists := match goal with | [ |- exists v2, Some ?v1 = Some v2 /\ Val.inject _ _ v2 ] => @@ -1328,6 +1488,42 @@ Proof. exploit eval_condition_inj; eauto. intros EQ; rewrite EQ. destruct b; simpl; constructor. simpl; constructor. + (* select *) + - unfold eval_select. + inv H4; trivial. + inv H2; trivial. + inv H3; trivial; + try (destruct cond; simpl; trivial; fail). + destruct (eval_condition0 cond (Vptr _ _) m1) eqn:Hcond; trivial. + eassert (Hcond' : ((eval_condition0 cond (Vptr b2 (Ptrofs.add ofs1 (Ptrofs.repr delta)))) m2) = Some b). + * eapply eval_condition0_inj. + eapply Val.inject_ptr. + eassumption. + reflexivity. + assumption. + * rewrite Hcond'. constructor. + (* selectl *) + - unfold eval_selectl. + inv H4; trivial. + inv H2; trivial. + inv H3; trivial; + try (destruct cond; simpl; trivial; fail). + destruct (eval_condition0 cond (Vptr _ _) m1) eqn:Hcond; trivial. + eassert (Hcond' : ((eval_condition0 cond (Vptr b2 (Ptrofs.add ofs1 (Ptrofs.repr delta)))) m2) = Some b). + * eapply eval_condition0_inj. + eapply Val.inject_ptr. + eassumption. + reflexivity. + assumption. + * rewrite Hcond'. constructor. + (* selectf *) + - inv H3; simpl; try constructor. + inv H4; simpl; try constructor. + inv H2; simpl; constructor. + (* selectfs *) + - inv H3; simpl; try constructor. + inv H4; simpl; try constructor. + inv H2; simpl; constructor. Qed. Lemma eval_addressing_inj: diff --git a/mppa_k1c/PostpassSchedulingOracle.ml b/mppa_k1c/PostpassSchedulingOracle.ml index f7a35443..c4d8cd8d 100644 --- a/mppa_k1c/PostpassSchedulingOracle.ml +++ b/mppa_k1c/PostpassSchedulingOracle.ml @@ -128,6 +128,8 @@ let arith_rri64_str = function let arith_arrr_str = function | Pmaddw -> "Pmaddw" | Pmaddl -> "Pmaddl" + | Pcmove _ -> "Pcmove" + | Pcmoveu _ -> "Pcmoveu" let arith_ri32_str = "Pmake" @@ -420,7 +422,7 @@ type real_instruction = | Addw | Andw | Compw | Mulw | Orw | Sbfw | Sraw | Srlw | Sllw | Rorw | Xorw | Addd | Andd | Compd | Muld | Ord | Sbfd | Srad | Srld | Slld | Xord | Nandw | Norw | Nxorw | Nandd | Nord | Nxord | Andnw | Ornw | Andnd | Ornd - | Maddw | Maddd + | Maddw | Maddd | Cmoved | Make | Nop | Sxwd | Zxwd (* LSU *) | Lbs | Lbz | Lhs | Lhz | Lws | Ld @@ -487,7 +489,8 @@ let ab_inst_to_real = function | "Pfixedudrzz" -> Fixedudz | "Pfixeddrzz_i32" -> Fixeddz | "Pfixedudrzz_i32" -> Fixedudz - + | "Pcmove" | "Pcmoveu" -> Cmoved + | "Plb" -> Lbs | "Plbu" -> Lbz | "Plh" -> Lhs @@ -536,7 +539,7 @@ let rec_to_usage r = | Some U27L5 | Some U27L10 -> alu_tiny_x | _ -> raise InvalidEncoding) | Addd | Andd | Nandd | Ord | Nord | Sbfd | Xord - | Nxord | Andnd | Ornd -> + | Nxord | Andnd | Ornd | Cmoved -> (match encoding with None | Some U6 | Some S10 -> alu_tiny | Some U27L5 | Some U27L10 -> alu_tiny_x | Some E27U27L10 -> alu_tiny_y) @@ -589,7 +592,7 @@ let real_inst_to_latency = function | Rorw | Nandw | Norw | Nxorw | Ornw | Andnw | Nandd | Nord | Nxord | Ornd | Andnd | Addd | Andd | Compd | Ord | Sbfd | Srad | Srld | Slld | Xord | Make - | Sxwd | Zxwd | Fcompw | Fcompd + | Sxwd | Zxwd | Fcompw | Fcompd | Cmoved -> 1 | Floatwz | Floatuwz | Fixeduwz | Fixedwz | Floatdz | Floatudz | Fixeddz | Fixedudz -> 4 | Mulw | Muld | Maddw | Maddd -> 2 (* FIXME - WORST CASE. If it's S10 then it's only 1 *) diff --git a/mppa_k1c/SelectLong.vp b/mppa_k1c/SelectLong.vp index 31112dca..f8f5bf3b 100644 --- a/mppa_k1c/SelectLong.vp +++ b/mppa_k1c/SelectLong.vp @@ -258,9 +258,17 @@ Nondetfunction andl (e1: expr) (e2: expr) := | Eop (Olongconst n1) Enil, t2 => andlimm n1 t2 | t1, Eop (Olongconst n2) Enil => andlimm n2 t1 | (Eop Onotl (t1:::Enil)), t2 => Eop Oandnl (t1:::t2:::Enil) - | t1, (Eop Onotl (t2:::Enil)) => Eop Oandnl (t2:::t1:::Enil) + | t1, (Eop Onotl (t2:::Enil)) => Eop Oandnl (t2:::t1:::Enil) | _, _ => Eop Oandl (e1:::e2:::Enil) end. +(* + | (Eop Ocast32signed + ((Eop Oneg ((Eop (Ocmp (Ccomplimm Cne zero1)) + (y1:::Enil)):::Enil)):::Enil)), v1 => + if Int64.eq zero1 Int64.zero + then Eop Oselectl ((Eop (Olongconst Int64.zero) Enil):::v1:::y1:::Enil) + else Eop Oandl (e1:::e2:::Enil) +*) Nondetfunction orlimm (n1: int64) (e2: expr) := if Int64.eq n1 Int64.zero then e2 else @@ -277,9 +285,23 @@ Nondetfunction orl (e1: expr) (e2: expr) := | Eop (Olongconst n1) Enil, t2 => orlimm n1 t2 | t1, Eop (Olongconst n2) Enil => orlimm n2 t1 | (Eop Onotl (t1:::Enil)), t2 => Eop Oornl (t1:::t2:::Enil) - | t1, (Eop Onotl (t2:::Enil)) => Eop Oornl (t2:::t1:::Enil) + | t1, (Eop Onotl (t2:::Enil)) => Eop Oornl (t2:::t1:::Enil) | _, _ => Eop Oorl (e1:::e2:::Enil) end. + + (* + | (Eop Oandl ((Eop Ocast32signed + ((Eop Oneg ((Eop (Ocmp (Ccomplimm Ceq zero0)) + (y0:::Enil)):::Enil)):::Enil)):::v0:::Enil)), + (Eop Oandl ((Eop Ocast32signed + ((Eop Oneg ((Eop (Ocmp (Ccomplimm Cne zero1)) + (y1:::Enil)):::Enil)):::Enil)):::v1:::Enil)) => + if same_expr_pure y0 y1 + && Int64.eq zero0 Int64.zero + && Int64.eq zero1 Int64.zero + then Eop Oselectl (v0:::v1:::y0:::Enil) + else Eop Oorl (e1:::e2:::Enil) + *) Nondetfunction xorlimm (n1: int64) (e2: expr) := if Int64.eq n1 Int64.zero then e2 else diff --git a/mppa_k1c/SelectLongproof.v b/mppa_k1c/SelectLongproof.v index 51b989d6..e18de2ee 100644 --- a/mppa_k1c/SelectLongproof.v +++ b/mppa_k1c/SelectLongproof.v @@ -390,6 +390,15 @@ Proof. - TrivialExists. Qed. +Lemma int64_eq_commut: forall x y : int64, + (Int64.eq x y) = (Int64.eq y x). +Proof. + intros. + predSpec Int64.eq Int64.eq_spec x y; + predSpec Int64.eq Int64.eq_spec y x; + congruence. +Qed. + Theorem eval_andl: binary_constructor_sound andl Val.andl. Proof. unfold andl; destruct Archi.splitlong. apply SplitLongproof.eval_andl. @@ -398,6 +407,25 @@ Proof. - InvEval. apply eval_andlimm; auto. - (*andn*) InvEval. TrivialExists. simpl. congruence. - (*andn reverse*) InvEval. rewrite Val.andl_commut. TrivialExists; simpl. congruence. + (* +- (* selectl *) + InvEval. + predSpec Int64.eq Int64.eq_spec zero1 Int64.zero; simpl; TrivialExists. + + constructor. econstructor; constructor. + constructor; try constructor; try constructor; try eassumption. + + simpl in *. f_equal. inv H6. + unfold selectl. + simpl. + destruct v3; simpl; trivial. + rewrite int64_eq_commut. + destruct (Int64.eq i Int64.zero); simpl. + * replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve. + destruct y; simpl; trivial. + * replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve. + destruct y; simpl; trivial. + rewrite Int64.and_commut. rewrite Int64.and_mone. reflexivity. + + constructor. econstructor. constructor. econstructor. constructor. econstructor. constructor. eassumption. constructor. simpl. f_equal. constructor. simpl. f_equal. constructor. simpl. f_equal. constructor. eassumption. constructor. + + simpl in *. congruence. *) - TrivialExists. Qed. @@ -414,6 +442,7 @@ Proof. - TrivialExists. Qed. + Theorem eval_orl: binary_constructor_sound orl Val.orl. Proof. unfold orl; destruct Archi.splitlong. apply SplitLongproof.eval_orl. @@ -423,6 +452,81 @@ Proof. - InvEval. apply eval_orlimm; auto. - (*orn*) InvEval. TrivialExists; simpl; congruence. - (*orn reversed*) InvEval. rewrite Val.orl_commut. TrivialExists; simpl; congruence. + (* + - (* selectl *) + destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try TrivialExists. + predSpec Int64.eq Int64.eq_spec zero0 Int64.zero; simpl; try TrivialExists. + predSpec Int64.eq Int64.eq_spec zero1 Int64.zero; simpl; [ | TrivialExists]. + inv H. + inv H0. + inv H6. + inv H3. + inv H2. + inv H7. + inv H4. + inv H3. + inv H6. + inv H4. + inv H3. + inv H14. + inv H13. + inv H6. + inv H4. + inv H13. + inv H14. + inv H9. + inv H11. + inv H13. + inv H3. + inv H6. + inv H7. + inv H3. + inv H14. + inv H17. + simpl in *. + inv H8. + inv H5. + inv H10. + inv H12. + inv H15. + inv H16. + inv H11. + inv H13. + unfold same_expr_pure in PURE. + destruct y0; try congruence. + destruct y1; try congruence. + destruct (ident_eq i i0); try congruence; clear PURE. + rewrite <- e0 in *; clear e0. + inv H6. + inv H7. + rename v10 into vtest. + replace v11 with vtest in * by congruence. + TrivialExists. + simpl. + f_equal. + unfold selectl. + destruct vtest; simpl; trivial. + rewrite Val.andl_commut. + destruct v4; simpl; trivial. + rewrite Val.andl_commut. + rewrite Val.orl_commut. + destruct v9; simpl; trivial. + rewrite int64_eq_commut. + destruct (Int64.eq i1 Int64.zero); simpl. + + + replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve. + replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve. + rewrite Int64.and_mone. + rewrite Int64.and_zero. + rewrite Int64.or_commut. + rewrite Int64.or_zero. + reflexivity. + + replace (Int64.repr (Int.signed (Int.neg Int.one))) with Int64.mone by Int64.bit_solve. + replace (Int64.repr (Int.signed (Int.neg Int.zero))) with Int64.zero by Int64.bit_solve. + rewrite Int64.and_mone. + rewrite Int64.and_zero. + rewrite Int64.or_zero. + reflexivity. *) - TrivialExists. Qed. diff --git a/mppa_k1c/SelectOp.vp b/mppa_k1c/SelectOp.vp index d82fe238..eeb3ffae 100644 --- a/mppa_k1c/SelectOp.vp +++ b/mppa_k1c/SelectOp.vp @@ -61,6 +61,21 @@ Section SELECT. Context {hf: helper_functions}. +(** Ternary operator *) +Definition select_base o0 o1 oselect := + Eop (Oselect (Ccomp0 Cne)) + (o0:::o1:::oselect:::Enil). + +Definition select o0 o1 oselect := + select_base o0 o1 oselect. + +Definition selectl_base o0 o1 oselect := + Eop (Oselectl (Ccomp0 Cne)) + (o0:::o1:::oselect:::Enil). + +Definition selectl o0 o1 oselect := + selectl_base o0 o1 oselect. + (** ** Constants **) Definition addrsymbol (id: ident) (ofs: ptrofs) := @@ -275,7 +290,25 @@ Nondetfunction or (e1: expr) (e2: expr) := then Eop (Ororimm n2) (t1:::Enil) else Eop Oor (e1:::e2:::Enil) | (Eop Onot (t1:::Enil)), t2 => Eop Oorn (t1:::t2:::Enil) - | t1, (Eop Onot (t2:::Enil)) => Eop Oorn (t2:::t1:::Enil) + | t1, (Eop Onot (t2:::Enil)) => Eop Oorn (t2:::t1:::Enil) + | (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompimm Ceq zero0)) + (y0:::Enil)):::Enil)):::v0:::Enil)), + (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompimm Cne zero1)) + (y1:::Enil)):::Enil)):::v1:::Enil)) => + if same_expr_pure y0 y1 + && Int.eq zero0 Int.zero + && Int.eq zero1 Int.zero + then select_base v0 v1 y0 + else Eop Oor (e1:::e2:::Enil) + | (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompuimm Ceq zero0)) + (y0:::Enil)):::Enil)):::v0:::Enil)), + (Eop Oand ((Eop Oneg ((Eop (Ocmp (Ccompuimm Cne zero1)) + (y1:::Enil)):::Enil)):::v1:::Enil)) => + if same_expr_pure y0 y1 + && Int.eq zero0 Int.zero + && Int.eq zero1 Int.zero + then select_base v0 v1 y0 + else Eop Oor (e1:::e2:::Enil) | _, _ => Eop Oor (e1:::e2:::Enil) end. diff --git a/mppa_k1c/SelectOpproof.v b/mppa_k1c/SelectOpproof.v index d426e4f1..4af5ccfa 100644 --- a/mppa_k1c/SelectOpproof.v +++ b/mppa_k1c/SelectOpproof.v @@ -92,7 +92,7 @@ Let ge := Genv.globalenv prog. Variable sp: val. Variable e: env. Variable m: mem. - + (* Helper lemmas - from SplitLongproof.v *) Ltac UseHelper := decompose [Logic.and] arith_helpers_correct; eauto. @@ -162,7 +162,7 @@ Definition binary_constructor_sound (cstr: expr -> expr -> expr) (sem: val -> va eval_expr ge sp e m le a x -> eval_expr ge sp e m le b y -> exists v, eval_expr ge sp e m le (cstr a b) v /\ Val.lessdef (sem x y) v. - + Theorem eval_addrsymbol: forall le id ofs, exists v, eval_expr ge sp e m le (addrsymbol id ofs) v /\ Val.lessdef (Genv.symbol_address ge id ofs) v. @@ -526,6 +526,15 @@ Proof. discriminate. Qed. +Lemma int_eq_commut: forall x y : int, + (Int.eq x y) = (Int.eq y x). +Proof. + intros. + predSpec Int.eq Int.eq_spec x y; + predSpec Int.eq Int.eq_spec y x; + congruence. +Qed. + Theorem eval_or: binary_constructor_sound or Val.or. Proof. unfold or; red; intros. @@ -553,6 +562,83 @@ Proof. exists (Val.ror v1 (Vint n2)); split. EvalOp. rewrite Val.or_commut. apply ROR; auto. - (*orn*) TrivialExists; simpl; congruence. - (*orn reversed*) rewrite Val.or_commut. TrivialExists; simpl; congruence. + - (* select *) + destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try exact DEFAULT. + predSpec Int.eq Int.eq_spec zero0 Int.zero; simpl; try exact DEFAULT. + predSpec Int.eq Int.eq_spec zero1 Int.zero; simpl; try exact DEFAULT. + TrivialExists. + simpl in *. + unfold eval_select. + f_equal. + inv H6. + inv H7. + inv H9. + inv H11. + unfold same_expr_pure in PURE. + destruct y0; try congruence. + destruct y1; try congruence. + destruct (ident_eq i i0); try congruence. + rewrite <- e0 in *. clear e0. clear PURE. + inv H2. inv H5. + replace v8 with v4 in * by congruence. + rename v4 into vselect. + destruct vselect; simpl; trivial; + destruct v5; simpl; trivial; destruct v9; simpl; trivial; + destruct (Int.eq i1 Int.zero); simpl; trivial. + + rewrite Int.neg_zero. + rewrite Int.and_commut. + rewrite Int.and_mone. + rewrite Int.and_commut. + rewrite Int.and_zero. + rewrite Int.or_zero. + reflexivity. + + rewrite Int.neg_zero. + rewrite Int.and_commut. + rewrite Int.and_zero. + rewrite Int.and_commut. + rewrite Int.and_mone. + rewrite Int.or_commut. + rewrite Int.or_zero. + reflexivity. + - (* select unsigned *) + destruct (same_expr_pure y0 y1) eqn:PURE; simpl; try exact DEFAULT. + predSpec Int.eq Int.eq_spec zero0 Int.zero; simpl; try exact DEFAULT. + predSpec Int.eq Int.eq_spec zero1 Int.zero; simpl; try exact DEFAULT. + TrivialExists. + simpl in *. + unfold eval_select. + f_equal. + inv H6. + inv H7. + inv H9. + inv H11. + unfold same_expr_pure in PURE. + destruct y0; try congruence. + destruct y1; try congruence. + destruct (ident_eq i i0); try congruence. + rewrite <- e0 in *. clear e0. clear PURE. + inv H2. inv H5. + replace v8 with v4 in * by congruence. + rename v4 into vselect. + destruct vselect; simpl; trivial; + destruct v5; simpl; trivial; + destruct v9; simpl; trivial; + destruct (Int.eq i1 Int.zero); simpl; trivial. + + rewrite Int.neg_zero. + rewrite Int.and_commut. + rewrite Int.and_mone. + rewrite Int.and_commut. + rewrite Int.and_zero. + rewrite Int.or_zero. + reflexivity. + + rewrite Int.neg_zero. + rewrite Int.and_commut. + rewrite Int.and_zero. + rewrite Int.and_commut. + rewrite Int.and_mone. + rewrite Int.or_commut. + rewrite Int.or_zero. + reflexivity. - apply DEFAULT. Qed. diff --git a/mppa_k1c/TargetPrinter.ml b/mppa_k1c/TargetPrinter.ml index ef02c25a..6f292460 100644 --- a/mppa_k1c/TargetPrinter.ml +++ b/mppa_k1c/TargetPrinter.ml @@ -529,6 +529,10 @@ module Target (*: TARGET*) = | Pmaddil (rd, rs, imm) -> fprintf oc " maddd %a = %a, %a\n" ireg rd ireg rs coqint64 imm + | Pcmove (bt, rd, rcond, rs) | Pcmoveu (bt, rd, rcond, rs) -> + fprintf oc " cmoved.%a %a? %a = %a\n" + bcond bt ireg rcond ireg rd ireg rs + let get_section_names name = let (text, lit) = match C2C.atom_sections name with diff --git a/mppa_k1c/ValueAOp.v b/mppa_k1c/ValueAOp.v index a54dbd8f..62cfa85e 100644 --- a/mppa_k1c/ValueAOp.v +++ b/mppa_k1c/ValueAOp.v @@ -42,6 +42,44 @@ Definition eval_static_addressing (addr: addressing) (vl: list aval): aval := | _, _ => Vbot end. +Definition eval_static_condition0 (cond : condition0) (v : aval) : abool := + match cond with + | Ccomp0 c => cmp_bool c v (I Int.zero) + | Ccompu0 c => cmpu_bool c v (I Int.zero) + | Ccompl0 c => cmpl_bool c v (L Int64.zero) + | Ccomplu0 c => cmplu_bool c v (L Int64.zero) + end. + +Definition eval_static_select (cond : condition0) (v0 v1 vselect : aval) : aval := + match eval_static_condition0 cond vselect with + | Just b => binop_int (fun x0 x1 => if b then x1 else x0) v0 v1 + | _ => Vtop + end. + +Definition eval_static_selectl (cond : condition0) (v0 v1 vselect : aval) : aval := + match eval_static_condition0 cond vselect with + | Just b => binop_long (fun x0 x1 => if b then x1 else x0) v0 v1 + | _ => Vtop + end. + +Definition eval_static_selectf (v0 v1 vselect : aval) : aval := + match vselect with + | I iselect => + if Int.eq Int.zero iselect + then binop_float (fun x0 x1 => x0) v0 v1 + else binop_float (fun x0 x1 => x1) v0 v1 + | _ => Vtop + end. + +Definition eval_static_selectfs (v0 v1 vselect : aval) : aval := + match vselect with + | I iselect => + if Int.eq Int.zero iselect + then binop_single (fun x0 x1 => x0) v0 v1 + else binop_single (fun x0 x1 => x1) v0 v1 + | _ => Vtop + end. + Definition eval_static_operation (op: operation) (vl: list aval): aval := match op, vl with | Omove, v1::nil => v1 @@ -166,6 +204,10 @@ Definition eval_static_operation (op: operation) (vl: list aval): aval := | Osingleoflong, v1::nil => singleoflong v1 | Osingleoflongu, v1::nil => singleoflongu v1 | Ocmp c, _ => of_optbool (eval_static_condition c vl) + | (Oselect cond), v0::v1::vselect::nil => eval_static_select cond v0 v1 vselect + | (Oselectl cond), v0::v1::vselect::nil => eval_static_selectl cond v0 v1 vselect + | Oselectf, v0::v1::vselect::nil => eval_static_selectf v0 v1 vselect + | Oselectfs, v0::v1::vselect::nil => eval_static_selectfs v0 v1 vselect | _, _ => Vbot end. @@ -191,6 +233,15 @@ Proof. destruct cond; auto with va. Qed. +Theorem eval_static_condition0_sound: + forall cond varg m aarg, + vmatch bc varg aarg -> + cmatch (eval_condition0 cond varg m) (eval_static_condition0 cond aarg). +Proof. + intros until aarg; intro VM. + destruct cond; simpl; eauto with va. +Qed. + Lemma symbol_address_sound: forall id ofs, vmatch bc (Genv.symbol_address ge id ofs) (Ptr (Gl id ofs)). @@ -236,12 +287,44 @@ Theorem eval_static_operation_sound: list_forall2 (vmatch bc) vargs aargs -> vmatch bc vres (eval_static_operation op aargs). Proof. - unfold eval_operation, eval_static_operation; intros; + unfold eval_operation, eval_static_operation, eval_static_select, eval_static_selectl, eval_static_selectf, eval_static_selectfs; intros; destruct op; InvHyps; eauto with va. destruct (propagate_float_constants tt); constructor. destruct (propagate_float_constants tt); constructor. rewrite Ptrofs.add_zero_l; eauto with va. apply of_optbool_sound. eapply eval_static_condition_sound; eauto. + (* select *) + - assert (Hcond : (cmatch (eval_condition0 cond a2 m) (eval_static_condition0 cond b2))) by (apply eval_static_condition0_sound; assumption). + rewrite eval_select_to2. + unfold eval_select2. + inv Hcond; trivial; try constructor. + + apply binop_int_sound; assumption. + + destruct a1; destruct a0; try apply vmatch_ifptr_undef. + apply vmatch_ifptr_i. + + destruct (eval_condition0 cond a2 m); destruct a1; destruct a0; try apply vmatch_ifptr_undef. + apply vmatch_ifptr_i. + (* selectl *) + - assert (Hcond : (cmatch (eval_condition0 cond a2 m) (eval_static_condition0 cond b2))) by (apply eval_static_condition0_sound; assumption). + rewrite eval_selectl_to2. + unfold eval_selectl2. + inv Hcond; trivial; try constructor. + + apply binop_long_sound; assumption. + + destruct a1; destruct a0; try apply vmatch_ifptr_undef. + apply vmatch_ifptr_l. + + destruct (eval_condition0 cond a2 m); destruct a1; destruct a0; try apply vmatch_ifptr_undef. + apply vmatch_ifptr_l. + (* selectf *) + - inv H2; simpl; try constructor. + + destruct (Int.eq _ _); apply binop_float_sound; trivial. + + destruct (Int.eq _ _); destruct a1; destruct a0; eauto; constructor. + + destruct (Int.eq _ _); destruct a1; destruct a0; eauto; constructor. + + destruct (Int.eq _ _); destruct a1; destruct a0; eauto; constructor. + (* selectfs *) + - inv H2; simpl; try constructor. + + destruct (Int.eq _ _); apply binop_single_sound; trivial. + + destruct (Int.eq _ _); destruct a1; destruct a0; eauto; constructor. + + destruct (Int.eq _ _); destruct a1; destruct a0; eauto; constructor. + + destruct (Int.eq _ _); destruct a1; destruct a0; eauto; constructor. Qed. End SOUNDNESS. diff --git a/test/monniaux/bitsliced-aes/bs.c b/test/monniaux/bitsliced-aes/bs.c index 4a9df4aa..929fd24d 100644 --- a/test/monniaux/bitsliced-aes/bs.c +++ b/test/monniaux/bitsliced-aes/bs.c @@ -2,6 +2,12 @@ #include <string.h> #include "bs.h" +#if defined(__K1C__) && defined(__COMPCERT__) +#define TERNARY(x, v0, v1) __builtin_ternary_ulong((x)!=0, (v1), (v0)) +#else +#define TERNARY(x, v0, v1) ((x) ? (v1) : (v0)) +#endif + #if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) ||\ defined(__amd64__) || defined(__amd32__)|| defined(__amd16__) #define bs2le(x) (x) @@ -14,12 +20,6 @@ #error "endianness not supported" #endif -#if 1 -#define TERNARY_XY0(t, x) ((-((t) != 0)) & (x)) -#else -#define TERNARY_XY0(t, x) (((t) != 0) ? (x) : (0)) -#endif - void bs_addroundkey(word_t * B, word_t * rk) { int i; @@ -393,15 +393,23 @@ void bs_transpose_dst(word_t * transpose, word_t * blocks) int offset = i << MUL_SHIFT; #ifndef UNROLL_TRANSPOSE - /* DM experiments */ - /* The normal ternary operator costs us a lot! - from 10145951 to 7995063 */ - int j; + int j; +#ifdef __COMPCERT__ + word_t *transptr = transpose+offset; + word_t bitmask = ONE; + for(j=0; j < WORD_SIZE; j++) + { + word_t old = *transptr; + *(transptr++) = TERNARY(w & bitmask, old, old|bitpos); + bitmask <<= 1; + } +#else for(j=0; j < WORD_SIZE; j++) { // TODO make const time - transpose[offset + j] |= TERNARY_XY0(w & (ONE << j), bitpos); + transpose[offset + j] |= (w & (ONE << j)) ? bitpos : 0; } +#endif #else transpose[(offset)+ 0 ] |= (w & (ONE << 0 )) ? (bitpos) : 0; @@ -494,11 +502,23 @@ void bs_transpose_rev(word_t * blocks) word_t offset = k / WORD_SIZE; #ifndef UNROLL_TRANSPOSE int j; +#ifdef __COMPCERT__ + word_t *transptr = transpose + offset; + word_t bitmask = ONE; for(j=0; j < WORD_SIZE; j++) { - word_t bit = TERNARY_XY0((w & (ONE << j)), (ONE << (k % WORD_SIZE))); + word_t old = *transptr; + *transptr = TERNARY(w & bitmask, old, old | bitpos); + transptr += WORDS_PER_BLOCK; + bitmask <<= 1; + } +#else + for(j=0; j < WORD_SIZE; j++) + { + word_t bit = (w & (ONE << j)) ? (ONE << (k % WORD_SIZE)) : 0; transpose[j * WORDS_PER_BLOCK + (offset)] |= bit; } +#endif #else transpose[0 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 0 )) ? bitpos : 0; transpose[1 * WORDS_PER_BLOCK + (offset )] |= (w & (ONE << 1 )) ? bitpos : 0; diff --git a/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized b/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized new file mode 100644 index 00000000..d939f856 --- /dev/null +++ b/test/monniaux/bitsliced-aes/bs.ccomp.k1c.s.optimized @@ -0,0 +1,3268 @@ +# File generated by CompCert 3.5 +# Command line: -O3 -Wall -Wno-c11-extensions -fno-unprototyped -S bs.c -o bs.ccomp.k1c.s + .text + .balign 2 + .globl bs_addroundkey +bs_addroundkey: + addd $r17 = $r12, 0 + addd $r12 = $r12, -16 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + make $r5, 0 +;; +.L100: + sxwd $r6 = $r5 + addw $r5 = $r5, 1 + make $r32, 128 +;; + slld $r2 = $r6, 3 + compw.lt $r32 = $r5, $r32 +;; + addd $r3 = $r0, $r2 + addd $r4 = $r1, $r2 +;; + ld $r7 = 0[$r3] +;; + ld $r9 = 0[$r4] +;; + xord $r6 = $r7, $r9 +;; + sd 0[$r3] = $r6 +;; + cb.wnez $r32? .L100 +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 16 +;; + ret +;; + .type bs_addroundkey, @function + .size bs_addroundkey, . - bs_addroundkey + .text + .balign 2 + .globl bs_apply_sbox +bs_apply_sbox: + addd $r17 = $r12, 0 + addd $r12 = $r12, -32 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 +;; + sd 24[$r12] = $r19 + make $r19, 0 +;; +.L101: + sxwd $r1 = $r19 +;; + slld $r0 = $r1, 3 +;; + addd $r0 = $r18, $r0 + call bs_sbox +;; + addw $r19 = $r19, 8 + make $r32, 128 +;; + compw.lt $r32 = $r19, $r32 +;; + cb.wnez $r32? .L101 +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 32 +;; + ret +;; + .type bs_apply_sbox, @function + .size bs_apply_sbox, . - bs_apply_sbox + .text + .balign 2 + .globl bs_apply_sbox_rev +bs_apply_sbox_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -32 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 +;; + sd 24[$r12] = $r19 + make $r19, 0 +;; +.L102: + sxwd $r1 = $r19 +;; + slld $r0 = $r1, 3 +;; + addd $r0 = $r18, $r0 + call bs_sbox_rev +;; + addw $r19 = $r19, 8 + make $r32, 128 +;; + compw.lt $r32 = $r19, $r32 +;; + cb.wnez $r32? .L102 +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 32 +;; + ret +;; + .type bs_apply_sbox_rev, @function + .size bs_apply_sbox_rev, . - bs_apply_sbox_rev + .text + .balign 2 + .globl bs_sbox_rev +bs_sbox_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -96 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 +;; + sd 24[$r12] = $r19 +;; + ld $r7 = 48[$r0] +;; + ld $r3 = 56[$r0] +;; + ld $r4 = 32[$r0] + nxord $r40 = $r3, $r7 +;; + xord $r10 = $r3, $r4 + nxord $r11 = $r7, $r4 + ld $r2 = 8[$r0] +;; + ld $r5 = 24[$r0] + nxord $r45 = $r7, $r10 + xord $r59 = $r7, $r2 +;; + xord $r41 = $r4, $r5 + ld $r1 = 0[$r0] + xord $r60 = $r5, $r45 + andd $r33 = $r10, $r45 +;; + nxord $r35 = $r5, $r1 + xord $r63 = $r2, $r1 + nxord $r39 = $r1, $r41 + ld $r6 = 40[$r0] +;; + xord $r46 = $r11, $r63 + xord $r54 = $r40, $r35 + ld $r7 = 16[$r0] + nxord $r57 = $r6, $r5 +;; + xord $r52 = $r40, $r63 + xord $r50 = $r41, $r63 + nxord $r47 = $r6, $r7 + nxord $r38 = $r7, $r2 +;; + nxord $r58 = $r6, $r46 + xord $r19 = $r11, $r47 + xord $r63 = $r59, $r57 + xord $r7 = $r41, $r38 +;; + xord $r44 = $r35, $r59 + xord $r18 = $r3, $r47 + xord $r3 = $r54, $r7 + xord $r55 = $r54, $r38 +;; + nxord $r34 = $r6, $r41 + xord $r2 = $r50, $r63 + andd $r57 = $r52, $r19 + andd $r17 = $r50, $r63 +;; + xord $r36 = $r55, $r57 + andd $r62 = $r46, $r18 + andd $r53 = $r11, $r39 + xord $r6 = $r2, $r17 +;; + andd $r42 = $r44, $r58 + andd $r15 = $r41, $r3 + andd $r2 = $r60, $r7 + andd $r37 = $r40, $r54 +;; + xord $r59 = $r62, $r57 + xord $r51 = $r42, $r17 + xord $r8 = $r2, $r15 + xord $r4 = $r37, $r15 +;; + xord $r5 = $r36, $r33 + xord $r38 = $r59, $r35 + xord $r48 = $r6, $r53 + xord $r47 = $r51, $r4 +;; + xord $r53 = $r5, $r8 + xord $r43 = $r38, $r4 + xord $r56 = $r48, $r8 + xord $r57 = $r47, $r34 +;; + xord $r49 = $r56, $r57 + andd $r48 = $r56, $r53 + xord $r47 = $r53, $r43 + andd $r9 = $r53, $r57 +;; + xord $r36 = $r43, $r48 + xord $r35 = $r57, $r48 + andd $r62 = $r47, $r9 + xord $r17 = $r47, $r48 +;; + andd $r15 = $r35, $r47 + andd $r42 = $r36, $r49 + andd $r47 = $r43, $r56 + xord $r59 = $r49, $r48 +;; + andd $r37 = $r49, $r47 + xord $r5 = $r43, $r15 + xord $r4 = $r62, $r17 + xord $r55 = $r57, $r42 +;; + xord $r1 = $r37, $r59 + xord $r2 = $r5, $r55 + xord $r47 = $r5, $r4 + andd $r35 = $r4, $r39 +;; + xord $r61 = $r4, $r1 + xord $r33 = $r55, $r1 + andd $r62 = $r1, $r45 + andd $r45 = $r55, $r18 +;; + xord $r48 = $r2, $r61 + andd $r49 = $r2, $r3 + andd $r6 = $r1, $r10 + andd $r3 = $r47, $r50 +;; + andd $r56 = $r47, $r63 + andd $r42 = $r5, $r58 + andd $r1 = $r4, $r11 + andd $r57 = $r2, $r41 +;; + andd $r9 = $r61, $r54 + andd $r51 = $r33, $r52 + andd $r58 = $r55, $r46 + andd $r53 = $r5, $r44 +;; + andd $r41 = $r48, $r60 + andd $r10 = $r61, $r40 + xord $r59 = $r49, $r57 + xord $r61 = $r3, $r1 +;; + andd $r34 = $r33, $r19 + andd $r39 = $r48, $r7 + xord $r55 = $r9, $r41 + xord $r60 = $r45, $r6 +;; + xord $r48 = $r62, $r35 + xord $r15 = $r56, $r53 + xord $r44 = $r59, $r61 + xord $r49 = $r51, $r10 +;; + xord $r54 = $r34, $r42 + xord $r51 = $r58, $r60 + xord $r59 = $r59, $r48 + xord $r8 = $r56, $r42 +;; + xord $r47 = $r9, $r1 + xord $r11 = $r60, $r15 + xord $r40 = $r55, $r44 + xord $r60 = $r15, $r51 +;; + xord $r52 = $r56, $r41 + xord $r56 = $r10, $r54 + xord $r2 = $r49, $r59 + xord $r5 = $r59, $r60 +;; + xord $r7 = $r3, $r55 + xord $r61 = $r51, $r56 + xord $r59 = $r47, $r11 + xord $r47 = $r8, $r40 +;; + xord $r63 = $r35, $r39 + xord $r4 = $r34, $r45 + sd 88[$r12] = $r47 + xord $r51 = $r2, $r59 +;; + xord $r10 = $r55, $r48 + xord $r50 = $r44, $r63 + sd 80[$r12] = $r51 + xord $r37 = $r7, $r5 +;; + xord $r53 = $r54, $r44 + sd 72[$r12] = $r37 + xord $r37 = $r4, $r40 + xord $r40 = $r50, $r61 +;; + xord $r1 = $r58, $r57 + sd 64[$r12] = $r37 + xord $r46 = $r10, $r53 + xord $r7 = $r52, $r50 +;; + sd 56[$r12] = $r40 + xord $r49 = $r49, $r1 + addd $r1 = $r12, 32 + make $r2, 64 +;; + sd 48[$r12] = $r46 +;; + sd 40[$r12] = $r7 +;; + sd 32[$r12] = $r49 + call memmove +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 96 +;; + ret +;; + .type bs_sbox_rev, @function + .size bs_sbox_rev, . - bs_sbox_rev + .text + .balign 2 + .globl bs_sbox +bs_sbox: + addd $r17 = $r12, 0 + addd $r12 = $r12, -80 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + ld $r5 = 56[$r0] +;; + ld $r6 = 32[$r0] +;; + xord $r41 = $r5, $r6 + ld $r2 = 16[$r0] +;; + xord $r42 = $r5, $r2 + ld $r4 = 8[$r0] + xord $r49 = $r6, $r2 +;; + xord $r48 = $r5, $r4 + ld $r55 = 24[$r0] +;; + xord $r9 = $r55, $r4 + ld $r3 = 48[$r0] +;; + xord $r5 = $r41, $r9 + ld $r7 = 40[$r0] + xord $r34 = $r3, $r2 +;; + xord $r10 = $r3, $r7 + ld $r1 = 0[$r0] + xord $r11 = $r7, $r2 + xord $r3 = $r48, $r49 +;; + xord $r33 = $r9, $r34 + xord $r8 = $r9, $r11 + xord $r44 = $r6, $r1 + xord $r47 = $r4, $r1 +;; + xord $r59 = $r1, $r10 + xord $r7 = $r5, $r34 + xord $r9 = $r10, $r44 + xord $r4 = $r10, $r47 +;; + xord $r61 = $r1, $r5 + xord $r57 = $r5, $r10 + andd $r50 = $r3, $r5 + andd $r43 = $r9, $r1 +;; + xord $r34 = $r59, $r8 + xord $r6 = $r41, $r9 + xord $r35 = $r42, $r4 + xord $r36 = $r48, $r8 +;; + xord $r38 = $r41, $r11 + xord $r40 = $r7, $r50 + xord $r11 = $r43, $r50 + andd $r50 = $r48, $r8 +;; + xord $r15 = $r42, $r57 + andd $r62 = $r35, $r61 + andd $r37 = $r4, $r59 + xord $r52 = $r36, $r50 +;; + andd $r53 = $r6, $r34 + andd $r55 = $r41, $r33 + andd $r46 = $r49, $r38 + andd $r54 = $r42, $r57 +;; + xord $r39 = $r53, $r50 + xord $r60 = $r46, $r55 + xord $r55 = $r54, $r55 + xord $r10 = $r40, $r62 +;; + xord $r44 = $r6, $r34 + xord $r43 = $r11, $r15 + xord $r15 = $r52, $r37 + xord $r17 = $r39, $r55 +;; + xord $r45 = $r10, $r60 + xord $r55 = $r43, $r55 + xord $r50 = $r15, $r60 + xord $r46 = $r17, $r44 +;; + xord $r63 = $r50, $r46 + andd $r43 = $r50, $r45 + xord $r56 = $r45, $r55 + andd $r54 = $r45, $r46 +;; + xord $r36 = $r55, $r43 + xord $r47 = $r46, $r43 + andd $r40 = $r56, $r54 + andd $r60 = $r55, $r50 +;; + andd $r2 = $r47, $r56 + andd $r58 = $r36, $r63 + xord $r36 = $r56, $r43 + andd $r15 = $r63, $r60 +;; + xord $r47 = $r63, $r43 + xord $r17 = $r55, $r2 + xord $r50 = $r40, $r36 + xord $r52 = $r46, $r58 +;; + xord $r58 = $r15, $r47 + xord $r51 = $r17, $r52 + xord $r7 = $r17, $r50 + andd $r43 = $r52, $r1 +;; + xord $r53 = $r50, $r58 + xord $r62 = $r52, $r58 + andd $r44 = $r7, $r8 + andd $r8 = $r50, $r59 +;; + xord $r40 = $r51, $r53 + andd $r45 = $r58, $r61 + andd $r54 = $r51, $r33 + andd $r10 = $r58, $r35 +;; + andd $r47 = $r40, $r38 + andd $r46 = $r62, $r3 + andd $r2 = $r51, $r41 + xord $r35 = $r8, $r10 +;; + andd $r5 = $r62, $r5 + andd $r36 = $r17, $r34 + andd $r39 = $r53, $r57 + andd $r56 = $r7, $r48 +;; + andd $r41 = $r40, $r49 + xord $r34 = $r45, $r46 + xord $r51 = $r44, $r2 + xord $r62 = $r54, $r47 +;; + andd $r38 = $r52, $r9 + andd $r37 = $r50, $r4 + andd $r9 = $r17, $r6 + xord $r59 = $r46, $r35 +;; + andd $r61 = $r53, $r42 + xord $r1 = $r2, $r41 + xord $r63 = $r5, $r43 + xord $r33 = $r39, $r56 +;; + xord $r42 = $r41, $r51 + xord $r51 = $r5, $r34 + xord $r52 = $r36, $r37 + xord $r49 = $r59, $r62 +;; + xord $r57 = $r47, $r33 + xord $r3 = $r9, $r63 + xord $r11 = $r54, $r2 + xord $r50 = $r10, $r1 +;; + xord $r37 = $r43, $r36 + xord $r6 = $r56, $r52 + xord $r9 = $r51, $r62 + xord $r40 = $r42, $r49 +;; + xord $r36 = $r61, $r33 + xord $r10 = $r42, $r57 + xord $r56 = $r52, $r57 + xord $r57 = $r3, $r11 +;; + xord $r5 = $r35, $r51 + sd 72[$r12] = $r40 + nxord $r43 = $r50, $r9 + nxord $r17 = $r36, $r57 +;; + xord $r39 = $r8, $r1 + xord $r53 = $r38, $r35 + xord $r8 = $r1, $r35 + sd 64[$r12] = $r43 +;; + xord $r7 = $r34, $r37 + xord $r58 = $r3, $r53 + sd 56[$r12] = $r17 + xord $r38 = $r42, $r5 +;; + xord $r48 = $r6, $r63 + sd 48[$r12] = $r38 + xord $r1 = $r8, $r7 + xord $r43 = $r10, $r58 +;; + sd 40[$r12] = $r1 + nxord $r4 = $r39, $r56 + nxord $r34 = $r42, $r48 + addd $r1 = $r12, 16 +;; + sd 32[$r12] = $r43 + make $r2, 64 +;; + sd 24[$r12] = $r4 +;; + sd 16[$r12] = $r34 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 80 +;; + ret +;; + .type bs_sbox, @function + .size bs_sbox, . - bs_sbox + .text + .balign 2 + .globl bs_transpose +bs_transpose: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1056 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 + addd $r0 = $r12, 24 + make $r1, 0 +;; + make $r2, 1024 + call memset +;; + addd $r0 = $r12, 24 + addd $r1 = $r18, 0 + call bs_transpose_dst +;; + addd $r1 = $r12, 24 + make $r2, 1024 + addd $r0 = $r18, 0 + call memmove +;; + ld $r16 = 8[$r12] +;; + ld $r18 = 16[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1056 +;; + ret +;; + .type bs_transpose, @function + .size bs_transpose, . - bs_transpose + .text + .balign 2 + .globl bs_transpose_dst +bs_transpose_dst: + addd $r17 = $r12, 0 + addd $r12 = $r12, -16 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + make $r4, 0 +;; +.L103: + make $r35, 1 + make $r17, 0 +;; + slld $r41 = $r35, $r4 +;; + addw $r9 = $r41, 0 +;; +.L104: + sllw $r10 = $r4, 1 + sllw $r42 = $r17, 6 + make $r6, 0 +;; + addw $r36 = $r10, $r17 +;; + sxwd $r15 = $r36 +;; + slld $r2 = $r15, 3 +;; + addd $r8 = $r1, $r2 +;; + ld $r11 = 0[$r8] +;; +.L105: + addw $r40 = $r42, $r6 + make $r2, 0 + make $r44, 1 + make $r32, 64 +;; + sxwd $r34 = $r40 + sxwd $r39 = $r9 + slld $r37 = $r44, $r6 + addw $r6 = $r6, 1 +;; + ld.xs $r7 = $r34[$r0] + andd $r33 = $r11, $r37 + compw.lt $r32 = $r6, $r32 +;; + cmoved.dnez $r33? $r2 = $r39 +;; + ord $r38 = $r7, $r2 +;; + sd.xs $r34[$r0] = $r38 + cb.wnez $r32? .L105 +;; + addw $r17 = $r17, 1 + make $r32, 2 +;; + compw.lt $r32 = $r17, $r32 +;; + cb.wnez $r32? .L104 +;; + addw $r4 = $r4, 1 + make $r32, 64 +;; + compw.lt $r32 = $r4, $r32 +;; + cb.wnez $r32? .L103 +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 16 +;; + ret +;; + .type bs_transpose_dst, @function + .size bs_transpose_dst, . - bs_transpose_dst + .text + .balign 2 + .globl bs_transpose_rev +bs_transpose_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1056 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 + addd $r0 = $r12, 24 + make $r1, 0 +;; + make $r2, 1024 + call memset +;; + make $r3, 0 +;; +.L106: + sxwd $r8 = $r3 + sraw $r32 = $r3, 31 + make $r11, 0 +;; + slld $r34 = $r8, 3 + srlw $r32 = $r32, 26 +;; + addd $r6 = $r18, $r34 + addw $r32 = $r3, $r32 +;; + sraw $r2 = $r32, 6 +;; + sxwd $r5 = $r2 +;; + ld $r36 = 0[$r6] +;; +.L107: + make $r39, 1 +;; + slld $r38 = $r39, $r11 +;; + andd $r17 = $r36, $r38 +;; + cb.deqz $r17? .L108 +;; + make $r44, 1 + sraw $r32 = $r3, 31 +;; + srlw $r32 = $r32, 26 +;; + addw $r32 = $r3, $r32 +;; + sraw $r40 = $r32, 6 +;; + sllw $r9 = $r40, 6 +;; + sbfw $r45 = $r9, $r3 +;; + slld $r0 = $r44, $r45 + goto .L109 +;; +.L108: + make $r0, 0 +;; +.L109: + addd $r37 = $r12, 24 + sllw $r46 = $r11, 1 + addw $r11 = $r11, 1 + make $r32, 64 +;; + sxwd $r7 = $r46 + compw.lt $r32 = $r11, $r32 +;; + addd $r4 = $r7, $r5 +;; + slld $r10 = $r4, 3 +;; + addd $r1 = $r37, $r10 +;; + ld $r41 = 0[$r1] +;; + ord $r35 = $r41, $r0 +;; + sd 0[$r1] = $r35 +;; + cb.wnez $r32? .L107 +;; + addw $r3 = $r3, 1 + make $r32, 128 +;; + compw.lt $r32 = $r3, $r32 +;; + cb.wnez $r32? .L106 +;; + addd $r1 = $r12, 24 + make $r2, 1024 + addd $r0 = $r18, 0 + call memmove +;; + ld $r16 = 8[$r12] +;; + ld $r18 = 16[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1056 +;; + ret +;; + .type bs_transpose_rev, @function + .size bs_transpose_rev, . - bs_transpose_rev + .text + .balign 2 + .globl bs_shiftrows +bs_shiftrows: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1040 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + addd $r50 = $r12, 16 + addd $r1 = $r0, 0 + addd $r43 = $r0, 256 + addd $r8 = $r0, 512 +;; + addd $r60 = $r0, 768 + make $r15, 0 + make $r52, 32 + make $r3, 64 +;; + make $r36, 96 + make $r7, 0 +;; +.L110: + ld $r5 = 0[$r1] + addw $r59 = $r52, 40 + addw $r7 = $r7, 1 + make $r32, 4 +;; + sd 0[$r50] = $r5 + andw $r52 = $r59, 127 + addw $r63 = $r36, 40 + compw.lt $r32 = $r7, $r32 +;; + andw $r36 = $r63, 127 + sxwd $r62 = $r52 +;; + slld $r53 = $r62, 3 +;; + ld $r11 = 8[$r1] +;; + sd 8[$r50] = $r11 +;; + ld $r61 = 16[$r1] +;; + sd 16[$r50] = $r61 +;; + ld $r6 = 24[$r1] +;; + sd 24[$r50] = $r6 +;; + ld $r56 = 32[$r1] +;; + sd 32[$r50] = $r56 +;; + ld $r2 = 40[$r1] +;; + sd 40[$r50] = $r2 + addw $r2 = $r3, 40 +;; + andw $r3 = $r2, 127 + sxwd $r2 = $r36 +;; + sxwd $r5 = $r3 + slld $r39 = $r2, 3 +;; + ld $r38 = 48[$r1] + slld $r46 = $r5, 3 +;; + sd 48[$r50] = $r38 +;; + ld $r54 = 56[$r1] +;; + sd 56[$r50] = $r54 +;; + ld $r4 = 0[$r43] +;; + sd 256[$r50] = $r4 +;; + ld $r58 = 8[$r43] +;; + sd 264[$r50] = $r58 +;; + ld $r10 = 16[$r43] +;; + sd 272[$r50] = $r10 +;; + ld $r34 = 24[$r43] +;; + sd 280[$r50] = $r34 +;; + ld $r51 = 32[$r43] +;; + sd 288[$r50] = $r51 +;; + ld $r9 = 40[$r43] +;; + sd 296[$r50] = $r9 +;; + ld $r1 = 48[$r43] +;; + sd 304[$r50] = $r1 +;; + ld $r4 = 56[$r43] + addd $r43 = $r0, $r53 +;; + sd 312[$r50] = $r4 +;; + ld $r41 = 0[$r8] +;; + sd 512[$r50] = $r41 +;; + ld $r9 = 8[$r8] +;; + sd 520[$r50] = $r9 +;; + ld $r6 = 16[$r8] +;; + sd 528[$r50] = $r6 +;; + ld $r9 = 24[$r8] +;; + sd 536[$r50] = $r9 +;; + ld $r42 = 32[$r8] +;; + sd 544[$r50] = $r42 +;; + ld $r35 = 40[$r8] +;; + sd 552[$r50] = $r35 +;; + ld $r10 = 48[$r8] +;; + sd 560[$r50] = $r10 +;; + ld $r57 = 56[$r8] +;; + sd 568[$r50] = $r57 +;; + ld $r17 = 0[$r60] +;; + sd 768[$r50] = $r17 +;; + ld $r8 = 8[$r60] +;; + sd 776[$r50] = $r8 + addw $r8 = $r15, 40 +;; + andw $r15 = $r8, 127 + addd $r8 = $r0, $r46 +;; + sxwd $r37 = $r15 +;; + ld $r48 = 16[$r60] + slld $r40 = $r37, 3 +;; + sd 784[$r50] = $r48 + addd $r1 = $r0, $r40 +;; + ld $r33 = 24[$r60] +;; + sd 792[$r50] = $r33 +;; + ld $r47 = 32[$r60] +;; + sd 800[$r50] = $r47 +;; + ld $r4 = 40[$r60] +;; + sd 808[$r50] = $r4 +;; + ld $r44 = 48[$r60] +;; + sd 816[$r50] = $r44 +;; + ld $r49 = 56[$r60] + addd $r60 = $r0, $r39 +;; + sd 824[$r50] = $r49 + addd $r50 = $r50, 64 + cb.wnez $r32? .L110 +;; + addd $r1 = $r12, 16 + make $r2, 1024 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1040 +;; + ret +;; + .type bs_shiftrows, @function + .size bs_shiftrows, . - bs_shiftrows + .text + .balign 2 + .globl bs_shiftrows_rev +bs_shiftrows_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1040 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + addd $r56 = $r12, 16 + addd $r34 = $r12, 16 + addd $r45 = $r12, 272 + addd $r6 = $r12, 528 +;; + addd $r62 = $r12, 784 + make $r4, 0 + make $r10, 32 + make $r55, 64 +;; + make $r2, 96 + make $r59, 0 +;; +.L111: + ld $r43 = 0[$r0] + addw $r9 = $r4, 40 + addw $r59 = $r59, 1 + make $r32, 4 +;; + sd 0[$r34] = $r43 + andw $r4 = $r9, 127 + addw $r51 = $r10, 40 + compw.lt $r32 = $r59, $r32 +;; + andw $r10 = $r51, 127 + sxwd $r39 = $r4 +;; + slld $r60 = $r39, 3 +;; + ld $r57 = 8[$r0] +;; + sd 8[$r34] = $r57 +;; + ld $r63 = 16[$r0] +;; + sd 16[$r34] = $r63 +;; + ld $r7 = 24[$r0] +;; + sd 24[$r34] = $r7 +;; + ld $r44 = 32[$r0] +;; + sd 32[$r34] = $r44 +;; + ld $r42 = 40[$r0] +;; + sd 40[$r34] = $r42 +;; + ld $r40 = 48[$r0] +;; + sd 48[$r34] = $r40 +;; + ld $r61 = 56[$r0] +;; + sd 56[$r34] = $r61 + addd $r34 = $r56, $r60 +;; + ld $r35 = 256[$r0] +;; + sd 0[$r45] = $r35 +;; + ld $r1 = 264[$r0] +;; + sd 8[$r45] = $r1 + addw $r1 = $r2, 40 +;; + andw $r2 = $r1, 127 +;; + ld $r49 = 272[$r0] +;; + sd 16[$r45] = $r49 +;; + ld $r37 = 280[$r0] +;; + sd 24[$r45] = $r37 +;; + ld $r54 = 288[$r0] +;; + sd 32[$r45] = $r54 +;; + ld $r15 = 296[$r0] +;; + sd 40[$r45] = $r15 +;; + ld $r3 = 304[$r0] +;; + sd 48[$r45] = $r3 +;; + ld $r5 = 312[$r0] +;; + sd 56[$r45] = $r5 + sxwd $r5 = $r2 +;; + slld $r38 = $r5, 3 +;; + ld $r53 = 512[$r0] +;; + sd 0[$r6] = $r53 +;; + ld $r33 = 520[$r0] +;; + sd 8[$r6] = $r33 +;; + ld $r8 = 528[$r0] +;; + sd 16[$r6] = $r8 +;; + ld $r11 = 536[$r0] +;; + sd 24[$r6] = $r11 +;; + ld $r47 = 544[$r0] +;; + sd 32[$r6] = $r47 +;; + ld $r3 = 552[$r0] +;; + sd 40[$r6] = $r3 +;; + ld $r17 = 560[$r0] +;; + sd 48[$r6] = $r17 +;; + ld $r52 = 568[$r0] +;; + sd 56[$r6] = $r52 + sxwd $r6 = $r10 +;; + slld $r1 = $r6, 3 +;; + addd $r45 = $r56, $r1 +;; + ld $r8 = 768[$r0] +;; + sd 0[$r62] = $r8 +;; + ld $r41 = 776[$r0] +;; + sd 8[$r62] = $r41 +;; + ld $r3 = 784[$r0] +;; + sd 16[$r62] = $r3 + addw $r3 = $r55, 40 +;; + andw $r55 = $r3, 127 +;; + sxwd $r7 = $r55 +;; + ld $r36 = 792[$r0] + slld $r58 = $r7, 3 +;; + sd 24[$r62] = $r36 + addd $r6 = $r56, $r58 +;; + ld $r48 = 800[$r0] +;; + sd 32[$r62] = $r48 +;; + ld $r11 = 808[$r0] +;; + sd 40[$r62] = $r11 +;; + ld $r46 = 816[$r0] +;; + sd 48[$r62] = $r46 +;; + ld $r50 = 824[$r0] + addd $r0 = $r0, 64 +;; + sd 56[$r62] = $r50 + addd $r62 = $r56, $r38 + cb.wnez $r32? .L111 +;; + addd $r0 = $r0, -256 + addd $r1 = $r12, 16 + make $r2, 1024 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1040 +;; + ret +;; + .type bs_shiftrows_rev, @function + .size bs_shiftrows_rev, . - bs_shiftrows_rev + .text + .balign 2 + .globl bs_shiftmix +bs_shiftmix: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1088 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r4 = $r0, 256 + addd $r1 = $r0, 512 + addd $r3 = $r0, 768 +;; + sd 24[$r12] = $r19 + addd $r19 = $r12, 64 + make $r18, 0 + addd $r2 = $r0, 0 +;; + sd 32[$r12] = $r20 + make $r20, 64 +;; + sd 40[$r12] = $r21 + make $r21, 96 +;; + sd 48[$r12] = $r22 + make $r22, 32 +;; + sd 56[$r12] = $r23 + make $r23, 0 +;; +.L112: + ld $r46 = 64[$r4] + addw $r23 = $r23, 1 + make $r32, 4 +;; + ld $r8 = 128[$r1] + compw.lt $r32 = $r23, $r32 +;; + ld $r5 = 56[$r2] + xord $r57 = $r46, $r8 +;; + ld $r59 = 120[$r4] +;; + xord $r7 = $r5, $r59 + ld $r17 = 192[$r3] +;; + xord $r5 = $r57, $r17 +;; + xord $r61 = $r5, $r7 +;; + sd 0[$r19] = $r61 +;; + ld $r48 = 0[$r2] +;; + ld $r62 = 64[$r4] +;; + xord $r42 = $r48, $r62 + ld $r60 = 72[$r4] +;; + xord $r5 = $r42, $r60 + ld $r61 = 136[$r1] +;; + xord $r45 = $r5, $r61 + ld $r40 = 200[$r3] +;; + xord $r45 = $r45, $r40 +;; + xord $r5 = $r45, $r7 +;; + sd 8[$r19] = $r5 +;; + ld $r11 = 8[$r2] +;; + ld $r51 = 72[$r4] +;; + xord $r45 = $r11, $r51 + ld $r40 = 80[$r4] +;; + xord $r37 = $r45, $r40 + ld $r39 = 144[$r1] +;; + xord $r6 = $r37, $r39 + ld $r42 = 208[$r3] +;; + xord $r59 = $r6, $r42 +;; + sd 16[$r19] = $r59 +;; + ld $r6 = 16[$r2] +;; + ld $r44 = 80[$r4] +;; + xord $r43 = $r6, $r44 + ld $r9 = 88[$r4] +;; + xord $r52 = $r43, $r9 + ld $r46 = 152[$r1] +;; + xord $r42 = $r52, $r46 + ld $r48 = 216[$r3] +;; + xord $r5 = $r42, $r48 +;; + xord $r55 = $r5, $r7 +;; + sd 24[$r19] = $r55 +;; + ld $r34 = 24[$r2] +;; + ld $r8 = 88[$r4] +;; + xord $r62 = $r34, $r8 + ld $r47 = 96[$r4] +;; + xord $r38 = $r62, $r47 + ld $r50 = 160[$r1] +;; + xord $r34 = $r38, $r50 + ld $r56 = 224[$r3] +;; + xord $r8 = $r34, $r56 +;; + xord $r11 = $r8, $r7 +;; + sd 32[$r19] = $r11 +;; + ld $r5 = 96[$r4] +;; + ld $r53 = 32[$r2] +;; + xord $r44 = $r53, $r5 + ld $r54 = 168[$r1] +;; + ld $r5 = 104[$r4] +;; + xord $r40 = $r44, $r5 +;; + xord $r10 = $r40, $r54 +;; + ld $r5 = 232[$r3] +;; + xord $r39 = $r10, $r5 +;; + sd 40[$r19] = $r39 +;; + ld $r5 = 40[$r2] +;; + ld $r58 = 104[$r4] +;; + xord $r17 = $r5, $r58 + ld $r15 = 112[$r4] +;; + xord $r37 = $r17, $r15 + ld $r5 = 176[$r1] +;; + xord $r57 = $r37, $r5 + ld $r51 = 240[$r3] +;; + xord $r57 = $r57, $r51 +;; + sd 48[$r19] = $r57 +;; + ld $r40 = 48[$r2] +;; + ld $r52 = 112[$r4] +;; + xord $r35 = $r40, $r52 + ld $r5 = 120[$r4] +;; + xord $r5 = $r35, $r5 + ld $r49 = 184[$r1] +;; + xord $r15 = $r5, $r49 + ld $r34 = 248[$r3] +;; + xord $r46 = $r15, $r34 +;; + sd 56[$r19] = $r46 +;; + ld $r33 = 0[$r2] +;; + ld $r36 = 128[$r1] +;; + ld $r48 = 120[$r4] + xord $r42 = $r33, $r36 +;; + ld $r5 = 184[$r1] +;; + xord $r34 = $r48, $r5 + ld $r47 = 192[$r3] +;; + xord $r60 = $r42, $r47 +;; + xord $r60 = $r60, $r34 +;; + sd 64[$r19] = $r60 +;; + ld $r43 = 8[$r2] +;; + ld $r47 = 64[$r4] +;; + xord $r63 = $r43, $r47 + ld $r52 = 128[$r1] +;; + xord $r5 = $r63, $r52 + ld $r7 = 136[$r1] +;; + xord $r60 = $r5, $r7 + ld $r15 = 200[$r3] +;; + xord $r55 = $r60, $r15 +;; + xord $r48 = $r55, $r34 +;; + sd 72[$r19] = $r48 +;; + ld $r56 = 16[$r2] +;; + ld $r5 = 72[$r4] +;; + xord $r7 = $r56, $r5 + ld $r46 = 136[$r1] +;; + xord $r41 = $r7, $r46 + ld $r40 = 144[$r1] +;; + xord $r5 = $r41, $r40 + ld $r47 = 208[$r3] +;; + xord $r5 = $r5, $r47 +;; + sd 80[$r19] = $r5 +;; + ld $r52 = 24[$r2] +;; + ld $r54 = 80[$r4] +;; + xord $r35 = $r52, $r54 + ld $r63 = 144[$r1] +;; + xord $r7 = $r35, $r63 + ld $r8 = 152[$r1] +;; + xord $r33 = $r7, $r8 + ld $r37 = 216[$r3] +;; + xord $r56 = $r33, $r37 +;; + xord $r54 = $r56, $r34 +;; + sd 88[$r19] = $r54 +;; + ld $r9 = 32[$r2] +;; + ld $r6 = 88[$r4] +;; + xord $r44 = $r9, $r6 + ld $r51 = 152[$r1] +;; + xord $r35 = $r44, $r51 + ld $r52 = 160[$r1] +;; + xord $r38 = $r35, $r52 + ld $r9 = 224[$r3] +;; + xord $r62 = $r38, $r9 +;; + xord $r6 = $r62, $r34 +;; + sd 96[$r19] = $r6 +;; + ld $r15 = 40[$r2] +;; + ld $r17 = 96[$r4] +;; + xord $r36 = $r15, $r17 + ld $r5 = 160[$r1] +;; + xord $r50 = $r36, $r5 + ld $r51 = 168[$r1] +;; + xord $r37 = $r50, $r51 + ld $r42 = 232[$r3] +;; + xord $r58 = $r37, $r42 +;; + sd 104[$r19] = $r58 +;; + ld $r56 = 48[$r2] +;; + ld $r41 = 104[$r4] +;; + xord $r11 = $r56, $r41 + ld $r48 = 168[$r1] +;; + xord $r51 = $r11, $r48 + ld $r58 = 176[$r1] +;; + xord $r61 = $r51, $r58 + ld $r5 = 240[$r3] +;; + xord $r61 = $r61, $r5 +;; + sd 112[$r19] = $r61 +;; + ld $r34 = 56[$r2] +;; + ld $r56 = 112[$r4] +;; + xord $r46 = $r34, $r56 + ld $r9 = 176[$r1] +;; + xord $r62 = $r46, $r9 + ld $r33 = 184[$r1] +;; + xord $r46 = $r62, $r33 + ld $r61 = 248[$r3] +;; + xord $r40 = $r46, $r61 +;; + sd 120[$r19] = $r40 +;; + ld $r5 = 184[$r1] +;; + ld $r59 = 248[$r3] +;; + xord $r43 = $r5, $r59 + ld $r55 = 0[$r2] +;; + ld $r5 = 64[$r4] +;; + xord $r42 = $r55, $r5 + ld $r35 = 192[$r3] +;; + xord $r49 = $r42, $r35 +;; + xord $r5 = $r49, $r43 +;; + sd 128[$r19] = $r5 +;; + ld $r57 = 8[$r2] +;; + ld $r5 = 72[$r4] +;; + xord $r44 = $r57, $r5 + ld $r45 = 128[$r1] +;; + xord $r17 = $r44, $r45 + ld $r33 = 192[$r3] +;; + xord $r52 = $r17, $r33 + ld $r39 = 200[$r3] +;; + xord $r35 = $r52, $r39 +;; + xord $r62 = $r35, $r43 +;; + sd 136[$r19] = $r62 +;; + ld $r5 = 16[$r2] +;; + ld $r39 = 80[$r4] +;; + xord $r36 = $r5, $r39 + ld $r41 = 136[$r1] +;; + xord $r6 = $r36, $r41 + ld $r5 = 200[$r3] +;; + xord $r35 = $r6, $r5 + ld $r11 = 208[$r3] +;; + xord $r37 = $r35, $r11 +;; + sd 144[$r19] = $r37 +;; + ld $r5 = 24[$r2] +;; + ld $r63 = 88[$r4] +;; + xord $r33 = $r5, $r63 + ld $r45 = 144[$r1] +;; + xord $r49 = $r33, $r45 + ld $r36 = 208[$r3] +;; + xord $r55 = $r49, $r36 + ld $r8 = 216[$r3] +;; + xord $r41 = $r55, $r8 +;; + xord $r58 = $r41, $r43 +;; + sd 152[$r19] = $r58 +;; + ld $r6 = 32[$r2] +;; + ld $r47 = 96[$r4] +;; + xord $r11 = $r6, $r47 + ld $r61 = 152[$r1] +;; + xord $r44 = $r11, $r61 + ld $r9 = 216[$r3] +;; + xord $r59 = $r44, $r9 + ld $r34 = 224[$r3] +;; + xord $r7 = $r59, $r34 +;; + xord $r17 = $r7, $r43 +;; + sd 160[$r19] = $r17 +;; + ld $r54 = 40[$r2] +;; + ld $r53 = 104[$r4] +;; + xord $r7 = $r54, $r53 + ld $r59 = 160[$r1] +;; + xord $r37 = $r7, $r59 + ld $r41 = 224[$r3] +;; + xord $r10 = $r37, $r41 + ld $r46 = 232[$r3] +;; + xord $r10 = $r10, $r46 +;; + sd 168[$r19] = $r10 +;; + ld $r58 = 48[$r2] +;; + ld $r5 = 112[$r4] +;; + xord $r40 = $r58, $r5 + ld $r38 = 168[$r1] +;; + xord $r57 = $r40, $r38 + ld $r51 = 232[$r3] +;; + xord $r60 = $r57, $r51 + ld $r55 = 240[$r3] +;; + xord $r53 = $r60, $r55 +;; + sd 176[$r19] = $r53 +;; + ld $r45 = 56[$r2] +;; + ld $r41 = 120[$r4] +;; + xord $r5 = $r45, $r41 + ld $r53 = 176[$r1] +;; + xord $r38 = $r5, $r53 + ld $r8 = 240[$r3] +;; + xord $r43 = $r38, $r8 + ld $r63 = 248[$r3] +;; + xord $r6 = $r43, $r63 +;; + sd 184[$r19] = $r6 +;; + ld $r8 = 0[$r2] +;; + ld $r58 = 64[$r4] +;; + ld $r35 = 56[$r2] + xord $r54 = $r8, $r58 +;; + ld $r5 = 248[$r3] +;; + xord $r50 = $r35, $r5 + ld $r51 = 128[$r1] +;; + xord $r11 = $r54, $r51 +;; + xord $r38 = $r11, $r50 +;; + sd 192[$r19] = $r38 +;; + ld $r63 = 8[$r2] +;; + ld $r54 = 0[$r2] +;; + xord $r54 = $r63, $r54 + ld $r36 = 72[$r4] +;; + xord $r5 = $r54, $r36 + ld $r41 = 136[$r1] +;; + xord $r39 = $r5, $r41 + ld $r58 = 192[$r3] +;; + xord $r44 = $r39, $r58 +;; + xord $r33 = $r44, $r50 +;; + sd 200[$r19] = $r33 +;; + ld $r5 = 8[$r2] +;; + ld $r63 = 16[$r2] +;; + xord $r54 = $r63, $r5 + ld $r49 = 80[$r4] + addw $r63 = $r18, 32 +;; + xord $r51 = $r54, $r49 + ld $r5 = 144[$r1] + andw $r18 = $r63, 127 +;; + xord $r43 = $r51, $r5 + ld $r57 = 200[$r3] +;; + xord $r47 = $r43, $r57 +;; + sd 208[$r19] = $r47 + addw $r47 = $r21, 32 +;; + andw $r21 = $r47, 127 +;; + ld $r7 = 24[$r2] +;; + ld $r15 = 16[$r2] +;; + xord $r56 = $r7, $r15 + ld $r48 = 88[$r4] +;; + xord $r10 = $r56, $r48 + ld $r51 = 152[$r1] +;; + xord $r39 = $r10, $r51 + addw $r10 = $r22, 32 +;; + ld $r48 = 208[$r3] + andw $r22 = $r10, 127 +;; + xord $r53 = $r39, $r48 +;; + xord $r37 = $r53, $r50 +;; + sd 216[$r19] = $r37 +;; + ld $r9 = 32[$r2] +;; + ld $r15 = 24[$r2] +;; + xord $r43 = $r9, $r15 + ld $r53 = 96[$r4] + addw $r15 = $r20, 32 +;; + xord $r42 = $r43, $r53 + ld $r17 = 160[$r1] + andw $r20 = $r15, 127 +;; + xord $r55 = $r42, $r17 + ld $r62 = 216[$r3] + sxwd $r8 = $r20 +;; + xord $r60 = $r55, $r62 + slld $r43 = $r8, 3 +;; + xord $r5 = $r60, $r50 + sxwd $r50 = $r18 +;; + sd 224[$r19] = $r5 + slld $r39 = $r50, 3 +;; + ld $r5 = 40[$r2] +;; + ld $r51 = 32[$r2] +;; + xord $r62 = $r5, $r51 + ld $r45 = 168[$r1] +;; + ld $r5 = 104[$r4] +;; + xord $r9 = $r62, $r5 +;; + xord $r17 = $r9, $r45 +;; + ld $r5 = 224[$r3] +;; + xord $r49 = $r17, $r5 +;; + sd 232[$r19] = $r49 +;; + ld $r33 = 48[$r2] +;; + ld $r57 = 40[$r2] +;; + xord $r49 = $r33, $r57 + ld $r55 = 112[$r4] +;; + xord $r59 = $r49, $r55 + ld $r36 = 176[$r1] +;; + xord $r61 = $r59, $r36 + ld $r52 = 232[$r3] +;; + xord $r6 = $r61, $r52 +;; + sd 240[$r19] = $r6 +;; + ld $r49 = 56[$r2] +;; + ld $r45 = 48[$r2] + addd $r2 = $r0, $r39 +;; + xord $r56 = $r49, $r45 + ld $r59 = 120[$r4] +;; + xord $r11 = $r56, $r59 + ld $r38 = 184[$r1] +;; + xord $r4 = $r11, $r38 + ld $r34 = 240[$r3] + sxwd $r38 = $r22 + sxwd $r3 = $r21 +;; + xord $r1 = $r4, $r34 + slld $r10 = $r38, 3 + slld $r36 = $r3, 3 +;; + sd 248[$r19] = $r1 + addd $r19 = $r19, 256 + addd $r4 = $r0, $r10 + addd $r1 = $r0, $r43 +;; + addd $r3 = $r0, $r36 + cb.wnez $r32? .L112 +;; + addd $r1 = $r12, 64 + make $r2, 1024 + call memmove +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r20 = 32[$r12] +;; + ld $r21 = 40[$r12] +;; + ld $r22 = 48[$r12] +;; + ld $r23 = 56[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1088 +;; + ret +;; + .type bs_shiftmix, @function + .size bs_shiftmix, . - bs_shiftmix + .text + .balign 2 + .globl bs_mixcolumns +bs_mixcolumns: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1040 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + addd $r46 = $r12, 16 + make $r45, 0 +;; +.L113: + ld $r60 = 64[$r0] + addw $r45 = $r45, 1 + make $r32, 4 +;; + ld $r54 = 128[$r0] + compw.lt $r32 = $r45, $r32 +;; + ld $r44 = 56[$r0] + xord $r49 = $r60, $r54 +;; + ld $r7 = 120[$r0] +;; + xord $r57 = $r44, $r7 + ld $r5 = 192[$r0] +;; + xord $r1 = $r49, $r5 +;; + xord $r40 = $r1, $r57 +;; + sd 0[$r46] = $r40 +;; + ld $r42 = 0[$r0] +;; + xord $r53 = $r42, $r60 + ld $r39 = 72[$r0] +;; + xord $r43 = $r53, $r39 + ld $r55 = 136[$r0] +;; + xord $r6 = $r43, $r55 + ld $r2 = 200[$r0] +;; + xord $r15 = $r6, $r2 +;; + xord $r8 = $r15, $r57 +;; + sd 8[$r46] = $r8 +;; + ld $r35 = 8[$r0] +;; + xord $r59 = $r35, $r39 + ld $r37 = 80[$r0] + xord $r60 = $r35, $r60 +;; + xord $r56 = $r59, $r37 + ld $r6 = 144[$r0] + xord $r59 = $r59, $r54 +;; + xord $r43 = $r56, $r6 + ld $r51 = 208[$r0] + xord $r59 = $r59, $r5 +;; + xord $r11 = $r43, $r51 + xord $r59 = $r59, $r2 +;; + sd 16[$r46] = $r11 +;; + ld $r47 = 16[$r0] +;; + xord $r11 = $r47, $r37 + ld $r52 = 88[$r0] +;; + xord $r48 = $r11, $r52 + ld $r17 = 152[$r0] +;; + xord $r4 = $r48, $r17 + ld $r1 = 216[$r0] +;; + xord $r4 = $r4, $r1 +;; + xord $r3 = $r4, $r57 +;; + sd 24[$r46] = $r3 +;; + ld $r8 = 24[$r0] +;; + xord $r58 = $r8, $r52 + ld $r36 = 96[$r0] +;; + xord $r9 = $r58, $r36 + ld $r50 = 160[$r0] + xord $r58 = $r58, $r6 +;; + xord $r40 = $r9, $r50 + ld $r4 = 224[$r0] +;; + xord $r61 = $r40, $r4 +;; + xord $r48 = $r61, $r57 +;; + sd 32[$r46] = $r48 +;; + ld $r15 = 32[$r0] +;; + xord $r57 = $r15, $r36 + ld $r38 = 104[$r0] +;; + xord $r61 = $r57, $r38 + ld $r3 = 168[$r0] +;; + xord $r9 = $r61, $r3 + ld $r48 = 232[$r0] +;; + xord $r9 = $r9, $r48 +;; + sd 40[$r46] = $r9 +;; + ld $r43 = 40[$r0] +;; + xord $r34 = $r43, $r38 + ld $r33 = 112[$r0] +;; + xord $r40 = $r34, $r33 + ld $r10 = 176[$r0] + xord $r63 = $r44, $r33 +;; + xord $r49 = $r40, $r10 + ld $r41 = 240[$r0] +;; + xord $r62 = $r49, $r41 +;; + sd 48[$r46] = $r62 + xord $r62 = $r42, $r54 +;; + xord $r62 = $r62, $r5 +;; + ld $r9 = 48[$r0] +;; + xord $r56 = $r9, $r33 + ld $r40 = 184[$r0] +;; + xord $r49 = $r56, $r7 + xord $r56 = $r56, $r3 +;; + xord $r61 = $r49, $r40 +;; + ld $r49 = 248[$r0] + addd $r0 = $r0, 256 +;; + xord $r61 = $r61, $r49 +;; + sd 56[$r46] = $r61 + xord $r61 = $r7, $r40 +;; + xord $r62 = $r62, $r61 +;; + sd 64[$r46] = $r62 + xord $r62 = $r60, $r54 +;; + xord $r60 = $r62, $r55 +;; + xord $r60 = $r60, $r2 +;; + xord $r60 = $r60, $r61 +;; + sd 72[$r46] = $r60 + xord $r60 = $r47, $r39 +;; + xord $r60 = $r60, $r55 +;; + xord $r60 = $r60, $r6 +;; + xord $r60 = $r60, $r51 +;; + sd 80[$r46] = $r60 + xord $r60 = $r8, $r37 +;; + xord $r60 = $r60, $r6 +;; + xord $r60 = $r60, $r17 +;; + xord $r60 = $r60, $r1 +;; + xord $r60 = $r60, $r61 +;; + sd 88[$r46] = $r60 + xord $r60 = $r15, $r52 +;; + xord $r60 = $r60, $r17 +;; + xord $r60 = $r60, $r50 +;; + xord $r60 = $r60, $r4 +;; + xord $r60 = $r60, $r61 + xord $r61 = $r53, $r5 +;; + sd 96[$r46] = $r60 + xord $r60 = $r43, $r36 +;; + xord $r60 = $r60, $r50 +;; + xord $r60 = $r60, $r3 +;; + xord $r60 = $r60, $r48 +;; + sd 104[$r46] = $r60 + xord $r60 = $r9, $r38 +;; + xord $r60 = $r60, $r3 +;; + xord $r60 = $r60, $r10 +;; + xord $r60 = $r60, $r41 +;; + sd 112[$r46] = $r60 + xord $r60 = $r63, $r10 +;; + xord $r60 = $r60, $r40 +;; + xord $r60 = $r60, $r49 +;; + sd 120[$r46] = $r60 + xord $r60 = $r40, $r49 +;; + xord $r61 = $r61, $r60 + xord $r63 = $r59, $r60 + xord $r59 = $r11, $r55 +;; + sd 128[$r46] = $r61 + xord $r59 = $r59, $r2 +;; + sd 136[$r46] = $r63 + xord $r11 = $r59, $r51 + xord $r63 = $r58, $r51 +;; + sd 144[$r46] = $r11 + xord $r58 = $r63, $r1 + xord $r11 = $r57, $r17 +;; + xord $r61 = $r58, $r60 + xord $r57 = $r11, $r1 +;; + sd 152[$r46] = $r61 + xord $r57 = $r57, $r4 +;; + xord $r57 = $r57, $r60 +;; + sd 160[$r46] = $r57 + xord $r57 = $r34, $r50 +;; + xord $r57 = $r57, $r4 +;; + xord $r11 = $r57, $r48 + xord $r57 = $r53, $r54 +;; + sd 168[$r46] = $r11 + xord $r11 = $r56, $r48 +;; + xord $r56 = $r11, $r41 + xord $r11 = $r44, $r7 +;; + sd 176[$r46] = $r56 + xord $r34 = $r11, $r10 +;; + xord $r11 = $r34, $r41 +;; + xord $r56 = $r11, $r49 + xord $r49 = $r44, $r49 + xord $r11 = $r35, $r42 + xord $r42 = $r9, $r43 +;; + sd 184[$r46] = $r56 + xord $r53 = $r57, $r49 + xord $r11 = $r11, $r39 + xord $r58 = $r42, $r33 +;; + sd 192[$r46] = $r53 + xord $r61 = $r11, $r55 + xord $r53 = $r47, $r35 +;; + xord $r39 = $r61, $r5 + xord $r62 = $r53, $r37 +;; + xord $r34 = $r39, $r49 + xord $r37 = $r62, $r6 +;; + sd 200[$r46] = $r34 + xord $r57 = $r37, $r2 + xord $r37 = $r8, $r47 + xord $r34 = $r15, $r8 +;; + sd 208[$r46] = $r57 + xord $r35 = $r37, $r52 +;; + xord $r47 = $r35, $r17 +;; + xord $r47 = $r47, $r51 +;; + xord $r54 = $r47, $r49 + xord $r47 = $r34, $r36 +;; + sd 216[$r46] = $r54 + xord $r35 = $r47, $r50 +;; + xord $r39 = $r35, $r1 +;; + xord $r11 = $r39, $r49 + xord $r49 = $r43, $r15 +;; + sd 224[$r46] = $r11 + xord $r53 = $r49, $r38 + xord $r11 = $r58, $r10 + xord $r38 = $r44, $r9 +;; + xord $r6 = $r53, $r3 + xord $r56 = $r11, $r48 + xord $r52 = $r38, $r7 +;; + xord $r15 = $r6, $r4 + xord $r34 = $r52, $r40 +;; + sd 232[$r46] = $r15 + xord $r58 = $r34, $r41 +;; + sd 240[$r46] = $r56 +;; + sd 248[$r46] = $r58 + addd $r46 = $r46, 256 + cb.wnez $r32? .L113 +;; + addd $r0 = $r0, -1024 + addd $r1 = $r46, -1024 + make $r2, 1024 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1040 +;; + ret +;; + .type bs_mixcolumns, @function + .size bs_mixcolumns, . - bs_mixcolumns + .text + .balign 2 + .globl bs_mixcolumns_rev +bs_mixcolumns_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -1040 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + addd $r49 = $r12, 16 + make $r50, 0 +;; +.L114: + ld $r47 = 56[$r0] + addw $r50 = $r50, 8 + make $r32, 32 +;; + ld $r11 = 48[$r0] + compw.lt $r32 = $r50, $r32 +;; + xord $r56 = $r47, $r11 + ld $r6 = 40[$r0] +;; + xord $r2 = $r56, $r6 + ld $r15 = 120[$r0] +;; + ld $r33 = 104[$r0] + xord $r1 = $r47, $r15 +;; + xord $r60 = $r15, $r33 + ld $r7 = 176[$r0] +;; + xord $r57 = $r2, $r60 + ld $r48 = 168[$r0] +;; + xord $r63 = $r7, $r48 + ld $r44 = 64[$r0] +;; + xord $r62 = $r57, $r63 + ld $r34 = 128[$r0] +;; + ld $r3 = 232[$r0] + xord $r38 = $r44, $r34 +;; + xord $r9 = $r62, $r3 + ld $r17 = 192[$r0] +;; + ld $r51 = 112[$r0] + xord $r36 = $r38, $r17 +;; + xord $r8 = $r56, $r51 + ld $r10 = 184[$r0] + xord $r40 = $r36, $r9 +;; + xord $r57 = $r10, $r7 + ld $r5 = 240[$r0] + xord $r4 = $r1, $r10 +;; + xord $r54 = $r8, $r57 + ld $r39 = 248[$r0] +;; + xord $r35 = $r54, $r5 + xord $r58 = $r4, $r39 + sd 0[$r49] = $r40 +;; + ld $r8 = 0[$r0] +;; + xord $r36 = $r8, $r44 + ld $r52 = 72[$r0] +;; + xord $r59 = $r36, $r52 + ld $r45 = 136[$r0] +;; + xord $r59 = $r59, $r45 + ld $r4 = 200[$r0] +;; + xord $r59 = $r59, $r4 +;; + xord $r36 = $r59, $r9 +;; + xord $r61 = $r36, $r35 +;; + sd 8[$r49] = $r61 +;; + ld $r43 = 8[$r0] +;; + xord $r53 = $r43, $r8 + ld $r46 = 80[$r0] +;; + xord $r53 = $r53, $r52 + ld $r2 = 144[$r0] +;; + xord $r36 = $r53, $r46 + ld $r42 = 208[$r0] +;; + xord $r59 = $r36, $r2 +;; + xord $r53 = $r59, $r34 +;; + xord $r53 = $r53, $r42 +;; + xord $r37 = $r53, $r35 +;; + xord $r53 = $r37, $r58 +;; + sd 16[$r49] = $r53 +;; + ld $r1 = 16[$r0] +;; + xord $r41 = $r1, $r43 + ld $r38 = 88[$r0] +;; + xord $r36 = $r41, $r8 +;; + xord $r53 = $r36, $r44 +;; + xord $r36 = $r53, $r46 + ld $r41 = 152[$r0] +;; + xord $r36 = $r36, $r38 +;; + xord $r40 = $r36, $r41 +;; + xord $r53 = $r40, $r45 +;; + xord $r36 = $r53, $r34 +;; + ld $r53 = 216[$r0] +;; + xord $r54 = $r36, $r53 +;; + xord $r36 = $r54, $r17 +;; + xord $r36 = $r36, $r9 +;; + xord $r36 = $r36, $r58 +;; + sd 24[$r49] = $r36 +;; + ld $r36 = 24[$r0] +;; + xord $r54 = $r36, $r1 + ld $r40 = 96[$r0] +;; + xord $r61 = $r54, $r43 + ld $r37 = 160[$r0] +;; + xord $r54 = $r61, $r52 +;; + xord $r59 = $r54, $r38 +;; + xord $r59 = $r59, $r40 +;; + xord $r54 = $r59, $r37 +;; + xord $r54 = $r54, $r2 +;; + xord $r55 = $r54, $r45 +;; + ld $r54 = 224[$r0] +;; + xord $r55 = $r55, $r54 +;; + xord $r55 = $r55, $r4 +;; + xord $r55 = $r55, $r9 +;; + xord $r9 = $r55, $r35 +;; + sd 32[$r49] = $r9 +;; + ld $r9 = 32[$r0] + addd $r0 = $r0, 256 +;; + xord $r55 = $r9, $r36 + xord $r61 = $r6, $r9 +;; + xord $r55 = $r55, $r1 +;; + xord $r55 = $r55, $r46 +;; + xord $r55 = $r55, $r40 +;; + xord $r55 = $r55, $r33 +;; + xord $r55 = $r55, $r48 +;; + xord $r55 = $r55, $r41 +;; + xord $r55 = $r55, $r2 +;; + xord $r55 = $r55, $r3 +;; + xord $r55 = $r55, $r42 +;; + xord $r35 = $r55, $r35 +;; + xord $r60 = $r35, $r58 + xord $r35 = $r61, $r36 +;; + sd 40[$r49] = $r60 + xord $r55 = $r35, $r38 +;; + xord $r62 = $r55, $r33 +;; + xord $r59 = $r62, $r51 + xord $r62 = $r8, $r34 +;; + xord $r35 = $r59, $r7 + xord $r59 = $r11, $r6 + xord $r62 = $r62, $r17 +;; + xord $r35 = $r35, $r37 +;; + xord $r35 = $r35, $r41 +;; + xord $r55 = $r35, $r5 + xord $r35 = $r59, $r9 +;; + xord $r63 = $r55, $r53 + xord $r60 = $r35, $r40 +;; + xord $r55 = $r63, $r58 + xord $r35 = $r60, $r51 +;; + sd 48[$r49] = $r55 + xord $r63 = $r35, $r15 + xord $r55 = $r15, $r51 +;; + xord $r35 = $r63, $r10 + xord $r63 = $r1, $r52 +;; + xord $r35 = $r35, $r48 +;; + xord $r35 = $r35, $r37 +;; + xord $r60 = $r35, $r39 + xord $r35 = $r55, $r33 +;; + xord $r60 = $r60, $r54 + xord $r35 = $r6, $r35 +;; + sd 56[$r49] = $r60 + xord $r60 = $r10, $r48 +;; + xord $r61 = $r35, $r60 + xord $r60 = $r5, $r3 + xord $r35 = $r39, $r5 +;; + xord $r60 = $r61, $r60 + xord $r61 = $r11, $r55 +;; + xord $r61 = $r61, $r7 + xord $r62 = $r62, $r60 +;; + xord $r61 = $r61, $r35 + sd 64[$r49] = $r62 + xord $r62 = $r43, $r44 +;; + xord $r62 = $r62, $r45 +;; + xord $r62 = $r62, $r34 +;; + xord $r62 = $r62, $r4 +;; + xord $r62 = $r62, $r60 +;; + xord $r62 = $r62, $r61 +;; + sd 72[$r49] = $r62 + xord $r62 = $r63, $r44 +;; + xord $r62 = $r62, $r2 +;; + xord $r62 = $r62, $r45 +;; + xord $r62 = $r62, $r42 +;; + xord $r62 = $r62, $r17 +;; + xord $r62 = $r62, $r61 +;; + xord $r62 = $r62, $r58 +;; + sd 80[$r49] = $r62 + xord $r62 = $r36, $r8 +;; + xord $r62 = $r62, $r46 +;; + xord $r62 = $r62, $r52 +;; + xord $r62 = $r62, $r44 +;; + xord $r62 = $r62, $r41 +;; + xord $r62 = $r62, $r2 +;; + xord $r62 = $r62, $r34 +;; + xord $r62 = $r62, $r53 +;; + xord $r62 = $r62, $r4 +;; + xord $r62 = $r62, $r17 +;; + xord $r62 = $r62, $r60 +;; + xord $r62 = $r62, $r58 +;; + sd 88[$r49] = $r62 + xord $r62 = $r9, $r43 +;; + xord $r62 = $r62, $r38 +;; + xord $r62 = $r62, $r46 +;; + xord $r62 = $r62, $r52 +;; + xord $r62 = $r62, $r37 +;; + xord $r62 = $r62, $r41 +;; + xord $r62 = $r62, $r45 +;; + xord $r62 = $r62, $r54 +;; + xord $r62 = $r62, $r42 +;; + xord $r62 = $r62, $r4 +;; + xord $r60 = $r62, $r60 +;; + xord $r60 = $r60, $r61 +;; + sd 96[$r49] = $r60 + xord $r60 = $r6, $r1 +;; + xord $r60 = $r60, $r40 +;; + xord $r60 = $r60, $r38 +;; + xord $r60 = $r60, $r46 +;; + xord $r60 = $r60, $r48 +;; + xord $r60 = $r60, $r37 +;; + xord $r60 = $r60, $r2 +;; + xord $r60 = $r60, $r3 +;; + xord $r60 = $r60, $r53 +;; + xord $r60 = $r60, $r42 +;; + xord $r60 = $r60, $r61 +;; + xord $r60 = $r60, $r58 +;; + sd 104[$r49] = $r60 + xord $r60 = $r11, $r36 +;; + xord $r60 = $r60, $r33 +;; + xord $r60 = $r60, $r40 +;; + xord $r60 = $r60, $r38 +;; + xord $r62 = $r60, $r7 +;; + xord $r60 = $r62, $r48 +;; + xord $r60 = $r60, $r41 +;; + xord $r60 = $r60, $r5 +;; + xord $r60 = $r60, $r54 +;; + xord $r60 = $r60, $r53 +;; + xord $r58 = $r60, $r58 +;; + sd 112[$r49] = $r58 + xord $r58 = $r47, $r9 +;; + xord $r58 = $r58, $r51 +;; + xord $r58 = $r58, $r33 +;; + xord $r58 = $r58, $r40 +;; + xord $r58 = $r58, $r10 +;; + xord $r58 = $r58, $r7 +;; + xord $r58 = $r58, $r37 +;; + xord $r58 = $r58, $r39 +;; + xord $r63 = $r58, $r3 +;; + xord $r58 = $r63, $r54 +;; + sd 120[$r49] = $r58 + xord $r58 = $r57, $r48 + xord $r57 = $r51, $r57 +;; + xord $r60 = $r33, $r58 + xord $r58 = $r39, $r3 + xord $r63 = $r57, $r5 +;; + xord $r58 = $r60, $r58 + xord $r63 = $r63, $r56 + xord $r56 = $r15, $r10 +;; + xord $r61 = $r58, $r59 + xord $r56 = $r56, $r39 +;; + xord $r62 = $r56, $r47 + xord $r56 = $r44, $r17 +;; + xord $r56 = $r56, $r8 +;; + xord $r56 = $r56, $r61 +;; + sd 128[$r49] = $r56 + xord $r56 = $r52, $r34 +;; + xord $r56 = $r56, $r4 +;; + xord $r56 = $r56, $r17 +;; + xord $r56 = $r56, $r43 +;; + xord $r56 = $r56, $r61 +;; + xord $r56 = $r56, $r63 +;; + sd 136[$r49] = $r56 + xord $r56 = $r46, $r45 +;; + xord $r56 = $r56, $r34 +;; + xord $r56 = $r56, $r42 +;; + xord $r56 = $r56, $r4 +;; + xord $r56 = $r56, $r1 +;; + xord $r56 = $r56, $r8 +;; + xord $r56 = $r56, $r63 +;; + xord $r56 = $r56, $r62 +;; + sd 144[$r49] = $r56 + xord $r56 = $r38, $r44 +;; + xord $r56 = $r56, $r2 +;; + xord $r56 = $r56, $r45 +;; + xord $r56 = $r56, $r34 +;; + xord $r56 = $r56, $r53 +;; + xord $r56 = $r56, $r42 +;; + xord $r56 = $r56, $r17 +;; + xord $r56 = $r56, $r36 +;; + xord $r59 = $r56, $r43 +;; + xord $r56 = $r59, $r8 +;; + xord $r59 = $r56, $r61 +;; + xord $r56 = $r59, $r62 + xord $r59 = $r33, $r46 +;; + sd 152[$r49] = $r56 + xord $r56 = $r40, $r52 +;; + xord $r56 = $r56, $r41 +;; + xord $r56 = $r56, $r2 +;; + xord $r56 = $r56, $r45 +;; + xord $r56 = $r56, $r54 +;; + xord $r56 = $r56, $r53 +;; + xord $r56 = $r56, $r4 +;; + xord $r56 = $r56, $r9 +;; + xord $r56 = $r56, $r1 +;; + xord $r56 = $r56, $r43 +;; + xord $r56 = $r56, $r61 +;; + xord $r56 = $r56, $r63 +;; + sd 160[$r49] = $r56 + xord $r56 = $r59, $r37 +;; + xord $r56 = $r56, $r41 +;; + xord $r56 = $r56, $r2 +;; + xord $r56 = $r56, $r3 +;; + xord $r56 = $r56, $r54 +;; + xord $r57 = $r56, $r42 +;; + xord $r56 = $r57, $r6 +;; + xord $r56 = $r56, $r36 +;; + xord $r56 = $r56, $r1 +;; + xord $r56 = $r56, $r63 +;; + xord $r56 = $r56, $r62 +;; + sd 168[$r49] = $r56 + xord $r56 = $r51, $r38 +;; + xord $r56 = $r56, $r48 +;; + xord $r56 = $r56, $r37 +;; + xord $r58 = $r56, $r41 +;; + xord $r56 = $r58, $r5 +;; + xord $r56 = $r56, $r3 +;; + xord $r56 = $r56, $r53 +;; + xord $r56 = $r56, $r11 +;; + xord $r58 = $r56, $r9 +;; + xord $r56 = $r58, $r36 +;; + xord $r56 = $r56, $r62 +;; + sd 176[$r49] = $r56 + xord $r56 = $r15, $r40 +;; + xord $r56 = $r56, $r7 +;; + xord $r56 = $r56, $r48 +;; + xord $r56 = $r56, $r37 +;; + xord $r60 = $r56, $r39 + xord $r39 = $r10, $r39 +;; + xord $r56 = $r60, $r5 +;; + xord $r56 = $r56, $r54 +;; + xord $r56 = $r56, $r47 +;; + xord $r56 = $r56, $r6 +;; + xord $r56 = $r56, $r9 +;; + sd 184[$r49] = $r56 + xord $r56 = $r35, $r3 +;; + xord $r57 = $r48, $r56 + xord $r56 = $r47, $r6 +;; + xord $r57 = $r57, $r56 + xord $r56 = $r51, $r33 +;; + xord $r56 = $r57, $r56 + xord $r57 = $r7, $r35 +;; + xord $r35 = $r57, $r11 + xord $r57 = $r39, $r47 +;; + xord $r55 = $r35, $r55 + xord $r35 = $r57, $r15 + xord $r57 = $r34, $r8 + xord $r34 = $r41, $r34 +;; + xord $r39 = $r57, $r44 + xord $r57 = $r45, $r17 + xord $r58 = $r34, $r42 +;; + xord $r39 = $r39, $r56 + xord $r34 = $r58, $r4 +;; + sd 192[$r49] = $r39 + xord $r39 = $r57, $r43 +;; + xord $r39 = $r39, $r8 +;; + xord $r39 = $r39, $r52 +;; + xord $r57 = $r39, $r56 +;; + xord $r39 = $r57, $r55 +;; + sd 200[$r49] = $r39 + xord $r39 = $r2, $r4 +;; + xord $r39 = $r39, $r17 + xord $r17 = $r34, $r17 +;; + xord $r39 = $r39, $r1 +;; + xord $r57 = $r39, $r43 +;; + xord $r39 = $r57, $r46 +;; + xord $r39 = $r39, $r44 +;; + xord $r63 = $r39, $r55 +;; + xord $r57 = $r63, $r35 +;; + sd 208[$r49] = $r57 + xord $r57 = $r17, $r36 +;; + xord $r57 = $r57, $r1 +;; + xord $r57 = $r57, $r8 + xord $r8 = $r48, $r2 +;; + xord $r17 = $r57, $r38 + xord $r57 = $r37, $r45 +;; + xord $r59 = $r17, $r52 + xord $r62 = $r57, $r53 +;; + xord $r34 = $r59, $r44 + xord $r39 = $r62, $r42 +;; + xord $r44 = $r34, $r56 +;; + xord $r44 = $r44, $r35 +;; + sd 216[$r49] = $r44 + xord $r44 = $r39, $r4 +;; + xord $r4 = $r44, $r9 +;; + xord $r4 = $r4, $r36 +;; + xord $r57 = $r4, $r43 + xord $r43 = $r8, $r54 +;; + xord $r59 = $r57, $r40 + xord $r48 = $r43, $r53 +;; + xord $r34 = $r59, $r46 +;; + xord $r52 = $r34, $r52 +;; + xord $r52 = $r52, $r56 + xord $r56 = $r10, $r37 +;; + xord $r39 = $r52, $r55 + xord $r52 = $r48, $r42 +;; + sd 224[$r49] = $r39 + xord $r42 = $r52, $r6 +;; + xord $r17 = $r42, $r9 +;; + xord $r39 = $r17, $r1 +;; + xord $r48 = $r39, $r33 +;; + xord $r42 = $r48, $r38 +;; + xord $r17 = $r42, $r46 +;; + xord $r52 = $r17, $r55 + xord $r55 = $r7, $r41 +;; + xord $r17 = $r52, $r35 + xord $r34 = $r55, $r3 +;; + sd 232[$r49] = $r17 + xord $r44 = $r34, $r54 +;; + xord $r39 = $r44, $r53 +;; + xord $r60 = $r39, $r11 +;; + xord $r61 = $r60, $r6 +;; + xord $r8 = $r61, $r36 +;; + xord $r4 = $r8, $r51 +;; + xord $r39 = $r4, $r40 +;; + xord $r2 = $r39, $r38 +;; + xord $r2 = $r2, $r35 + xord $r35 = $r56, $r5 +;; + sd 240[$r49] = $r2 + xord $r62 = $r35, $r3 +;; + xord $r38 = $r62, $r54 +;; + xord $r48 = $r38, $r47 +;; + xord $r38 = $r48, $r11 +;; + xord $r11 = $r38, $r9 +;; + xord $r1 = $r11, $r15 +;; + xord $r55 = $r1, $r33 +;; + xord $r5 = $r55, $r40 +;; + sd 248[$r49] = $r5 + addd $r49 = $r49, 256 + cb.wnez $r32? .L114 +;; + addd $r0 = $r0, -1024 + addd $r1 = $r49, -1024 + make $r2, 1024 + call memmove +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 1040 +;; + ret +;; + .type bs_mixcolumns_rev, @function + .size bs_mixcolumns_rev, . - bs_mixcolumns_rev + .text + .balign 2 + .globl bs_expand_key +bs_expand_key: + addd $r17 = $r12, 0 + addd $r12 = $r12, -224 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 + addd $r0 = $r12, 40 + make $r2, 16 +;; + sd 24[$r12] = $r19 +;; + sd 32[$r12] = $r20 + call memmove +;; + addd $r0 = $r12, 40 + call expand_key +;; + make $r20, 0 + make $r19, 0 +;; +.L115: + sxwd $r1 = $r20 + addd $r11 = $r12, 40 + sxwd $r15 = $r19 + make $r2, 16 +;; + slld $r34 = $r1, 10 + addd $r1 = $r11, $r15 +;; + addd $r0 = $r18, $r34 + call memmove +;; + make $r1, 2 +;; +.L116: + make $r35, 0 +;; +.L117: + addw $r38 = $r1, $r35 + sxwd $r37 = $r35 + addw $r35 = $r35, 1 + make $r32, 2 +;; + sxwd $r6 = $r20 + sxwd $r33 = $r38 + slld $r10 = $r37, 3 + compw.lt $r32 = $r35, $r32 +;; + slld $r3 = $r6, 10 + slld $r9 = $r33, 3 +;; + addd $r0 = $r18, $r3 +;; + addd $r39 = $r0, $r9 + addd $r8 = $r0, $r10 +;; + ld $r17 = 0[$r8] +;; + sd 0[$r39] = $r17 +;; + cb.wnez $r32? .L117 +;; + addw $r1 = $r1, 2 + make $r32, 128 +;; + compw.lt $r32 = $r1, $r32 +;; + cb.wnez $r32? .L116 +;; + call bs_transpose +;; + addw $r20 = $r20, 1 + addw $r19 = $r19, 16 + make $r32, 176 +;; + compw.lt $r32 = $r19, $r32 +;; + cb.wnez $r32? .L115 +;; + ld $r16 = 8[$r12] +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r20 = 32[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 224 +;; + ret +;; + .type bs_expand_key, @function + .size bs_expand_key, . - bs_expand_key + .text + .balign 2 + .globl bs_cipher +bs_cipher: + addd $r17 = $r12, 0 + addd $r12 = $r12, -48 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 +;; + sd 24[$r12] = $r19 + addd $r19 = $r1, 0 + addd $r0 = $r18, 0 +;; + sd 32[$r12] = $r20 + call bs_transpose +;; + addd $r1 = $r19, 0 + addd $r0 = $r18, 0 + call bs_addroundkey +;; + make $r20, 1 +;; +.L118: + addd $r0 = $r18, 0 + call bs_apply_sbox +;; + addd $r0 = $r18, 0 + call bs_shiftmix +;; + sxwd $r6 = $r20 + addd $r0 = $r18, 0 +;; + slld $r4 = $r6, 10 +;; + addd $r1 = $r19, $r4 + call bs_addroundkey +;; + addw $r20 = $r20, 1 + make $r32, 10 +;; + compw.lt $r32 = $r20, $r32 +;; + cb.wnez $r32? .L118 +;; + addd $r0 = $r18, 0 + call bs_apply_sbox +;; + addd $r0 = $r18, 0 + call bs_shiftrows +;; + addd $r1 = $r19, 10240 + addd $r0 = $r18, 0 + call bs_addroundkey +;; + addd $r0 = $r18, 0 +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r20 = 32[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 48 +;; + goto bs_transpose_rev +;; + .type bs_cipher, @function + .size bs_cipher, . - bs_cipher + .text + .balign 2 + .globl bs_cipher_rev +bs_cipher_rev: + addd $r17 = $r12, 0 + addd $r12 = $r12, -48 +;; + sd 0[$r12] = $r17 +;; +;; + get $r16 = $ra +;; + sd 8[$r12] = $r16 +;; + sd 16[$r12] = $r18 + addd $r18 = $r0, 0 +;; + sd 24[$r12] = $r19 + addd $r19 = $r1, 0 + addd $r0 = $r18, 0 +;; + sd 32[$r12] = $r20 + call bs_transpose +;; + addd $r1 = $r19, 10240 + addd $r0 = $r18, 0 + call bs_addroundkey +;; + make $r20, 9 +;; +.L119: + addd $r0 = $r18, 0 + call bs_shiftrows_rev +;; + addd $r0 = $r18, 0 + call bs_apply_sbox_rev +;; + sxwd $r5 = $r20 + addd $r0 = $r18, 0 +;; + slld $r8 = $r5, 10 +;; + addd $r1 = $r19, $r8 + call bs_addroundkey +;; + addd $r0 = $r18, 0 + call bs_mixcolumns_rev +;; + addw $r20 = $r20, -1 +;; + cb.wgtz $r20? .L119 +;; + addd $r0 = $r18, 0 + call bs_shiftrows_rev +;; + addd $r0 = $r18, 0 + call bs_apply_sbox_rev +;; + addd $r1 = $r19, 0 + addd $r0 = $r18, 0 + call bs_addroundkey +;; + addd $r0 = $r18, 0 +;; + ld $r18 = 16[$r12] +;; + ld $r19 = 24[$r12] +;; + ld $r20 = 32[$r12] +;; + ld $r16 = 8[$r12] +;; + set $ra = $r16 +;; + addd $r12 = $r12, 48 +;; + goto bs_transpose_rev +;; + .type bs_cipher_rev, @function + .size bs_cipher_rev, . - bs_cipher_rev diff --git a/test/monniaux/bitsliced-aes/notes.org b/test/monniaux/bitsliced-aes/notes.org new file mode 100644 index 00000000..6c2e27fa --- /dev/null +++ b/test/monniaux/bitsliced-aes/notes.org @@ -0,0 +1,59 @@ +* bs_transpose_dst only +** original +==> test.ccomp.host.out <== +cycles: 3080223 + +==> test.ccomp.k1c.out <== +cycles: 10145951 + +==> test.gcc.host.out <== +cycles: 1485887 + +==> test.gcc.k1c.out <== +cycles: 4078535 + +** neg and +==> test.ccomp.host.out <== +cycles: 2905049 + +==> test.ccomp.k1c.out <== +cycles: 7995063 + +==> test.gcc.host.out <== +cycles: 1858263 + +==> test.gcc.k1c.out <== +cycles: 5255763 + +** cmove mais mauvais scheduling de registres +==> test.ccomp.host.out <== +cycles: 4363682 + +==> test.ccomp.k1c.out <== +cycles: 7208629 + +==> test.gcc.host.out <== +cycles: 2916854 + +==> test.gcc.k1c.out <== +cycles: 5646730 + +** cmove via match du and +==> test.ccomp.host.out <== +cycles: 2553732 + +==> test.ccomp.k1c.out <== +cycles: 7208629 + +==> test.gcc.host.out <== +cycles: 1849125 + +==> test.gcc.k1c.out <== +cycles: 5255763 + +** hand optimized loads +cycles: 6027072 + +* both bs_transpose_dst and bs_transpose_rev +** with both cmove +6890902 diff --git a/test/monniaux/bitsliced-tea/bstea.c b/test/monniaux/bitsliced-tea/bstea.c index c20c169c..43e29d45 100644 --- a/test/monniaux/bitsliced-tea/bstea.c +++ b/test/monniaux/bitsliced-tea/bstea.c @@ -86,7 +86,7 @@ void encrypt(parallel_blocks_t v, const parallel_keys_t k, unsigned int r) * in the expansion of multiples of 32/golden-ratio, * or 32/(1+sqrt(5)/2 */ - ai = (sum & (1<<i)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO; + ai = TERNARY((sum & (1<<i)), VECTOR_AT_ONE, VECTOR_AT_ZERO); bi = v[offset_v1 + i]; aandb = ai & bi; axorb = ai ^ bi; @@ -147,7 +147,7 @@ void encrypt(parallel_blocks_t v, const parallel_keys_t k, unsigned int r) carry = 0; for (i = 0;i < 32;++i) { /* VECTOR_AT_ONE where the ith bit of the sum is set */ - ai = (sum & (1<<i)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO; + ai = TERNARY((sum & (1<<i)), VECTOR_AT_ONE, VECTOR_AT_ZERO); bi = v[offset_v0 + i]; aandb = ai & bi; axorb = ai ^ bi; @@ -261,7 +261,7 @@ void decrypt(parallel_blocks_t v, const parallel_keys_t k, unsigned int r) carry = 0; for (i = 0;i < 32;++i) { /* VECTOR_AT_ONE where the ith bit of the sum is set */ - ai = (sum & (1<<i)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO; + ai = TERNARY((sum & (1<<i)), VECTOR_AT_ONE, VECTOR_AT_ZERO); bi = v[offset_v0 + i]; aandb = ai & bi; axorb = ai ^ bi; @@ -323,7 +323,7 @@ void decrypt(parallel_blocks_t v, const parallel_keys_t k, unsigned int r) carry = 0; for (i = 0;i < 32;++i) { /* VECTOR_AT_ONE where the ith bit of the sum is set */ - ai = (sum & (1<<i)) ? VECTOR_AT_ONE : VECTOR_AT_ZERO; + ai = TERNARY((sum & (1<<i)), VECTOR_AT_ONE, VECTOR_AT_ZERO); bi = v[offset_v1 + i]; aandb = ai & bi; axorb = ai ^ bi; diff --git a/test/monniaux/bitsliced-tea/bstea.h b/test/monniaux/bitsliced-tea/bstea.h index 305bd403..793f29ff 100644 --- a/test/monniaux/bitsliced-tea/bstea.h +++ b/test/monniaux/bitsliced-tea/bstea.h @@ -6,6 +6,12 @@ #include "bstea_wordsize.h" +#if defined(__K1C__) && defined(__COMPCERT__) +#define TERNARY(x, v1, v0) __builtin_ternary_ulong((x)!=0, (v1), (v0)) +#else +#define TERNARY(x, v1, v0) ((x) ? (v1) : (v0)) +#endif + #define TEA_ROUNDS 32 #define TEA_BLOCK_SIZE 64 diff --git a/test/monniaux/bitsliced-tea/bstea_wordsize.h b/test/monniaux/bitsliced-tea/bstea_wordsize.h index 5381e17c..b4e2e823 100644 --- a/test/monniaux/bitsliced-tea/bstea_wordsize.h +++ b/test/monniaux/bitsliced-tea/bstea_wordsize.h @@ -6,7 +6,7 @@ #if defined __x86_64__ || defined __amd64__ || defined __x86_64 || \ defined __amd64 || defined _M_X64 || defined __ia64__ || \ defined __ia64__ || defined __IA64__ || defined __ia64 || \ - defined _M_IA64 + defined _M_IA64 || defined __K1C__ # define __BSTEA_WORDSIZE 64 #else # define __BSTEA_WORDSIZE 32 diff --git a/test/monniaux/bitsliced-tea/reduce/compare.sh b/test/monniaux/bitsliced-tea/reduce/compare.sh new file mode 100755 index 00000000..d9f3cf86 --- /dev/null +++ b/test/monniaux/bitsliced-tea/reduce/compare.sh @@ -0,0 +1,20 @@ +PREFIX=/home/monniaux/work/Kalray/ternary/CompCert +INCLUDES=-I$PREFIX/test/monniaux/bitsliced-tea +CCOMP_K1="$PREFIX/ccomp -fno-unprototyped -O3 $INCLUDES" +GCC_K1="k1-mbr-gcc -Werror=implicit -O3 $INCLUDES" +GCC_HOST="gcc -Werror=implicit -O3 $INCLUDES" +FILE=bstea.c + +OTHERS_K1="$PREFIX/test/monniaux/bitsliced-tea/bstea_run.gcc.k1c.o $PREFIX/test/monniaux/clock.gcc.k1c.o" +OTHERS_HOST="$PREFIX/test/monniaux/bitsliced-tea/bstea_run.gcc.host.o $PREFIX/test/monniaux/clock.gcc.host.o" + +$CCOMP_K1 $FILE $OTHERS_K1 -o bstead.ccomp.k1c && +$GCC_K1 $FILE $OTHERS_K1 -o bstead.gcc.k1c && +$GCC_HOST $FILE $OTHERS_HOST -o bstead.gcc.host && +valgrind -q ./bstead.gcc.host && +k1-cluster --cycle-based -- bstead.ccomp.k1c > bstead.ccomp.k1c.out && +k1-cluster --cycle-based -- bstead.gcc.k1c > bstead.gcc.k1c.out && +grep cycles bstead.ccomp.k1c.out|sed -e 's/cycles: //' > bstead.ccomp.k1c.cycles && +grep cycles bstead.gcc.k1c.out|sed -e 's/cycles: //' > bstead.gcc.k1c.cycles && +test `cat bstead.gcc.k1c.cycles` -gt 100000 && +test `cat bstead.ccomp.k1c.cycles` -gt 200000 diff --git a/test/monniaux/ternary_builtin/ternary_builtin.c b/test/monniaux/ternary_builtin/ternary_builtin.c new file mode 100644 index 00000000..7f2043ec --- /dev/null +++ b/test/monniaux/ternary_builtin/ternary_builtin.c @@ -0,0 +1,10 @@ +#include <stdio.h> +#include <stdlib.h> + +int main(int argc, char **argv) { + int i=0; + if (argc >= 2) i=atoi(argv[1]); + printf("%ld\n", __builtin_ternary_uint(i, 42, 69)); + printf("%f\n", __builtin_ternary_double(i, 42.0, 69.0)); + return 0; +} |