Merge remote-tracking branch 'origin/kvx-work' into kvx-test-prepass

author: David Monniaux <david.monniaux@univ-grenoble-alpes.fr> 2020-10-18 22:49:10 +0200
committer: David Monniaux <david.monniaux@univ-grenoble-alpes.fr> 2020-10-18 22:49:10 +0200
commit: 996a2e5bbc4826d95144b62f5218b6e3e1e7d881 (patch)
tree: 55a5136c68e7d0f7a304913f28d0a761d9facd9a
parent: 8c3a2bdb56eba8d8bc5e359b01a320916eac85f0 (diff)
parent: a2f31f2b886ccb9656a019db1780aabc1789368a (diff)
download: compcert-kvx-996a2e5bbc4826d95144b62f5218b6e3e1e7d881.tar.gz
compcert-kvx-996a2e5bbc4826d95144b62f5218b6e3e1e7d881.zip
22 files changed, 939 insertions, 176 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3b1a86fd..10008017 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -237,7 +237,7 @@ build_kvx:
     - source /opt/kalray/accesscore/kalray.sh && ./config_kvx.sh
     - source /opt/kalray/accesscore/kalray.sh && make -j "$NJOBS"
     - source /opt/kalray/accesscore/kalray.sh && make -C test CCOMPOPTS=-static SIMU='kvx-cluster -- ' EXECUTE='kvx-cluster -- ' all test
-    - source /opt/kalray/accesscore/kalray.sh && make -C test/monniaux/yarpgen TARGET_CC='kvx-cos-gcc' EXECUTE='kvx-cluster -- ' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
+    - source /opt/kalray/accesscore/kalray.sh && ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='kvx-cos-gcc' EXECUTE='kvx-cluster -- ' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
   rules:
     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
diff --git a/Makefile b/Makefile
index 73b3a446..6560a64b 100644
--- a/Makefile
+++ b/Makefile
@@ -95,6 +95,7 @@ BACKEND=\
   CSE2deps.v CSE2depsproof.v \
   CSE2.v CSE2proof.v \
   CSE3analysis.v CSE3analysisproof.v CSE3.v CSE3proof.v \
+  KillUselessMoves.v KillUselessMovesproof.v \
   LICM.v LICMproof.v \
   NeedDomain.v NeedOp.v Deadcode.v Deadcodeproof.v \
   Unusedglob.v Unusedglobproof.v \
diff --git a/backend/CSE3analysis.v b/backend/CSE3analysis.v
index ade79c28..7316c9a9 100644
--- a/backend/CSE3analysis.v
+++ b/backend/CSE3analysis.v
@@ -298,13 +298,22 @@ Section OPERATIONS.
 
   
   Definition move (src dst : reg) (rel : RELATION.t) : RELATION.t :=
-    match eq_find {| eq_lhs := dst;
+    if peq src dst
+    then rel
+    else
+      match eq_find {| eq_lhs := dst;
                      eq_op  := SOp Omove;
                      eq_args:= src::nil |} with
-    | Some eq_id => PSet.add eq_id (kill_reg dst rel)
-    | None => kill_reg dst rel
-    end.
+      | Some eq_id => PSet.add eq_id (kill_reg dst rel)
+      | None => kill_reg dst rel
+      end.
 
+  Definition is_trivial_sym_op sop :=
+    match sop with
+    | SOp op => is_trivial_op op
+    | SLoad _ _ => false
+    end.
+  
   Definition oper (dst : reg) (op: sym_op) (args : list reg)
              (rel : RELATION.t) : RELATION.t :=
     if is_smove op
@@ -315,15 +324,18 @@ Section OPERATIONS.
       | _ => kill_reg dst rel
       end
     else
-      let args' := forward_move_l rel args in
-      match rhs_find op args' rel with
-      | Some r =>
-        if Compopts.optim_CSE3_glb tt
-        then RELATION.glb (move r dst rel)
-                          (oper1 dst op args' rel)
-        else oper1 dst op args' rel
-      | None => oper1 dst op args' rel
-      end.
+      if is_trivial_sym_op op
+      then kill_reg dst rel
+      else
+        let args' := forward_move_l rel args in
+        match rhs_find op args' rel with
+        | Some r =>
+          if Compopts.optim_CSE3_glb tt
+          then RELATION.glb (move r dst rel)
+                            (oper1 dst op args' rel)
+          else oper1 dst op args' rel
+        | None => oper1 dst op args' rel
+        end.
   
   Definition clever_kill_store
              (chunk : memory_chunk) (addr: addressing) (args : list reg)
diff --git a/backend/CSE3analysisaux.ml b/backend/CSE3analysisaux.ml
index 3e4a6b9e..3990b765 100644
--- a/backend/CSE3analysisaux.ml
+++ b/backend/CSE3analysisaux.ml
@@ -67,6 +67,9 @@ let pp_option pp oc = function
   | None -> output_string oc "none"
   | Some x -> pp oc x;;
 
+let is_trivial eq =
+  (eq.eq_op = SOp Op.Omove) && (eq.eq_args = [eq.eq_lhs]);;
+
 let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
   let cur_eq_id = ref 0
   and cur_catalog = ref PTree.empty
@@ -76,6 +79,7 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
   and cur_kill_mem = ref PSet.empty
   and cur_moves = ref (PMap.init PSet.empty) in
   let eq_find_oracle node eq =
+    assert (not (is_trivial eq));
     let o = Hashtbl.find_opt eq_table (flatten_eq eq) in
     (if !Clflags.option_debug_compcert > 1
      then Printf.printf "@%d: eq_find %a -> %a\n" (P.to_int node)
diff --git a/backend/CSE3analysisproof.v b/backend/CSE3analysisproof.v
index f4e3672d..66b199cc 100644
--- a/backend/CSE3analysisproof.v
+++ b/backend/CSE3analysisproof.v
@@ -745,6 +745,25 @@ Section SOUNDNESS.
 
   Hint Resolve oper1_sound : cse3.
 
+  Lemma rel_idem_replace:
+    forall rel rs r m,
+      sem_rel rel rs m ->
+      sem_rel rel rs # r <- (rs # r) m.
+  Proof.
+    intros until m.
+    intro REL.
+    unfold sem_rel, sem_eq, sem_rhs in *.
+    intros.
+    specialize REL with (i:=i) (eq0:=eq).
+    rewrite Regmap.gsident.
+    replace ((rs # r <- (rs # r)) ## (eq_args eq)) with
+        (rs ## (eq_args eq)).
+    { apply REL; auto. }
+    apply list_map_exten.
+    intros.
+    apply Regmap.gsident.
+  Qed.
+      
   Lemma move_sound :
     forall no : node,
     forall rel : RELATION.t,
@@ -756,6 +775,10 @@ Section SOUNDNESS.
     unfold move.
     intros until m.
     intro REL.
+    destruct (peq src dst).
+    { subst dst.
+      apply rel_idem_replace; auto.
+    }
     pose proof (eq_find_sound no  {| eq_lhs := dst; eq_op := SOp Omove; eq_args := src :: nil |}) as EQ_FIND_SOUND.
     destruct eq_find.
     - intros i eq CONTAINS.
@@ -798,7 +821,11 @@ Section SOUNDNESS.
       subst.
       rewrite <- (forward_move_sound rel rs m r) by auto.
       apply move_sound; auto.
-    - destruct rhs_find as [src |] eqn:RHS_FIND.
+    - destruct (is_trivial_sym_op sop).
+      {
+        apply kill_reg_sound; auto.
+      }
+      destruct rhs_find as [src |] eqn:RHS_FIND.
       + destruct (Compopts.optim_CSE3_glb tt).
         * apply sem_rel_glb; split.
           ** pose proof (rhs_find_sound no sop (forward_move_l (ctx:=ctx) rel args) rel src rs m REL RHS_FIND) as SOUND.
diff --git a/backend/CSE3proof.v b/backend/CSE3proof.v
index 6e489066..3fbc9912 100644
--- a/backend/CSE3proof.v
+++ b/backend/CSE3proof.v
@@ -443,12 +443,6 @@ Ltac IND_STEP :=
         idtac mpc mpc' fn minstr *)
       end.
 
-Lemma if_same : forall {T : Type} (b : bool) (x : T),
-    (if b then x else x) = x.
-Proof.
-  destruct b; trivial.
-Qed.
-
 Lemma step_simulation:
   forall S1 t S2, RTL.step ge S1 t S2 -> 
   forall S1', match_states S1 S1' ->
diff --git a/backend/Duplicateaux.ml b/backend/Duplicateaux.ml
index 1297ec90..eb9f42e0 100644
--- a/backend/Duplicateaux.ml
+++ b/backend/Duplicateaux.ml
@@ -15,6 +15,7 @@
 (* Oracle for Duplicate pass.
  * - Add static prediction information to Icond nodes
  * - Performs tail duplication on interesting traces to form superblocks
+ * - Unrolls a single iteration of innermost loops
  * - (TODO: perform partial loop unrolling inside innermost loops)
  *)
 
@@ -22,23 +23,13 @@ open RTL
 open Maps
 open Camlcoq
 
-let debug_flag = ref false
-
-let debug fmt =
-  if !debug_flag then Printf.eprintf fmt
-  else Printf.ifprintf stderr fmt
-
-let get_some = function
-| None -> failwith "Did not get some"
-| Some thing -> thing
-
-let rtl_successors = function
-| Itailcall _ | Ireturn _ -> []
-| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n)
-| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n]
-| Icond (_,_,n1,n2,_) -> [n1; n2]
-| Ijumptable (_,ln) -> ln
+let debug_flag = LICMaux.debug_flag
+let debug = LICMaux.debug
+let get_loop_headers = LICMaux.get_loop_headers
+let get_some = LICMaux.get_some
+let rtl_successors = LICMaux.rtl_successors
 
+(* Get list of nodes following a BFS of the code *)
 let bfs code entrypoint = begin
   debug "bfs\n";
   let visited = ref (PTree.map (fun n i -> false) code)
@@ -67,6 +58,7 @@ let optbool o = match o with Some _ -> true | None -> false
 
 let ptree_get_some n ptree = get_some @@ PTree.get n ptree
 
+(* Returns a PTree: node -> list of the predecessors of that node *)
 let get_predecessors_rtl code = begin
   debug "get_predecessors_rtl\n";
   let preds = ref (PTree.map (fun n i -> []) code) in
@@ -89,15 +81,13 @@ end
 
 module PSet = Set.Make(PInt)
 
-let print_intlist l =
-  let rec f = function
+let print_intlist oc l =
+  let rec f oc = function
   | [] -> ()
-  | n::ln -> (Printf.printf "%d " (P.to_int n); f ln)
+  | n::ln -> (Printf.fprintf oc "%d %a" (P.to_int n) f ln)
   in begin
     if !debug_flag then begin
-      Printf.printf "[";
-      f l;
-      Printf.printf "]"
+      Printf.fprintf oc "[%a]" f l
     end
   end
 
@@ -113,43 +103,6 @@ let print_intset s =
     end
   end
 
-type vstate = Unvisited | Processed | Visited
-
-(** Getting loop branches with a DFS visit :
-  * Each node is either Unvisited, Visited, or Processed
-  * pre-order: node becomes Processed
-  * post-order: node becomes Visited
-  *
-  * If we come accross an edge to a Processed node, it's a loop!
-  *)
-let get_loop_headers code entrypoint = begin
-  debug "get_loop_headers\n";
-  let visited = ref (PTree.map (fun n i -> Unvisited) code)
-  and is_loop_header = ref (PTree.map (fun n i -> false) code)
-  in let rec dfs_visit code = function
-  | [] -> ()
-  | node :: ln ->
-      match (get_some @@ PTree.get node !visited) with
-      | Visited -> ()
-      | Processed -> begin
-          debug "Node %d is a loop header\n" (P.to_int node);
-          is_loop_header := PTree.set node true !is_loop_header;
-          visited := PTree.set node Visited !visited
-        end
-      | Unvisited -> begin
-          visited := PTree.set node Processed !visited;
-          match PTree.get node code with
-          | None -> failwith "No such node"
-          | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits;
-          visited := PTree.set node Visited !visited;
-          dfs_visit code ln
-        end
-  in begin
-    dfs_visit code [entrypoint];
-    !is_loop_header
-  end
-end
-
 let ptree_printbool pt =
   let elements = PTree.elements pt
   in begin
@@ -174,6 +127,10 @@ let rec look_ahead code node is_loop_header predicate =
         )
     | _ -> false
 
+(** 
+ * Heuristics mostly based on the paper Branch Prediction for Free 
+ *)
+
 let do_call_heuristic code cond ifso ifnot is_loop_header =
   begin
     debug "\tCall heuristic..\n";
@@ -302,7 +259,7 @@ let get_loop_info is_loop_header bfs_order code =
     !loop_info
   end
 
-(* Remark - compared to the original paper, we don't use the store heuristic *)
+(* Remark - compared to the original Branch Prediction for Free paper, we don't use the store heuristic *)
 let get_directions code entrypoint = begin
   debug "get_directions\n";
   let bfs_order = bfs code entrypoint in
@@ -435,18 +392,29 @@ let best_predecessor_of node predecessors code order is_visited =
         ) order)
       with Not_found -> None
 
-let print_trace t = print_intlist t
+let print_trace = print_intlist
 
-let print_traces traces =
-  let rec f = function
+let print_traces oc traces =
+  let rec f oc = function
   | [] -> ()
-  | t::lt -> Printf.printf "\n\t"; print_trace t; Printf.printf ",\n"; f lt
+  | t::lt -> Printf.fprintf oc "\n\t%a,\n%a" print_trace t f lt
   in begin
-    if !debug_flag then begin
-      Printf.printf "Traces: {";
-      f traces;
-      Printf.printf "}\n";
-    end
+    if !debug_flag then
+      Printf.fprintf oc "Traces: {%a}\n" f traces
+  end
+
+(* Adapted from backend/PrintRTL.ml: print_function *)
+let print_code code = let open PrintRTL in let open Printf in
+  if (!debug_flag) then begin
+    fprintf stdout "{\n";
+    let instrs =
+      List.sort
+        (fun (pc1, _) (pc2, _) -> compare pc2 pc1)
+        (List.rev_map
+          (fun (pc, i) -> (P.to_int pc, i))
+          (PTree.elements code)) in
+    List.iter (print_instruction stdout) instrs;
+    fprintf stdout "}"
   end
 
 (* Dumb (but linear) trace selection *)
@@ -521,7 +489,7 @@ let select_traces_chang code entrypoint = begin
       end
     done;
     (* debug "DFS: \t"; print_intlist order; debug "\n"; *)
-    debug "Traces: "; print_traces !traces;
+    debug "Traces: %a" print_traces !traces;
     !traces
   end
 end
@@ -537,26 +505,26 @@ let rec make_identity_ptree_rec = function
 
 let make_identity_ptree code = make_identity_ptree_rec (PTree.elements code)
 
-(* Change the pointers of preds nodes to point to n' instead of n *)
+(* Change the pointers of nodes to point to n' instead of n *)
 let rec change_pointers code n n' = function
   | [] -> code
-  | pred :: preds ->
-      let new_pred_inst = match ptree_get_some pred code with
-        | Icall(a, b, c, d, n0) -> assert (n0 == n); Icall(a, b, c, d, n')
-        | Ibuiltin(a, b, c, n0) -> assert (n0 == n); Ibuiltin(a, b, c, n')
-        | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e == n) ln);
-                               Ijumptable(a, List.map (fun e -> if (e == n) then n' else e) ln)
-        | Icond(a, b, n1, n2, i) -> assert (n1 == n || n2 == n);
-                                 let n1' = if (n1 == n) then n' else n1
-                                 in let n2' = if (n2 == n) then n' else n2
+  | node :: nodes ->
+      let new_pred_inst = match ptree_get_some node code with
+        | Icall(a, b, c, d, n0) -> assert (n0 = n); Icall(a, b, c, d, n')
+        | Ibuiltin(a, b, c, n0) -> assert (n0 = n); Ibuiltin(a, b, c, n')
+        | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e = n) ln);
+                               Ijumptable(a, List.map (fun e -> if (e = n) then n' else e) ln)
+        | Icond(a, b, n1, n2, i) -> assert (n1 = n || n2 = n);
+                                 let n1' = if (n1 = n) then n' else n1
+                                 in let n2' = if (n2 = n) then n' else n2
                                  in Icond(a, b, n1', n2', i)
-        | Inop n0 -> assert (n0 == n); Inop n'
-        | Iop (a, b, c, n0) -> assert (n0 == n); Iop (a, b, c, n')
-        | Iload (a, b, c, d, e, n0) -> assert (n0 == n); Iload (a, b, c, d, e, n')
-        | Istore (a, b, c, d, n0) -> assert (n0 == n); Istore (a, b, c, d, n')
+        | Inop n0 -> assert (n0 = n); Inop n'
+        | Iop (a, b, c, n0) -> assert (n0 = n); Iop (a, b, c, n')
+        | Iload (a, b, c, d, e, n0) -> assert (n0 = n); Iload (a, b, c, d, e, n')
+        | Istore (a, b, c, d, n0) -> assert (n0 = n); Istore (a, b, c, d, n')
         | Itailcall _ | Ireturn _ -> failwith "That instruction cannot be a predecessor"
-      in let new_code = PTree.set pred new_pred_inst code
-      in change_pointers new_code n n' preds
+      in let new_code = PTree.set node new_pred_inst code
+      in change_pointers new_code n n' nodes
 
 (* parent: parent of n to keep as parent
  * preds: all the other parents of n
@@ -580,13 +548,20 @@ let is_empty = function
   | [] -> true
   | _ -> false
 
+let next_free_pc code = maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1
+
+let is_a_nop code n =
+  match get_some @@ PTree.get n code with
+  | Inop _ -> true
+  | _ -> false
+
 (* code: RTL code
  * preds: mapping node -> predecessors
  * ptree: the revmap
  * trace: the trace to follow tail duplication on *)
 let tail_duplicate code preds ptree trace =
   (* next_int: unused integer that can be used for the next duplication *)
-  let next_int = ref (maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1)
+  let next_int = ref (next_free_pc code)
   (* last_node and last_duplicate store resp. the last processed node of the trace, and its duplication *)
   in let last_node = ref None
   in let last_duplicate = ref None
@@ -608,7 +583,7 @@ let tail_duplicate code preds ptree trace =
               in let (newc, newp) = duplicate code ptree !last_node n final_node_preds (P.of_int n')
               in begin
                 next_int := !next_int + 1;
-                nb_duplicated := !nb_duplicated + 1;
+                (if not @@ is_a_nop code n then nb_duplicated := !nb_duplicated + 1);
                 last_duplicate := Some (P.of_int n');
                 (newc, newp)
               end
@@ -620,9 +595,8 @@ let tail_duplicate code preds ptree trace =
   in let new_code, new_ptree = f code ptree true trace
   in (new_code, new_ptree, !nb_duplicated)
 
-let superblockify_traces code preds traces =
-  let max_nb_duplicated = !Clflags.option_fduplicate (* FIXME - should be architecture dependent *)
-  in let ptree = make_identity_ptree code
+let superblockify_traces code preds traces ptree =
+  let max_nb_duplicated = !Clflags.option_ftailduplicate (* FIXME - should be architecture dependent *)
   in let rec f code ptree = function
     | [] -> (code, ptree, 0)
     | trace :: traces ->
@@ -633,37 +607,271 @@ let superblockify_traces code preds traces =
   in let new_code, new_ptree, _ = f code ptree traces
   in (new_code, new_ptree)
 
-let rec invert_iconds_trace code = function
-  | [] -> code
-  | n :: ln ->
-      let code' = match ptree_get_some n code with
-        | Icond (c, lr, ifso, ifnot, info) -> (match info with
-            | Some true -> begin
-                (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *)
-                PTree.set n (Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)) code
-              end
-            | _ -> code)
-        | _ -> code
-      in invert_iconds_trace code' ln
+let invert_iconds code =
+  PTree.map1 (fun i -> match i with
+    | Icond (c, lr, ifso, ifnot, info) -> (match info with
+        | Some true -> begin
+            (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *)
+            Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)
+          end
+        | _ -> i)
+    | _ -> i
+  ) code
+
+(** Partial loop unrolling
+ *
+ * The following code seeks innermost loops, and unfolds the first iteration
+ * Most of the code has been moved from LICMaux.ml to Duplicateaux.ml to solve
+ * cyclic dependencies between LICMaux and Duplicateaux
+ *)
 
-let rec invert_iconds code = function
-  | [] -> code
-  | t :: ts ->
-      let code' = if !Clflags.option_finvertcond then invert_iconds_trace code t
-                  else code
-      in invert_iconds code' ts
+type innerLoop = {
+  preds: P.t list;
+  body: HashedSet.PSet.t;
+  head: P.t; (* head of the loop *)
+  final: P.t (* the final instruction, which loops back to the head *)
+}
+
+let print_pset = LICMaux.pp_pset
+
+let print_inner_loop iloop =
+  debug "{preds: %a, body: %a}" print_intlist iloop.preds print_pset iloop.body
+
+let rec print_inner_loops = function
+| [] -> ()
+| iloop :: iloops -> begin
+    print_inner_loop iloop;
+    debug "\n";
+    print_inner_loops iloops
+  end
+
+let print_ptree printer pt =
+  let elements = PTree.elements pt in
+  begin
+    debug "[\n";
+    List.iter (fun (n, elt) ->
+      debug "\t%d: %a\n" (P.to_int n) printer elt
+    ) elements;
+    debug "]\n"
+  end
+
+let print_pint oc i = if !debug_flag then Printf.fprintf oc "%d" (P.to_int i) else ()
+
+let get_inner_loops f code is_loop_header =
+  let fake_f = { fn_sig = f.fn_sig; fn_params = f.fn_params; 
+    fn_stacksize = f.fn_stacksize; fn_code = code; fn_entrypoint = f.fn_entrypoint } in
+  let (_, predmap, loopmap) = LICMaux.inner_loops fake_f in
+  begin
+    debug "PREDMAP: "; print_ptree print_intlist predmap;
+    debug "LOOPMAP: "; print_ptree print_pset loopmap;
+    List.map (fun (n, body) ->
+      let preds = List.filter (fun p -> not @@ HashedSet.PSet.contains body p) 
+        @@ get_some @@ PTree.get n predmap in
+      let head = (* the instruction from body which is a loop header *)
+        let heads = HashedSet.PSet.elements @@ HashedSet.PSet.filter 
+          (fun n -> ptree_get_some n is_loop_header) body in
+        begin
+          assert (List.length heads == 1);
+          List.hd heads
+        end in
+      let final = (* the predecessors from head that are in the body *)
+        let head_preds = ptree_get_some head predmap in
+        let filtered = List.filter (fun n -> HashedSet.PSet.contains body n) head_preds in
+        begin
+          debug "HEAD: %d\n" (P.to_int head);
+          debug "BODY: %a\n" print_pset body;
+          debug "HEADPREDS: %a\n" print_intlist head_preds;
+          assert (List.length filtered == 1);
+          List.hd filtered
+        end in
+      { preds = preds; body = body; head = head; final = final }
+    ) 
+    (* LICMaux.inner_loops also returns non-inner loops, but with a body of 1 instruction
+     * We remove those to get just the inner loops *)
+    @@ List.filter (fun (n, body) ->
+      let count = List.length @@ HashedSet.PSet.elements body in count != 1
+    ) (PTree.elements loopmap)
+  end
+
+let rec generate_fwmap ln ln' fwmap =
+  match ln with
+  | [] -> begin
+      match ln' with
+      | [] -> fwmap
+      | _ -> failwith "ln and ln' have different lengths"
+    end
+  | n :: ln -> begin
+      match ln' with
+      | n' :: ln' -> generate_fwmap ln ln' (PTree.set n n' fwmap)
+      | _ -> failwith "ln and ln' have different lengths"
+    end
+
+let generate_revmap ln ln' revmap = generate_fwmap ln' ln revmap
+
+let apply_map fw n = P.of_int @@ ptree_get_some n fw
+
+let apply_map_opt fw n =
+  match PTree.get n fw with
+  | Some n' -> P.of_int n'
+  | None -> n
+
+let change_nexts fwmap = function
+  | Icall (a, b, c, d, n) -> Icall (a, b, c, d, apply_map fwmap n)
+  | Ibuiltin (a, b, c, n) -> Ibuiltin (a, b, c, apply_map fwmap n)
+  | Ijumptable (a, ln) -> Ijumptable (a, List.map (apply_map_opt fwmap) ln)
+  | Icond (a, b, n1, n2, i) -> Icond (a, b, apply_map_opt fwmap n1, apply_map_opt fwmap n2, i)
+  | Inop n -> Inop (apply_map fwmap n)
+  | Iop (a, b, c, n) -> Iop (a, b, c, apply_map fwmap n)
+  | Iload (a, b, c, d, e, n) -> Iload (a, b, c, d, e, apply_map fwmap n)
+  | Istore (a, b, c, d, n) -> Istore (a, b, c, d, apply_map fwmap n)
+  | Itailcall (a, b, c) -> Itailcall (a, b, c)
+  | Ireturn o -> Ireturn o
+
+(** Clone a list of instructions into free pc indexes
+ *
+ * The list of instructions should be contiguous, and not include any loop.
+ * It is assumed that the first instruction of the list is the head.
+ * Also, the last instruction of the list should be the loop backedge.
+ *
+ * Returns: (code', revmap', ln', fwmap)
+ *  code' is the updated code, after cloning
+ *  revmap' is the updated revmap
+ *  ln' is the list of the new indexes used to reference the cloned instructions
+ *  fwmap is a map from ln to ln'
+ *)
+let clone code revmap ln = begin
+  assert (List.length ln > 0);
+  let head' = next_free_pc code in
+  (* +head' to ensure we never overlap with the existing code *)
+  let ln' = List.map (fun n -> n + head') @@ List.map P.to_int ln in
+  let fwmap = generate_fwmap ln ln' PTree.empty in
+  let revmap' = generate_revmap ln (List.map P.of_int ln') revmap in
+  let code' = ref code in
+  List.iter (fun n ->
+    let instr = get_some @@ PTree.get n code in
+    let instr' = change_nexts fwmap instr in
+    code' := PTree.set (apply_map fwmap n) instr' !code'
+  ) ln;
+  (!code', revmap', ln', fwmap)
+end
+
+let rec count_ignore_nops code = function
+  | [] -> 0
+  | n::ln ->
+      let inst = get_some @@ PTree.get n code in
+      match inst with
+      | Inop _ -> count_ignore_nops code ln
+      | _ -> 1 + count_ignore_nops code ln
+
+(* Unrolls a single interation of the inner loop
+ * 1) Clones the body into body'
+ * 2) Links the preds to the first instruction of body'
+ * 3) Links the last instruction of body' into the first instruction of body
+ *)
+let unroll_inner_loop_single code revmap iloop =
+  let body = HashedSet.PSet.elements (iloop.body) in
+  if count_ignore_nops code body > !Clflags.option_funrollsingle then begin
+    debug "Too many nodes in the loop body (%d > %d)" (List.length body) !Clflags.option_funrollsingle;
+    (code, revmap)
+  end else
+    let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
+    let code' = ref code2 in
+    let head' = apply_map fwmap (iloop.head) in
+    let final' = apply_map fwmap (iloop.final) in
+    begin
+      debug "PREDS: %a\n" print_intlist iloop.preds;
+      debug "IHEAD: %d\n" (P.to_int iloop.head);
+      code' := change_pointers !code' (iloop.head) head' (iloop.preds);
+      code' := change_pointers !code' head' (iloop.head) [final'];
+      (!code', revmap2)
+    end
+
+let unroll_inner_loops_single f code revmap =
+  let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
+  let inner_loops = get_inner_loops f code is_loop_header in
+  let code' = ref code in
+  let revmap' = ref revmap in
+  begin
+    print_inner_loops inner_loops;
+    List.iter (fun iloop ->
+      let (new_code, new_revmap) = unroll_inner_loop_single !code' !revmap' iloop in
+      code' := new_code; revmap' := new_revmap
+    ) inner_loops;
+    (!code', !revmap')
+  end
+
+(* Unrolls the body of the inner loop once - duplicating the exit condition as well 
+ * 1) Clones body into body'
+ * 2) Links the last instruction of body into the first of body' 
+ * 3) Links the last instruction of body' into the first of body
+ *)
+let unroll_inner_loop_body code revmap iloop =
+  let body = HashedSet.PSet.elements (iloop.body) in
+  let limit = !Clflags.option_funrollbody in
+  if count_ignore_nops code body > limit then begin
+    debug "Too many nodes in the loop body (%d > %d)" (List.length body) limit;
+    (code, revmap)
+  end else
+    let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
+    let code' = ref code2 in
+    let head' = apply_map fwmap (iloop.head) in
+    let final' = apply_map fwmap (iloop.final) in
+    begin
+      code' := change_pointers !code' iloop.head head' [iloop.final];
+      code' := change_pointers !code' head' iloop.head [final']; 
+      (!code', revmap2)
+    end
+
+let unroll_inner_loops_body f code revmap =
+  let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
+  let inner_loops = get_inner_loops f code is_loop_header in
+  let code' = ref code in
+  let revmap' = ref revmap in
+  begin
+    print_inner_loops inner_loops;
+    List.iter (fun iloop ->
+      let (new_code, new_revmap) = unroll_inner_loop_body !code' !revmap' iloop in
+      code' := new_code; revmap' := new_revmap
+    ) inner_loops;
+    (!code', !revmap')
+  end
 
 let duplicate_aux f =
+  (* initializing *)
   let entrypoint = f.fn_entrypoint in
-  if !Clflags.option_fduplicate < 0 then
-    ((f.fn_code, entrypoint), make_identity_ptree f.fn_code)
-  else
-    let code = update_directions (f.fn_code) entrypoint in
-    let traces = select_traces code entrypoint in
-    let icond_code = invert_iconds code traces in
-    let preds = get_predecessors_rtl icond_code in
-    if !Clflags.option_fduplicate >= 1 then
-      let (new_code, pTreeId) = ((* print_traces traces; *) superblockify_traces icond_code preds traces) in
-      ((new_code, f.fn_entrypoint), pTreeId)
-    else
-      ((icond_code, entrypoint), make_identity_ptree code)
+  let code = f.fn_code in
+  let revmap = make_identity_ptree code in
+
+  (* static prediction *)
+  let code =
+    if !Clflags.option_fpredict then
+      update_directions code entrypoint
+    else code in
+
+  (* unroll single *)
+  let (code, revmap) =
+    if !Clflags.option_funrollsingle > 0 then
+      unroll_inner_loops_single f code revmap
+    else (code, revmap) in
+
+  (* unroll body *)
+  let (code, revmap) =
+    if !Clflags.option_funrollbody > 0 then
+      unroll_inner_loops_body f code revmap
+    else (code, revmap) in
+
+  (* static prediction bis *)
+  let code =
+    if !Clflags.option_fpredict then
+      invert_iconds code
+    else code in
+
+  (* tail duplication *)
+  let (code, revmap) =
+    if !Clflags.option_ftailduplicate > 0 then
+      let traces = select_traces code entrypoint in
+      let preds = get_predecessors_rtl code in
+      superblockify_traces code preds traces revmap
+    else (code, revmap) in
+
+  ((code, entrypoint), revmap)
diff --git a/backend/KillUselessMoves.v b/backend/KillUselessMoves.v
new file mode 100644
index 00000000..bdd7ec60
--- /dev/null
+++ b/backend/KillUselessMoves.v
@@ -0,0 +1,40 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           David Monniaux     CNRS, VERIMAG                  *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL.
+Require List.
+
+Definition transf_ros (ros: reg + ident) : reg + ident := ros.
+
+Definition transf_instr (pc: node) (instr: instruction) :=
+  match instr with
+  | Iop op args res s =>
+    if (eq_operation op Omove) && (List.list_eq_dec peq args (res :: nil))
+    then Inop s
+    else instr
+  | _ => instr
+  end.
+
+Definition transf_function (f: function) : function :=
+  {| fn_sig := f.(fn_sig);
+     fn_params := f.(fn_params);
+     fn_stacksize := f.(fn_stacksize);
+     fn_code := PTree.map transf_instr f.(fn_code);
+     fn_entrypoint := f.(fn_entrypoint) |}.
+
+Definition transf_fundef (fd: fundef) : fundef :=
+  AST.transf_fundef transf_function fd.
+
+Definition transf_program (p: program) : program :=
+  transform_program transf_fundef p.
diff --git a/backend/KillUselessMovesproof.v b/backend/KillUselessMovesproof.v
new file mode 100644
index 00000000..629aa6aa
--- /dev/null
+++ b/backend/KillUselessMovesproof.v
@@ -0,0 +1,361 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           David Monniaux     CNRS, VERIMAG                  *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+Require Import Axioms.
+Require Import FunInd.
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import KillUselessMoves.
+
+
+Definition match_prog (p tp: RTL.program) :=
+  match_program (fun ctx f tf => tf = transf_fundef f) eq p tp.
+
+Lemma transf_program_match:
+  forall p, match_prog p (transf_program p).
+Proof.
+  intros. eapply match_transform_program; eauto.
+Qed.
+
+Section PRESERVATION.
+
+Variables prog tprog: program.
+Hypothesis TRANSL: match_prog prog tprog.
+Let ge := Genv.globalenv prog.
+Let tge := Genv.globalenv tprog.
+
+Lemma functions_translated:
+  forall v f,
+  Genv.find_funct ge v = Some f ->
+  Genv.find_funct tge v = Some (transf_fundef f).
+Proof (Genv.find_funct_transf TRANSL).
+
+Lemma function_ptr_translated:
+  forall v f,
+  Genv.find_funct_ptr ge v = Some f ->
+  Genv.find_funct_ptr tge v = Some (transf_fundef f).
+Proof (Genv.find_funct_ptr_transf TRANSL).
+
+Lemma symbols_preserved:
+  forall id,
+  Genv.find_symbol tge id = Genv.find_symbol ge id.
+Proof (Genv.find_symbol_transf TRANSL).
+
+Lemma senv_preserved:
+  Senv.equiv ge tge.
+Proof (Genv.senv_transf TRANSL).
+
+Lemma sig_preserved:
+  forall f, funsig (transf_fundef f) = funsig f.
+Proof.
+  destruct f; reflexivity.
+Qed.
+
+Lemma find_function_translated:
+  forall ros rs fd,
+  find_function ge ros rs = Some fd ->
+  find_function tge ros rs = Some (transf_fundef fd).
+Proof.
+  unfold find_function; intros. destruct ros as [r|id].
+  eapply functions_translated; eauto.
+  rewrite symbols_preserved. destruct (Genv.find_symbol ge id); try congruence.
+  eapply function_ptr_translated; eauto.
+Qed.
+
+Lemma transf_function_at:
+  forall f pc i,
+  f.(fn_code)!pc = Some i ->
+  (transf_function f).(fn_code)!pc = Some(transf_instr pc i).
+Proof.
+  intros until i. intro Hcode.
+  unfold transf_function; simpl.
+  rewrite PTree.gmap.
+  unfold option_map.
+  rewrite Hcode.
+  reflexivity.
+Qed.
+
+Ltac TR_AT :=
+  match goal with
+  | [ A: (fn_code _)!_ = Some _ |- _ ] =>
+        generalize (transf_function_at _ _ _ A); intros
+  end.
+
+Section SAME_RS.
+  Context {A : Type}.
+  
+  Definition same_rs (rs rs' : Regmap.t A) :=
+    forall x, rs # x = rs' # x.
+
+  Lemma same_rs_refl : forall rs, same_rs rs rs.
+  Proof.
+    unfold same_rs.
+    reflexivity.
+  Qed.
+
+  Lemma same_rs_comm : forall rs rs', (same_rs rs rs') -> (same_rs rs' rs).
+  Proof.
+    unfold same_rs.
+    congruence.
+  Qed.
+
+  Lemma same_rs_trans : forall rs1 rs2 rs3,
+      (same_rs rs1 rs2) -> (same_rs rs2 rs3) -> (same_rs rs1 rs3).
+  Proof.
+    unfold same_rs.
+    congruence.
+  Qed.
+
+  Lemma same_rs_idem_write : forall rs r,
+      (same_rs rs (rs # r <- (rs # r))).
+  Proof.
+    unfold same_rs.
+    intros.
+    rewrite Regmap.gsident.
+    reflexivity.
+  Qed.
+
+  Lemma same_rs_read:
+    forall rs rs' r, (same_rs rs rs') -> rs # r = rs' # r.
+  Proof.
+    unfold same_rs.
+    auto.
+  Qed.
+  
+  Lemma same_rs_subst:
+    forall rs rs' l, (same_rs rs rs') -> rs ## l = rs' ## l.
+  Proof.
+    induction l; cbn; intuition congruence.
+  Qed.
+
+  Lemma same_rs_write: forall rs rs' r x,
+      (same_rs rs rs') -> (same_rs (rs # r <- x) (rs' # r <- x)).
+  Proof.
+    unfold same_rs.
+    intros.
+    destruct (peq r x0).
+    { subst x0.
+      rewrite Regmap.gss. rewrite Regmap.gss.
+      reflexivity.
+    }
+    rewrite Regmap.gso by congruence.
+    rewrite Regmap.gso by congruence.
+    auto.
+  Qed.
+
+  Lemma same_rs_setres:
+    forall rs rs' (SAME: same_rs rs rs') res vres,
+      same_rs (regmap_setres res vres rs) (regmap_setres res vres rs').
+  Proof.
+    induction res; cbn; auto using same_rs_write.
+  Qed.
+End SAME_RS.
+
+Lemma same_find_function: forall tge rs rs' (SAME: same_rs rs rs') ros,
+  find_function tge ros rs = find_function tge ros rs'.
+Proof.
+  destruct ros; cbn.
+  { rewrite (same_rs_read rs rs' r SAME).
+    reflexivity. }
+  reflexivity.
+Qed.
+
+Inductive match_frames: RTL.stackframe -> RTL.stackframe -> Prop :=
+| match_frames_intro: forall res f sp pc rs rs' (SAME : same_rs rs rs'),
+    match_frames (Stackframe res f sp pc rs)
+                 (Stackframe res (transf_function f) sp pc rs').
+
+Inductive match_states: RTL.state -> RTL.state -> Prop :=
+  | match_regular_states: forall stk f sp pc rs rs' m stk'
+        (SAME: same_rs rs rs')
+        (STACKS: list_forall2 match_frames stk stk'),
+      match_states (State stk f sp pc rs m)
+                   (State stk' (transf_function f) sp pc rs' m)
+  | match_callstates: forall stk f args m stk'
+        (STACKS: list_forall2 match_frames stk stk'),
+      match_states (Callstate stk f args m)
+                   (Callstate stk' (transf_fundef f) args m)
+  | match_returnstates: forall stk v m stk'
+        (STACKS: list_forall2 match_frames stk stk'),
+      match_states (Returnstate stk v m)
+                   (Returnstate stk' v m).
+
+Lemma step_simulation:
+  forall S1 t S2, RTL.step ge S1 t S2 ->
+  forall S1', match_states S1 S1' ->
+  exists S2', RTL.step tge S1' t S2' /\ match_states S2 S2'.
+Proof.
+  induction 1; intros S1' MS; inv MS; try TR_AT.
+- (* nop *)
+  econstructor; split. eapply exec_Inop; eauto.
+  constructor; auto.
+- (* op *)
+  cbn in H1.
+  destruct (_ && _) eqn:IS_MOVE in H1.
+  {
+    destruct eq_operation in IS_MOVE. 2: discriminate.
+    destruct list_eq_dec in IS_MOVE. 2: discriminate.
+    subst op. subst args.
+    clear IS_MOVE.
+    cbn in H0.
+    inv H0.
+    econstructor; split.
+    { eapply exec_Inop; eauto. }
+    constructor.
+    2: assumption.
+    eapply same_rs_trans.
+    { apply same_rs_comm.
+      apply same_rs_idem_write.
+    }
+    assumption.
+  }
+  econstructor; split.
+  eapply exec_Iop with (v := v); eauto.
+  rewrite <- H0.
+  rewrite (same_rs_subst rs rs' args SAME).
+  apply eval_operation_preserved. exact symbols_preserved.
+  constructor; auto using same_rs_write.
+(* load *)
+- econstructor; split.
+  assert (eval_addressing tge sp addr rs' ## args = Some a).
+  { rewrite <- H0.
+    rewrite (same_rs_subst rs rs' args SAME).
+    apply eval_addressing_preserved. exact symbols_preserved.
+  }
+  eapply exec_Iload; eauto.
+  constructor; auto using same_rs_write.
+- (* load notrap1 *)
+  econstructor; split.
+  assert (eval_addressing tge sp addr rs' ## args = None).
+  { rewrite <- H0.
+    rewrite (same_rs_subst rs rs' args SAME).
+    apply eval_addressing_preserved. exact symbols_preserved.
+  }
+  eapply exec_Iload_notrap1; eauto.
+  constructor; auto using same_rs_write.
+- (* load notrap2 *)
+  econstructor; split.
+  assert (eval_addressing tge sp addr rs' ## args = Some a).
+  { rewrite <- H0.
+    rewrite (same_rs_subst rs rs' args SAME).
+    apply eval_addressing_preserved. exact symbols_preserved.
+  }
+  eapply exec_Iload_notrap2; eauto.
+  constructor; auto using same_rs_write. 
+- (* store *)
+  econstructor; split.
+  assert (eval_addressing tge sp addr rs' ## args = Some a).
+  { rewrite <- H0.
+    rewrite (same_rs_subst rs rs' args SAME).
+    apply eval_addressing_preserved. exact symbols_preserved.
+  }
+  rewrite (same_rs_read rs rs' src SAME) in H1.
+  eapply exec_Istore; eauto.
+  constructor; auto.
+(* call *)
+- econstructor; split.
+  eapply exec_Icall with (fd := transf_fundef fd); eauto.
+  eapply find_function_translated; eauto.
+  { rewrite <- (same_find_function ge rs rs') by assumption.
+    assumption. }
+  apply sig_preserved.
+  rewrite (same_rs_subst rs rs' args SAME).  
+  constructor. constructor; auto. constructor; auto.
+(* tailcall *)
+- econstructor; split.
+  eapply exec_Itailcall with (fd := transf_fundef fd); eauto.
+    eapply find_function_translated; eauto.
+  { rewrite <- (same_find_function ge rs rs') by assumption.
+    assumption. }
+    apply sig_preserved.
+  rewrite (same_rs_subst rs rs' args SAME).  
+  constructor. auto.
+(* builtin *)
+- econstructor; split.
+  eapply exec_Ibuiltin; eauto.
+  eapply eval_builtin_args_preserved with (ge1 := ge); eauto. exact symbols_preserved.
+  {
+    replace (fun r : positive => rs' # r) with (fun r : positive => rs # r).
+    eassumption.
+    apply functional_extensionality.
+    auto using same_rs_read.
+  }
+  eapply external_call_symbols_preserved; eauto. apply senv_preserved.
+  constructor; auto.
+  auto using same_rs_setres.
+(* cond *)
+- econstructor; split.
+  eapply exec_Icond; eauto.
+  rewrite <- (same_rs_subst rs rs' args SAME); eassumption.
+  constructor; auto.
+(* jumptbl *)
+- econstructor; split.
+  eapply exec_Ijumptable; eauto.
+  rewrite <- (same_rs_read rs rs' arg SAME); eassumption.
+  constructor; auto.
+(* return *)
+- econstructor; split.
+  eapply exec_Ireturn; eauto.
+  destruct or; cbn.
+  + rewrite <- (same_rs_read rs rs' r SAME) by auto.
+    constructor; auto.
+  + constructor; auto.
+(* internal function *)
+-  simpl. econstructor; split.
+  eapply exec_function_internal; eauto.
+  constructor; auto.
+  cbn.
+  apply same_rs_refl.
+(* external function *)
+- econstructor; split.
+  eapply exec_function_external; eauto.
+    eapply external_call_symbols_preserved; eauto. apply senv_preserved.
+  constructor; auto.
+(* return *)
+- inv STACKS. inv H1.
+  econstructor; split.
+  eapply exec_return; eauto.
+  constructor; auto using same_rs_write.
+Qed.
+
+Lemma transf_initial_states:
+  forall S1, RTL.initial_state prog S1 ->
+  exists S2, RTL.initial_state tprog S2 /\ match_states S1 S2.
+Proof.
+  intros. inv H. econstructor; split.
+  econstructor.
+    eapply (Genv.init_mem_transf TRANSL); eauto.
+    rewrite symbols_preserved. rewrite (match_program_main TRANSL). eauto.
+    eapply function_ptr_translated; eauto.
+    rewrite <- H3; apply sig_preserved.
+  constructor. constructor.
+Qed.
+
+Lemma transf_final_states:
+  forall S1 S2 r, match_states S1 S2 -> RTL.final_state S1 r -> RTL.final_state S2 r.
+Proof.
+  intros. inv H0. inv H. inv STACKS. constructor.
+Qed.
+
+Theorem transf_program_correct:
+  forward_simulation (RTL.semantics prog) (RTL.semantics tprog).
+Proof.
+  eapply forward_simulation_step.
+  apply senv_preserved.
+  eexact transf_initial_states.
+  eexact transf_final_states.
+  exact step_simulation.
+Qed.
+
+End PRESERVATION.
diff --git a/backend/LICMaux.ml b/backend/LICMaux.ml
index c3907809..0ca4418b 100644
--- a/backend/LICMaux.ml
+++ b/backend/LICMaux.ml
@@ -19,6 +19,62 @@ open Inject;;
 
 type reg = P.t;;
 
+(** get_loop_headers moved from Duplicateaux.ml to LICMaux.ml to prevent cycle dependencies *)
+let debug_flag = ref false
+
+let debug fmt =
+  if !debug_flag then Printf.eprintf fmt
+  else Printf.ifprintf stderr fmt
+
+type vstate = Unvisited | Processed | Visited
+
+let get_some = function
+| None -> failwith "Did not get some"
+| Some thing -> thing
+
+let rtl_successors = function
+| Itailcall _ | Ireturn _ -> []
+| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n)
+| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n]
+| Icond (_,_,n1,n2,_) -> [n1; n2]
+| Ijumptable (_,ln) -> ln
+
+(** Getting loop branches with a DFS visit :
+  * Each node is either Unvisited, Visited, or Processed
+  * pre-order: node becomes Processed
+  * post-order: node becomes Visited
+  *
+  * If we come accross an edge to a Processed node, it's a loop!
+  *)
+let get_loop_headers code entrypoint = begin
+  debug "get_loop_headers\n";
+  let visited = ref (PTree.map (fun n i -> Unvisited) code)
+  and is_loop_header = ref (PTree.map (fun n i -> false) code)
+  in let rec dfs_visit code = function
+  | [] -> ()
+  | node :: ln ->
+      match (get_some @@ PTree.get node !visited) with
+      | Visited -> ()
+      | Processed -> begin
+          debug "Node %d is a loop header\n" (P.to_int node);
+          is_loop_header := PTree.set node true !is_loop_header;
+          visited := PTree.set node Visited !visited
+        end
+      | Unvisited -> begin
+          visited := PTree.set node Processed !visited;
+          match PTree.get node code with
+          | None -> failwith "No such node"
+          | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits;
+          visited := PTree.set node Visited !visited;
+          dfs_visit code ln
+        end
+  in begin
+    dfs_visit code [entrypoint];
+    !is_loop_header
+  end
+end
+
+
 module Dominator =
   struct
     type t = Unreachable | Dominated of int | Multiple
@@ -57,7 +113,7 @@ let apply_dominator (is_marked : node -> bool) (pc : node)
 
 let dominated_parts1 (f : coq_function) :
       (bool PTree.t) * (Dominator.t PMap.t option) =
-  let headers = Duplicateaux.get_loop_headers f.fn_code f.fn_entrypoint in
+  let headers = get_loop_headers f.fn_code f.fn_entrypoint in
   let dominated = Dominator_Solver.fixpoint f.fn_code RTL.successors_instr
     (apply_dominator (fun pc -> match PTree.get pc headers with
                                 | Some x -> x
@@ -248,7 +304,7 @@ let print_dominated_parts1 oc f =
        (PTree.elements f.fn_code);;
   
 let loop_headers (f : coq_function) : RTL.node list =
-  List.map fst (List.filter snd (PTree.elements (Duplicateaux.get_loop_headers f.fn_code f.fn_entrypoint)));;
+  List.map fst (List.filter snd (PTree.elements (get_loop_headers f.fn_code f.fn_entrypoint)));;
 
 let print_loop_headers f =
   print_endline "Loop headers";
diff --git a/doc/index-kvx.html b/doc/index-kvx.html
index 97eefc24..6906c212 100644
--- a/doc/index-kvx.html
+++ b/doc/index-kvx.html
@@ -34,8 +34,8 @@ a:active {color : Red; text-decoration : underline; }
   The unmodified parts of this table appear in <font color=gray>gray</font>.
   <br>
   <br>
-  A high-level view of this backend of CompCert is provided by this HAL preprint of Six, Boulm&eacute; and Monniaux (2019):
-  <div><a href=https://hal.archives-ouvertes.fr/hal-02185883>Certified Compiler Backends for VLIW Processors (Highly Modular Postpass-Scheduling in the CompCert Certified Compiler)</a></div>
+  A high-level view of this CompCert backend is provided by this OOSPLA'20 paper (of Six, Boulm&eacute; and Monniaux):
+  <div><a href=https://hal.archives-ouvertes.fr/hal-02185883>Certified and Efficient Instruction Scheduling. Application to Interlocked VLIW Processors.</a></div>
   <br>
   Our source code is available on our <a href=https://gricad-gitlab.univ-grenoble-alpes.fr/certicompil/compcert-kvx>GitLab public repository</a> (see conditions in the LICENSE file).
 </p>
diff --git a/driver/Clflags.ml b/driver/Clflags.ml
index 829af76a..d5f3aca5 100644
--- a/driver/Clflags.ml
+++ b/driver/Clflags.ml
@@ -33,13 +33,21 @@ let option_fcse3_across_calls = ref false
 let option_fcse3_across_merges = ref true
 let option_fcse3_glb = ref true
 let option_fredundancy = ref true
-let option_fduplicate = ref (-1)
-let option_finvertcond = ref true
-let option_ftracelinearize = ref false
+
+(** Options relative to superblock scheduling *)
+let option_fpredict = ref true (* insert static branch prediction information, and swaps ifso/ifnot branches accordingly *)
+let option_ftailduplicate = ref 0 (* perform tail duplication for blocks of size n *)
+let option_ftracelinearize = ref true (* uses branch prediction information to improve the linearization *)
+let option_funrollsingle = ref 0 (* unroll a single iteration of innermost loops of size n *)
+let option_funrollbody = ref 0 (* unroll the body of innermost loops of size n *)
+
+(* Scheduling *)
 let option_fprepass = ref false
 let option_fprepass_sched = ref "list"
+
 let option_fpostpass = ref true
 let option_fpostpass_sched = ref "list"
+
 let option_fifconversion = ref true
 let option_Obranchless = ref false
 let option_falignfunctions = ref (None: int option)
diff --git a/driver/Compopts.v b/driver/Compopts.v
index d576ede6..540e8922 100644
--- a/driver/Compopts.v
+++ b/driver/Compopts.v
@@ -27,9 +27,6 @@ Parameter generate_float_constants: unit -> bool.
 (** For value analysis.  Currently always false. *)
 Parameter va_strict: unit -> bool.
 
-(** Flag -fduplicate. Branch prediction annotation + tail duplication *)
-Parameter optim_duplicate: unit -> bool.
-
 (** Flag -ftailcalls.  For tail call optimization. *)
 Parameter optim_tailcalls: unit -> bool.
 
diff --git a/driver/Driver.ml b/driver/Driver.ml
index fef9c166..e5fc78f8 100644
--- a/driver/Driver.ml
+++ b/driver/Driver.ml
@@ -210,15 +210,12 @@ Processing options:
   -fpostpass     Perform postpass scheduling (only for K1 architecture) [on] 
   -fpostpass= <optim> Perform postpass scheduling with the specified optimization [list]
                    (<optim>=list: list scheduling, <optim>=ilp: ILP, <optim>=greedy: just packing bundles)
-  -fduplicate <nb_nodes> Perform tail duplication to form superblocks on predicted traces
-    nb_nodes control the heuristic deciding to duplicate or not
-    A value of -1 desactivates the entire pass (including branch prediction)
-    A value of 0 desactivates the duplication (but activates the branch prediction)
-    FIXME : this is desactivated by default for now
-    -finvertcond    Invert conditions based on predicted paths (to prefer fallthrough).
-                    Requires -fduplicate to be also activated [on]
-    -ftracelinearize Linearizes based on the traces identified by duplicate phase
-                    It is heavily recommended to activate -finvertcond with this pass [off]
+  -fpredict      Insert static branch prediction information [on]
+                 Also swaps ifso/ifnot branches accordingly at RTL level
+  -ftailduplicate n Perform tail duplication for RTL code blocks of size n (not counting Inops) [0]
+  -ftracelinearize  Uses branch prediction information to improve the Linearize [on]
+  -funrollsingle n  Unrolls a single iteration of innermost loops of size n (not counting Inops) [0]
+  -funrollbody n  Unrolls once the body of innermost loops of size n (not counting Inops) [0]
   -fforward-moves   Forward moves after CSE
   -finline       Perform inlining of functions [on]
   -finline-functions-called-once Integrate functions only required by their
@@ -283,6 +280,7 @@ let dump_mnemonics destfile =
 let optimization_options = [
     option_ftailcalls; option_fifconversion; option_fconstprop;
     option_fcse; option_fcse2; option_fcse3;
+    option_fpredict; option_ftracelinearize;
     option_fpostpass;
     option_fredundancy; option_finline; option_finline_functions_called_once;
 ]
@@ -424,8 +422,10 @@ let cmdline_actions =
   @ f_opt "redundancy" option_fredundancy
   @ f_opt "prepass" option_fprepass
   @ f_opt "postpass" option_fpostpass
-  @ [ Exact "-fduplicate", Integer (fun n -> option_fduplicate := n) ]
-  @ f_opt "invertcond" option_finvertcond
+  @ [ Exact "-ftailduplicate", Integer (fun n -> option_ftailduplicate := n) ]
+  @ f_opt "predict" option_fpredict
+  @ [ Exact "-funrollsingle", Integer (fun n -> option_funrollsingle := n) ]
+  @ [ Exact "-funrollbody", Integer (fun n -> option_funrollbody := n) ]
   @ f_opt "tracelinearize" option_ftracelinearize
   @ f_opt_str "prepass" option_fprepass option_fprepass_sched
   @ f_opt_str "postpass" option_fpostpass option_fpostpass_sched
diff --git a/extraction/extraction.v b/extraction/extraction.v
index e43594fc..bd396cd8 100644
--- a/extraction/extraction.v
+++ b/extraction/extraction.v
@@ -111,8 +111,6 @@ Extract Constant Compopts.generate_float_constants =>
   "fun _ -> !Clflags.option_ffloatconstprop >= 2".
 Extract Constant Compopts.optim_tailcalls =>
   "fun _ -> !Clflags.option_ftailcalls".
-Extract Constant Compopts.optim_duplicate =>
-  "fun _ -> (if !Clflags.option_fduplicate = -1 then false else true)".
 Extract Constant Compopts.optim_constprop =>
   "fun _ -> !Clflags.option_fconstprop".
 Extract Constant Compopts.optim_CSE =>
diff --git a/kvx/SelectOp.vp b/kvx/SelectOp.vp
index 9e5d45a0..65dba3ac 100644
--- a/kvx/SelectOp.vp
+++ b/kvx/SelectOp.vp
@@ -103,8 +103,14 @@ Nondetfunction select0 (ty : typ) (cond0 : condition0) (e1 e2 e3: expr) :=
   | _, _, _ => (Eop (Osel cond0 ty) (e1 ::: e2 ::: e3 ::: Enil))
   end.
 
+Definition same_expr_pure (e1 e2: expr) :=
+  match e1, e2 with
+  | Evar v1, Evar v2 => if ident_eq v1 v2 then true else false
+  | _, _ => false
+  end.
+
 Definition select (ty : typ) (cond : condition) (args : exprlist) (e1 e2: expr)  : option expr :=
-  Some(
+  Some (if same_expr_pure e1 e2 then e1 else
   match cond_to_condition0 cond args with
   | None => select0 ty (Ccomp0 Cne) e1 e2 (Eop (Ocmp cond) args)
   | Some(cond0, ec) => select0 ty cond0 e1 e2 ec
@@ -356,12 +362,6 @@ Nondetfunction orimm (n1: int) (e2: expr) :=
        | _ => Eop (Oorimm n1) (e2:::Enil)
   end.
 
-Definition same_expr_pure (e1 e2: expr) :=
-  match e1, e2 with
-  | Evar v1, Evar v2 => if ident_eq v1 v2 then true else false
-  | _, _ => false
-  end.
-
 Nondetfunction or (e1: expr) (e2: expr) :=
   match e1, e2 with
   | Eop (Ointconst n1) Enil, t2 => orimm n1 t2
diff --git a/kvx/SelectOpproof.v b/kvx/SelectOpproof.v
index c23ed601..7a301929 100644
--- a/kvx/SelectOpproof.v
+++ b/kvx/SelectOpproof.v
@@ -1547,6 +1547,15 @@ Proof.
   intros until b.
   intro Hop; injection Hop; clear Hop; intro; subst a.
   intros HeL He1 He2 HeC.
+  destruct same_expr_pure eqn:SAME.
+  {
+    destruct (eval_same_expr a1 a2 le v1 v2 SAME He1 He2) as [EQ1 EQ2].
+    subst a2. subst v2.
+    exists v1; split; trivial.
+    cbn.
+    rewrite if_same.
+    apply Val.lessdef_normalize.
+  }
   unfold cond_to_condition0.
   destruct (cond_to_condition0_match cond al).
   {
diff --git a/lib/Coqlib.v b/lib/Coqlib.v
index 02c5d07f..16d880fa 100644
--- a/lib/Coqlib.v
+++ b/lib/Coqlib.v
@@ -1325,3 +1325,9 @@ Lemma nlist_forall2_imply:
 Proof.
   induction 1; simpl; intros; constructor; auto.
 Qed.
+
+Lemma if_same : forall {T : Type} (b : bool) (x : T),
+    (if b then x else x) = x.
+Proof.
+  destruct b; trivial.
+Qed.
diff --git a/runtime/include/ccomp_kvx_fixes.h b/runtime/include/ccomp_kvx_fixes.h
index 65d65e7b..a518a069 100644
--- a/runtime/include/ccomp_kvx_fixes.h
+++ b/runtime/include/ccomp_kvx_fixes.h
@@ -33,13 +33,26 @@ extern __int128 __compcert_acswapd(void *address, unsigned long long new_value,
 #define __builtin_kvx_acswapw __compcert_acswapw
 extern __int128 __compcert_acswapw(void *address, unsigned long long new_value, unsigned long long old_value);
 
+#define __builtin_kvx_aladdd __compcert_aladdd
+extern long long __compcert_aladdd(void *address, unsigned long long incr);
+
+#define __builtin_kvx_aladdw __compcert_aladdw
+extern int __compcert_aladdw(void *address, unsigned int incr);
+
 #define __builtin_kvx_afaddd __compcert_afaddd
 extern long long __compcert_afaddd(void *address, unsigned long long incr);
 
 #define __builtin_kvx_afaddw __compcert_afaddw
 extern int __compcert_afaddw(void *address, unsigned int incr);
-#endif
+
+#define __builtin_kvx_ld __compcert_ld
+extern int __compcert_ld(void *address, const char *str, const int b);
+
+#define __builtin_kvx_lwz __compcert_lwz
+extern int __compcert_lwz(void *address, const char *str, const int b);
 
 /* #define __builtin_expect(x, y) (x) */
 #define __builtin_ctz(x) __builtin_kvx_ctzw(x)
 #define __builtin_clz(x) __builtin_kvx_clzw(x)
+
+#endif
diff --git a/test/kvx/sort/Makefile b/test/kvx/sort/Makefile
index 1afab6e9..46a8f025 100644
--- a/test/kvx/sort/Makefile
+++ b/test/kvx/sort/Makefile
@@ -3,7 +3,7 @@ CC ?= gcc
 CCOMP ?= ccomp
 CFLAGS ?= -O2
 SIMU ?= kvx-mppa
-TIMEOUT ?= 10s
+TIMEOUT ?= 20s
 
 KVXCPATH=$(shell which $(KVXC))
 CCPATH=$(shell which $(CC))
diff --git a/test/monniaux/loop_nest/syrk.c b/test/monniaux/loop_nest/syrk.c
new file mode 100644
index 00000000..490d0a01
--- /dev/null
+++ b/test/monniaux/loop_nest/syrk.c
@@ -0,0 +1,28 @@
+/* Include polybench common header. */
+#include "polybench.h"
+
+/* Include benchmark-specific header. */
+/* Default data type is double, default size is 4000. */
+#include "syrk.h"
+
+/* Main computational kernel. The whole function will be timed,
+   including the call and return. */
+void kernel_syrk(int ni, int nj,
+		 DATA_TYPE alpha,
+		 DATA_TYPE beta,
+		 DATA_TYPE POLYBENCH_2D(C,NI,NI,ni,ni),
+		 DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj))
+{
+  int i, j, k;
+
+  /*  C := alpha*A*A' + beta*C */
+#if 0
+  for (i = 0; i < _PB_NI; i++)
+    for (j = 0; j < _PB_NI; j++)
+      C[i][j] *= beta;
+#endif
+  for (i = 0; i < _PB_NI; i++)
+    for (j = 0; j < _PB_NI; j++)
+      for (k = 0; k < _PB_NJ; k++)
+	  C[i][j] += alpha * A[i][k] * A[j][k];
+}
diff --git a/tools/compiler_expand.ml b/tools/compiler_expand.ml
index c37880d5..7e1e0181 100644
--- a/tools/compiler_expand.ml
+++ b/tools/compiler_expand.ml
@@ -20,13 +20,14 @@ TOTAL, (Option "profile_arcs"), (Some "Profiling insertion"), "Profiling";
 TOTAL, (Option "branch_probabilities"), (Some "Profiling use"), "ProfilingExploit";
 TOTAL, (Option "optim_move_loop_invariants"), (Some "Inserting initial nop"), "FirstNop";
 TOTAL, Always, (Some "Renumbering"), "Renumber";
-PARTIAL, (Option "optim_duplicate"),  (Some "Tail-duplicating"), "Duplicate";
+PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE";
+PARTIAL, Always, (Some "Duplicating blocks"), "Duplicate";
 TOTAL, Always, (Some "Renumbering pre constprop"), "Renumber";
 TOTAL, (Option "optim_constprop"), (Some "Constant propagation"), "Constprop";
 TOTAL, Always, (Some "Renumbering pre CSE"), "Renumber";
-PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE";
 TOTAL, (Option "optim_CSE2"), (Some "CSE2"), "CSE2";
 PARTIAL, (Option "optim_CSE3"), (Some "CSE3"), "CSE3";
+TOTAL, (Option "optim_CSE3"), (Some "Kill useless moves after CSE3"), "KillUselessMoves";
 TOTAL, (Option "optim_forward_moves"), (Some "Forwarding moves"), "ForwardMoves";
 PARTIAL, (Option "optim_redundancy"), (Some "Redundancy elimination"), "Deadcode";
 PARTIAL, (Option "optim_move_loop_invariants"), (Some "LICM"), "LICM";
author	David Monniaux <david.monniaux@univ-grenoble-alpes.fr>	2020-10-18 22:49:10 +0200
committer	David Monniaux <david.monniaux@univ-grenoble-alpes.fr>	2020-10-18 22:49:10 +0200
commit	996a2e5bbc4826d95144b62f5218b6e3e1e7d881 (patch)
tree	55a5136c68e7d0f7a304913f28d0a761d9facd9a
parent	8c3a2bdb56eba8d8bc5e359b01a320916eac85f0 (diff)
parent	a2f31f2b886ccb9656a019db1780aabc1789368a (diff)
download	compcert-kvx-996a2e5bbc4826d95144b62f5218b6e3e1e7d881.tar.gz compcert-kvx-996a2e5bbc4826d95144b62f5218b6e3e1e7d881.zip