24 files changed, 1565 insertions, 440 deletions
diff --git a/Makefile b/Makefile
index c66395fa..521c3297 100644
--- a/Makefile
+++ b/Makefile
@@ -83,7 +83,7 @@ BACKEND=\
   Profiling.v Profilingproof.v \
   ProfilingExploit.v ProfilingExploitproof.v \
   Renumber.v Renumberproof.v \
-  Duplicate.v Duplicateproof.v \
+  Duplicate.v Duplicateproof.v Duplicatepasses.v \
   RTLtyping.v \
   Kildall.v Liveness.v \
   ValueDomain.v ValueAOp.v ValueAnalysis.v \
diff --git a/README.md b/README.md
index 377776ca..58a7d052 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ This is a special version with additions from Verimag and Kalray :
 
 * A backend for the KVX processor: see [`README_Kalray.md`](README_Kalray.md) for details.
 * Some general-purpose optimization phases (e.g. profiling).
- - see [`PROFILING.md`](PROFILING.md) for details on the profiling system
+  * see [`PROFILING.md`](PROFILING.md) for details on the profiling system
 	
 The people responsible for this version are
 
@@ -29,10 +29,12 @@ The people responsible for this version are
 * David Monniaux (CNRS, Verimag)
 * Cyril Six (Kalray)
 
-## Papers on this CompCert version
+## Papers, docs, etc on this CompCert version
 
-* [a 5-minutes video](http://www-verimag.imag.fr/~boulme/videos/poster-oopsla20.mp4) by C. Six, presenting the postpass scheduling and the KVX backend.
+* [a 5-minutes video](http://www-verimag.imag.fr/~boulme/videos/poster-oopsla20.mp4) by C. Six, presenting the postpass scheduling and the KVX backend
+(also on [YouTube if you need subtitles](https://www.youtube.com/watch?v=RAzMDS9OVSw)).
 * [Certified and Efficient Instruction Scheduling](https://hal.archives-ouvertes.fr/hal-02185883), an OOPSLA'20 paper, by Six, Boulmé and Monniaux.
+* [the documentation of the KVX backend Coq sources](https://certicompil.gricad-pages.univ-grenoble-alpes.fr/compcert-kvx)
 
 ## License
 CompCert is not free software.  This non-commercial release can only
diff --git a/aarch64/Asm.v b/aarch64/Asm.v
index 5dafb01f..8d3d4ff7 100644
--- a/aarch64/Asm.v
+++ b/aarch64/Asm.v
@@ -1093,9 +1093,9 @@ Definition exec_instr (f: function) (i: instruction) (rs: regset) (m: mem) : out
   | Pfmov rd r1 =>
       Next (nextinstr (rs#rd <- (rs#r1))) m
   | Pfmovimms rd f =>
-      Next (nextinstr (rs#rd <- (Vsingle f))) m
+      Next (nextinstr (rs#rd <- (Vsingle f))#X16 <- Vundef) m
   | Pfmovimmd rd f =>
-      Next (nextinstr (rs#rd <- (Vfloat f))) m
+      Next (nextinstr (rs#rd <- (Vfloat f))#X16 <- Vundef) m
   | Pfmovi S rd r1 =>
       Next (nextinstr (rs#rd <- (float32_of_bits rs##r1))) m
   | Pfmovi D rd r1 =>
diff --git a/backend/CSE3analysis.v b/backend/CSE3analysis.v
index 7316c9a9..5ed04bc4 100644
--- a/backend/CSE3analysis.v
+++ b/backend/CSE3analysis.v
@@ -282,12 +282,14 @@ Section OPERATIONS.
 
   Definition oper2 (dst : reg) (op: sym_op)(args : list reg)
            (rel : RELATION.t) : RELATION.t :=
-    let rel' := kill_reg dst rel in
     match eq_find {| eq_lhs := dst;
                      eq_op  := op;
                      eq_args:= args |} with
-    | Some id => PSet.add id rel'
-    | None => rel'
+    | Some id =>
+      if PSet.contains rel id
+      then rel
+      else PSet.add id (kill_reg dst rel)
+    | None => kill_reg dst rel
     end.
 
   Definition oper1 (dst : reg) (op: sym_op) (args : list reg)
@@ -324,18 +326,21 @@ Section OPERATIONS.
       | _ => kill_reg dst rel
       end
     else
-      if is_trivial_sym_op op
-      then kill_reg dst rel
-      else
-        let args' := forward_move_l rel args in
-        match rhs_find op args' rel with
-        | Some r =>
-          if Compopts.optim_CSE3_glb tt
-          then RELATION.glb (move r dst rel)
-                            (oper1 dst op args' rel)
-          else oper1 dst op args' rel
-        | None => oper1 dst op args' rel
-        end.
+      let args' := forward_move_l rel args in
+      match rhs_find op args rel with
+      | Some r =>
+        if Compopts.optim_CSE3_glb tt
+        then RELATION.glb (move r dst rel)
+                          (RELATION.glb
+                             (oper1 dst op args rel)
+                             (oper1 dst op args' rel))
+        else RELATION.glb
+               (oper1 dst op args rel)
+               (oper1 dst op args' rel)
+      | None => RELATION.glb
+                  (oper1 dst op args rel)
+                  (oper1 dst op args' rel)
+      end.
   
   Definition clever_kill_store
              (chunk : memory_chunk) (addr: addressing) (args : list reg)
@@ -378,11 +383,21 @@ Section OPERATIONS.
       end
     else rel'.
     
-  Definition store
+  Definition store (tenv : typing_env)
              (chunk : memory_chunk) (addr: addressing) (args : list reg)
-             (src : reg) (ty: typ)
+             (src : reg)
              (rel : RELATION.t) : RELATION.t :=
-    store1 chunk addr (forward_move_l rel args) (forward_move rel src) ty rel.
+    let args' := forward_move_l rel args in
+    let src' := forward_move rel src in
+    let tsrc := tenv src in
+    let tsrc' := tenv src' in
+    RELATION.glb
+      (RELATION.glb
+         (store1 chunk addr args src tsrc rel)
+         (store1 chunk addr args' src tsrc rel))
+      (RELATION.glb
+         (store1 chunk addr args src' tsrc' rel)
+         (store1 chunk addr args' src' tsrc' rel)).
 
   Definition kill_builtin_res res rel :=
     match res with
@@ -425,7 +440,7 @@ Section OPERATIONS.
   | Icond _ _ _ _ _
   | Ijumptable _ _ => Some rel
   | Istore chunk addr args src _ =>
-    Some (store chunk addr args src (tenv (forward_move rel src)) rel)
+    Some (store tenv chunk addr args src rel)
   | Iop op args dst _ => Some (oper dst (SOp op) args rel)
   | Iload trap chunk addr args dst _ => Some (oper dst (SLoad chunk addr) args rel)
   | Icall _ _ _ dst _ => Some (kill_reg dst (kill_mem rel))
diff --git a/backend/CSE3analysisaux.ml b/backend/CSE3analysisaux.ml
index 3990b765..e37ef61f 100644
--- a/backend/CSE3analysisaux.ml
+++ b/backend/CSE3analysisaux.ml
@@ -14,7 +14,8 @@ open CSE3analysis
 open Maps
 open HashedSet
 open Camlcoq
-
+open Coqlib
+   
 let flatten_eq eq =
   ((P.to_int eq.eq_lhs), eq.eq_op, List.map P.to_int eq.eq_args);;
 
@@ -39,12 +40,12 @@ let print_reg channel i =
 let print_eq channel (lhs, sop, args) =
   match sop with
   | SOp op ->
-     Printf.printf "%a = %a\n" print_reg lhs (PrintOp.print_operation print_reg) (op, args)
+     Printf.printf "%a = %a" print_reg lhs (PrintOp.print_operation print_reg) (op, args)
   | SLoad(chunk, addr) ->
-     Printf.printf "%a = %s @ %a\n" print_reg lhs (string_of_chunk chunk)
+     Printf.printf "%a = %s @ %a" print_reg lhs (string_of_chunk chunk)
        (PrintOp.print_addressing print_reg) (addr, args);;
 
-let pp_set oc s =
+let pp_intset oc s =
   Printf.fprintf oc "{ ";
   List.iter (fun i -> Printf.fprintf oc "%d; " (P.to_int i)) (PSet.elements s);
   Printf.fprintf oc "}";;
@@ -70,6 +71,102 @@ let pp_option pp oc = function
 let is_trivial eq =
   (eq.eq_op = SOp Op.Omove) && (eq.eq_args = [eq.eq_lhs]);;
 
+let rec pp_list separator pp_item chan = function
+  | [] -> ()
+  | [h] -> pp_item chan h
+  | h::t ->
+     pp_item chan h;
+     output_string chan separator;
+     pp_list separator pp_item chan t;;
+
+let pp_set separator pp_item chan s =
+  pp_list separator pp_item chan (PSet.elements s);;
+
+let pp_equation hints chan x =
+  match PTree.get x hints.hint_eq_catalog with
+  | None -> output_string chan "???"
+  | Some eq ->
+     print_eq chan  (P.to_int eq.eq_lhs, eq.eq_op, List.map P.to_int eq.eq_args);;
+
+let pp_relation hints chan rel =
+  pp_set "; " (pp_equation hints) chan rel;;
+
+let pp_relation_b hints chan = function
+  | None -> output_string chan "bot"
+  | Some rel -> pp_relation hints chan rel;;
+
+let pp_results f (invariants : RB.t PMap.t) hints chan =
+  let max_pc = P.to_int (RTL.max_pc_function f) in
+  for pc=max_pc downto 1
+  do
+    Printf.fprintf chan "%d: %a\n\n" pc
+      (pp_relation_b hints) (PMap.get (P.of_int pc) invariants)
+  done
+
+module IntSet=Set.Make(struct type t=int let compare = ( - ) end);;
+
+let rec union_list prev = function
+  | [] -> prev
+  | h::t -> union_list (RB.lub prev h) t;;
+
+let rb_glb (x : RB.t) (y : RB.t) : RB.t =
+  match x, y with
+  | None, _ | _, None -> None
+  | (Some x'), (Some y') -> Some (RELATION.glb x' y');;
+
+let refine_invariants
+      (nodes : RTL.node list)
+      (entrypoint : RTL.node)
+      (successors : RTL.node -> RTL.node list)
+      (predecessors : RTL.node -> RTL.node list)
+      (tfr : RTL.node -> RB.t -> RB.t) (invariants0 : RB.t PMap.t) =
+  let todo = ref IntSet.empty
+  and invariants = ref invariants0 in  
+  let add_todo (pc : RTL.node) =
+    todo := IntSet.add (P.to_int pc) !todo in 
+  let update_node (pc : RTL.node) =
+    (if !Clflags.option_debug_compcert > 9
+     then Printf.printf "updating node %d\n" (P.to_int pc));
+    if not (peq pc entrypoint)
+    then
+      let cur = PMap.get pc !invariants in
+      let nxt = union_list RB.bot
+                  (List.map
+                     (fun pred_pc->
+                       rb_glb cur
+                         (tfr pred_pc (PMap.get pred_pc !invariants)))
+                     (predecessors pc)) in
+      if not (RB.beq cur nxt)
+      then
+        begin
+          (if !Clflags.option_debug_compcert > 4
+           then Printf.printf "refining CSE3 node %d\n" (P.to_int pc));
+          List.iter add_todo (successors pc)
+        end in
+  (List.iter add_todo nodes);
+  while not (IntSet.is_empty !todo) do
+    let nxt = IntSet.max_elt !todo in
+    todo := IntSet.remove nxt !todo;
+    update_node (P.of_int nxt)
+  done;
+  !invariants;;
+
+let get_default default x ptree =
+  match PTree.get x ptree with
+  | None -> default
+  | Some y -> y;;
+
+let refine_analysis ctx tenv
+      (f : RTL.coq_function) (invariants0 : RB.t PMap.t) =
+  let succ_map = RTL.successors_map f in
+  let succ_f x = get_default [] x succ_map in
+  let pred_map = Kildall.make_predecessors f.RTL.fn_code RTL.successors_instr in
+  let pred_f x = get_default [] x pred_map in
+  let tfr = apply_instr' ctx tenv f.RTL.fn_code in
+  refine_invariants
+    (List.map fst (PTree.elements f.RTL.fn_code))
+    f.RTL.fn_entrypoint succ_f pred_f tfr invariants0;;
+  
 let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
   let cur_eq_id = ref 0
   and cur_catalog = ref PTree.empty
@@ -81,7 +178,8 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
   let eq_find_oracle node eq =
     assert (not (is_trivial eq));
     let o = Hashtbl.find_opt eq_table (flatten_eq eq) in
-    (if !Clflags.option_debug_compcert > 1
+    (if o = None then failwith "eq_find_oracle");
+    (if !Clflags.option_debug_compcert > 5
      then Printf.printf "@%d: eq_find %a -> %a\n" (P.to_int node)
             pp_eq eq (pp_option pp_P) o);
     o
@@ -90,9 +188,9 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
       match Hashtbl.find_opt rhs_table (sop, List.map P.to_int args) with
       | None -> PSet.empty
       | Some s -> s in
-    (if !Clflags.option_debug_compcert > 1
+    (if !Clflags.option_debug_compcert > 5
      then Printf.printf "@%d: rhs_find %a = %a\n"
-            (P.to_int node) pp_rhs (sop, args) pp_set o);
+            (P.to_int node) pp_rhs (sop, args) pp_intset o);
     o in
   let mutating_eq_find_oracle node eq : P.t option =
     let (flat_eq_lhs, flat_eq_op, flat_eq_args) as flat_eq = flatten_eq eq in
@@ -124,23 +222,29 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
          Some coq_id
        end
     in
-    (if !Clflags.option_debug_compcert > 1
+    (if !Clflags.option_debug_compcert > 5
      then Printf.printf "@%d: mutating_eq_find %a -> %a\n" (P.to_int node)
       pp_eq eq (pp_option pp_P) o);    
     o
   in
-  match
-    internal_analysis
-      { eq_catalog     = (fun eq_id -> PTree.get eq_id !cur_catalog);
-        eq_find_oracle = mutating_eq_find_oracle;
-        eq_rhs_oracle  = rhs_find_oracle ;
-        eq_kill_reg    = (fun reg -> PMap.get reg !cur_kill_reg);
-        eq_kill_mem    = (fun () -> !cur_kill_mem);
-        eq_moves       = (fun reg -> PMap.get reg !cur_moves)
-      } tenv f
+  let ctx = { eq_catalog     = (fun eq_id -> PTree.get eq_id !cur_catalog);
+              eq_find_oracle = mutating_eq_find_oracle;
+              eq_rhs_oracle  = rhs_find_oracle ;
+              eq_kill_reg    = (fun reg -> PMap.get reg !cur_kill_reg);
+              eq_kill_mem    = (fun () -> !cur_kill_mem);
+              eq_moves       = (fun reg -> PMap.get reg !cur_moves)
+            } in
+  match internal_analysis ctx tenv f
   with None -> failwith "CSE3analysisaux analysis failed, try re-running with -fno-cse3"
      | Some invariants ->
-        invariants,
-        { hint_eq_catalog    = !cur_catalog;
-          hint_eq_find_oracle= eq_find_oracle;
-          hint_eq_rhs_oracle = rhs_find_oracle };;
+        let invariants' =
+          if ! Clflags.option_fcse3_refine
+          then refine_analysis ctx tenv f invariants
+          else invariants
+        and hints = { hint_eq_catalog    = !cur_catalog;
+                      hint_eq_find_oracle= eq_find_oracle;
+                      hint_eq_rhs_oracle = rhs_find_oracle } in
+        (if !Clflags.option_debug_compcert > 1
+         then pp_results f invariants' hints stdout);
+        invariants', hints
+;;
diff --git a/backend/CSE3analysisproof.v b/backend/CSE3analysisproof.v
index 66b199cc..1e5b88c3 100644
--- a/backend/CSE3analysisproof.v
+++ b/backend/CSE3analysisproof.v
@@ -699,6 +699,27 @@ Section SOUNDNESS.
       + congruence.
   Qed.
 
+  Lemma arglist_idem_write:
+    forall { A : Type} args (rs : Regmap.t A) dst,
+      (rs # dst <- (rs # dst)) ## args = rs ## args.
+  Proof.
+    induction args; trivial.
+    intros. cbn.
+    f_equal; trivial.
+    apply Regmap.gsident.
+  Qed.
+  
+  Lemma sem_rhs_idem_write:
+    forall sop args rs dst m v,
+      sem_rhs sop args rs m v ->
+      sem_rhs sop args (rs # dst <- (rs # dst)) m v.
+  Proof.
+    intros.
+    unfold sem_rhs in *.
+    rewrite arglist_idem_write.
+    assumption.
+  Qed.
+  
   Theorem oper2_sound:
     forall no dst sop args rel rs m v,
       sem_rel rel rs m ->
@@ -726,6 +747,17 @@ Section SOUNDNESS.
       rewrite Regmap.gss.
       apply sem_rhs_depends_on_args_only; auto.
     }
+    intros INi.
+    destruct (PSet.contains rel e) eqn:CONTAINSe.
+    { pose proof (REL e {| eq_lhs := dst; eq_op := sop; eq_args := args |} CONTAINSe H) as RELe.
+      pose proof (REL i eq CONTAINS INi) as RELi.
+      unfold sem_eq in *.
+      cbn in RELe.
+      replace v with (rs # dst) by (eapply sem_rhs_det; eassumption).
+      rewrite Regmap.gsident.
+      apply sem_rhs_idem_write.
+      assumption.
+    }
     rewrite PSet.gaddo in CONTAINS by congruence.
     apply (kill_reg_sound rel rs m dst v REL i eq); auto.
   Qed.
@@ -821,24 +853,24 @@ Section SOUNDNESS.
       subst.
       rewrite <- (forward_move_sound rel rs m r) by auto.
       apply move_sound; auto.
-    - destruct (is_trivial_sym_op sop).
-      {
-        apply kill_reg_sound; auto.
-      }
-      destruct rhs_find as [src |] eqn:RHS_FIND.
+    - destruct rhs_find as [src |] eqn:RHS_FIND.
       + destruct (Compopts.optim_CSE3_glb tt).
         * apply sem_rel_glb; split.
-          ** pose proof (rhs_find_sound no sop (forward_move_l (ctx:=ctx) rel args) rel src rs m REL RHS_FIND) as SOUND.
-             eapply forward_move_rhs_sound in RHS.
-             2: eassumption.
+          ** pose proof (rhs_find_sound no sop args rel src rs m REL RHS_FIND) as SOUND.                                                                    
              rewrite <- (sem_rhs_det SOUND RHS).
              apply move_sound; auto.
+          ** apply sem_rel_glb; split.
+             *** apply oper1_sound; auto.
+             *** apply oper1_sound; auto.
+                 apply forward_move_rhs_sound; auto.
+        * apply sem_rel_glb; split.
+          ** apply oper1_sound; auto.
           ** apply oper1_sound; auto.
              apply forward_move_rhs_sound; auto.
-        * ** apply oper1_sound; auto.
-             apply forward_move_rhs_sound; auto.
-      + apply oper1_sound; auto.
-        apply forward_move_rhs_sound; auto.
+      + apply sem_rel_glb; split.
+        * apply oper1_sound; auto.
+        * apply oper1_sound; auto.
+          apply forward_move_rhs_sound; auto.
   Qed.
 
   Hint Resolve oper_sound : cse3.
@@ -927,22 +959,28 @@ Section SOUNDNESS.
   
   Hint Resolve store1_sound : cse3.
     
+    
   Theorem store_sound:
     forall no chunk addr args a src rel tenv rs m m',
       sem_rel rel rs m ->
       wt_regset tenv rs ->
       eval_addressing genv sp addr (rs ## args) = Some a ->
       Mem.storev chunk m a (rs#src) = Some m' ->
-      sem_rel (store (ctx:=ctx) no chunk addr args src (tenv (forward_move (ctx:=ctx) rel src)) rel) rs m'.
+      sem_rel (store (ctx:=ctx) no tenv chunk addr args src rel) rs m'.
   Proof.
     unfold store.
     intros until m'.
     intros REL WT ADDR STORE.
-    rewrite <- forward_move_l_sound with (rel:=rel) (m:=m) in ADDR by trivial.
-    rewrite <- forward_move_sound with (rel:=rel) (m:=m) in STORE by trivial.
-    apply store1_sound with (a := a) (m := m); trivial.
-    (* rewrite forward_move_sound with (rel:=rel) (m:=m) in STORE by trivial.
-    assumption. *)
+    apply sem_rel_glb; split.
+    - apply sem_rel_glb; split.
+      * apply store1_sound with (a := a) (m := m); trivial.
+      * rewrite <- forward_move_l_sound with (rel:=rel) (m:=m) in ADDR by trivial.
+        apply store1_sound with (a := a) (m := m); trivial.
+    - rewrite <- forward_move_sound with (rel:=rel) (m:=m) in STORE by trivial.
+      apply sem_rel_glb; split.
+      * apply store1_sound with (a := a) (m := m); trivial.
+      * rewrite <- forward_move_l_sound with (rel:=rel) (m:=m) in ADDR by trivial.
+        apply store1_sound with (a := a) (m := m); trivial.
   Qed.
 
   Hint Resolve store_sound : cse3.
diff --git a/backend/Duplicate.v b/backend/Duplicate.v
index 0e04b07d..7dea752b 100644
--- a/backend/Duplicate.v
+++ b/backend/Duplicate.v
@@ -18,14 +18,28 @@
 Require Import AST RTL Maps Globalenvs.
 Require Import Coqlib Errors Op.
 
-Local Open Scope error_monad_scope.
-Local Open Scope positive_scope.
 
-(** External oracle returning the new RTL code (entry point unchanged),
+
+Module Type DuplicateOracle.
+
+  (** External oracle returning the new RTL code (entry point unchanged),
     along with the new entrypoint, and a mapping of new nodes to old nodes *)
-Axiom duplicate_aux: function -> code * node * (PTree.t node).
+  Parameter duplicate_aux: function -> code * node * (PTree.t node).
+
+End DuplicateOracle.
+
+
 
-Extract Constant duplicate_aux => "Duplicateaux.duplicate_aux".
+Module Duplicate (D: DuplicateOracle).
+
+Export D.
+
+Definition duplicate_aux := duplicate_aux.
+
+(* Extract Constant duplicate_aux => "Duplicateaux.duplicate_aux". *)
+
+Local Open Scope error_monad_scope.
+Local Open Scope positive_scope.
 
 (** * Verification of node duplications *)
 
@@ -215,3 +229,5 @@ Definition transf_fundef (f: fundef) : res fundef :=
 
 Definition transf_program (p: program) : res program :=
   transform_partial_program transf_fundef p.
+
+End Duplicate.
diff --git a/backend/Duplicateaux.ml b/backend/Duplicateaux.ml
index eb9f42e0..76b5616b 100644
--- a/backend/Duplicateaux.ml
+++ b/backend/Duplicateaux.ml
@@ -30,7 +30,9 @@ let get_some = LICMaux.get_some
 let rtl_successors = LICMaux.rtl_successors
 
 (* Get list of nodes following a BFS of the code *)
-let bfs code entrypoint = begin
+(* Stops when predicate is reached
+ * Excludes any node given in excluded function *)
+let bfs_until code entrypoint (predicate: node->bool) (excluded: node->bool) = begin
   debug "bfs\n";
   let visited = ref (PTree.map (fun n i -> false) code)
   and bfs_list = ref []
@@ -40,20 +42,24 @@ let bfs code entrypoint = begin
     Queue.add entrypoint to_visit;
     while not (Queue.is_empty to_visit) do
       node := Queue.pop to_visit;
-      if not (get_some @@ PTree.get !node !visited) then begin
+      if (not (get_some @@ PTree.get !node !visited)) then begin
         visited := PTree.set !node true !visited;
-        match PTree.get !node code with
-        | None -> failwith "No such node"
-        | Some i ->
-            bfs_list := !node :: !bfs_list;
-            let succ = rtl_successors i in
-            List.iter (fun n -> Queue.add n to_visit) succ
+        if not (excluded !node) then begin
+          match PTree.get !node code with
+          | None -> failwith "No such node"
+          | Some i ->
+              bfs_list := !node :: !bfs_list;
+              if not (predicate !node) then
+                let succ = rtl_successors i in List.iter (fun n -> Queue.add n to_visit) succ
+        end
       end
     done;
     List.rev !bfs_list
   end
 end
 
+let bfs code entrypoint = bfs_until code entrypoint (fun _ -> false) (fun _ -> false)
+
 let optbool o = match o with Some _ -> true | None -> false
 
 let ptree_get_some n ptree = get_some @@ PTree.get n ptree
@@ -81,15 +87,7 @@ end
 
 module PSet = Set.Make(PInt)
 
-let print_intlist oc l =
-  let rec f oc = function
-  | [] -> ()
-  | n::ln -> (Printf.fprintf oc "%d %a" (P.to_int n) f ln)
-  in begin
-    if !debug_flag then begin
-      Printf.fprintf oc "[%a]" f l
-    end
-  end
+let print_intlist = LICMaux.print_intlist
 
 let print_intset s =
   let seq = PSet.to_seq s
@@ -403,19 +401,7 @@ let print_traces oc traces =
       Printf.fprintf oc "Traces: {%a}\n" f traces
   end
 
-(* Adapted from backend/PrintRTL.ml: print_function *)
-let print_code code = let open PrintRTL in let open Printf in
-  if (!debug_flag) then begin
-    fprintf stdout "{\n";
-    let instrs =
-      List.sort
-        (fun (pc1, _) (pc2, _) -> compare pc2 pc1)
-        (List.rev_map
-          (fun (pc, i) -> (P.to_int pc, i))
-          (PTree.elements code)) in
-    List.iter (print_instruction stdout) instrs;
-    fprintf stdout "}"
-  end
+let print_code code = LICMaux.print_code code
 
 (* Dumb (but linear) trace selection *)
 let select_traces_linear code entrypoint =
@@ -627,15 +613,17 @@ let invert_iconds code =
 
 type innerLoop = {
   preds: P.t list;
-  body: HashedSet.PSet.t;
+  body: P.t list;
   head: P.t; (* head of the loop *)
-  final: P.t (* the final instruction, which loops back to the head *)
+  finals: P.t list (* the final instructions, which loops back to the head *)
+  (* There may be more than one ; for instance if there is an if inside the loop with both
+   * branches leading to a goto backedge *)
 }
 
 let print_pset = LICMaux.pp_pset
 
 let print_inner_loop iloop =
-  debug "{preds: %a, body: %a}" print_intlist iloop.preds print_pset iloop.body
+  debug "{preds: %a, body: %a}" print_intlist iloop.preds print_intlist iloop.body
 
 let rec print_inner_loops = function
 | [] -> ()
@@ -657,6 +645,53 @@ let print_ptree printer pt =
 
 let print_pint oc i = if !debug_flag then Printf.fprintf oc "%d" (P.to_int i) else ()
 
+let cb_exit_node = function
+  | Icond (_,_,n1,n2,p) -> begin match p with
+      | Some true -> Some n2
+      | Some false -> Some n1
+      | None -> None
+    end
+  | _ -> None
+
+      (*
+(* Alternative code to get inner_loops - use it if we suspect the other function to be bugged *)
+let get_natural_loop code predmap n =
+  let is_final_node m =
+    let successors = rtl_successors @@ get_some @@ PTree.get m code in
+    List.exists (fun s -> (P.to_int s) == (P.to_int n)) successors
+  in 
+  let excluded_node = cb_exit_node @@ get_some @@ PTree.get n code in
+  let is_excluded m = match excluded_node with
+    | None -> false
+    | Some ex -> P.to_int ex == P.to_int m
+  in
+  debug "get_natural_loop for %d\n" (P.to_int n);
+  let body = bfs_until code n is_final_node is_excluded in
+  debug "BODY: %a\n" print_intlist body;
+  let final = List.find is_final_node body in
+  debug "FINAL: %d\n" (P.to_int final);
+  let preds = List.filter (fun pred -> List.mem pred body) @@ get_some @@ PTree.get n predmap in
+  debug "PREDS: %a\n" print_intlist preds;
+  { preds = preds; body = body; head = n; final = final }
+
+let rec count_loop_headers is_loop_header = function
+  | [] -> 0
+  | n :: ln ->
+      let rem = count_loop_headers is_loop_header ln in
+      if (get_some @@ PTree.get n is_loop_header) then rem + 1 else rem
+
+let get_inner_loops f code is_loop_header =
+  let predmap = get_predecessors_rtl code in
+  let iloops = ref [] in
+  List.iter (fun (n, ilh) -> if ilh then begin
+    let iloop = get_natural_loop code predmap n in
+    let nb_headers = count_loop_headers is_loop_header iloop.body in
+    if nb_headers == 1 then (* innermost loop *)
+      iloops := iloop :: !iloops end
+  ) (PTree.elements is_loop_header);
+  !iloops
+  *)
+
 let get_inner_loops f code is_loop_header =
   let fake_f = { fn_sig = f.fn_sig; fn_params = f.fn_params; 
     fn_stacksize = f.fn_stacksize; fn_code = code; fn_entrypoint = f.fn_entrypoint } in
@@ -674,17 +709,16 @@ let get_inner_loops f code is_loop_header =
           assert (List.length heads == 1);
           List.hd heads
         end in
-      let final = (* the predecessors from head that are in the body *)
+      let finals = (* the predecessors from head that are in the body *)
         let head_preds = ptree_get_some head predmap in
         let filtered = List.filter (fun n -> HashedSet.PSet.contains body n) head_preds in
         begin
           debug "HEAD: %d\n" (P.to_int head);
           debug "BODY: %a\n" print_pset body;
           debug "HEADPREDS: %a\n" print_intlist head_preds;
-          assert (List.length filtered == 1);
-          List.hd filtered
+          filtered
         end in
-      { preds = preds; body = body; head = head; final = final }
+      { preds = preds; body = (HashedSet.PSet.elements body); head = head; finals = finals }
     ) 
     (* LICMaux.inner_loops also returns non-inner loops, but with a body of 1 instruction
      * We remove those to get just the inner loops *)
@@ -710,6 +744,8 @@ let generate_revmap ln ln' revmap = generate_fwmap ln' ln revmap
 
 let apply_map fw n = P.of_int @@ ptree_get_some n fw
 
+let apply_map_list fw ln = List.map (apply_map fw) ln
+
 let apply_map_opt fw n =
   match PTree.get n fw with
   | Some n' -> P.of_int n'
@@ -769,7 +805,7 @@ let rec count_ignore_nops code = function
  * 3) Links the last instruction of body' into the first instruction of body
  *)
 let unroll_inner_loop_single code revmap iloop =
-  let body = HashedSet.PSet.elements (iloop.body) in
+  let body = iloop.body in
   if count_ignore_nops code body > !Clflags.option_funrollsingle then begin
     debug "Too many nodes in the loop body (%d > %d)" (List.length body) !Clflags.option_funrollsingle;
     (code, revmap)
@@ -777,12 +813,12 @@ let unroll_inner_loop_single code revmap iloop =
     let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
     let code' = ref code2 in
     let head' = apply_map fwmap (iloop.head) in
-    let final' = apply_map fwmap (iloop.final) in
+    let finals' = apply_map_list fwmap (iloop.finals) in
     begin
       debug "PREDS: %a\n" print_intlist iloop.preds;
       debug "IHEAD: %d\n" (P.to_int iloop.head);
       code' := change_pointers !code' (iloop.head) head' (iloop.preds);
-      code' := change_pointers !code' head' (iloop.head) [final'];
+      code' := change_pointers !code' head' (iloop.head) finals';
       (!code', revmap2)
     end
 
@@ -806,7 +842,7 @@ let unroll_inner_loops_single f code revmap =
  * 3) Links the last instruction of body' into the first of body
  *)
 let unroll_inner_loop_body code revmap iloop =
-  let body = HashedSet.PSet.elements (iloop.body) in
+  let body = iloop.body in
   let limit = !Clflags.option_funrollbody in
   if count_ignore_nops code body > limit then begin
     debug "Too many nodes in the loop body (%d > %d)" (List.length body) limit;
@@ -815,16 +851,18 @@ let unroll_inner_loop_body code revmap iloop =
     let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
     let code' = ref code2 in
     let head' = apply_map fwmap (iloop.head) in
-    let final' = apply_map fwmap (iloop.final) in
+    let finals' = apply_map_list fwmap (iloop.finals) in
     begin
-      code' := change_pointers !code' iloop.head head' [iloop.final];
-      code' := change_pointers !code' head' iloop.head [final']; 
+      code' := change_pointers !code' iloop.head head' iloop.finals;
+      code' := change_pointers !code' head' iloop.head finals'; 
       (!code', revmap2)
     end
 
 let unroll_inner_loops_body f code revmap =
   let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
+  (* debug_flag := true; *)
   let inner_loops = get_inner_loops f code is_loop_header in
+  debug "Number of loops found: %d\n" (List.length inner_loops);
   let code' = ref code in
   let revmap' = ref revmap in
   begin
@@ -832,46 +870,102 @@ let unroll_inner_loops_body f code revmap =
     List.iter (fun iloop ->
       let (new_code, new_revmap) = unroll_inner_loop_body !code' !revmap' iloop in
       code' := new_code; revmap' := new_revmap
+    ) inner_loops; (* debug_flag := false; *)
+    (!code', !revmap')
+  end
+
+let extract_upto_icond f code head =
+  let rec extract h =
+    let inst = get_some @@ PTree.get h code in
+    match inst with
+    | Icond _ -> [h]
+    | _ -> ( match rtl_successors inst with
+        | [n] -> h :: (extract n)
+        | _ -> failwith "Found a node with more than one successor??"
+      )
+  in List.rev @@ extract head
+
+let rotate_inner_loop f code revmap iloop =
+  let header = extract_upto_icond f code iloop.head in
+  let limit = !Clflags.option_flooprotate in
+  if count_ignore_nops code header > limit then begin
+    debug "Loop Rotate: too many nodes to duplicate (%d > %d)" (List.length header) limit;
+    (code, revmap)
+  end else
+    let (code2, revmap2, dupheader, fwmap) = clone code revmap header in
+    let code' = ref code2 in
+    let head' = apply_map fwmap iloop.head in
+    begin
+      code' := change_pointers !code' iloop.head head' iloop.preds;
+      (!code', revmap2)
+    end
+
+let rotate_inner_loops f code revmap =
+  let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
+  let inner_loops = get_inner_loops f code is_loop_header in
+  let code' = ref code in
+  let revmap' = ref revmap in
+  begin
+    print_inner_loops inner_loops;
+    List.iter (fun iloop ->
+      let (new_code, new_revmap) = rotate_inner_loop f !code' !revmap' iloop in
+      code' := new_code; revmap' := new_revmap
     ) inner_loops;
     (!code', !revmap')
   end
 
-let duplicate_aux f =
-  (* initializing *)
+let loop_rotate f =
   let entrypoint = f.fn_entrypoint in
   let code = f.fn_code in
   let revmap = make_identity_ptree code in
+  let (code, revmap) =
+    if !Clflags.option_flooprotate > 0 then
+      rotate_inner_loops f code revmap
+    else (code, revmap) in
+  ((code, entrypoint), revmap)
 
-  (* static prediction *)
+let static_predict f =
+  let entrypoint = f.fn_entrypoint in
+  let code = f.fn_code in
+  let revmap = make_identity_ptree code in
   let code =
     if !Clflags.option_fpredict then
       update_directions code entrypoint
     else code in
+  let code =
+    if !Clflags.option_fpredict then
+      invert_iconds code
+    else code in
+  ((code, entrypoint), revmap)
 
-  (* unroll single *)
+let unroll_single f =
+  let entrypoint = f.fn_entrypoint in
+  let code = f.fn_code in
+  let revmap = make_identity_ptree code in
   let (code, revmap) =
     if !Clflags.option_funrollsingle > 0 then
       unroll_inner_loops_single f code revmap
     else (code, revmap) in
+  ((code, entrypoint), revmap)
 
-  (* unroll body *)
+let unroll_body f =
+  let entrypoint = f.fn_entrypoint in
+  let code = f.fn_code in
+  let revmap = make_identity_ptree code in
   let (code, revmap) =
     if !Clflags.option_funrollbody > 0 then
       unroll_inner_loops_body f code revmap
     else (code, revmap) in
+  ((code, entrypoint), revmap)
 
-  (* static prediction bis *)
-  let code =
-    if !Clflags.option_fpredict then
-      invert_iconds code
-    else code in
-
-  (* tail duplication *)
+let tail_duplicate f =
+  let entrypoint = f.fn_entrypoint in
+  let code = f.fn_code in
+  let revmap = make_identity_ptree code in
   let (code, revmap) =
     if !Clflags.option_ftailduplicate > 0 then
       let traces = select_traces code entrypoint in
       let preds = get_predecessors_rtl code in
       superblockify_traces code preds traces revmap
     else (code, revmap) in
-
   ((code, entrypoint), revmap)
diff --git a/backend/Duplicatepasses.v b/backend/Duplicatepasses.v
new file mode 100644
index 00000000..7e58eedf
--- /dev/null
+++ b/backend/Duplicatepasses.v
@@ -0,0 +1,58 @@
+Require Import RTL.
+Require Import Maps.
+Require Import Duplicate.
+Require Import Duplicateproof.
+
+(** Static Prediction *)
+
+Module StaticPredictOracle <: DuplicateOracle.
+  Axiom duplicate_aux : function -> code * node * (PTree.t node).
+  Extract Constant duplicate_aux => "Duplicateaux.static_predict".
+End StaticPredictOracle.
+
+Module Staticpredictproof := DuplicateProof StaticPredictOracle.
+
+Module Staticpredict := Staticpredictproof.
+
+(** Unrolling one iteration out of the body *)
+
+Module UnrollSingleOracle <: DuplicateOracle.
+  Axiom duplicate_aux : function -> code * node * (PTree.t node).
+  Extract Constant duplicate_aux => "Duplicateaux.unroll_single".
+End UnrollSingleOracle.
+
+Module Unrollsingleproof := DuplicateProof UnrollSingleOracle.
+
+Module Unrollsingle := Unrollsingleproof.
+
+(** Unrolling the body of innermost loops *)
+
+Module UnrollBodyOracle <: DuplicateOracle.
+  Axiom duplicate_aux : function -> code * node * (PTree.t node).
+  Extract Constant duplicate_aux => "Duplicateaux.unroll_body".
+End UnrollBodyOracle.
+
+Module Unrollbodyproof := DuplicateProof UnrollBodyOracle.
+
+Module Unrollbody := Unrollbodyproof.
+
+(** Tail Duplication *)
+
+Module TailDuplicateOracle <: DuplicateOracle.
+ Axiom duplicate_aux : function -> code * node * (PTree.t node).
+ Extract Constant duplicate_aux =>  "Duplicateaux.tail_duplicate".
+End TailDuplicateOracle.
+
+Module Tailduplicateproof := DuplicateProof TailDuplicateOracle.
+
+Module Tailduplicate := Tailduplicateproof.
+
+(** Loop Rotate *)
+
+Module LoopRotateOracle <: DuplicateOracle.
+  Axiom duplicate_aux : function -> code * node * (PTree.t node).
+  Extract Constant duplicate_aux =>  "Duplicateaux.loop_rotate".
+End LoopRotateOracle.
+
+Module Looprotateproof := DuplicateProof LoopRotateOracle.
+Module Looprotate := Looprotateproof.
diff --git a/backend/Duplicateproof.v b/backend/Duplicateproof.v
index 62455076..2480ccf0 100644
--- a/backend/Duplicateproof.v
+++ b/backend/Duplicateproof.v
@@ -17,6 +17,9 @@ Require Import AST Linking Errors Globalenvs Smallstep.
 Require Import Coqlib Maps Events Values.
 Require Import Op RTL Duplicate.
 
+Module DuplicateProof (D: DuplicateOracle).
+Include Duplicate D.
+
 Local Open Scope positive_scope.
 
 (** * Definition of [match_states] (independently of the translation) *)
@@ -535,3 +538,5 @@ Proof.
 Qed.
 
 End PRESERVATION.
+
+End DuplicateProof.
diff --git a/backend/LICMaux.ml b/backend/LICMaux.ml
index 0ca4418b..6283e129 100644
--- a/backend/LICMaux.ml
+++ b/backend/LICMaux.ml
@@ -39,6 +39,42 @@ let rtl_successors = function
 | Icond (_,_,n1,n2,_) -> [n1; n2]
 | Ijumptable (_,ln) -> ln
 
+let print_ptree_bool oc pt =
+  if !debug_flag then
+    let elements = PTree.elements pt in
+    begin
+      Printf.fprintf oc "[";
+      List.iter (fun (n, b) ->
+        if b then Printf.fprintf oc "%d, " (P.to_int n)
+      ) elements;
+      Printf.fprintf oc "]\n"
+    end
+  else ()
+
+let print_intlist oc l =
+  let rec f oc = function
+  | [] -> ()
+  | n::ln -> (Printf.fprintf oc "%d %a" (P.to_int n) f ln)
+  in begin
+    if !debug_flag then begin
+      Printf.fprintf oc "[%a]" f l
+    end
+  end
+
+(* Adapted from backend/PrintRTL.ml: print_function *)
+let print_code code = let open PrintRTL in let open Printf in
+  if (!debug_flag) then begin
+    fprintf stdout "{\n";
+    let instrs =
+      List.sort
+        (fun (pc1, _) (pc2, _) -> compare pc2 pc1)
+        (List.rev_map
+          (fun (pc, i) -> (P.to_int pc, i))
+          (PTree.elements code)) in
+    List.iter (print_instruction stdout) instrs;
+    fprintf stdout "}"
+  end
+
 (** Getting loop branches with a DFS visit :
   * Each node is either Unvisited, Visited, or Processed
   * pre-order: node becomes Processed
@@ -53,23 +89,34 @@ let get_loop_headers code entrypoint = begin
   in let rec dfs_visit code = function
   | [] -> ()
   | node :: ln ->
+      debug "ENTERING node %d, REM are %a\n" (P.to_int node) print_intlist ln;
       match (get_some @@ PTree.get node !visited) with
-      | Visited -> ()
+      | Visited -> begin
+          debug "\tNode %d is already Visited, skipping\n" (P.to_int node);
+          dfs_visit code ln
+        end
       | Processed -> begin
           debug "Node %d is a loop header\n" (P.to_int node);
           is_loop_header := PTree.set node true !is_loop_header;
-          visited := PTree.set node Visited !visited
+          visited := PTree.set node Visited !visited;
+          dfs_visit code ln
         end
       | Unvisited -> begin
           visited := PTree.set node Processed !visited;
-          match PTree.get node code with
+          debug "Node %d is Processed\n" (P.to_int node);
+          (match PTree.get node code with
           | None -> failwith "No such node"
-          | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits;
+          | Some i -> let next_visits = rtl_successors i in begin
+              debug "About to visit: %a\n" print_intlist next_visits;
+              dfs_visit code next_visits
+            end);
+          debug "Node %d is Visited!\n" (P.to_int node);
           visited := PTree.set node Visited !visited;
           dfs_visit code ln
         end
   in begin
     dfs_visit code [entrypoint];
+    debug "LOOP HEADERS: %a\n" print_ptree_bool !is_loop_header;
     !is_loop_header
   end
 end
@@ -208,7 +255,8 @@ let rewrite_loop_body (last_alloc : reg ref)
                                (List.map (map_reg mapper) args),
                                new_res));
                 PTree.set res new_res mapper
-             | Iload(trap, chunk, addr, args, res, pc')
+             | Iload(_, chunk, addr, args, res, pc')
+             | Istore(chunk, addr, args, res, pc')
                   when Archi.has_notrap_loads &&
                        !Clflags.option_fnontrap_loads ->
                 let new_res = P.succ !last_alloc in
diff --git a/backend/PrintLTL.ml b/backend/PrintLTL.ml
index d8f2ac12..8259297b 100644
--- a/backend/PrintLTL.ml
+++ b/backend/PrintLTL.ml
@@ -133,10 +133,10 @@ let print_program pp (prog: LTL.program) =
 
 let destination : string option ref = ref None
 
-let print_if prog =
+let print_if passno prog =
   match !destination with
   | None -> ()
   | Some f ->
-      let oc = open_out f in
+      let oc = open_out (f ^ "." ^ Z.to_string passno) in
       print_program oc prog;
       close_out oc
diff --git a/backend/Tunneling.v b/backend/Tunneling.v
index 78458582..269ebb6f 100644
--- a/backend/Tunneling.v
+++ b/backend/Tunneling.v
@@ -3,6 +3,7 @@
 (*              The Compcert verified compiler                         *)
 (*                                                                     *)
 (*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*          Sylvain Boulmé  Grenoble-INP, VERIMAG                      *)
 (*                                                                     *)
 (*  Copyright Institut National de Recherche en Informatique et en     *)
 (*  Automatique.  All rights reserved.  This file is distributed       *)
@@ -12,7 +13,7 @@
 
 (** Branch tunneling (optimization of branches to branches). *)
 
-Require Import Coqlib Maps UnionFind.
+Require Import Coqlib Maps Errors.
 Require Import AST.
 Require Import LTL.
 
@@ -21,10 +22,10 @@ Require Import LTL.
   so that they jump directly to the end of the branch sequence.
   For example:
 <<
-     L1: nop L2;                          L1: nop L3;
-     L2; nop L3;               becomes    L2: nop L3;
+     L1: if (cond) nop L2;                L1: nop L3;
+     L2: nop L3;               becomes    L2: nop L3;
      L3: instr;                           L3: instr;
-     L4: if (cond) goto L1;               L4: if (cond) goto L3;
+     L4: if (cond) goto L1;               L4: if (cond) nop L1;
 >>
   This optimization can be applied to several of our intermediate
   languages.  We choose to perform it on the [LTL] language,
@@ -37,11 +38,14 @@ Require Import LTL.
   dead code (as the "nop L3" in the example above).
 *)
 
-(** The naive implementation of branch tunneling would replace
-  any branch to a node [pc] by a branch to the node
-  [branch_target f pc], defined as follows:
+(** The implementation consists in two passes: the first pass
+    records the branch t of each "nop"
+    and the second pass replace any "nop" node to [pc]
+    by a branch to a "nop" at [branch_t f pc]
+
+Naively, we may define [branch_t f pc] as follows:
 <<
-  branch_target f pc = branch_target f pc'  if f(pc) = nop pc'
+  branch_t f pc = branch_t f pc'  if f(pc) = nop pc'
                      = pc                   otherwise
 >>
   However, this definition can fail to terminate if
@@ -50,56 +54,114 @@ Require Import LTL.
      L1: nop L1;
 >>
   or
-<<   L1: nop L2;
+<<
+     L1: nop L2;
      L2: nop L1;
 >>
   Coq warns us of this fact by not accepting the definition
-  of [branch_target] above.
+  of [branch_t] above.
+
+  To handle this problem, we use a union-find data structure, adding equalities [pc = pc']
+  for every instruction [pc: nop pc'] in the function.
+
+  Moreover, because the elimination of "useless" [Lcond] depends on the current [uf] datastructure,
+  we need to iterate until we reach a fixpoint.
+
+  Actually, it is simpler and more efficient to perform this in an external oracle, that also returns a measure
+  in order to help the proof.
+
+  A verifier checks that this data-structure is correct.
+*)
+
+Definition UF := PTree.t (node * Z).
 
-  To handle this problem, we proceed in two passes.  The first pass
-  populates a union-find data structure, adding equalities [pc = pc']
-  for every instruction [pc: nop pc'] in the function. *)
+(* The oracle returns a map of "nop" node to their target with a distance (ie the number of the "nop" node on the path) to the target. *)
+Axiom branch_target: LTL.function -> UF. 
+Extract Constant branch_target => "Tunnelingaux.branch_target".
 
-Module U := UnionFind.UF(PTree).
+Local Open Scope error_monad_scope.
 
-Definition record_goto (uf: U.t) (pc: node) (b: bblock) : U.t :=
-  match b with
-  | Lbranch s :: _ => U.union uf pc s
-  | _ => uf
+Definition get (td: UF) pc:node*Z :=
+  match td!pc with
+  | Some (t,d) => (t,Z.abs d)
+  | _ => (pc,0)
   end.
 
-Definition record_gotos (f: LTL.function) : U.t :=
-  PTree.fold record_goto f.(fn_code) U.empty.
+Definition target (td: UF) (pc:node): node := fst (get td pc).
+Coercion target: UF >-> Funclass.
+
+(* we check that the domain of [td] is included in the domain of [c] *)
+Definition check_included (td: UF) (c: code): option bblock
+  := PTree.fold (fun (ok:option bblock) pc _ => if ok then c!pc else None) td (Some nil).
+
+(* we check the validity of targets and their bound:
+   the distance of a "nop" node (w.r.t to the target) must be greater than the one of its parents.
+*)
+Definition check_bblock (td: UF) (pc:node) (bb: bblock): res unit
+ := match td!pc with
+    | None => OK tt
+    | Some (tpc, dpc) =>
+       let dpc := Z.abs dpc in
+       match bb with
+       | Lbranch s ::_ =>
+         let (ts, ds) := get td s in 
+         if peq tpc ts then
+            if zlt ds dpc then OK tt
+            else Error (msg "bad distance in Lbranch")
+         else Error (msg "invalid skip of Lbranch")
+       | Lcond _ _ s1 s2 _ :: _ =>
+          let (ts1, ds1) := get td s1 in
+          let (ts2, ds2) := get td s2 in
+          if peq tpc ts1 then
+            if peq tpc ts2 then
+              if zlt ds1 dpc then
+                if zlt ds2 dpc then OK tt
+                else Error (msg "bad distance on else branch")
+              else Error (msg "bad distance on then branch")
+            else Error (msg "invalid skip of else branch")
+          else Error (msg "invalid skip of then branch")
+      | _ => Error (msg "cannot skip this block")
+      end
+   end.
+
+Definition check_code (td: UF) (c:code): res unit
+  := PTree.fold (fun ok pc bb => do _ <- ok; check_bblock td pc bb) c (OK tt).
 
 (** The second pass rewrites all LTL instructions, replacing every
-  successor [s] of every instruction by the canonical representative
+  successor [s] of every instruction by [t s], the canonical representative
   of its equivalence class in the union-find data structure. *)
 
-Definition tunnel_instr (uf: U.t) (i: instruction) : instruction :=
+Definition tunnel_instr (t: node -> node) (i: instruction) : instruction :=
   match i with
-  | Lbranch s => Lbranch (U.repr uf s)
+  | Lbranch s => Lbranch (t s)
   | Lcond cond args s1 s2 info =>
-      let s1' := U.repr uf s1 in let s2' := U.repr uf s2 in
+      let s1' := t s1 in let s2' := t s2 in
       if peq s1' s2'
       then Lbranch s1'
       else Lcond cond args s1' s2' info
-  | Ljumptable arg tbl => Ljumptable arg (List.map (U.repr uf) tbl)
+  | Ljumptable arg tbl => Ljumptable arg (List.map t tbl)
   | _ => i
   end.
 
-Definition tunnel_block (uf: U.t) (b: bblock) : bblock :=
-  List.map (tunnel_instr uf) b.
+Definition tunnel_block (t: node -> node) (b: bblock) : bblock :=
+  List.map (tunnel_instr t) b.
 
-Definition tunnel_function (f: LTL.function) : LTL.function :=
-  let uf := record_gotos f in
-  mkfunction
-    (fn_sig f)
-    (fn_stacksize f)
-    (PTree.map1 (tunnel_block uf) (fn_code f))
-    (U.repr uf (fn_entrypoint f)).
+Definition tunnel_function (f: LTL.function) : res LTL.function :=
+  let td := branch_target f in
+  let c := (fn_code f) in
+  if check_included td c then 
+    do _ <- check_code td c ; OK
+    (mkfunction
+      (fn_sig f)
+      (fn_stacksize f)
+      (PTree.map1 (tunnel_block td) c)
+      (td (fn_entrypoint f)))
+  else 
+   Error (msg "Some node of the union-find is not in the CFG")
+   .
 
-Definition tunnel_fundef (f: LTL.fundef) : LTL.fundef :=
-  transf_fundef tunnel_function f.
+Definition tunnel_fundef (f: fundef) : res fundef :=
+  transf_partial_fundef tunnel_function f.
 
-Definition transf_program (p: LTL.program) : LTL.program :=
-  transform_program tunnel_fundef p.
+Definition transf_program (p: program) : res program :=
+  transform_partial_program tunnel_fundef p.
diff --git a/backend/Tunnelingaux.ml b/backend/Tunnelingaux.ml
new file mode 100644
index 00000000..af89adea
--- /dev/null
+++ b/backend/Tunnelingaux.ml
@@ -0,0 +1,283 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           Sylvain Boulmé  Grenoble-INP, VERIMAG             *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+(*
+
+This file implements the [branch_target] oracle that identifies "nop" branches in a LTL function,
+and computes their target node with the distance (ie the number of cummulated nops) toward this target.
+
+See [Tunneling.v]
+
+*)
+
+open Coqlib
+open LTL
+open Maps
+open Camlcoq
+
+let limit_tunneling = None (* for debugging: [Some x] limit the number of iterations *)
+let debug_flag = ref false
+let final_dump = false   (* set to true to have a more verbose debugging *)
+
+let debug fmt =
+  if !debug_flag then Printf.eprintf fmt
+  else Printf.ifprintf stderr fmt
+
+exception BugOnPC of int
+
+(* type of labels in the cfg *)
+type label = int * P.t
+
+(* instructions under analyzis *)
+type simple_inst = (* a simplified view of LTL instructions *)
+  LBRANCH of node
+| LCOND of node * node
+| OTHER
+and node = {
+    lab : label;
+    mutable inst: simple_inst;
+    mutable link: node; (* link in the union-find: itself for non "nop"-nodes, target of the "nop" otherwise *)
+    mutable dist: int;
+    mutable tag: int
+  }
+
+(* type of the (simplified) CFG *)
+type cfg = {
+    nodes: (int, node) Hashtbl.t;
+    mutable rems: node list; (* remaining conditions that may become lbranch or not *)
+    mutable num_rems: int;
+    mutable iter_num: int (* number of iterations in elimination of conditions *)
+  }
+
+let lab_i (n: node): int = fst n.lab
+let lab_p (n: node): P.t = snd n.lab
+
+let rec target c n = (* inspired from the "find" of union-find algorithm *)
+  match n.inst with
+  | LCOND(s1,s2) ->
+     if n.link != n
+     then update c n
+     else if n.tag < c.iter_num then (
+       (* we try to change the condition ... *)
+       n.tag <- c.iter_num; (* ... but at most once by iteration *)
+       let ts1 = target c s1 in
+       let ts2 = target c s2 in
+       if ts1 == ts2 then (n.link <- ts1; ts1) else n
+     ) else n
+  | _ ->
+     if n.link != n
+     then update c n
+     else n
+and update c n =
+  let t = target c n.link in
+  n.link <- t; t
+
+let get_node c p =
+  let li = P.to_int p in
+  try
+    Hashtbl.find c.nodes li
+  with
+    Not_found ->
+      let rec n = { lab = (li, p); inst = OTHER; link = n ; dist = 0; tag = 0 }  in
+      Hashtbl.add c.nodes li n;
+      n
+
+let set_branch c p s =
+  let li = P.to_int p in
+  try
+    let n = Hashtbl.find c.nodes li in
+    n.inst <- LBRANCH s;
+    n.link <- target c s
+  with
+    Not_found ->
+      let n = { lab = (li,p); inst = LBRANCH s; link = target c s; dist = 0; tag = 0 } in
+      Hashtbl.add c.nodes li n
+
+
+(* build [c.nodes] and accumulate in [acc] conditions at beginning of LTL basic-blocks *)
+let build_simplified_cfg c acc pc bb =
+  match bb with
+  | Lbranch s :: _ ->
+     let ns = get_node c s in
+     set_branch c pc ns;
+     acc
+  | Lcond (_, _, s1, s2, _) :: _ ->
+     c.num_rems <- c.num_rems + 1;
+     let ns1 = get_node c s1 in
+     let ns2 = get_node c s2 in
+     let npc = get_node c pc in
+     npc.inst <- LCOND(ns1, ns2);
+     npc::acc
+  | _ -> acc
+
+(* try to change a condition into a branch
+[acc] is the current accumulator of conditions to consider in the next iteration of repeat_change_cond
+*)
+let try_change_cond c acc pc =
+  match pc.inst with
+  | LCOND(s1,s2) ->
+     let ts1 = target c s1 in
+     let ts2 = target c s2 in
+     if ts1 == ts2 then (
+       pc.link <- ts1;
+       c.num_rems <- c.num_rems - 1;
+       acc
+     ) else
+       pc::acc
+  | _  -> raise (BugOnPC (lab_i pc)) (* LCOND expected *)
+
+(* repeat [try_change_cond] until no condition is changed into a branch *)
+let rec repeat_change_cond c =
+  c.iter_num <- c.iter_num + 1;
+  debug "++ Tunneling.branch_target %d: remaining number of conds to consider = %d\n" (c.iter_num) (c.num_rems);
+  let old =  c.num_rems in
+  c.rems <- List.fold_left (try_change_cond c) [] c.rems;
+  let curr = c.num_rems in
+  let continue =
+    match limit_tunneling with
+    | Some n -> curr < old && c.iter_num < n
+    | None -> curr < old
+  in
+  if continue
+  then repeat_change_cond c
+
+
+(* compute the final distance of each nop nodes to its target *)
+let undef_dist = -1
+let self_dist = undef_dist-1
+let rec dist n =
+  if n.dist = undef_dist
+  then (
+    n.dist <- self_dist; (* protection against an unexpected loop in the data-structure *)
+    n.dist <-
+      (match n.inst with
+       | OTHER -> 0
+       | LBRANCH p -> 1 + dist p
+       | LCOND (p1,p2) -> 1 + (max (dist p1) (dist p2)));
+    n.dist
+  ) else if n.dist=self_dist then raise (BugOnPC (lab_i n))
+    else n.dist
+
+let final_export f c =
+  let count = ref 0 in
+  let filter_nops_init_dist _ n acc =
+    let tn = target c n in
+    if tn == n
+    then (
+      n.dist <- 0; (* force [n] to be a base case in the recursion of [dist] *)
+      acc
+    ) else (
+      n.dist <- undef_dist; (* force [dist] to compute the actual [n.dist] *)
+      count := !count+1;
+      (tn, n)::acc
+    )
+  in
+  let nops = Hashtbl.fold filter_nops_init_dist c.nodes [] in
+  let res = List.fold_left (fun acc (tn,n) -> PTree.set (lab_p n) (lab_p tn, Z.of_uint (dist n)) acc) PTree.empty nops in
+  debug "* Tunneling.branch_target: final number of eliminated nops = %d\n" !count;
+  res
+
+(*********************************************)
+(*** START: printing and debugging functions *)
+
+let string_of_labeli nodes ipc =
+  try
+    let pc = Hashtbl.find nodes ipc in
+    if pc.link == pc
+    then Printf.sprintf "(Target@%d)" (dist pc)
+    else Printf.sprintf "(Nop %d @%d)" (lab_i pc.link) (dist pc)
+  with
+    Not_found -> ""
+
+let print_bblock c println (pc, bb) =
+  match bb with
+  | Lbranch s::_ -> (if println then debug "\n"); debug "%d:Lbranch %d %s\n" pc (P.to_int s) (string_of_labeli c.nodes pc); false
+  | Lcond (_, _, s1, s2, _)::_ -> (if println then debug "\n"); debug "%d:Lcond (%d,%d) %s\n" pc (P.to_int s1) (P.to_int s2) (string_of_labeli c.nodes pc); false
+  | _ -> debug "%d " pc; true
+
+
+let print_cfg f c  =
+  let a = Array.of_list (PTree.fold (fun acc pc bb -> (P.to_int pc,bb)::acc) f.fn_code []) in
+  Array.fast_sort (fun (i1,_) (i2,_) -> i2 - i1) a;
+  let ep = P.to_int f.fn_entrypoint in
+  debug "entrypoint: %d %s\n" ep (string_of_labeli c.nodes ep);
+  let println = Array.fold_left (print_bblock c) false a in
+  (if println then debug "\n");debug "remaining cond:";
+  List.iter (fun n -> debug "%d " (lab_i n)) c.rems;
+  debug "\n"
+
+(*************************************************************)
+(* Copy-paste of the extracted code of the verifier          *)
+(* with [raise (BugOnPC (P.to_int pc))] instead of [Error.*] *)
+
+let get td pc =
+  match PTree.get pc td with
+  | Some p -> let (t0, d) = p in (t0, d)
+  | None -> (pc, Z.of_uint 0)
+
+let check_bblock td pc bb =
+  match PTree.get pc td with
+  | Some p ->
+    let (tpc, dpc) = p in
+    let dpc0 = dpc in
+    (match bb with
+     | [] ->
+       raise (BugOnPC (P.to_int pc))
+     | i :: _ ->
+       (match i with
+        | Lbranch s ->
+          let (ts, ds) = get td s in
+          if peq tpc ts
+          then if zlt ds dpc0
+               then ()
+               else raise (BugOnPC (P.to_int pc))
+          else raise (BugOnPC (P.to_int pc))
+        | Lcond (_, _, s1, s2, _) ->
+          let (ts1, ds1) = get td s1 in
+          let (ts2, ds2) = get td s2 in
+          if peq tpc ts1
+          then if peq tpc ts2
+               then if zlt ds1 dpc0
+                    then if zlt ds2 dpc0
+                         then ()
+                         else raise (BugOnPC (P.to_int pc))
+                    else raise (BugOnPC (P.to_int pc))
+               else raise (BugOnPC (P.to_int pc))
+          else raise (BugOnPC (P.to_int pc))
+        | _ ->
+          raise (BugOnPC (P.to_int pc))))
+  | None -> ()
+
+(** val check_code : coq_UF -> code -> unit res **)
+
+let check_code td c =
+  PTree.fold (fun _ pc bb -> check_bblock td pc bb) c (())
+
+(*** END: copy-paste & debugging functions *******)
+
+let branch_target f =
+  debug "* Tunneling.branch_target: starting on a new function\n";
+  if limit_tunneling <> None then debug "* WARNING: limit_tunneling <> None\n";
+  let c = { nodes = Hashtbl.create 100; rems = []; num_rems = 0; iter_num = 0 } in
+  c.rems <- PTree.fold (build_simplified_cfg c) f.fn_code [];
+  repeat_change_cond c;
+  let res = final_export f c in
+  if !debug_flag then (
+    try
+      check_code res f.fn_code;
+      if final_dump then print_cfg f c;
+    with e -> (
+      print_cfg f c;
+      check_code res f.fn_code
+    )
+  );
+  res
diff --git a/backend/Tunnelingproof.v b/backend/Tunnelingproof.v
index cdf6c800..126b7b87 100644
--- a/backend/Tunnelingproof.v
+++ b/backend/Tunnelingproof.v
@@ -3,6 +3,7 @@
 (*              The Compcert verified compiler                         *)
 (*                                                                     *)
 (*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*          Sylvain Boulmé  Grenoble-INP, VERIMAG                      *)
 (*                                                                     *)
 (*  Copyright Institut National de Recherche en Informatique et en     *)
 (*  Automatique.  All rights reserved.  This file is distributed       *)
@@ -12,131 +13,163 @@
 
 (** Correctness proof for the branch tunneling optimization. *)
 
-Require Import Coqlib Maps UnionFind.
+Require Import Coqlib Maps Errors.
 Require Import AST Linking.
 Require Import Values Memory Events Globalenvs Smallstep.
 Require Import Op Locations LTL.
 Require Import Tunneling.
 
-Definition match_prog (p tp: program) :=
-  match_program (fun ctx f tf => tf = tunnel_fundef f) eq p tp.
+Local Open Scope nat.
 
-Lemma transf_program_match:
-  forall p, match_prog p (transf_program p).
+
+(** * Properties of the branch_target, when the verifier succeeds *)
+
+Definition check_included_spec (c:code) (td:UF) (ok: option bblock) :=
+   ok <> None -> forall pc, c!pc = None -> td!pc = None.
+
+Lemma check_included_correct (td: UF) (c: code): 
+  check_included_spec c td (check_included td c).
+Proof. 
+  apply PTree_Properties.fold_rec with (P := check_included_spec c).
+- (* extensionality *)
+  unfold check_included_spec. intros m m' a EQ IND X pc. rewrite <- EQ; auto.
+- (* base case *)
+  intros _ pc.  rewrite PTree.gempty; try congruence.
+- (* inductive case *)
+  unfold check_included_spec.
+  intros m [|] pc bb NEW ATPC IND; simpl; try congruence.
+  intros H pc0. rewrite PTree.gsspec; destruct (peq _ _); subst; simpl; try congruence.
+  intros; eapply IND; try congruence.
+Qed.
+
+Inductive target_bounds (target: node -> node) (bound: node -> nat) (pc: node): (option bblock) -> Prop :=
+ | TB_default (TB: target pc = pc) ob
+     : target_bounds target bound pc ob
+ | TB_branch s bb
+     (EQ: target pc = target s)
+     (DECREASE: bound s < bound pc)
+     : target_bounds target bound pc (Some (Lbranch s::bb))
+ | TB_cond cond args s1 s2 info bb
+     (EQ1: target pc = target s1)
+     (EQ2: target pc = target s2)
+     (DEC1: bound s1 < bound pc)
+     (DEC2: bound s2 < bound pc)
+     : target_bounds target bound pc (Some (Lcond cond args s1 s2 info::bb))
+ .
+Local Hint Resolve TB_default: core.
+
+Lemma target_None (td:UF) (pc: node): td!pc = None -> td pc = pc.
 Proof.
-  intros. eapply match_transform_program; eauto.
+  unfold target, get. intros H; rewrite H; auto.
 Qed.
+Local Hint Resolve target_None Z.abs_nonneg: core.
 
-(** * Properties of the branch map computed using union-find. *)
+Lemma get_nonneg td pc t d: get td pc = (t, d) -> (0 <= d)%Z.
+Proof.
+  unfold get. destruct (td!_) as [(t0&d0)|]; intros H; inversion H; subst; simpl; omega || auto.
+Qed.
+Local Hint Resolve get_nonneg: core.
 
-(** A variant of [record_goto] that also incrementally computes a measure [f: node -> nat]
-  counting the number of [Lnop] instructions starting at a given [pc] that were eliminated. *)
+Definition bound (td: UF) (pc: node) := Z.to_nat (snd (get td pc)).
 
-Definition measure_edge (u: U.t) (pc s: node) (f: node -> nat) : node -> nat :=
-  fun x => if peq (U.repr u s) pc then f x
-           else if peq (U.repr u x) pc then (f x + f s + 1)%nat
-           else f x.
+Lemma check_bblock_correct (td:UF) (pc:node) (bb: bblock):
+  check_bblock td pc bb = OK tt -> 
+  target_bounds (target td) (bound td) pc (Some bb).
+Proof.
+  unfold check_bblock, bound.
+  destruct (td!pc) as [(tpc&dpc)|] eqn:Hpc; auto.
+  assert (Tpc: td pc = tpc). { unfold target, get; rewrite Hpc; simpl; auto. }
+  assert (Dpc: snd (get td pc) = Z.abs dpc). { unfold get; rewrite Hpc; simpl; auto. }
+  destruct bb as [|[ ] bb]; simpl; try congruence.
+  + destruct (get td s) as (ts, ds) eqn:Hs.
+    repeat (destruct (peq _ _) || destruct (zlt _ _)); simpl; try congruence.
+    intros; apply TB_branch.
+    * rewrite Tpc. unfold target; rewrite Hs; simpl; auto.
+    * rewrite Dpc, Hs; simpl. apply Z2Nat.inj_lt; eauto.
+  + destruct (get td s1) as (ts1, ds1) eqn:Hs1.
+    destruct (get td s2) as (ts2, ds2) eqn:Hs2.
+    repeat (destruct (peq _ _) || destruct (zlt _ _)); simpl; try congruence.
+    intros; apply TB_cond.
+    * rewrite Tpc. unfold target; rewrite Hs1; simpl; auto.
+    * rewrite Tpc. unfold target; rewrite Hs2; simpl; auto.
+    * rewrite Dpc, Hs1; simpl. apply Z2Nat.inj_lt; eauto.
+    * rewrite Dpc, Hs2; simpl. apply Z2Nat.inj_lt; eauto.
+Qed.
 
-Definition record_goto' (uf: U.t * (node -> nat)) (pc: node) (b: bblock) : U.t * (node -> nat) :=
-  match b with
-  | Lbranch s :: b' => let (u, f) := uf in (U.union u pc s, measure_edge u pc s f)
-  | _ => uf
-  end.
+Definition check_code_spec (td:UF) (c:code) (ok: res unit) :=
+   ok = OK tt -> forall pc bb, c!pc = Some bb -> target_bounds (target td) (bound td) pc (Some bb).
 
-Definition branch_map_correct (c: code) (uf: U.t * (node -> nat)): Prop :=
-  forall pc,
-  match c!pc with
-  | Some(Lbranch s :: b) =>
-      U.repr (fst uf) pc = pc \/ (U.repr (fst uf) pc = U.repr (fst uf) s /\ snd uf s < snd uf pc)%nat
-  | _ =>
-      U.repr (fst uf) pc = pc
-  end.
+Lemma check_code_correct (td:UF) c:
+   check_code_spec td c (check_code td c).
+Proof.
+  apply PTree_Properties.fold_rec with (P := check_code_spec td).
+- (* extensionality *)
+  unfold check_code_spec. intros m m' a EQ IND X pc bb; subst. rewrite  <- ! EQ; eauto.
+- (* base case *)
+  intros _ pc.  rewrite PTree.gempty; try congruence.
+- (* inductive case *)
+  unfold check_code_spec.
+  intros m [[]|] pc bb NEW ATPC IND; simpl; try congruence.
+  intros H pc0 bb0. rewrite PTree.gsspec; destruct (peq _ _); subst; simpl; auto.
+  intros X; inversion X; subst.
+  apply check_bblock_correct; auto.
+Qed.
 
-Lemma record_gotos'_correct:
-  forall c,
-  branch_map_correct c (PTree.fold record_goto' c (U.empty, fun (x: node) => O)).
+Theorem branch_target_bounds:
+  forall f tf pc, 
+  tunnel_function f = OK tf ->
+  target_bounds (branch_target f) (bound (branch_target f)) pc (f.(fn_code)!pc).
 Proof.
-  intros.
-  apply PTree_Properties.fold_rec with (P := fun c uf => branch_map_correct c uf).
+  unfold tunnel_function; intros f f' pc.
+  destruct (check_included _ _) eqn:H1; try congruence.
+  destruct (check_code _ _) as [[]|] eqn:H2; simpl; try congruence.
+  intros _.
+  destruct ((fn_code f)!pc) eqn:X.
+  - exploit check_code_correct; eauto.
+  - exploit check_included_correct; eauto.
+    congruence.
+Qed.
 
-- (* extensionality *)
-  intros. red; intros. rewrite <- H. apply H0.
+Lemma tunnel_function_unfold:
+  forall f tf pc,
+  tunnel_function f = OK tf -> 
+  (fn_code tf)!pc = option_map (tunnel_block (branch_target f)) (fn_code f)!pc.
+Proof.
+  unfold tunnel_function; intros f f' pc.
+  destruct (check_included _ _) eqn:H1; try congruence.
+  destruct (check_code _ _) as [[]|] eqn:H2; simpl; try congruence.
+  intros X; inversion X; clear X; subst.
+  simpl. rewrite PTree.gmap1. auto.
+Qed.
 
-- (* base case *)
-  red; intros; simpl. rewrite PTree.gempty. apply U.repr_empty.
+Lemma tunnel_fundef_Internal:
+  forall f tf, tunnel_fundef (Internal f) = OK tf
+  -> exists tf', tunnel_function f = OK tf' /\ tf = Internal tf'.
+Proof.
+  intros f tf; simpl.
+  destruct (tunnel_function f) eqn:X; simpl; try congruence.
+  intros EQ; inversion EQ.
+  eexists; split; eauto.
+Qed.
 
-- (* inductive case *)
-  intros m uf pc bb; intros. destruct uf as [u f].
-  assert (PC: U.repr u pc = pc).
-    generalize (H1 pc). rewrite H. auto.
-  assert (record_goto' (u, f) pc bb = (u, f)
-          \/ exists s, exists bb', bb = Lbranch s :: bb' /\ record_goto' (u, f) pc bb = (U.union u pc s, measure_edge u pc s f)).
-    unfold record_goto'; simpl. destruct bb; auto. destruct i; auto. right. exists s; exists bb; auto.
-  destruct H2 as [B | [s [bb' [EQ B]]]].
-
-+ (* u and f are unchanged *)
-  rewrite B.
-  red. intro pc'. simpl. rewrite PTree.gsspec. destruct (peq pc' pc). subst pc'.
-  destruct bb; auto. destruct i; auto.
-  apply H1.
-
-+ (* b is Lbranch s, u becomes union u pc s, f becomes measure_edge u pc s f *)
-  rewrite B.
-  red. intro pc'. simpl. rewrite PTree.gsspec. destruct (peq pc' pc). subst pc'. rewrite EQ.
-
-* (* The new instruction *)
-  rewrite (U.repr_union_2 u pc s); auto. rewrite U.repr_union_3.
-  unfold measure_edge. destruct (peq (U.repr u s) pc). auto. right. split. auto.
-  rewrite PC. rewrite peq_true. omega.
-
-* (* An old instruction *)
-  assert (U.repr u pc' = pc' -> U.repr (U.union u pc s) pc' = pc').
-  { intro. rewrite <- H2 at 2. apply U.repr_union_1. congruence. }
-  generalize (H1 pc'). simpl. destruct (m!pc'); auto. destruct b; auto. destruct i; auto.
-  intros [P | [P Q]]. left; auto. right.
-  split. apply U.sameclass_union_2. auto.
-  unfold measure_edge. destruct (peq (U.repr u s) pc). auto.
-  rewrite P. destruct (peq (U.repr u s0) pc). omega. auto.
-Qed.
-
-Definition record_gotos' (f: function) :=
-  PTree.fold record_goto' f.(fn_code) (U.empty, fun (x: node) => O).
-
-Lemma record_gotos_gotos':
-  forall f, fst (record_gotos' f) = record_gotos f.
-Proof.
-  intros. unfold record_gotos', record_gotos.
-  repeat rewrite PTree.fold_spec.
-  generalize (PTree.elements (fn_code f)) (U.empty) (fun _ : node => O).
-  induction l; intros; simpl.
-  auto.
-  unfold record_goto' at 2. unfold record_goto at 2.
-  destruct (snd a). apply IHl. destruct i; apply IHl.
-Qed.
-
-Definition branch_target (f: function) (pc: node) : node :=
-  U.repr (record_gotos f) pc.
-
-Definition count_gotos (f: function) (pc: node) : nat :=
-  snd (record_gotos' f) pc.
-
-Theorem record_gotos_correct:
-  forall f pc,
-  match f.(fn_code)!pc with
-  | Some(Lbranch s :: b) =>
-       branch_target f pc = pc \/
-       (branch_target f pc = branch_target f s /\ count_gotos f s < count_gotos f pc)%nat
-  | _ => branch_target f pc = pc
-  end.
+Lemma tunnel_fundef_External:
+  forall tf ef, tunnel_fundef (External ef) = OK tf
+  -> tf = External ef.
 Proof.
-  intros.
-  generalize (record_gotos'_correct f.(fn_code) pc). simpl.
-  fold (record_gotos' f). unfold branch_map_correct, branch_target, count_gotos.
-  rewrite record_gotos_gotos'. auto.
+  intros tf ef; simpl. intros H; inversion H; auto.
 Qed.
 
 (** * Preservation of semantics *)
 
+Definition match_prog (p tp: program) :=
+  match_program (fun _ f tf => tunnel_fundef f = OK tf) eq p tp.
+
+Lemma transf_program_match:
+  forall prog tprog, transf_program prog = OK tprog -> match_prog prog tprog.
+Proof.
+  intros. eapply match_transform_partial_program_contextual; eauto.
+Qed.
+
 Section PRESERVATION.
 
 Variables prog tprog: program.
@@ -145,32 +178,65 @@ Let ge := Genv.globalenv prog.
 Let tge := Genv.globalenv tprog.
 
 Lemma functions_translated:
-  forall v f,
+  forall (v: val) (f: fundef),
   Genv.find_funct ge v = Some f ->
-  Genv.find_funct tge v = Some (tunnel_fundef f).
-Proof (Genv.find_funct_transf TRANSL).
+  exists tf, tunnel_fundef f = OK tf /\ Genv.find_funct tge v = Some tf.
+Proof.
+  intros. exploit (Genv.find_funct_match TRANSL); eauto.
+  intros (cu & tf & A & B & C).
+  repeat eexists; intuition eauto.
+Qed.
 
 Lemma function_ptr_translated:
   forall v f,
   Genv.find_funct_ptr ge v = Some f ->
-  Genv.find_funct_ptr tge v = Some (tunnel_fundef f).
-Proof (Genv.find_funct_ptr_transf TRANSL).
+  exists tf,
+  Genv.find_funct_ptr tge v = Some tf /\ tunnel_fundef f = OK tf.
+Proof.
+  intros.
+  exploit (Genv.find_funct_ptr_transf_partial TRANSL); eauto.
+Qed.
 
-Lemma symbols_preserved:
-  forall id,
-  Genv.find_symbol tge id = Genv.find_symbol ge id.
-Proof (Genv.find_symbol_transf TRANSL).
+Lemma symbols_preserved s: Genv.find_symbol tge s = Genv.find_symbol ge s.
+Proof.
+  rewrite <- (Genv.find_symbol_match TRANSL). reflexivity.
+Qed.
 
 Lemma senv_preserved:
   Senv.equiv ge tge.
-Proof (Genv.senv_transf TRANSL).
+Proof.
+  eapply (Genv.senv_match TRANSL).
+Qed.
 
 Lemma sig_preserved:
-  forall f, funsig (tunnel_fundef f) = funsig f.
+  forall f tf, tunnel_fundef f = OK tf -> funsig tf = funsig f.
 Proof.
-  destruct f; reflexivity.
+  intros. destruct f.
+  - simpl in H. monadInv H. unfold tunnel_function in EQ. 
+    destruct (check_included _ _); try congruence.
+    monadInv EQ. simpl; auto.
+  - simpl in H. monadInv H. reflexivity.
 Qed.
 
+Lemma fn_stacksize_preserved:
+  forall f tf, tunnel_function f = OK tf -> fn_stacksize tf = fn_stacksize f.
+Proof.
+  intros f tf; unfold tunnel_function.
+  destruct (check_included _ _); try congruence.
+  destruct (check_code _ _); simpl; try congruence.
+  intros H; inversion H; simpl; auto.
+Qed.
+
+Lemma fn_entrypoint_preserved:
+  forall f tf, tunnel_function f = OK tf -> fn_entrypoint tf = branch_target f (fn_entrypoint f).
+Proof.
+  intros f tf; unfold tunnel_function.
+  destruct (check_included _ _); try congruence.
+  destruct (check_code _ _); simpl; try congruence.
+  intros H; inversion H; simpl; auto.
+Qed.
+
+
 (** The proof of semantic preservation is a simulation argument
   based on diagrams of the following form:
 <<
@@ -185,7 +251,7 @@ Qed.
   between states [st1] and [st2], as well as the postcondition between
   [st1'] and [st2'].  One transition in the source code (left) can correspond
   to zero or one transition in the transformed code (right).  The
-  "zero transition" case occurs when executing a [Lgoto] instruction
+  "zero transition" case occurs when executing a [Lnop] instruction
   in the source code that has been removed by tunneling.
 
   In the definition of [match_states], what changes between the original and
@@ -194,52 +260,52 @@ Qed.
   and memory states, since some [Vundef] values can become more defined
   as a consequence of eliminating useless [Lcond] instructions. *)
 
-Definition tunneled_block (f: function) (b: bblock) :=
-  tunnel_block (record_gotos f) b.
-
-Definition tunneled_code (f: function) :=
-  PTree.map1 (tunneled_block f) (fn_code f).
-
 Definition locmap_lessdef (ls1 ls2: locset) : Prop :=
   forall l, Val.lessdef (ls1 l) (ls2 l).
 
 Inductive match_stackframes: stackframe -> stackframe -> Prop :=
   | match_stackframes_intro:
-      forall f sp ls0 bb tls0,
+      forall f tf sp ls0 bb tls0,
       locmap_lessdef ls0 tls0 ->
+      tunnel_function f = OK tf ->
       match_stackframes
          (Stackframe f sp ls0 bb)
-         (Stackframe (tunnel_function f) sp tls0 (tunneled_block f bb)).
+         (Stackframe tf sp tls0 (tunnel_block (branch_target f) bb)).
 
 Inductive match_states: state -> state -> Prop :=
   | match_states_intro:
-      forall s f sp pc ls m ts tls tm
+      forall s f tf sp pc ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
         (LS: locmap_lessdef ls tls)
-        (MEM: Mem.extends m tm),
+        (MEM: Mem.extends m tm)
+        (TF: tunnel_function f = OK tf),
       match_states (State s f sp pc ls m)
-                   (State ts (tunnel_function f) sp (branch_target f pc) tls tm)
+                   (State ts tf sp (branch_target f pc) tls tm)
   | match_states_block:
-      forall s f sp bb ls m ts tls tm
+      forall s f tf sp bb ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
         (LS: locmap_lessdef ls tls)
-        (MEM: Mem.extends m tm),
+        (MEM: Mem.extends m tm)
+        (TF: tunnel_function f = OK tf),
       match_states (Block s f sp bb ls m)
-                   (Block ts (tunnel_function f) sp (tunneled_block f bb) tls tm)
+                   (Block ts tf sp (tunnel_block (branch_target f) bb) tls tm)
   | match_states_interm:
-      forall s f sp pc bb ls m ts tls tm
+      forall s f tf sp pc i bb ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
         (LS: locmap_lessdef ls tls)
-        (MEM: Mem.extends m tm),
-      match_states (Block s f sp (Lbranch pc :: bb) ls m)
-                   (State ts (tunnel_function f) sp (branch_target f pc) tls tm)
+        (MEM: Mem.extends m tm)
+        (IBRANCH: tunnel_instr (branch_target f) i = Lbranch pc)
+        (TF: tunnel_function f = OK tf),
+      match_states (Block s f sp (i :: bb) ls m)
+                   (State ts tf sp pc tls tm)
   | match_states_call:
-      forall s f ls m ts tls tm
+      forall s f tf ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
         (LS: locmap_lessdef ls tls)
-        (MEM: Mem.extends m tm),
+        (MEM: Mem.extends m tm)
+        (TF: tunnel_fundef f = OK tf),
       match_states (Callstate s f ls m)
-                   (Callstate ts (tunnel_fundef f) tls tm)
+                   (Callstate ts tf tls tm)
   | match_states_return:
       forall s ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
@@ -289,22 +355,6 @@ Proof.
   induction rl as [ | r rl]; intros; simpl. auto. apply locmap_set_undef_lessdef; auto. 
 Qed.
 
-(*
-Lemma locmap_undef_lessdef:
-  forall ll ls1 ls2,
-  locmap_lessdef ls1 ls2 -> locmap_lessdef (Locmap.undef ll ls1) (Locmap.undef ll ls2).
-Proof.
-  induction ll as [ | l ll]; intros; simpl. auto. apply IHll. apply locmap_set_lessdef; auto. 
-Qed.
-
-Lemma locmap_undef_lessdef_1:
-  forall ll ls1 ls2,
-  locmap_lessdef ls1 ls2 -> locmap_lessdef (Locmap.undef ll ls1) ls2.
-Proof.
-  induction ll as [ | l ll]; intros; simpl. auto. apply IHll. apply locmap_set_undef_lessdef; auto. 
-Qed.
-*)
-
 Lemma locmap_getpair_lessdef:
   forall p ls1 ls2,
   locmap_lessdef ls1 ls2 -> Val.lessdef (Locmap.getpair p ls1) (Locmap.getpair p ls2).
@@ -348,15 +398,16 @@ Lemma find_function_translated:
   forall ros ls tls fd,
   locmap_lessdef ls tls ->
   find_function ge ros ls = Some fd ->
-  find_function tge ros tls = Some (tunnel_fundef fd).
+  exists tfd, tunnel_fundef fd = OK tfd /\ find_function tge ros tls = Some tfd.
 Proof.
   intros. destruct ros; simpl in *.
 - assert (E: tls (R m) = ls (R m)).
   { exploit Genv.find_funct_inv; eauto. intros (b & EQ). 
     generalize (H (R m)). rewrite EQ. intros LD; inv LD. auto. }
-  rewrite E. apply functions_translated; auto.
+  rewrite E. exploit functions_translated; eauto.
 - rewrite symbols_preserved. destruct (Genv.find_symbol ge i); inv H0. 
-  apply function_ptr_translated; auto.
+  exploit function_ptr_translated; eauto.
+  intros (tf & X1 & X2). exists tf; intuition.
 Qed.
 
 Lemma call_regs_lessdef:
@@ -383,11 +434,12 @@ Qed.
 
 Definition measure (st: state) : nat :=
   match st with
-  | State s f sp pc ls m => (count_gotos f pc * 2)%nat
-  | Block s f sp (Lbranch pc :: _) ls m => (count_gotos f pc * 2 + 1)%nat
-  | Block s f sp bb ls m => 0%nat
-  | Callstate s f ls m => 0%nat
-  | Returnstate s ls m => 0%nat
+  | State s f sp pc ls m => (bound (branch_target f) pc) * 2
+  | Block s f sp (Lbranch pc :: _) ls m => (bound (branch_target f) pc) * 2 + 1
+  | Block s f sp (Lcond _ _ pc1 pc2 _ :: _) ls m => (max (bound (branch_target f) pc1) (bound (branch_target f) pc2)) * 2 + 1
+  | Block s f sp bb ls m => 0
+  | Callstate s f ls m => 0
+  | Returnstate s ls m => 0
   end.
 
 Lemma match_parent_locset:
@@ -406,24 +458,23 @@ Lemma tunnel_step_correct:
   (exists st2', step tge st1' t st2' /\ match_states st2 st2')
   \/ (measure st2 < measure st1 /\ t = E0 /\ match_states st2 st1')%nat.
 Proof.
-  induction 1; intros; try inv MS.
+  induction 1; intros; try inv MS; try (simpl in IBRANCH; inv IBRANCH).
 
 - (* entering a block *)
-  assert (DEFAULT: branch_target f pc = pc ->
-    (exists st2' : state,
-     step tge (State ts (tunnel_function f) sp (branch_target f pc) tls tm) E0 st2'
-     /\ match_states (Block s f sp bb rs m) st2')).
-  { intros. rewrite H0. econstructor; split.
-    econstructor. simpl. rewrite PTree.gmap1. rewrite H. simpl. eauto.
-    econstructor; eauto. }
-
-  generalize (record_gotos_correct f pc). rewrite H.
-  destruct bb; auto. destruct i; auto.
-  intros [A | [B C]]. auto.
-  right. split. simpl. omega.
-  split. auto.
-  rewrite B. econstructor; eauto.
-
+  exploit (branch_target_bounds f tf pc); eauto.
+  rewrite H. intros X; inversion X.
+  + (* TB_default *) 
+    rewrite TB; left. econstructor; split.
+    * econstructor. simpl. erewrite tunnel_function_unfold, H ; simpl; eauto.
+    * econstructor; eauto.
+  + (* FT_branch *)
+    simpl; right.
+    rewrite EQ; repeat (econstructor; omega || eauto).
+  + (* FT_cond *)
+    simpl; right.
+    repeat (econstructor; omega || eauto); simpl.
+    apply Nat.max_case; omega.
+    destruct (peq _ _); try congruence.
 - (* Lop *)
   exploit eval_operation_lessdef. apply reglist_lessdef; eauto. eauto. eauto. 
   intros (tv & EV & LD).
@@ -485,20 +536,25 @@ Proof.
   eauto. eauto.
   econstructor; eauto using locmap_undef_regs_lessdef.
 - (* Lcall *)
-  left; simpl; econstructor; split.
-  eapply exec_Lcall with (fd := tunnel_fundef fd); eauto.
-  eapply find_function_translated; eauto.
-  rewrite sig_preserved. auto.
-  econstructor; eauto.
-  constructor; auto.
-  constructor; auto.
+  left; simpl.
+  exploit find_function_translated; eauto.
+  intros (tfd & Htfd & FIND).
+  econstructor; split.
+  + eapply exec_Lcall; eauto.
+    erewrite sig_preserved; eauto.
+  + econstructor; eauto.
+    constructor; auto.
+    constructor; auto.
 - (* Ltailcall *)
-  exploit Mem.free_parallel_extends. eauto. eauto. intros (tm' & FREE & MEM'). 
+  exploit find_function_translated. 2: eauto.
+  { eauto using return_regs_lessdef, match_parent_locset. }
+  intros (tfd & Htfd & FIND).
+  exploit Mem.free_parallel_extends. eauto. eauto. intros (tm' & FREE & MEM').
   left; simpl; econstructor; split.
-  eapply exec_Ltailcall with (fd := tunnel_fundef fd); eauto.
-  eapply find_function_translated; eauto using return_regs_lessdef, match_parent_locset.
-  apply sig_preserved.
-  econstructor; eauto using return_regs_lessdef, match_parent_locset.
+  + eapply exec_Ltailcall; eauto.
+    * eapply sig_preserved; eauto.
+    * erewrite fn_stacksize_preserved; eauto.
+  + econstructor; eauto using return_regs_lessdef, match_parent_locset.
 - (* Lbuiltin *)
   exploit eval_builtin_args_lessdef. eexact LS. eauto. eauto. intros (tvargs & EVA & LDA).
   exploit external_call_mem_extends; eauto. intros (tvres & tm' & A & B & C & D).
@@ -513,45 +569,58 @@ Proof.
   fold (branch_target f pc). econstructor; eauto.
 - (* Lbranch (eliminated) *)
   right; split. simpl. omega. split. auto. constructor; auto.
-
-- (* Lcond *)
-  simpl tunneled_block.
-  set (s1 := U.repr (record_gotos f) pc1). set (s2 := U.repr (record_gotos f) pc2).
-  destruct (peq s1 s2).
-+ left; econstructor; split.
-  eapply exec_Lbranch. 
-  destruct b.
-* constructor; eauto using locmap_undef_regs_lessdef_1.
-* rewrite e. constructor; eauto using locmap_undef_regs_lessdef_1.
-+ left; econstructor; split.
-  eapply exec_Lcond; eauto. eapply eval_condition_lessdef; eauto using reglist_lessdef.
-  destruct b; econstructor; eauto using locmap_undef_regs_lessdef.
-
+- (* Lcond (preserved) *)
+  simpl; left; destruct (peq _ _) eqn: EQ.
+  + econstructor; split.
+    eapply exec_Lbranch. 
+    destruct b.
+    * constructor; eauto using locmap_undef_regs_lessdef_1.
+    * rewrite e. constructor; eauto using locmap_undef_regs_lessdef_1.
+  + econstructor; split.
+    eapply exec_Lcond; eauto. eapply eval_condition_lessdef; eauto using reglist_lessdef.
+    destruct b; econstructor; eauto using locmap_undef_regs_lessdef.
+- (* Lcond (eliminated) *)
+  destruct (peq _ _) eqn: EQ; try inv H1.
+  right; split; simpl. 
+  + destruct b.
+    generalize (Nat.le_max_l (bound (branch_target f) pc1) (bound (branch_target f) pc2)); omega.
+    generalize (Nat.le_max_r (bound (branch_target f) pc1) (bound (branch_target f) pc2)); omega.
+  + destruct b.
+    -- repeat (constructor; auto).
+    -- rewrite e; repeat (constructor; auto).
 - (* Ljumptable *)
   assert (tls (R arg) = Vint n).
   { generalize (LS (R arg)); rewrite H; intros LD; inv LD; auto. }
   left; simpl; econstructor; split.
   eapply exec_Ljumptable.
-  eauto. rewrite list_nth_z_map. change U.elt with node. rewrite H0. reflexivity. eauto.
+  eauto. rewrite list_nth_z_map, H0; simpl; eauto. eauto.
   econstructor; eauto using locmap_undef_regs_lessdef.
 - (* Lreturn *)
   exploit Mem.free_parallel_extends. eauto. eauto. intros (tm' & FREE & MEM'). 
   left; simpl; econstructor; split.
-  eapply exec_Lreturn; eauto.
-  constructor; eauto using return_regs_lessdef, match_parent_locset.
+  + eapply exec_Lreturn; eauto.
+    erewrite fn_stacksize_preserved; eauto.
+  + constructor; eauto using return_regs_lessdef, match_parent_locset.
 - (* internal function *)
+  exploit tunnel_fundef_Internal; eauto.
+  intros (tf' & TF' & ITF). subst.
   exploit Mem.alloc_extends. eauto. eauto. apply Z.le_refl. apply Z.le_refl.
-  intros (tm' & ALLOC & MEM'). 
-  left; simpl; econstructor; split.
-  eapply exec_function_internal; eauto.
-  simpl. econstructor; eauto using locmap_undef_regs_lessdef, call_regs_lessdef.
+  intros (tm' & ALLOC & MEM').
+  left; simpl.
+  econstructor; split.
+  + eapply exec_function_internal; eauto.
+    erewrite fn_stacksize_preserved; eauto.
+  + simpl.
+    erewrite (fn_entrypoint_preserved f tf'); auto.
+    econstructor; eauto using locmap_undef_regs_lessdef, call_regs_lessdef.
 - (* external function *)
   exploit external_call_mem_extends; eauto using locmap_getpairs_lessdef.
   intros (tvres & tm' & A & B & C & D).
   left; simpl; econstructor; split.
-  eapply exec_function_external; eauto.
-  eapply external_call_symbols_preserved; eauto. apply senv_preserved.
-  simpl. econstructor; eauto using locmap_setpair_lessdef, locmap_undef_caller_save_regs_lessdef.
+  + erewrite (tunnel_fundef_External tf ef); eauto.
+    eapply exec_function_external; eauto.
+    eapply external_call_symbols_preserved; eauto. apply senv_preserved.
+  + simpl. econstructor; eauto using locmap_setpair_lessdef, locmap_undef_caller_save_regs_lessdef.
 - (* return *)
   inv STK. inv H1.
   left; econstructor; split.
@@ -564,14 +633,15 @@ Lemma transf_initial_states:
   exists st2, initial_state tprog st2 /\ match_states st1 st2.
 Proof.
   intros. inversion H.
-  exists (Callstate nil (tunnel_fundef f) (Locmap.init Vundef) m0); split.
+  exploit function_ptr_translated; eauto.
+  intros (tf & Htf & Hf).
+  exists (Callstate nil tf (Locmap.init Vundef) m0); split.
   econstructor; eauto.
-  apply (Genv.init_mem_transf TRANSL); auto.
+  apply (Genv.init_mem_transf_partial TRANSL); auto.
   rewrite (match_program_main TRANSL).
   rewrite symbols_preserved. eauto.
-  apply function_ptr_translated; auto.
-  rewrite <- H3. apply sig_preserved.
-  constructor. constructor. red; simpl; auto. apply Mem.extends_refl.
+  rewrite <- H3. apply sig_preserved. auto.
+  constructor. constructor. red; simpl; auto. apply Mem.extends_refl. auto.
 Qed.
 
 Lemma transf_final_states:
diff --git a/driver/Clflags.ml b/driver/Clflags.ml
index 9df58903..d1e7dd7f 100644
--- a/driver/Clflags.ml
+++ b/driver/Clflags.ml
@@ -27,11 +27,14 @@ let option_ftailcalls = ref true
 let option_fconstprop = ref true
 let option_fcse = ref true
 let option_fcse2 = ref false
+
 let option_fcse3 = ref true
 let option_fcse3_alias_analysis = ref true
 let option_fcse3_across_calls = ref false
 let option_fcse3_across_merges = ref true
 let option_fcse3_glb = ref true
+let option_fcse3_trivial_ops = ref false
+let option_fcse3_refine = ref true                             
 let option_fredundancy = ref true
 
 (** Options relative to superblock scheduling *)
@@ -40,6 +43,7 @@ let option_ftailduplicate = ref 0 (* perform tail duplication for blocks of size
 let option_ftracelinearize = ref true (* uses branch prediction information to improve the linearization *)
 let option_funrollsingle = ref 0 (* unroll a single iteration of innermost loops of size n *)
 let option_funrollbody = ref 0 (* unroll the body of innermost loops of size n *)
+let option_flooprotate = ref 0 (* rotate the innermost loops to have the condition inside the loop body *)
 
 let option_fpostpass = ref true
 let option_fpostpass_sched = ref "list"
@@ -93,7 +97,7 @@ let option_div_i32 = ref "stsud"
 let option_div_i64 = ref "stsud" 
 let option_fcoalesce_mem = ref true
 let option_fforward_moves = ref false
-let option_fmove_loop_invariants = ref true
+let option_fmove_loop_invariants = ref false
 let option_fnontrap_loads = ref true
 let option_all_loads_nontrap = ref false
 let option_inline_auto_threshold = ref 0
diff --git a/driver/Compiler.vexpand b/driver/Compiler.vexpand
index 0f59aab7..3acec956 100644
--- a/driver/Compiler.vexpand
+++ b/driver/Compiler.vexpand
@@ -35,6 +35,7 @@ Require Cshmgen.
 Require Cminorgen.
 Require Selection.
 Require RTLgen.
+Require Import Duplicatepasses.
 EXPAND_RTL_REQUIRE
 Require Asmgen.
 (** Proofs of semantic preservation. *)
@@ -53,7 +54,7 @@ Require Import Compopts.
 Parameter print_Clight: Clight.program -> unit.
 Parameter print_Cminor: Cminor.program -> unit.
 Parameter print_RTL: Z -> RTL.program -> unit.
-Parameter print_LTL: LTL.program -> unit.
+Parameter print_LTL: Z -> LTL.program -> unit.
 Parameter print_Mach: Mach.program -> unit.
 
 Local Open Scope string_scope.
diff --git a/driver/Compopts.v b/driver/Compopts.v
index 540e8922..0c90ee52 100644
--- a/driver/Compopts.v
+++ b/driver/Compopts.v
@@ -54,6 +54,9 @@ Parameter optim_CSE3_across_merges: unit -> bool.
 (** Flag -fcse3-glb *)
 Parameter optim_CSE3_glb: unit -> bool.
 
+(** Flag -fcse3-trivial-ops. For DMonniaux's common subexpression elimination, simplify trivial operations as well. *)
+Parameter optim_CSE3_trivial_ops: unit -> bool.
+
 (** Flag -fmove-loop-invariants. *)
 Parameter optim_move_loop_invariants: unit -> bool.
 
diff --git a/driver/Driver.ml b/driver/Driver.ml
index 12f50762..d93578b6 100644
--- a/driver/Driver.ml
+++ b/driver/Driver.ml
@@ -202,6 +202,8 @@ Processing options:
   -fcse3-across-calls   Propagate CSE3 information across function calls [off]
   -fcse3-across-merges  Propagate CSE3 information across control-flow merges [on]
   -fcse3-glb            Refine CSE3 information using greatest lower bounds [on]
+  -fcse3-trivial-ops    Replace trivial operations as well using CSE3 [off]
+  -fcse3-refine         Refine CSE3 invariants by descending iteration [on]
   -fmove-loop-invariants Perform loop-invariant code motion [off]
   -fredundancy   Perform redundancy elimination [on]
   -fpostpass     Perform postpass scheduling (only for K1 architecture) [on]
@@ -213,6 +215,8 @@ Processing options:
   -ftracelinearize  Uses branch prediction information to improve the Linearize [on]
   -funrollsingle n  Unrolls a single iteration of innermost loops of size n (not counting Inops) [0]
   -funrollbody n  Unrolls once the body of innermost loops of size n (not counting Inops) [0]
+  -flooprotate n  Duplicates the header (condition computation part) of innermost loops to perform a loop rotate [0]
+                  Doesn't duplicate if the size of that header is strictly greater than n
   -fforward-moves   Forward moves after CSE
   -finline       Perform inlining of functions [on]
   -finline-functions-called-once Integrate functions only required by their
@@ -415,6 +419,8 @@ let cmdline_actions =
   @ f_opt "cse3-across-calls" option_fcse3_across_calls
   @ f_opt "cse3-across-merges" option_fcse3_across_merges
   @ f_opt "cse3-glb" option_fcse3_glb
+  @ f_opt "cse3-trivial-ops" option_fcse3_trivial_ops
+  @ f_opt "cse3-refine" option_fcse3_refine
   @ f_opt "move-loop-invariants" option_fmove_loop_invariants
   @ f_opt "redundancy" option_fredundancy
   @ f_opt "postpass" option_fpostpass
@@ -422,6 +428,7 @@ let cmdline_actions =
   @ f_opt "predict" option_fpredict
   @ [ Exact "-funrollsingle", Integer (fun n -> option_funrollsingle := n) ]
   @ [ Exact "-funrollbody", Integer (fun n -> option_funrollbody := n) ]
+  @ [ Exact "-flooprotate", Integer (fun n -> option_flooprotate := n) ]
   @ f_opt "tracelinearize" option_ftracelinearize
   @ f_opt_str "postpass" option_fpostpass option_fpostpass_sched
   @ f_opt "inline" option_finline
diff --git a/extraction/extraction.v b/extraction/extraction.v
index 5de66797..f5b8291b 100644
--- a/extraction/extraction.v
+++ b/extraction/extraction.v
@@ -128,6 +128,8 @@ Extract Constant Compopts.optim_CSE3_across_merges =>
   "fun _ -> !Clflags.option_fcse3_across_merges".
 Extract Constant Compopts.optim_CSE3_glb =>
   "fun _ -> !Clflags.option_fcse3_glb".
+Extract Constant Compopts.optim_CSE3_trivial_ops =>
+  "fun _ -> !Clflags.option_fcse3_trivial_ops".
 Extract Constant Compopts.optim_move_loop_invariants =>
   "fun _ -> !Clflags.option_fmove_loop_invariants".
 
diff --git a/lib/UnionFind.v b/lib/UnionFind.v
index 20bb91cd..bd1b763b 100644
--- a/lib/UnionFind.v
+++ b/lib/UnionFind.v
@@ -124,6 +124,15 @@ Module Type UNIONFIND.
         pathlen uf x + pathlen uf b + 1
       else
         pathlen uf x.
+   Axiom pathlen_union:
+    forall uf a b x,
+    pathlen (union uf a b) x =
+      if elt_eq (repr uf a) (repr uf b) then
+        pathlen uf x
+     else if elt_eq (repr uf x) (repr uf a) then
+        (pathlen uf x)+1
+      else
+        (pathlen uf x).
   Axiom pathlen_gt_merge:
     forall uf a b x y,
     repr uf x = repr uf y ->
@@ -531,6 +540,7 @@ Qed.
 
 End PATHLEN.
 
+
 (* Path length and merge *)
 
 Lemma pathlen_merge:
@@ -549,16 +559,49 @@ Proof.
   set (uf' := identify uf (repr uf a) b (repr_res_none uf a) (not_eq_sym n)).
   pattern x. apply (well_founded_ind (mwf uf')); intros.
   rewrite (pathlen_unroll uf'). destruct (M.get x0 (m uf')) as [x'|] eqn:G.
-  rewrite H; auto. simpl in G. rewrite M.gsspec in G.
-  destruct (M.elt_eq x0 (repr uf a)). rewrite e. rewrite repr_canonical. rewrite dec_eq_true.
-  inversion G. subst x'. rewrite dec_eq_false; auto.
-  replace (pathlen uf (repr uf a)) with 0. omega.
-  symmetry. apply pathlen_none. apply repr_res_none.
-  rewrite (repr_unroll uf x0), (pathlen_unroll uf x0); rewrite G.
-  destruct (M.elt_eq (repr uf x') (repr uf a)); omega.
-  simpl in G. rewrite M.gsspec in G. destruct (M.elt_eq x0 (repr uf a)); try discriminate.
-  rewrite (repr_none uf x0) by auto. rewrite dec_eq_false; auto.
-  symmetry. apply pathlen_zero; auto. apply repr_none; auto.
+  + rewrite H; auto. clear H. simpl in G. rewrite M.gsspec in G.
+    destruct (M.elt_eq x0 (repr uf a)).
+    - rewrite e, repr_canonical, dec_eq_true.
+      inversion G. subst x'. rewrite dec_eq_false; auto.
+      replace (pathlen uf (repr uf a)) with 0; try omega.
+      symmetry. apply pathlen_none. apply repr_res_none.
+    - rewrite (repr_unroll uf x0), (pathlen_unroll uf x0), G.
+      destruct (M.elt_eq (repr uf x') (repr uf a)); omega.
+  + clear H; simpl in G. rewrite M.gsspec in G. destruct (M.elt_eq x0 (repr uf a)); try discriminate.
+    rewrite (repr_none uf x0) by auto. rewrite dec_eq_false; auto.
+    symmetry. apply pathlen_zero; auto. apply repr_none; auto.
+Qed.
+
+Lemma pathlen_union:
+  forall uf a b x,
+  pathlen (union uf a b) x =
+    if M.elt_eq (repr uf a) (repr uf b) then
+      pathlen uf x
+    else if M.elt_eq (repr uf x) (repr uf a) then
+      (pathlen uf x)+1
+    else
+      (pathlen uf x).
+Proof.
+  intros. unfold union.
+  destruct (M.elt_eq (repr uf a) (repr uf b)).
+  auto.
+  set (uf' := identify uf _ _ _ _).
+  assert (LENa: pathlen uf (repr uf a) = 0).
+  { apply pathlen_none. apply repr_res_none. }
+  pattern x. apply (well_founded_ind (mwf uf')); intros.
+  rewrite (pathlen_unroll uf'). destruct (M.get x0 (m uf')) as [x'|] eqn:G.
+  + rewrite H; auto. clear H. simpl in G. rewrite M.gsspec in G.
+    destruct (M.elt_eq x0 (repr uf a)).
+    - inversion G; clear G. subst.
+      rewrite !repr_canonical, dec_eq_true.
+      rewrite dec_eq_false; auto.
+      rewrite LENa. rewrite (pathlen_none uf (repr uf b)); try omega.
+      apply repr_res_none.
+    - rewrite (repr_unroll uf x0), G, ! (pathlen_some _ _ _ G).
+      destruct (M.elt_eq _ _); auto.
+  + clear H. simpl in G. rewrite M.gsspec in G.
+    destruct (M.elt_eq _ (repr uf a)); try discriminate.
+    rewrite (repr_none _ _ G), !(pathlen_none _ _ G), dec_eq_false; auto.
 Qed.
 
 Lemma pathlen_gt_merge:
diff --git a/test/monniaux/loop_nest/polybench.h b/test/monniaux/loop_nest/polybench.h
new file mode 100644
index 00000000..7d092e45
--- /dev/null
+++ b/test/monniaux/loop_nest/polybench.h
@@ -0,0 +1,202 @@
+/**
+ * polybench.h: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+/*
+ * Polybench header for instrumentation.
+ *
+ * Programs must be compiled with `-I utilities utilities/polybench.c'
+ *
+ * Optionally, one can define:
+ *
+ * -DPOLYBENCH_TIME, to report the execution time,
+ *   OR (exclusive):
+ * -DPOLYBENCH_PAPI, to use PAPI H/W counters (defined in polybench.c)
+ *
+ *
+ * See README or utilities/polybench.c for additional options.
+ *
+ */
+#ifndef POLYBENCH_H
+# define POLYBENCH_H
+
+# include <stdlib.h>
+
+/* Array padding. By default, none is used. */
+# ifndef POLYBENCH_PADDING_FACTOR
+/* default: */
+#  define POLYBENCH_PADDING_FACTOR 0
+# endif
+
+
+/* C99 arrays in function prototype. By default, do not use. */
+# ifdef POLYBENCH_USE_C99_PROTO
+#  define POLYBENCH_C99_SELECT(x,y) y
+# else
+/* default: */
+#  define POLYBENCH_C99_SELECT(x,y) x
+# endif
+
+
+/* Scalar loop bounds in SCoPs. By default, use parametric loop bounds. */
+# ifdef POLYBENCH_USE_SCALAR_LB
+#  define POLYBENCH_LOOP_BOUND(x,y) x
+# else
+/* default: */
+#  define POLYBENCH_LOOP_BOUND(x,y) y
+# endif
+
+
+/* Macros to reference an array. Generic for heap and stack arrays
+   (C99).  Each array dimensionality has his own macro, to be used at
+   declaration or as a function argument.
+   Example:
+   int b[x] => POLYBENCH_1D_ARRAY(b, x)
+   int A[N][N] => POLYBENCH_2D_ARRAY(A, N, N)
+*/
+# ifndef POLYBENCH_STACK_ARRAYS
+#  define POLYBENCH_ARRAY(x) *x
+#  define POLYBENCH_FREE_ARRAY(x) free((void*)x);
+#  define POLYBENCH_DECL_VAR(x) (*x)
+# else
+#  define POLYBENCH_ARRAY(x) x
+#  define POLYBENCH_FREE_ARRAY(x)
+#  define POLYBENCH_DECL_VAR(x) x
+# endif
+/* Macros for using arrays in the function prototypes. */
+# define POLYBENCH_1D(var, dim1,ddim1) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_2D(var, dim1, dim2, ddim1, ddim2) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_3D(var, dim1, dim2, dim3, ddim1, ddim2, ddim3) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_4D(var, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR]
+# define POLYBENCH_5D(var, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) var[POLYBENCH_C99_SELECT(dim1,ddim1) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim2,ddim2) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim3,ddim3) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim4,ddim4) + POLYBENCH_PADDING_FACTOR][POLYBENCH_C99_SELECT(dim5,ddim5) + POLYBENCH_PADDING_FACTOR]
+
+
+/* Macros to allocate heap arrays.
+   Example:
+   polybench_alloc_2d_array(N, M, double) => allocates N x M x sizeof(double)
+					  and returns a pointer to the 2d array
+ */
+# define POLYBENCH_ALLOC_1D_ARRAY(n1, type)	\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data (n1 + POLYBENCH_PADDING_FACTOR, sizeof(type))
+# define POLYBENCH_ALLOC_2D_ARRAY(n1, n2, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_3D_ARRAY(n1, n2, n3, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_4D_ARRAY(n1, n2, n3, n4, type)			\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+# define POLYBENCH_ALLOC_5D_ARRAY(n1, n2, n3, n4, n5, type)		\
+  (type(*)[n1 + POLYBENCH_PADDING_FACTOR][n2 + POLYBENCH_PADDING_FACTOR][n3 + POLYBENCH_PADDING_FACTOR][n4 + POLYBENCH_PADDING_FACTOR][n5 + POLYBENCH_PADDING_FACTOR])polybench_alloc_data ((n1 + POLYBENCH_PADDING_FACTOR) * (n2 + POLYBENCH_PADDING_FACTOR) * (n3 + POLYBENCH_PADDING_FACTOR) * (n4 + POLYBENCH_PADDING_FACTOR) * (n5 + POLYBENCH_PADDING_FACTOR), sizeof(type))
+
+/* Macros for array declaration. */
+# ifndef POLYBENCH_STACK_ARRAYS
+#  define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1)		\
+  type POLYBENCH_1D(POLYBENCH_DECL_VAR(var), dim1, ddim1); \
+  var = POLYBENCH_ALLOC_1D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), type);
+#  define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2)	\
+  type POLYBENCH_2D(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2); \
+  var = POLYBENCH_ALLOC_2D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), type);
+#  define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \
+  type POLYBENCH_3D(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3); \
+  var = POLYBENCH_ALLOC_3D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), type);
+#  define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \
+  type POLYBENCH_4D(POLYBENCH_DECL_VAR(var), dim1, dim2, ,dim3, dim4, ddim1, ddim2, ddim3, ddim4); \
+  var = POLYBENCH_ALLOC_4D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), type);
+#  define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \
+  type POLYBENCH_5D(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5); \
+  var = POLYBENCH_ALLOC_5D_ARRAY(POLYBENCH_C99_SELECT(dim1, ddim1), POLYBENCH_C99_SELECT(dim2, ddim2), POLYBENCH_C99_SELECT(dim3, ddim3), POLYBENCH_C99_SELECT(dim4, ddim4), POLYBENCH_C99_SELECT(dim5, ddim5), type);
+# else
+#  define POLYBENCH_1D_ARRAY_DECL(var, type, dim1, ddim1)		\
+  type POLYBENCH_1D(POLYBENCH_DECL_VAR(var), dim1, ddim1);
+#  define POLYBENCH_2D_ARRAY_DECL(var, type, dim1, dim2, ddim1, ddim2)	\
+  type POLYBENCH_2D(POLYBENCH_DECL_VAR(var), dim1, dim2, ddim1, ddim2);
+#  define POLYBENCH_3D_ARRAY_DECL(var, type, dim1, dim2, dim3, ddim1, ddim2, ddim3) \
+  type POLYBENCH_3D(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, ddim1, ddim2, ddim3);
+#  define POLYBENCH_4D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4) \
+  type POLYBENCH_4D(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, ddim1, ddim2, ddim3, ddim4);
+#  define POLYBENCH_5D_ARRAY_DECL(var, type, dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5) \
+  type POLYBENCH_5D(POLYBENCH_DECL_VAR(var), dim1, dim2, dim3, dim4, dim5, ddim1, ddim2, ddim3, ddim4, ddim5);
+# endif
+
+
+/* Dead-code elimination macros. Use argc/argv for the run-time check. */
+# ifndef POLYBENCH_DUMP_ARRAYS
+#  define POLYBENCH_DCE_ONLY_CODE    if (argc > 42 && ! strcmp(argv[0], ""))
+# else
+#  define POLYBENCH_DCE_ONLY_CODE
+# endif
+
+# define polybench_prevent_dce(func)		\
+  POLYBENCH_DCE_ONLY_CODE			\
+  func
+
+
+/* Performance-related instrumentation. See polybench.c */
+# define polybench_start_instruments
+# define polybench_stop_instruments
+# define polybench_print_instruments
+
+
+/* PAPI support. */
+# ifdef POLYBENCH_PAPI
+extern const unsigned int polybench_papi_eventlist[];
+#  undef polybench_start_instruments
+#  undef polybench_stop_instruments
+#  undef polybench_print_instruments
+#  define polybench_set_papi_thread_report(x)	\
+   polybench_papi_counters_threadid = x;
+#  define polybench_start_instruments				\
+  polybench_prepare_instruments();				\
+  polybench_papi_init();					\
+  int evid;							\
+  for (evid = 0; polybench_papi_eventlist[evid] != 0; evid++)	\
+    {								\
+      if (polybench_papi_start_counter(evid))			\
+	continue;						\
+
+#  define polybench_stop_instruments		\
+      polybench_papi_stop_counter(evid);	\
+    }						\
+  polybench_papi_close();			\
+
+#  define polybench_print_instruments polybench_papi_print();
+# endif
+
+
+/* Timing support. */
+# if defined(POLYBENCH_TIME) || defined(POLYBENCH_GFLOPS)
+#  undef polybench_start_instruments
+#  undef polybench_stop_instruments
+#  undef polybench_print_instruments
+#  define polybench_start_instruments polybench_timer_start();
+#  define polybench_stop_instruments polybench_timer_stop();
+#  define polybench_print_instruments polybench_timer_print();
+extern double polybench_program_total_flops;
+extern void polybench_timer_start();
+extern void polybench_timer_stop();
+extern void polybench_timer_print();
+# endif
+
+/* Function declaration. */
+# ifdef POLYBENCH_TIME
+extern void polybench_timer_start();
+extern void polybench_timer_stop();
+extern void polybench_timer_print();
+# endif
+
+# ifdef POLYBENCH_PAPI
+extern void polybench_prepare_instruments();
+extern int polybench_papi_start_counter(int evid);
+extern void polybench_papi_stop_counter(int evid);
+extern void polybench_papi_init();
+extern void polybench_papi_close();
+extern void polybench_papi_print();
+# endif
+
+/* Function prototypes. */
+extern void* polybench_alloc_data(unsigned long long int n, int elt_size);
+
+
+#endif /* !POLYBENCH_H */
diff --git a/test/monniaux/loop_nest/syrk.h b/test/monniaux/loop_nest/syrk.h
new file mode 100644
index 00000000..c753ff3b
--- /dev/null
+++ b/test/monniaux/loop_nest/syrk.h
@@ -0,0 +1,54 @@
+/**
+ * syrk.h: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#ifndef SYRK_H
+# define SYRK_H
+
+/* Default to STANDARD_DATASET. */
+# if !defined(MINI_DATASET) && !defined(SMALL_DATASET) && !defined(LARGE_DATASET) && !defined(EXTRALARGE_DATASET)
+#  define STANDARD_DATASET
+# endif
+
+/* Do not define anything if the user manually defines the size. */
+# if !defined(NI) && !defined(NJ)
+/* Define the possible dataset sizes. */
+#  ifdef MINI_DATASET
+#   define NI 32
+#   define NJ 32
+#  endif
+
+#  ifdef SMALL_DATASET
+#   define NI 128
+#   define NJ 128
+#  endif
+
+#  ifdef STANDARD_DATASET /* Default if unspecified. */
+#   define NI 1024
+#   define NJ 1024
+#  endif
+
+#  ifdef LARGE_DATASET
+#   define NI 2000
+#   define NJ 2000
+#  endif
+
+#  ifdef EXTRALARGE_DATASET
+#   define NI 4000
+#   define NJ 4000
+#  endif
+# endif /* !N */
+
+# define _PB_NI POLYBENCH_LOOP_BOUND(NI,ni)
+# define _PB_NJ POLYBENCH_LOOP_BOUND(NJ,nj)
+
+# ifndef DATA_TYPE
+#  define DATA_TYPE double
+#  define DATA_PRINTF_MODIFIER "%0.2lf "
+# endif
+
+
+#endif /* !SYRK */
diff --git a/tools/compiler_expand.ml b/tools/compiler_expand.ml
index e5cab30c..febb0282 100644
--- a/tools/compiler_expand.ml
+++ b/tools/compiler_expand.ml
@@ -11,65 +11,79 @@ David Monniaux, CNRS, VERIMAG
 type is_partial = TOTAL | PARTIAL;;
 type print_result = Noprint | Print of string;;
 type when_triggered = Always | Option of string;;
+type needs_require = Require | NoRequire;;
 
+(* FIXME - The gestion of NoRequire is a bit ugly right now. *)
 let rtl_passes =
 [|
-TOTAL, (Option "optim_tailcalls"), (Some "Tail calls"), "Tailcall";
-PARTIAL, Always, (Some "Inlining"), "Inlining";
-TOTAL, (Option "profile_arcs"), (Some "Profiling insertion"), "Profiling";
-TOTAL, (Option "branch_probabilities"), (Some "Profiling use"), "ProfilingExploit";
-TOTAL, (Option "optim_move_loop_invariants"), (Some "Inserting initial nop"), "FirstNop";
-TOTAL, Always, (Some "Renumbering"), "Renumber";
-PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE";
-PARTIAL, Always, (Some "Duplicating blocks"), "Duplicate";
-TOTAL, Always, (Some "Renumbering pre constprop"), "Renumber";
-TOTAL, (Option "optim_constprop"), (Some "Constant propagation"), "Constprop";
-TOTAL, Always, (Some "Renumbering pre CSE"), "Renumber";
-TOTAL, (Option "optim_CSE2"), (Some "CSE2"), "CSE2";
-PARTIAL, (Option "optim_CSE3"), (Some "CSE3"), "CSE3";
-TOTAL, (Option "optim_CSE3"), (Some "Kill useless moves after CSE3"), "KillUselessMoves";
-TOTAL, (Option "optim_forward_moves"), (Some "Forwarding moves"), "ForwardMoves";
-PARTIAL, (Option "optim_redundancy"), (Some "Redundancy elimination"), "Deadcode";
-PARTIAL, (Option "optim_move_loop_invariants"), (Some "LICM"), "LICM";
-TOTAL, (Option "optim_move_loop_invariants"), (Some "Renumbering for LICM"), "Renumber";
-PARTIAL, (Option "optim_move_loop_invariants"), (Some "CSE3 for LICM"), "CSE3";
-PARTIAL, (Option "optim_move_loop_invariants"), (Some "Redundancy elimination for LICM"), "Deadcode";
-TOTAL, (Option "all_loads_nontrap"), None, "Allnontrap";
-PARTIAL, Always, (Some "Unused globals"), "Unusedglob"
+TOTAL, (Option "optim_tailcalls"), Require, (Some "Tail calls"), "Tailcall";
+PARTIAL, Always, Require, (Some "Inlining"), "Inlining";
+TOTAL, (Option "profile_arcs"), Require, (Some "Profiling insertion"), "Profiling";
+TOTAL, (Option "branch_probabilities"), Require, (Some "Profiling use"), "ProfilingExploit";
+TOTAL, (Option "optim_move_loop_invariants"), Require, (Some "Inserting initial nop"), "FirstNop";
+TOTAL, Always, Require, (Some "Renumbering"), "Renumber";
+PARTIAL, (Option "optim_CSE"), Require, (Some "CSE"), "CSE";
+PARTIAL, Always, NoRequire, (Some "Static Prediction + inverting conditions"), "Staticpredict";
+PARTIAL, Always, NoRequire, (Some "Unrolling one iteration out of innermost loops"), "Unrollsingle";
+TOTAL, Always, NoRequire, (Some "Renumbering pre unroll"), "Renumber";
+PARTIAL, Always, NoRequire, (Some "Unrolling the body of innermost loops"), "Unrollbody";
+TOTAL, Always, NoRequire, (Some "Renumbering pre tail duplication"), "Renumber";
+PARTIAL, Always, NoRequire, (Some "Performing tail duplication"), "Tailduplicate";
+TOTAL, Always, NoRequire, (Some "Renumbering pre constprop"), "Renumber";
+TOTAL, (Option "optim_constprop"), Require, (Some "Constant propagation"), "Constprop";
+TOTAL, Always, NoRequire, (Some "Renumbering pre CSE"), "Renumber";
+TOTAL, (Option "optim_CSE2"), Require, (Some "CSE2"), "CSE2";
+PARTIAL, (Option "optim_CSE3"), Require, (Some "CSE3"), "CSE3";
+TOTAL, (Option "optim_CSE3"), Require, (Some "Kill useless moves after CSE3"), "KillUselessMoves";
+TOTAL, (Option "optim_forward_moves"), Require, (Some "Forwarding moves"), "ForwardMoves";
+PARTIAL, (Option "optim_redundancy"), Require, (Some "Redundancy elimination"), "Deadcode";
+TOTAL, Always, Require, (Some "Renumbering pre rotate"), "Renumber";
+PARTIAL, Always, NoRequire, (Some "Loop Rotate"), "Looprotate";
+TOTAL, (Option "optim_move_loop_invariants"), NoRequire, (Some "Renumbering for LICM"), "Renumber";
+PARTIAL, (Option "optim_move_loop_invariants"), Require, (Some "LICM"), "LICM";
+TOTAL, (Option "optim_move_loop_invariants"), NoRequire, (Some "Renumbering for LICM"), "Renumber";
+PARTIAL, (Option "optim_move_loop_invariants"), NoRequire, (Some "CSE3 for LICM"), "CSE3";
+PARTIAL, (Option "optim_move_loop_invariants"), NoRequire, (Some "Redundancy elimination for LICM"), "Deadcode";
+TOTAL, (Option "all_loads_nontrap"), Require, None, "Allnontrap";
+PARTIAL, Always, Require, (Some "Unused globals"), "Unusedglob"
 |];;
 
 let post_rtl_passes =
 [|
-  PARTIAL, Always, (Some "Register allocation"), "Allocation", (Print "LTL");
-  TOTAL, Always, (Some "Branch tunneling"), "Tunneling", Noprint;
-  PARTIAL, Always, (Some "CFG linearization"), "Linearize", Noprint;
-  TOTAL, Always, (Some "Label cleanup"), "CleanupLabels", Noprint;
-  PARTIAL, (Option "debug"), (Some "Debugging info for local variables"), "Debugvar", Noprint;
-  PARTIAL, Always, (Some "Mach generation"), "Stacking", (Print "Mach")
+  PARTIAL, Always, Require, (Some "Register allocation"), "Allocation", (Print "LTL 1");
+  PARTIAL, Always, Require, (Some "Branch tunneling"), "Tunneling", (Print "LTL 2");
+  PARTIAL, Always, Require, (Some "CFG linearization"), "Linearize", Noprint;
+  TOTAL, Always, Require, (Some "Label cleanup"), "CleanupLabels", Noprint;
+  PARTIAL, (Option "debug"), Require, (Some "Debugging info for local variables"), "Debugvar", Noprint;
+  PARTIAL, Always, Require, (Some "Mach generation"), "Stacking", (Print "Mach")
 |];;
 
 let all_passes =
   Array.concat
     [Array.mapi
-       (fun i (a,b,c,d) -> (a,b,c,d, Print (Printf.sprintf "RTL %d" (i+1))))
+       (fun i (a,b,r,c,d) -> (a,b,r,c,d, Print (Printf.sprintf "RTL %d" (i+1))))
        rtl_passes;
      post_rtl_passes];;
 
 let totality = function TOTAL -> "total" | PARTIAL -> "partial";;
 
 let print_rtl_require oc =
-  Array.iter (fun (partial, trigger, time_label, pass_name, printing) ->
-      Printf.fprintf oc "Require %s.\n" pass_name)
-    all_passes;;
+  Array.iter (fun (partial, trigger, require, time_label, pass_name, printing) ->
+      match require with Require -> 
+        Printf.fprintf oc "Require %s.\n" pass_name
+      | _ -> ()
+    ) all_passes;;
 
 let print_rtl_require_proof oc =
-  Array.iter (fun (partial, trigger, time_label, pass_name, printing) ->
-      Printf.fprintf oc "Require %sproof.\n" pass_name)
-    all_passes;;
+  Array.iter (fun (partial, trigger, require, time_label, pass_name, printing) ->
+      match require with Require -> 
+      Printf.fprintf oc "Require %sproof.\n" pass_name
+      | _ -> ()
+    ) all_passes;;
 
 let print_rtl_transf oc =
   Array.iteri
-    (fun i (partial, trigger, time_label, pass_name, printing) ->
+    (fun i (partial, trigger, require, time_label, pass_name, printing) ->
       output_string oc (match partial with
                         | TOTAL -> "   @@ "
                         | PARTIAL -> "  @@@ ");
@@ -90,7 +104,7 @@ let print_rtl_transf oc =
     ) all_passes;;
 
 let print_rtl_mkpass oc =
-  Array.iter (fun (partial, trigger, time_label, pass_name, printing) ->
+  Array.iter (fun (partial, trigger, require, time_label, pass_name, printing) ->
       output_string oc "  ::: mkpass (";
       (match trigger with
        | Always -> ()
@@ -106,7 +120,7 @@ let print_if kind oc = function
 let numbering_base = 7
                    
 let print_rtl_proof oc =
-  Array.iteri (fun i (partial, trigger, time_label, pass_name, printing) ->
+  Array.iteri (fun i (partial, trigger, require, time_label, pass_name, printing) ->
       let j = i+numbering_base in
       match partial with
       | TOTAL ->
@@ -118,7 +132,7 @@ let print_rtl_proof oc =
     all_passes;;
 
 let print_rtl_proof2 oc =
-  Array.iteri (fun i (partial, trigger, time_label, pass_name, printing) ->
+  Array.iteri (fun i (partial, trigger, require, time_label, pass_name, printing) ->
       let j = i+numbering_base in
       Printf.fprintf oc "  exists p%d; split. " j;
       (match trigger with
@@ -131,7 +145,7 @@ let print_rtl_proof2 oc =
     all_passes;;
 
 let print_rtl_forward_simulations oc =
-  Array.iter (fun (partial, trigger, time_label, pass_name) ->
+  Array.iter (fun (partial, trigger, require, time_label, pass_name) ->
       output_string oc "  eapply compose_forward_simulations.\n    ";
       (match trigger with
        | Always -> ()