63 files changed, 2096 insertions, 1293 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0499abc2..10008017 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,9 +3,8 @@ stages:
 
 check-admitted:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -13,7 +12,7 @@ check-admitted:
     - ./config_x86_64.sh
     - make check-admitted
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -23,9 +22,8 @@ check-admitted:
     
 build_x86_64:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -35,7 +33,7 @@ build_x86_64:
     - make -C test all test
     - ulimit -s65536 && make -C test/monniaux/yarpgen
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -45,11 +43,10 @@ build_x86_64:
 
 build_ia32:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install gcc-multilib
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -59,7 +56,7 @@ build_ia32:
     - make -C test all test
     - ulimit -s65536 && make -C test/monniaux/yarpgen BITS=32 TARGET_CC='gcc -m32'
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -69,11 +66,10 @@ build_ia32:
 
 build_aarch64:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install gcc-aarch64-linux-gnu qemu-user
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -83,7 +79,7 @@ build_aarch64:
     - make -C test CCOMPOPTS='-static' SIMU='qemu-aarch64' EXECUTE='qemu-aarch64' all test
     - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='aarch64-linux-gnu-gcc' EXECUTE='qemu-aarch64' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -93,11 +89,10 @@ build_aarch64:
 
 build_arm:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install gcc-arm-linux-gnueabi qemu-user
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -107,7 +102,7 @@ build_arm:
     - make -C test CCOMPOPTS=-static SIMU='qemu-arm' EXECUTE='qemu-arm' all test
     - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='arm-linux-gnueabi-gcc' EXECUTE='qemu-arm' CCOMPOPTS='-static' TARGET_CFLAGS='-static' BITS=32
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -118,11 +113,10 @@ build_arm:
 
 build_armhf:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install gcc-arm-linux-gnueabihf qemu-user
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -132,7 +126,7 @@ build_armhf:
     - make -C test CCOMPOPTS=-static SIMU='qemu-arm' EXECUTE='qemu-arm' all test
     - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='arm-linux-gnueabihf-gcc' EXECUTE='qemu-arm' CCOMPOPTS='-static' TARGET_CFLAGS='-static' BITS=32
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -142,11 +136,10 @@ build_armhf:
 
 build_ppc:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install gcc-powerpc-linux-gnu qemu-user
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -154,7 +147,7 @@ build_ppc:
     - ./config_ppc.sh
     - make -j "$NJOBS"
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -164,11 +157,10 @@ build_ppc:
 
 build_ppc64:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install gcc-powerpc64-linux-gnu
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -176,7 +168,7 @@ build_ppc64:
     - ./config_ppc64.sh
     - make -j "$NJOBS"
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -186,11 +178,10 @@ build_ppc64:
 
 build_rv64:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install gcc-riscv64-linux-gnu qemu-user
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -200,7 +191,7 @@ build_rv64:
     - make -C test CCOMPOPTS=-static SIMU='qemu-riscv64' EXECUTE='qemu-riscv64' all test
     - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='riscv64-linux-gnu-gcc' EXECUTE='qemu-riscv64' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -210,11 +201,10 @@ build_rv64:
 
 build_rv32:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install gcc-riscv64-linux-gnu qemu-user
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -222,7 +212,7 @@ build_rv32:
     - ./config_rv32.sh -no-runtime-lib
     - make -j "$NJOBS"
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -232,7 +222,7 @@ build_rv32:
 
 build_kvx:
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install sshpass openssh-client libzip4 lttng-tools liblttng-ctl-dev liblttng-ust-dev babeltrace
@@ -240,7 +230,6 @@ build_kvx:
     - rm -f download/*dkms*.deb download/*eclipse*.deb download/*llvm*.deb download/*board-mgmt* download/*oce-host* download/*pocl*
     - sudo dpkg -i download/*.deb
     - rm -rf download
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
@@ -248,9 +237,9 @@ build_kvx:
     - source /opt/kalray/accesscore/kalray.sh && ./config_kvx.sh
     - source /opt/kalray/accesscore/kalray.sh && make -j "$NJOBS"
     - source /opt/kalray/accesscore/kalray.sh && make -C test CCOMPOPTS=-static SIMU='kvx-cluster -- ' EXECUTE='kvx-cluster -- ' all test
-    - source /opt/kalray/accesscore/kalray.sh && make -C test/monniaux/yarpgen TARGET_CC='kvx-cos-gcc' EXECUTE='kvx-cluster -- ' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
+    - source /opt/kalray/accesscore/kalray.sh && ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='kvx-cos-gcc' EXECUTE='kvx-cluster -- ' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
   rules:
-    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "mppa-kvx"'
       when: always
@@ -260,7 +249,7 @@ build_kvx:
 
 pages: # TODO: change to "deploy" when "build" succeeds (or integrate with "build_kvx" above ?)
   stage: build
-  image: "coqorg/coq"
+  image: coqorg/coq:8.11.2-ocaml-4.09.1-flambda
   before_script:
     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
     - sudo apt-get -y install sshpass openssh-client libzip4 lttng-tools liblttng-ctl-dev liblttng-ust-dev babeltrace
@@ -268,7 +257,6 @@ pages: # TODO: change to "deploy" when "build" succeeds (or integrate with "buil
     - rm -f download/*dkms*.deb download/*eclipse*.deb download/*llvm*.deb download/*board-mgmt* download/*oce-host* download/*pocl*
     - sudo dpkg -i download/*.deb
     - rm -rf download
-    - opam switch 4.07.1+flambda
     - eval `opam config env`
     - opam update
     - opam install -y menhir
diff --git a/Makefile b/Makefile
index ba8add27..c66395fa 100644
--- a/Makefile
+++ b/Makefile
@@ -93,6 +93,7 @@ BACKEND=\
   CSE2deps.v CSE2depsproof.v \
   CSE2.v CSE2proof.v \
   CSE3analysis.v CSE3analysisproof.v CSE3.v CSE3proof.v \
+  KillUselessMoves.v KillUselessMovesproof.v \
   LICM.v LICMproof.v \
   NeedDomain.v NeedOp.v Deadcode.v Deadcodeproof.v \
   Unusedglob.v Unusedglobproof.v \
diff --git a/README.md b/README.md
index 59ff7447..377776ca 100644
--- a/README.md
+++ b/README.md
@@ -16,19 +16,23 @@ features, installation instructions, using the compiler, etc), please
 refer to the [Web site](http://compcert.inria.fr/) and especially
 the [user's manual](http://compcert.inria.fr/man/).
 
-## VERIMAG version
+## Verimag-Kalray version
 This is a special version with additions from Verimag and Kalray :
 
-	* Some general-purpose optimization phases (e.g. profiling).
-	* A backend for the KVX processor.
+* A backend for the KVX processor: see [`README_Kalray.md`](README_Kalray.md) for details.
+* Some general-purpose optimization phases (e.g. profiling).
+ - see [`PROFILING.md`](PROFILING.md) for details on the profiling system
 	
 The people responsible for this version are
 
-	* Sylvain Boulmé (Grenoble-INP, Verimag)
-	* David Monniaux (CNRS, Verimag)
-	* Cyril Six (Kalray)
-	
-See also `README_Kalray.md` and `PROFILING.md` and [the online documentation](https://certicompil.gricad-pages.univ-grenoble-alpes.fr/compcert-kvx).
+* Sylvain Boulmé (Grenoble-INP, Verimag)
+* David Monniaux (CNRS, Verimag)
+* Cyril Six (Kalray)
+
+## Papers on this CompCert version
+
+* [a 5-minutes video](http://www-verimag.imag.fr/~boulme/videos/poster-oopsla20.mp4) by C. Six, presenting the postpass scheduling and the KVX backend.
+* [Certified and Efficient Instruction Scheduling](https://hal.archives-ouvertes.fr/hal-02185883), an OOPSLA'20 paper, by Six, Boulmé and Monniaux.
 
 ## License
 CompCert is not free software.  This non-commercial release can only
diff --git a/backend/CSE3analysis.v b/backend/CSE3analysis.v
index ade79c28..7316c9a9 100644
--- a/backend/CSE3analysis.v
+++ b/backend/CSE3analysis.v
@@ -298,13 +298,22 @@ Section OPERATIONS.
 
   
   Definition move (src dst : reg) (rel : RELATION.t) : RELATION.t :=
-    match eq_find {| eq_lhs := dst;
+    if peq src dst
+    then rel
+    else
+      match eq_find {| eq_lhs := dst;
                      eq_op  := SOp Omove;
                      eq_args:= src::nil |} with
-    | Some eq_id => PSet.add eq_id (kill_reg dst rel)
-    | None => kill_reg dst rel
-    end.
+      | Some eq_id => PSet.add eq_id (kill_reg dst rel)
+      | None => kill_reg dst rel
+      end.
 
+  Definition is_trivial_sym_op sop :=
+    match sop with
+    | SOp op => is_trivial_op op
+    | SLoad _ _ => false
+    end.
+  
   Definition oper (dst : reg) (op: sym_op) (args : list reg)
              (rel : RELATION.t) : RELATION.t :=
     if is_smove op
@@ -315,15 +324,18 @@ Section OPERATIONS.
       | _ => kill_reg dst rel
       end
     else
-      let args' := forward_move_l rel args in
-      match rhs_find op args' rel with
-      | Some r =>
-        if Compopts.optim_CSE3_glb tt
-        then RELATION.glb (move r dst rel)
-                          (oper1 dst op args' rel)
-        else oper1 dst op args' rel
-      | None => oper1 dst op args' rel
-      end.
+      if is_trivial_sym_op op
+      then kill_reg dst rel
+      else
+        let args' := forward_move_l rel args in
+        match rhs_find op args' rel with
+        | Some r =>
+          if Compopts.optim_CSE3_glb tt
+          then RELATION.glb (move r dst rel)
+                            (oper1 dst op args' rel)
+          else oper1 dst op args' rel
+        | None => oper1 dst op args' rel
+        end.
   
   Definition clever_kill_store
              (chunk : memory_chunk) (addr: addressing) (args : list reg)
diff --git a/backend/CSE3analysisaux.ml b/backend/CSE3analysisaux.ml
index 3e4a6b9e..3990b765 100644
--- a/backend/CSE3analysisaux.ml
+++ b/backend/CSE3analysisaux.ml
@@ -67,6 +67,9 @@ let pp_option pp oc = function
   | None -> output_string oc "none"
   | Some x -> pp oc x;;
 
+let is_trivial eq =
+  (eq.eq_op = SOp Op.Omove) && (eq.eq_args = [eq.eq_lhs]);;
+
 let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
   let cur_eq_id = ref 0
   and cur_catalog = ref PTree.empty
@@ -76,6 +79,7 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
   and cur_kill_mem = ref PSet.empty
   and cur_moves = ref (PMap.init PSet.empty) in
   let eq_find_oracle node eq =
+    assert (not (is_trivial eq));
     let o = Hashtbl.find_opt eq_table (flatten_eq eq) in
     (if !Clflags.option_debug_compcert > 1
      then Printf.printf "@%d: eq_find %a -> %a\n" (P.to_int node)
diff --git a/backend/CSE3analysisproof.v b/backend/CSE3analysisproof.v
index f4e3672d..66b199cc 100644
--- a/backend/CSE3analysisproof.v
+++ b/backend/CSE3analysisproof.v
@@ -745,6 +745,25 @@ Section SOUNDNESS.
 
   Hint Resolve oper1_sound : cse3.
 
+  Lemma rel_idem_replace:
+    forall rel rs r m,
+      sem_rel rel rs m ->
+      sem_rel rel rs # r <- (rs # r) m.
+  Proof.
+    intros until m.
+    intro REL.
+    unfold sem_rel, sem_eq, sem_rhs in *.
+    intros.
+    specialize REL with (i:=i) (eq0:=eq).
+    rewrite Regmap.gsident.
+    replace ((rs # r <- (rs # r)) ## (eq_args eq)) with
+        (rs ## (eq_args eq)).
+    { apply REL; auto. }
+    apply list_map_exten.
+    intros.
+    apply Regmap.gsident.
+  Qed.
+      
   Lemma move_sound :
     forall no : node,
     forall rel : RELATION.t,
@@ -756,6 +775,10 @@ Section SOUNDNESS.
     unfold move.
     intros until m.
     intro REL.
+    destruct (peq src dst).
+    { subst dst.
+      apply rel_idem_replace; auto.
+    }
     pose proof (eq_find_sound no  {| eq_lhs := dst; eq_op := SOp Omove; eq_args := src :: nil |}) as EQ_FIND_SOUND.
     destruct eq_find.
     - intros i eq CONTAINS.
@@ -798,7 +821,11 @@ Section SOUNDNESS.
       subst.
       rewrite <- (forward_move_sound rel rs m r) by auto.
       apply move_sound; auto.
-    - destruct rhs_find as [src |] eqn:RHS_FIND.
+    - destruct (is_trivial_sym_op sop).
+      {
+        apply kill_reg_sound; auto.
+      }
+      destruct rhs_find as [src |] eqn:RHS_FIND.
       + destruct (Compopts.optim_CSE3_glb tt).
         * apply sem_rel_glb; split.
           ** pose proof (rhs_find_sound no sop (forward_move_l (ctx:=ctx) rel args) rel src rs m REL RHS_FIND) as SOUND.
diff --git a/backend/CSE3proof.v b/backend/CSE3proof.v
index 6e489066..3fbc9912 100644
--- a/backend/CSE3proof.v
+++ b/backend/CSE3proof.v
@@ -443,12 +443,6 @@ Ltac IND_STEP :=
         idtac mpc mpc' fn minstr *)
       end.
 
-Lemma if_same : forall {T : Type} (b : bool) (x : T),
-    (if b then x else x) = x.
-Proof.
-  destruct b; trivial.
-Qed.
-
 Lemma step_simulation:
   forall S1 t S2, RTL.step ge S1 t S2 -> 
   forall S1', match_states S1 S1' ->
diff --git a/backend/Duplicateaux.ml b/backend/Duplicateaux.ml
index 00819834..eb9f42e0 100644
--- a/backend/Duplicateaux.ml
+++ b/backend/Duplicateaux.ml
@@ -15,6 +15,7 @@
 (* Oracle for Duplicate pass.
  * - Add static prediction information to Icond nodes
  * - Performs tail duplication on interesting traces to form superblocks
+ * - Unrolls a single iteration of innermost loops
  * - (TODO: perform partial loop unrolling inside innermost loops)
  *)
 
@@ -22,23 +23,13 @@ open RTL
 open Maps
 open Camlcoq
 
-let debug_flag = ref false
-
-let debug fmt =
-  if !debug_flag then Printf.eprintf fmt
-  else Printf.ifprintf stderr fmt
-
-let get_some = function
-| None -> failwith "Did not get some"
-| Some thing -> thing
-
-let rtl_successors = function
-| Itailcall _ | Ireturn _ -> []
-| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n)
-| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n]
-| Icond (_,_,n1,n2,_) -> [n1; n2]
-| Ijumptable (_,ln) -> ln
+let debug_flag = LICMaux.debug_flag
+let debug = LICMaux.debug
+let get_loop_headers = LICMaux.get_loop_headers
+let get_some = LICMaux.get_some
+let rtl_successors = LICMaux.rtl_successors
 
+(* Get list of nodes following a BFS of the code *)
 let bfs code entrypoint = begin
   debug "bfs\n";
   let visited = ref (PTree.map (fun n i -> false) code)
@@ -67,6 +58,7 @@ let optbool o = match o with Some _ -> true | None -> false
 
 let ptree_get_some n ptree = get_some @@ PTree.get n ptree
 
+(* Returns a PTree: node -> list of the predecessors of that node *)
 let get_predecessors_rtl code = begin
   debug "get_predecessors_rtl\n";
   let preds = ref (PTree.map (fun n i -> []) code) in
@@ -89,15 +81,13 @@ end
 
 module PSet = Set.Make(PInt)
 
-let print_intlist l =
-  let rec f = function
+let print_intlist oc l =
+  let rec f oc = function
   | [] -> ()
-  | n::ln -> (Printf.printf "%d " (P.to_int n); f ln)
+  | n::ln -> (Printf.fprintf oc "%d %a" (P.to_int n) f ln)
   in begin
     if !debug_flag then begin
-      Printf.printf "[";
-      f l;
-      Printf.printf "]"
+      Printf.fprintf oc "[%a]" f l
     end
   end
 
@@ -113,43 +103,6 @@ let print_intset s =
     end
   end
 
-type vstate = Unvisited | Processed | Visited
-
-(** Getting loop branches with a DFS visit :
-  * Each node is either Unvisited, Visited, or Processed
-  * pre-order: node becomes Processed
-  * post-order: node becomes Visited
-  *
-  * If we come accross an edge to a Processed node, it's a loop!
-  *)
-let get_loop_headers code entrypoint = begin
-  debug "get_loop_headers\n";
-  let visited = ref (PTree.map (fun n i -> Unvisited) code)
-  and is_loop_header = ref (PTree.map (fun n i -> false) code)
-  in let rec dfs_visit code = function
-  | [] -> ()
-  | node :: ln ->
-      match (get_some @@ PTree.get node !visited) with
-      | Visited -> ()
-      | Processed -> begin
-          debug "Node %d is a loop header\n" (P.to_int node);
-          is_loop_header := PTree.set node true !is_loop_header;
-          visited := PTree.set node Visited !visited
-        end
-      | Unvisited -> begin
-          visited := PTree.set node Processed !visited;
-          match PTree.get node code with
-          | None -> failwith "No such node"
-          | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits;
-          visited := PTree.set node Visited !visited;
-          dfs_visit code ln
-        end
-  in begin
-    dfs_visit code [entrypoint];
-    !is_loop_header
-  end
-end
-
 let ptree_printbool pt =
   let elements = PTree.elements pt
   in begin
@@ -174,6 +127,10 @@ let rec look_ahead code node is_loop_header predicate =
         )
     | _ -> false
 
+(** 
+ * Heuristics mostly based on the paper Branch Prediction for Free 
+ *)
+
 let do_call_heuristic code cond ifso ifnot is_loop_header =
   begin
     debug "\tCall heuristic..\n";
@@ -302,7 +259,7 @@ let get_loop_info is_loop_header bfs_order code =
     !loop_info
   end
 
-(* Remark - compared to the original paper, we don't use the store heuristic *)
+(* Remark - compared to the original Branch Prediction for Free paper, we don't use the store heuristic *)
 let get_directions code entrypoint = begin
   debug "get_directions\n";
   let bfs_order = bfs code entrypoint in
@@ -314,7 +271,9 @@ let get_directions code entrypoint = begin
     (* debug "\n"; *)
     List.iter (fun n ->
       match (get_some @@ PTree.get n code) with
-      | Icond (cond, lr, ifso, ifnot, _) ->
+      | Icond (cond, lr, ifso, ifnot, pred) ->
+          (match pred with Some _ -> debug "RTL node %d already has prediction information\n" (P.to_int n)
+          | None ->
           (* debug "Analyzing %d.." (P.to_int n); *)
           let heuristics = [ do_opcode_heuristic;
             do_return_heuristic; do_loop2_heuristic loop_info n; do_loop_heuristic; do_call_heuristic;
@@ -333,6 +292,7 @@ let get_directions code entrypoint = begin
                                    | None -> debug "\tUNSURE\n");
             debug "---------------------------------------\n"
           end
+          )
       | _ -> ()
     ) bfs_order;
     !directions
@@ -340,7 +300,11 @@ let get_directions code entrypoint = begin
 end
 
 let update_direction direction = function
-| Icond (cond, lr, n, n', _) -> Icond (cond, lr, n, n', direction)
+| Icond (cond, lr, n, n', pred) ->
+    (* only update if there is no prior existing branch prediction *)
+    (match pred with
+    | None -> Icond (cond, lr, n, n', direction)
+    | Some _ -> Icond (cond, lr, n, n', pred) )
 | i -> i
 
 let rec update_direction_rec directions = function
@@ -428,18 +392,29 @@ let best_predecessor_of node predecessors code order is_visited =
         ) order)
       with Not_found -> None
 
-let print_trace t = print_intlist t
+let print_trace = print_intlist
 
-let print_traces traces =
-  let rec f = function
+let print_traces oc traces =
+  let rec f oc = function
   | [] -> ()
-  | t::lt -> Printf.printf "\n\t"; print_trace t; Printf.printf ",\n"; f lt
+  | t::lt -> Printf.fprintf oc "\n\t%a,\n%a" print_trace t f lt
   in begin
-    if !debug_flag then begin
-      Printf.printf "Traces: {";
-      f traces;
-      Printf.printf "}\n";
-    end
+    if !debug_flag then
+      Printf.fprintf oc "Traces: {%a}\n" f traces
+  end
+
+(* Adapted from backend/PrintRTL.ml: print_function *)
+let print_code code = let open PrintRTL in let open Printf in
+  if (!debug_flag) then begin
+    fprintf stdout "{\n";
+    let instrs =
+      List.sort
+        (fun (pc1, _) (pc2, _) -> compare pc2 pc1)
+        (List.rev_map
+          (fun (pc, i) -> (P.to_int pc, i))
+          (PTree.elements code)) in
+    List.iter (print_instruction stdout) instrs;
+    fprintf stdout "}"
   end
 
 (* Dumb (but linear) trace selection *)
@@ -514,7 +489,7 @@ let select_traces_chang code entrypoint = begin
       end
     done;
     (* debug "DFS: \t"; print_intlist order; debug "\n"; *)
-    debug "Traces: "; print_traces !traces;
+    debug "Traces: %a" print_traces !traces;
     !traces
   end
 end
@@ -530,26 +505,26 @@ let rec make_identity_ptree_rec = function
 
 let make_identity_ptree code = make_identity_ptree_rec (PTree.elements code)
 
-(* Change the pointers of preds nodes to point to n' instead of n *)
+(* Change the pointers of nodes to point to n' instead of n *)
 let rec change_pointers code n n' = function
   | [] -> code
-  | pred :: preds ->
-      let new_pred_inst = match ptree_get_some pred code with
-        | Icall(a, b, c, d, n0) -> assert (n0 == n); Icall(a, b, c, d, n')
-        | Ibuiltin(a, b, c, n0) -> assert (n0 == n); Ibuiltin(a, b, c, n')
-        | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e == n) ln);
-                               Ijumptable(a, List.map (fun e -> if (e == n) then n' else e) ln)
-        | Icond(a, b, n1, n2, i) -> assert (n1 == n || n2 == n);
-                                 let n1' = if (n1 == n) then n' else n1
-                                 in let n2' = if (n2 == n) then n' else n2
+  | node :: nodes ->
+      let new_pred_inst = match ptree_get_some node code with
+        | Icall(a, b, c, d, n0) -> assert (n0 = n); Icall(a, b, c, d, n')
+        | Ibuiltin(a, b, c, n0) -> assert (n0 = n); Ibuiltin(a, b, c, n')
+        | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e = n) ln);
+                               Ijumptable(a, List.map (fun e -> if (e = n) then n' else e) ln)
+        | Icond(a, b, n1, n2, i) -> assert (n1 = n || n2 = n);
+                                 let n1' = if (n1 = n) then n' else n1
+                                 in let n2' = if (n2 = n) then n' else n2
                                  in Icond(a, b, n1', n2', i)
-        | Inop n0 -> assert (n0 == n); Inop n'
-        | Iop (a, b, c, n0) -> assert (n0 == n); Iop (a, b, c, n')
-        | Iload (a, b, c, d, e, n0) -> assert (n0 == n); Iload (a, b, c, d, e, n')
-        | Istore (a, b, c, d, n0) -> assert (n0 == n); Istore (a, b, c, d, n')
+        | Inop n0 -> assert (n0 = n); Inop n'
+        | Iop (a, b, c, n0) -> assert (n0 = n); Iop (a, b, c, n')
+        | Iload (a, b, c, d, e, n0) -> assert (n0 = n); Iload (a, b, c, d, e, n')
+        | Istore (a, b, c, d, n0) -> assert (n0 = n); Istore (a, b, c, d, n')
         | Itailcall _ | Ireturn _ -> failwith "That instruction cannot be a predecessor"
-      in let new_code = PTree.set pred new_pred_inst code
-      in change_pointers new_code n n' preds
+      in let new_code = PTree.set node new_pred_inst code
+      in change_pointers new_code n n' nodes
 
 (* parent: parent of n to keep as parent
  * preds: all the other parents of n
@@ -573,13 +548,20 @@ let is_empty = function
   | [] -> true
   | _ -> false
 
+let next_free_pc code = maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1
+
+let is_a_nop code n =
+  match get_some @@ PTree.get n code with
+  | Inop _ -> true
+  | _ -> false
+
 (* code: RTL code
  * preds: mapping node -> predecessors
  * ptree: the revmap
  * trace: the trace to follow tail duplication on *)
 let tail_duplicate code preds ptree trace =
   (* next_int: unused integer that can be used for the next duplication *)
-  let next_int = ref (maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1)
+  let next_int = ref (next_free_pc code)
   (* last_node and last_duplicate store resp. the last processed node of the trace, and its duplication *)
   in let last_node = ref None
   in let last_duplicate = ref None
@@ -601,7 +583,7 @@ let tail_duplicate code preds ptree trace =
               in let (newc, newp) = duplicate code ptree !last_node n final_node_preds (P.of_int n')
               in begin
                 next_int := !next_int + 1;
-                nb_duplicated := !nb_duplicated + 1;
+                (if not @@ is_a_nop code n then nb_duplicated := !nb_duplicated + 1);
                 last_duplicate := Some (P.of_int n');
                 (newc, newp)
               end
@@ -613,9 +595,8 @@ let tail_duplicate code preds ptree trace =
   in let new_code, new_ptree = f code ptree true trace
   in (new_code, new_ptree, !nb_duplicated)
 
-let superblockify_traces code preds traces =
-  let max_nb_duplicated = !Clflags.option_fduplicate (* FIXME - should be architecture dependent *)
-  in let ptree = make_identity_ptree code
+let superblockify_traces code preds traces ptree =
+  let max_nb_duplicated = !Clflags.option_ftailduplicate (* FIXME - should be architecture dependent *)
   in let rec f code ptree = function
     | [] -> (code, ptree, 0)
     | trace :: traces ->
@@ -626,37 +607,271 @@ let superblockify_traces code preds traces =
   in let new_code, new_ptree, _ = f code ptree traces
   in (new_code, new_ptree)
 
-let rec invert_iconds_trace code = function
-  | [] -> code
-  | n :: ln ->
-      let code' = match ptree_get_some n code with
-        | Icond (c, lr, ifso, ifnot, info) -> (match info with
-            | Some true -> begin
-                (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *)
-                PTree.set n (Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)) code
-              end
-            | _ -> code)
-        | _ -> code
-      in invert_iconds_trace code' ln
+let invert_iconds code =
+  PTree.map1 (fun i -> match i with
+    | Icond (c, lr, ifso, ifnot, info) -> (match info with
+        | Some true -> begin
+            (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *)
+            Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)
+          end
+        | _ -> i)
+    | _ -> i
+  ) code
+
+(** Partial loop unrolling
+ *
+ * The following code seeks innermost loops, and unfolds the first iteration
+ * Most of the code has been moved from LICMaux.ml to Duplicateaux.ml to solve
+ * cyclic dependencies between LICMaux and Duplicateaux
+ *)
 
-let rec invert_iconds code = function
-  | [] -> code
-  | t :: ts ->
-      let code' = if !Clflags.option_finvertcond then invert_iconds_trace code t
-                  else code
-      in invert_iconds code' ts
+type innerLoop = {
+  preds: P.t list;
+  body: HashedSet.PSet.t;
+  head: P.t; (* head of the loop *)
+  final: P.t (* the final instruction, which loops back to the head *)
+}
+
+let print_pset = LICMaux.pp_pset
+
+let print_inner_loop iloop =
+  debug "{preds: %a, body: %a}" print_intlist iloop.preds print_pset iloop.body
+
+let rec print_inner_loops = function
+| [] -> ()
+| iloop :: iloops -> begin
+    print_inner_loop iloop;
+    debug "\n";
+    print_inner_loops iloops
+  end
+
+let print_ptree printer pt =
+  let elements = PTree.elements pt in
+  begin
+    debug "[\n";
+    List.iter (fun (n, elt) ->
+      debug "\t%d: %a\n" (P.to_int n) printer elt
+    ) elements;
+    debug "]\n"
+  end
+
+let print_pint oc i = if !debug_flag then Printf.fprintf oc "%d" (P.to_int i) else ()
+
+let get_inner_loops f code is_loop_header =
+  let fake_f = { fn_sig = f.fn_sig; fn_params = f.fn_params; 
+    fn_stacksize = f.fn_stacksize; fn_code = code; fn_entrypoint = f.fn_entrypoint } in
+  let (_, predmap, loopmap) = LICMaux.inner_loops fake_f in
+  begin
+    debug "PREDMAP: "; print_ptree print_intlist predmap;
+    debug "LOOPMAP: "; print_ptree print_pset loopmap;
+    List.map (fun (n, body) ->
+      let preds = List.filter (fun p -> not @@ HashedSet.PSet.contains body p) 
+        @@ get_some @@ PTree.get n predmap in
+      let head = (* the instruction from body which is a loop header *)
+        let heads = HashedSet.PSet.elements @@ HashedSet.PSet.filter 
+          (fun n -> ptree_get_some n is_loop_header) body in
+        begin
+          assert (List.length heads == 1);
+          List.hd heads
+        end in
+      let final = (* the predecessors from head that are in the body *)
+        let head_preds = ptree_get_some head predmap in
+        let filtered = List.filter (fun n -> HashedSet.PSet.contains body n) head_preds in
+        begin
+          debug "HEAD: %d\n" (P.to_int head);
+          debug "BODY: %a\n" print_pset body;
+          debug "HEADPREDS: %a\n" print_intlist head_preds;
+          assert (List.length filtered == 1);
+          List.hd filtered
+        end in
+      { preds = preds; body = body; head = head; final = final }
+    ) 
+    (* LICMaux.inner_loops also returns non-inner loops, but with a body of 1 instruction
+     * We remove those to get just the inner loops *)
+    @@ List.filter (fun (n, body) ->
+      let count = List.length @@ HashedSet.PSet.elements body in count != 1
+    ) (PTree.elements loopmap)
+  end
+
+let rec generate_fwmap ln ln' fwmap =
+  match ln with
+  | [] -> begin
+      match ln' with
+      | [] -> fwmap
+      | _ -> failwith "ln and ln' have different lengths"
+    end
+  | n :: ln -> begin
+      match ln' with
+      | n' :: ln' -> generate_fwmap ln ln' (PTree.set n n' fwmap)
+      | _ -> failwith "ln and ln' have different lengths"
+    end
+
+let generate_revmap ln ln' revmap = generate_fwmap ln' ln revmap
+
+let apply_map fw n = P.of_int @@ ptree_get_some n fw
+
+let apply_map_opt fw n =
+  match PTree.get n fw with
+  | Some n' -> P.of_int n'
+  | None -> n
+
+let change_nexts fwmap = function
+  | Icall (a, b, c, d, n) -> Icall (a, b, c, d, apply_map fwmap n)
+  | Ibuiltin (a, b, c, n) -> Ibuiltin (a, b, c, apply_map fwmap n)
+  | Ijumptable (a, ln) -> Ijumptable (a, List.map (apply_map_opt fwmap) ln)
+  | Icond (a, b, n1, n2, i) -> Icond (a, b, apply_map_opt fwmap n1, apply_map_opt fwmap n2, i)
+  | Inop n -> Inop (apply_map fwmap n)
+  | Iop (a, b, c, n) -> Iop (a, b, c, apply_map fwmap n)
+  | Iload (a, b, c, d, e, n) -> Iload (a, b, c, d, e, apply_map fwmap n)
+  | Istore (a, b, c, d, n) -> Istore (a, b, c, d, apply_map fwmap n)
+  | Itailcall (a, b, c) -> Itailcall (a, b, c)
+  | Ireturn o -> Ireturn o
+
+(** Clone a list of instructions into free pc indexes
+ *
+ * The list of instructions should be contiguous, and not include any loop.
+ * It is assumed that the first instruction of the list is the head.
+ * Also, the last instruction of the list should be the loop backedge.
+ *
+ * Returns: (code', revmap', ln', fwmap)
+ *  code' is the updated code, after cloning
+ *  revmap' is the updated revmap
+ *  ln' is the list of the new indexes used to reference the cloned instructions
+ *  fwmap is a map from ln to ln'
+ *)
+let clone code revmap ln = begin
+  assert (List.length ln > 0);
+  let head' = next_free_pc code in
+  (* +head' to ensure we never overlap with the existing code *)
+  let ln' = List.map (fun n -> n + head') @@ List.map P.to_int ln in
+  let fwmap = generate_fwmap ln ln' PTree.empty in
+  let revmap' = generate_revmap ln (List.map P.of_int ln') revmap in
+  let code' = ref code in
+  List.iter (fun n ->
+    let instr = get_some @@ PTree.get n code in
+    let instr' = change_nexts fwmap instr in
+    code' := PTree.set (apply_map fwmap n) instr' !code'
+  ) ln;
+  (!code', revmap', ln', fwmap)
+end
+
+let rec count_ignore_nops code = function
+  | [] -> 0
+  | n::ln ->
+      let inst = get_some @@ PTree.get n code in
+      match inst with
+      | Inop _ -> count_ignore_nops code ln
+      | _ -> 1 + count_ignore_nops code ln
+
+(* Unrolls a single interation of the inner loop
+ * 1) Clones the body into body'
+ * 2) Links the preds to the first instruction of body'
+ * 3) Links the last instruction of body' into the first instruction of body
+ *)
+let unroll_inner_loop_single code revmap iloop =
+  let body = HashedSet.PSet.elements (iloop.body) in
+  if count_ignore_nops code body > !Clflags.option_funrollsingle then begin
+    debug "Too many nodes in the loop body (%d > %d)" (List.length body) !Clflags.option_funrollsingle;
+    (code, revmap)
+  end else
+    let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
+    let code' = ref code2 in
+    let head' = apply_map fwmap (iloop.head) in
+    let final' = apply_map fwmap (iloop.final) in
+    begin
+      debug "PREDS: %a\n" print_intlist iloop.preds;
+      debug "IHEAD: %d\n" (P.to_int iloop.head);
+      code' := change_pointers !code' (iloop.head) head' (iloop.preds);
+      code' := change_pointers !code' head' (iloop.head) [final'];
+      (!code', revmap2)
+    end
+
+let unroll_inner_loops_single f code revmap =
+  let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
+  let inner_loops = get_inner_loops f code is_loop_header in
+  let code' = ref code in
+  let revmap' = ref revmap in
+  begin
+    print_inner_loops inner_loops;
+    List.iter (fun iloop ->
+      let (new_code, new_revmap) = unroll_inner_loop_single !code' !revmap' iloop in
+      code' := new_code; revmap' := new_revmap
+    ) inner_loops;
+    (!code', !revmap')
+  end
+
+(* Unrolls the body of the inner loop once - duplicating the exit condition as well 
+ * 1) Clones body into body'
+ * 2) Links the last instruction of body into the first of body' 
+ * 3) Links the last instruction of body' into the first of body
+ *)
+let unroll_inner_loop_body code revmap iloop =
+  let body = HashedSet.PSet.elements (iloop.body) in
+  let limit = !Clflags.option_funrollbody in
+  if count_ignore_nops code body > limit then begin
+    debug "Too many nodes in the loop body (%d > %d)" (List.length body) limit;
+    (code, revmap)
+  end else
+    let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
+    let code' = ref code2 in
+    let head' = apply_map fwmap (iloop.head) in
+    let final' = apply_map fwmap (iloop.final) in
+    begin
+      code' := change_pointers !code' iloop.head head' [iloop.final];
+      code' := change_pointers !code' head' iloop.head [final']; 
+      (!code', revmap2)
+    end
+
+let unroll_inner_loops_body f code revmap =
+  let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
+  let inner_loops = get_inner_loops f code is_loop_header in
+  let code' = ref code in
+  let revmap' = ref revmap in
+  begin
+    print_inner_loops inner_loops;
+    List.iter (fun iloop ->
+      let (new_code, new_revmap) = unroll_inner_loop_body !code' !revmap' iloop in
+      code' := new_code; revmap' := new_revmap
+    ) inner_loops;
+    (!code', !revmap')
+  end
 
 let duplicate_aux f =
+  (* initializing *)
   let entrypoint = f.fn_entrypoint in
-  if !Clflags.option_fduplicate < 0 then
-    ((f.fn_code, entrypoint), make_identity_ptree f.fn_code)
-  else
-    let code = update_directions (f.fn_code) entrypoint in
-    let traces = select_traces code entrypoint in
-    let icond_code = invert_iconds code traces in
-    let preds = get_predecessors_rtl icond_code in
-    if !Clflags.option_fduplicate >= 1 then
-      let (new_code, pTreeId) = ((* print_traces traces; *) superblockify_traces icond_code preds traces) in
-      ((new_code, f.fn_entrypoint), pTreeId)
-    else
-      ((icond_code, entrypoint), make_identity_ptree code)
+  let code = f.fn_code in
+  let revmap = make_identity_ptree code in
+
+  (* static prediction *)
+  let code =
+    if !Clflags.option_fpredict then
+      update_directions code entrypoint
+    else code in
+
+  (* unroll single *)
+  let (code, revmap) =
+    if !Clflags.option_funrollsingle > 0 then
+      unroll_inner_loops_single f code revmap
+    else (code, revmap) in
+
+  (* unroll body *)
+  let (code, revmap) =
+    if !Clflags.option_funrollbody > 0 then
+      unroll_inner_loops_body f code revmap
+    else (code, revmap) in
+
+  (* static prediction bis *)
+  let code =
+    if !Clflags.option_fpredict then
+      invert_iconds code
+    else code in
+
+  (* tail duplication *)
+  let (code, revmap) =
+    if !Clflags.option_ftailduplicate > 0 then
+      let traces = select_traces code entrypoint in
+      let preds = get_predecessors_rtl code in
+      superblockify_traces code preds traces revmap
+    else (code, revmap) in
+
+  ((code, entrypoint), revmap)
diff --git a/backend/KillUselessMoves.v b/backend/KillUselessMoves.v
new file mode 100644
index 00000000..bdd7ec60
--- /dev/null
+++ b/backend/KillUselessMoves.v
@@ -0,0 +1,40 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           David Monniaux     CNRS, VERIMAG                  *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL.
+Require List.
+
+Definition transf_ros (ros: reg + ident) : reg + ident := ros.
+
+Definition transf_instr (pc: node) (instr: instruction) :=
+  match instr with
+  | Iop op args res s =>
+    if (eq_operation op Omove) && (List.list_eq_dec peq args (res :: nil))
+    then Inop s
+    else instr
+  | _ => instr
+  end.
+
+Definition transf_function (f: function) : function :=
+  {| fn_sig := f.(fn_sig);
+     fn_params := f.(fn_params);
+     fn_stacksize := f.(fn_stacksize);
+     fn_code := PTree.map transf_instr f.(fn_code);
+     fn_entrypoint := f.(fn_entrypoint) |}.
+
+Definition transf_fundef (fd: fundef) : fundef :=
+  AST.transf_fundef transf_function fd.
+
+Definition transf_program (p: program) : program :=
+  transform_program transf_fundef p.
diff --git a/backend/KillUselessMovesproof.v b/backend/KillUselessMovesproof.v
new file mode 100644
index 00000000..629aa6aa
--- /dev/null
+++ b/backend/KillUselessMovesproof.v
@@ -0,0 +1,361 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           David Monniaux     CNRS, VERIMAG                  *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+Require Import Axioms.
+Require Import FunInd.
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import KillUselessMoves.
+
+
+Definition match_prog (p tp: RTL.program) :=
+  match_program (fun ctx f tf => tf = transf_fundef f) eq p tp.
+
+Lemma transf_program_match:
+  forall p, match_prog p (transf_program p).
+Proof.
+  intros. eapply match_transform_program; eauto.
+Qed.
+
+Section PRESERVATION.
+
+Variables prog tprog: program.
+Hypothesis TRANSL: match_prog prog tprog.
+Let ge := Genv.globalenv prog.
+Let tge := Genv.globalenv tprog.
+
+Lemma functions_translated:
+  forall v f,
+  Genv.find_funct ge v = Some f ->
+  Genv.find_funct tge v = Some (transf_fundef f).
+Proof (Genv.find_funct_transf TRANSL).
+
+Lemma function_ptr_translated:
+  forall v f,
+  Genv.find_funct_ptr ge v = Some f ->
+  Genv.find_funct_ptr tge v = Some (transf_fundef f).
+Proof (Genv.find_funct_ptr_transf TRANSL).
+
+Lemma symbols_preserved:
+  forall id,
+  Genv.find_symbol tge id = Genv.find_symbol ge id.
+Proof (Genv.find_symbol_transf TRANSL).
+
+Lemma senv_preserved:
+  Senv.equiv ge tge.
+Proof (Genv.senv_transf TRANSL).
+
+Lemma sig_preserved:
+  forall f, funsig (transf_fundef f) = funsig f.
+Proof.
+  destruct f; reflexivity.
+Qed.
+
+Lemma find_function_translated:
+  forall ros rs fd,
+  find_function ge ros rs = Some fd ->
+  find_function tge ros rs = Some (transf_fundef fd).
+Proof.
+  unfold find_function; intros. destruct ros as [r|id].
+  eapply functions_translated; eauto.
+  rewrite symbols_preserved. destruct (Genv.find_symbol ge id); try congruence.
+  eapply function_ptr_translated; eauto.
+Qed.
+
+Lemma transf_function_at:
+  forall f pc i,
+  f.(fn_code)!pc = Some i ->
+  (transf_function f).(fn_code)!pc = Some(transf_instr pc i).
+Proof.
+  intros until i. intro Hcode.
+  unfold transf_function; simpl.
+  rewrite PTree.gmap.
+  unfold option_map.
+  rewrite Hcode.
+  reflexivity.
+Qed.
+
+Ltac TR_AT :=
+  match goal with
+  | [ A: (fn_code _)!_ = Some _ |- _ ] =>
+        generalize (transf_function_at _ _ _ A); intros
+  end.
+
+Section SAME_RS.
+  Context {A : Type}.
+  
+  Definition same_rs (rs rs' : Regmap.t A) :=
+    forall x, rs # x = rs' # x.
+
+  Lemma same_rs_refl : forall rs, same_rs rs rs.
+  Proof.
+    unfold same_rs.
+    reflexivity.
+  Qed.
+
+  Lemma same_rs_comm : forall rs rs', (same_rs rs rs') -> (same_rs rs' rs).
+  Proof.
+    unfold same_rs.
+    congruence.
+  Qed.
+
+  Lemma same_rs_trans : forall rs1 rs2 rs3,
+      (same_rs rs1 rs2) -> (same_rs rs2 rs3) -> (same_rs rs1 rs3).
+  Proof.
+    unfold same_rs.
+    congruence.
+  Qed.
+
+  Lemma same_rs_idem_write : forall rs r,
+      (same_rs rs (rs # r <- (rs # r))).
+  Proof.
+    unfold same_rs.
+    intros.
+    rewrite Regmap.gsident.
+    reflexivity.
+  Qed.
+
+  Lemma same_rs_read:
+    forall rs rs' r, (same_rs rs rs') -> rs # r = rs' # r.
+  Proof.
+    unfold same_rs.
+    auto.
+  Qed.
+  
+  Lemma same_rs_subst:
+    forall rs rs' l, (same_rs rs rs') -> rs ## l = rs' ## l.
+  Proof.
+    induction l; cbn; intuition congruence.
+  Qed.
+
+  Lemma same_rs_write: forall rs rs' r x,
+      (same_rs rs rs') -> (same_rs (rs # r <- x) (rs' # r <- x)).
+  Proof.
+    unfold same_rs.
+    intros.
+    destruct (peq r x0).
+    { subst x0.
+      rewrite Regmap.gss. rewrite Regmap.gss.
+      reflexivity.
+    }
+    rewrite Regmap.gso by congruence.
+    rewrite Regmap.gso by congruence.
+    auto.
+  Qed.
+
+  Lemma same_rs_setres:
+    forall rs rs' (SAME: same_rs rs rs') res vres,
+      same_rs (regmap_setres res vres rs) (regmap_setres res vres rs').
+  Proof.
+    induction res; cbn; auto using same_rs_write.
+  Qed.
+End SAME_RS.
+
+Lemma same_find_function: forall tge rs rs' (SAME: same_rs rs rs') ros,
+  find_function tge ros rs = find_function tge ros rs'.
+Proof.
+  destruct ros; cbn.
+  { rewrite (same_rs_read rs rs' r SAME).
+    reflexivity. }
+  reflexivity.
+Qed.
+
+Inductive match_frames: RTL.stackframe -> RTL.stackframe -> Prop :=
+| match_frames_intro: forall res f sp pc rs rs' (SAME : same_rs rs rs'),
+    match_frames (Stackframe res f sp pc rs)
+                 (Stackframe res (transf_function f) sp pc rs').
+
+Inductive match_states: RTL.state -> RTL.state -> Prop :=
+  | match_regular_states: forall stk f sp pc rs rs' m stk'
+        (SAME: same_rs rs rs')
+        (STACKS: list_forall2 match_frames stk stk'),
+      match_states (State stk f sp pc rs m)
+                   (State stk' (transf_function f) sp pc rs' m)
+  | match_callstates: forall stk f args m stk'
+        (STACKS: list_forall2 match_frames stk stk'),
+      match_states (Callstate stk f args m)
+                   (Callstate stk' (transf_fundef f) args m)
+  | match_returnstates: forall stk v m stk'
+        (STACKS: list_forall2 match_frames stk stk'),
+      match_states (Returnstate stk v m)
+                   (Returnstate stk' v m).
+
+Lemma step_simulation:
+  forall S1 t S2, RTL.step ge S1 t S2 ->
+  forall S1', match_states S1 S1' ->
+  exists S2', RTL.step tge S1' t S2' /\ match_states S2 S2'.
+Proof.
+  induction 1; intros S1' MS; inv MS; try TR_AT.
+- (* nop *)
+  econstructor; split. eapply exec_Inop; eauto.
+  constructor; auto.
+- (* op *)
+  cbn in H1.
+  destruct (_ && _) eqn:IS_MOVE in H1.
+  {
+    destruct eq_operation in IS_MOVE. 2: discriminate.
+    destruct list_eq_dec in IS_MOVE. 2: discriminate.
+    subst op. subst args.
+    clear IS_MOVE.
+    cbn in H0.
+    inv H0.
+    econstructor; split.
+    { eapply exec_Inop; eauto. }
+    constructor.
+    2: assumption.
+    eapply same_rs_trans.
+    { apply same_rs_comm.
+      apply same_rs_idem_write.
+    }
+    assumption.
+  }
+  econstructor; split.
+  eapply exec_Iop with (v := v); eauto.
+  rewrite <- H0.
+  rewrite (same_rs_subst rs rs' args SAME).
+  apply eval_operation_preserved. exact symbols_preserved.
+  constructor; auto using same_rs_write.
+(* load *)
+- econstructor; split.
+  assert (eval_addressing tge sp addr rs' ## args = Some a).
+  { rewrite <- H0.
+    rewrite (same_rs_subst rs rs' args SAME).
+    apply eval_addressing_preserved. exact symbols_preserved.
+  }
+  eapply exec_Iload; eauto.
+  constructor; auto using same_rs_write.
+- (* load notrap1 *)
+  econstructor; split.
+  assert (eval_addressing tge sp addr rs' ## args = None).
+  { rewrite <- H0.
+    rewrite (same_rs_subst rs rs' args SAME).
+    apply eval_addressing_preserved. exact symbols_preserved.
+  }
+  eapply exec_Iload_notrap1; eauto.
+  constructor; auto using same_rs_write.
+- (* load notrap2 *)
+  econstructor; split.
+  assert (eval_addressing tge sp addr rs' ## args = Some a).
+  { rewrite <- H0.
+    rewrite (same_rs_subst rs rs' args SAME).
+    apply eval_addressing_preserved. exact symbols_preserved.
+  }
+  eapply exec_Iload_notrap2; eauto.
+  constructor; auto using same_rs_write. 
+- (* store *)
+  econstructor; split.
+  assert (eval_addressing tge sp addr rs' ## args = Some a).
+  { rewrite <- H0.
+    rewrite (same_rs_subst rs rs' args SAME).
+    apply eval_addressing_preserved. exact symbols_preserved.
+  }
+  rewrite (same_rs_read rs rs' src SAME) in H1.
+  eapply exec_Istore; eauto.
+  constructor; auto.
+(* call *)
+- econstructor; split.
+  eapply exec_Icall with (fd := transf_fundef fd); eauto.
+  eapply find_function_translated; eauto.
+  { rewrite <- (same_find_function ge rs rs') by assumption.
+    assumption. }
+  apply sig_preserved.
+  rewrite (same_rs_subst rs rs' args SAME).  
+  constructor. constructor; auto. constructor; auto.
+(* tailcall *)
+- econstructor; split.
+  eapply exec_Itailcall with (fd := transf_fundef fd); eauto.
+    eapply find_function_translated; eauto.
+  { rewrite <- (same_find_function ge rs rs') by assumption.
+    assumption. }
+    apply sig_preserved.
+  rewrite (same_rs_subst rs rs' args SAME).  
+  constructor. auto.
+(* builtin *)
+- econstructor; split.
+  eapply exec_Ibuiltin; eauto.
+  eapply eval_builtin_args_preserved with (ge1 := ge); eauto. exact symbols_preserved.
+  {
+    replace (fun r : positive => rs' # r) with (fun r : positive => rs # r).
+    eassumption.
+    apply functional_extensionality.
+    auto using same_rs_read.
+  }
+  eapply external_call_symbols_preserved; eauto. apply senv_preserved.
+  constructor; auto.
+  auto using same_rs_setres.
+(* cond *)
+- econstructor; split.
+  eapply exec_Icond; eauto.
+  rewrite <- (same_rs_subst rs rs' args SAME); eassumption.
+  constructor; auto.
+(* jumptbl *)
+- econstructor; split.
+  eapply exec_Ijumptable; eauto.
+  rewrite <- (same_rs_read rs rs' arg SAME); eassumption.
+  constructor; auto.
+(* return *)
+- econstructor; split.
+  eapply exec_Ireturn; eauto.
+  destruct or; cbn.
+  + rewrite <- (same_rs_read rs rs' r SAME) by auto.
+    constructor; auto.
+  + constructor; auto.
+(* internal function *)
+-  simpl. econstructor; split.
+  eapply exec_function_internal; eauto.
+  constructor; auto.
+  cbn.
+  apply same_rs_refl.
+(* external function *)
+- econstructor; split.
+  eapply exec_function_external; eauto.
+    eapply external_call_symbols_preserved; eauto. apply senv_preserved.
+  constructor; auto.
+(* return *)
+- inv STACKS. inv H1.
+  econstructor; split.
+  eapply exec_return; eauto.
+  constructor; auto using same_rs_write.
+Qed.
+
+Lemma transf_initial_states:
+  forall S1, RTL.initial_state prog S1 ->
+  exists S2, RTL.initial_state tprog S2 /\ match_states S1 S2.
+Proof.
+  intros. inv H. econstructor; split.
+  econstructor.
+    eapply (Genv.init_mem_transf TRANSL); eauto.
+    rewrite symbols_preserved. rewrite (match_program_main TRANSL). eauto.
+    eapply function_ptr_translated; eauto.
+    rewrite <- H3; apply sig_preserved.
+  constructor. constructor.
+Qed.
+
+Lemma transf_final_states:
+  forall S1 S2 r, match_states S1 S2 -> RTL.final_state S1 r -> RTL.final_state S2 r.
+Proof.
+  intros. inv H0. inv H. inv STACKS. constructor.
+Qed.
+
+Theorem transf_program_correct:
+  forward_simulation (RTL.semantics prog) (RTL.semantics tprog).
+Proof.
+  eapply forward_simulation_step.
+  apply senv_preserved.
+  eexact transf_initial_states.
+  eexact transf_final_states.
+  exact step_simulation.
+Qed.
+
+End PRESERVATION.
diff --git a/backend/LICMaux.ml b/backend/LICMaux.ml
index c3907809..0ca4418b 100644
--- a/backend/LICMaux.ml
+++ b/backend/LICMaux.ml
@@ -19,6 +19,62 @@ open Inject;;
 
 type reg = P.t;;
 
+(** get_loop_headers moved from Duplicateaux.ml to LICMaux.ml to prevent cycle dependencies *)
+let debug_flag = ref false
+
+let debug fmt =
+  if !debug_flag then Printf.eprintf fmt
+  else Printf.ifprintf stderr fmt
+
+type vstate = Unvisited | Processed | Visited
+
+let get_some = function
+| None -> failwith "Did not get some"
+| Some thing -> thing
+
+let rtl_successors = function
+| Itailcall _ | Ireturn _ -> []
+| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n)
+| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n]
+| Icond (_,_,n1,n2,_) -> [n1; n2]
+| Ijumptable (_,ln) -> ln
+
+(** Getting loop branches with a DFS visit :
+  * Each node is either Unvisited, Visited, or Processed
+  * pre-order: node becomes Processed
+  * post-order: node becomes Visited
+  *
+  * If we come accross an edge to a Processed node, it's a loop!
+  *)
+let get_loop_headers code entrypoint = begin
+  debug "get_loop_headers\n";
+  let visited = ref (PTree.map (fun n i -> Unvisited) code)
+  and is_loop_header = ref (PTree.map (fun n i -> false) code)
+  in let rec dfs_visit code = function
+  | [] -> ()
+  | node :: ln ->
+      match (get_some @@ PTree.get node !visited) with
+      | Visited -> ()
+      | Processed -> begin
+          debug "Node %d is a loop header\n" (P.to_int node);
+          is_loop_header := PTree.set node true !is_loop_header;
+          visited := PTree.set node Visited !visited
+        end
+      | Unvisited -> begin
+          visited := PTree.set node Processed !visited;
+          match PTree.get node code with
+          | None -> failwith "No such node"
+          | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits;
+          visited := PTree.set node Visited !visited;
+          dfs_visit code ln
+        end
+  in begin
+    dfs_visit code [entrypoint];
+    !is_loop_header
+  end
+end
+
+
 module Dominator =
   struct
     type t = Unreachable | Dominated of int | Multiple
@@ -57,7 +113,7 @@ let apply_dominator (is_marked : node -> bool) (pc : node)
 
 let dominated_parts1 (f : coq_function) :
       (bool PTree.t) * (Dominator.t PMap.t option) =
-  let headers = Duplicateaux.get_loop_headers f.fn_code f.fn_entrypoint in
+  let headers = get_loop_headers f.fn_code f.fn_entrypoint in
   let dominated = Dominator_Solver.fixpoint f.fn_code RTL.successors_instr
     (apply_dominator (fun pc -> match PTree.get pc headers with
                                 | Some x -> x
@@ -248,7 +304,7 @@ let print_dominated_parts1 oc f =
        (PTree.elements f.fn_code);;
   
 let loop_headers (f : coq_function) : RTL.node list =
-  List.map fst (List.filter snd (PTree.elements (Duplicateaux.get_loop_headers f.fn_code f.fn_entrypoint)));;
+  List.map fst (List.filter snd (PTree.elements (get_loop_headers f.fn_code f.fn_entrypoint)));;
 
 let print_loop_headers f =
   print_endline "Loop headers";
diff --git a/config_kvx_elf.sh b/config_kvx_elf.sh
new file mode 100755
index 00000000..f1430417
--- /dev/null
+++ b/config_kvx_elf.sh
@@ -0,0 +1 @@
+exec ./config_simple.sh kvx-elf "$@"
diff --git a/configure b/configure
index f2fd1762..a73e7879 100755
--- a/configure
+++ b/configure
@@ -457,7 +457,7 @@ if test "$arch" = "kvx"; then
   fi
   osupper=`echo $os|tr a-z A-Z`
   k1base="kvx-$os"
-  casm="kvx-elf-as"
+  casm="$k1base-as"
   casm_options="$model_options"
   cc="$k1base-gcc $model_options"
   clinker="$k1base-gcc"
diff --git a/doc/index-kvx.html b/doc/index-kvx.html
index 95fdb6de..b8850727 100644
--- a/doc/index-kvx.html
+++ b/doc/index-kvx.html
@@ -34,10 +34,10 @@ a:active {color : Red; text-decoration : underline; }
   The unmodified parts of this table appear in <font color=gray>gray</font>.
   <br>
   <br>
-  A high-level view of this backend of CompCert is provided by this HAL preprint of Six, Boulm&eacute; and Monniaux (2019):
-  <div><a href=https://hal.archives-ouvertes.fr/hal-02185883>Certified Compiler Backends for VLIW Processors (Highly Modular Postpass-Scheduling in the CompCert Certified Compiler)</a></div>
+  A high-level view of this CompCert backend is provided by this OOPSLA'20 paper (of Six, Boulm&eacute; and Monniaux):
+  <div><a href=https://hal.archives-ouvertes.fr/hal-02185883>Certified and Efficient Instruction Scheduling. Application to Interlocked VLIW Processors.</a></div>
   <br>
-  Our source code is available on our <a href=https://gricad-gitlab.univ-grenoble-alpes.fr/certicompil/compcert-kvx>GitLab public repository</a> (see conditions in the LICENSE file).
+  See also the <tt>README.md</tt> of our <a href=https://gricad-gitlab.univ-grenoble-alpes.fr/certicompil/compcert-kvx>GitLab public repository</a>.
 </p>
 
 <font color=gray><H2>Table of contents</H2>
@@ -60,11 +60,11 @@ inequations by fixpoint iteration.
 <LI> <A HREF="html/compcert.lib.Postorder.html">Postorder</A>: postorder numbering of a directed graph.
 </UL></font>
 
-<H4>The <tt>abstractbb</tt> library, introduced for MPPA-KVX</H4>
+<H4>The <tt>abstractbb</tt> library, introduced for KVX core</H4>
 <UL>
 <LI> <A HREF="html/compcert.kvx.abstractbb.AbstractBasicBlocksDef.html">AbstractBasicBlocksDef</A>: an IR for verifying some semantic properties on basic-blocks.
 <LI> <A HREF="html/compcert.kvx.abstractbb.Parallelizability.html">Parallelizability</A>: verifying that sequential and parallel semantics are equivalent for a given abstract basic-block.
-<LI> <A HREF="html/compcert.kvx.abstractbb.ImpSimuTest.html">ImpSimuTest</A>: verifying that a given abstract basic-block is simulated by another one for sequential semantics. This module refines <A HREF="html/compcert.kvx.abstractbb.SeqSimuTheory.html">SeqSimuTheory</A> with hash-consing. 
+<LI> <A HREF="html/compcert.kvx.abstractbb.ImpSimuTest.html">ImpSimuTest</A>: verifying that a given abstract basic-block is simulated by another one for sequential semantics. This module refines <A HREF="html/compcert.kvx.abstractbb.SeqSimuTheory.html">SeqSimuTheory</A> with hash-consing and uses <A HREF=https://github.com/boulme/ImpureDemo>the Impure library</A> to reason on physical equality and handling of imperative code in Coq.
 </UL>
 
 <font color=gray>
@@ -121,11 +121,11 @@ replaced by a linear list of instructions with explicit branches and labels.
 view of the activation record.  
 </UL>
 </font>
-<H4>Languages introduced for MPPA-KVX</H4>
+<H4>Languages introduced for KVX core</H4>
 <UL>
 <LI> <A HREF="html/compcert.kvx.lib.Machblock.html">Machblock</A>: a variant of Mach, with a syntax for basic-blocks, and a block-step semantics (execute one basic-block in one step).
-This IR is generic over the processor, even if currently, only used for MPPA_KVX.
-<LI> <A HREF="html/compcert.kvx.Asmvliw.html"><I>Asmvliw</I></A>: abstract syntax and semantics for Mppa_KVX VLIW assembly: atomic instructions are grouped by "bundles". These bundles are executed sequentially, but execution is parallel within bundles.
+This IR is generic over the processor, even if currently, only used for KVX.
+<LI> <A HREF="html/compcert.kvx.Asmvliw.html"><I>Asmvliw</I></A>: abstract syntax and semantics for KVX VLIW assembly: atomic instructions are grouped by "bundles". These bundles are executed sequentially, but execution is parallel within bundles.
 <LI> <A HREF="html/compcert.kvx.Asmblock.html"><I>Asmblock</I></A>: a variant of Asmvliw, with a sequential semantics within bundles, which make them corresponds here to usual basic-blocks.
   This IR is an intermediate step between Machblock and Asmvliw.
 <LI> <A HREF="html/compcert.kvx.Asm.html"><I>Asm</I></A>: a variant of Asmvliw with a flat syntax for bundles, instead of a structured one (bundle termination is encoded as a pseudo-instruction). This IR is mainly a wrapper of <I>Asmvliw</I> for a smooth integration in CompCert (and an easier pretty-printing of the abstract syntax).
@@ -294,7 +294,7 @@ This IR is generic over the processor, even if currently, only used for MPPA_KVX
 </TR>
 </TABLE>
 
-<H4>Compilation passes introduced for MPPA-KVX</H4>
+<H4>Compilation passes introduced for KVX VLIW</H4>
 <TABLE cellpadding="5%">
 <TR valign="top">
   <TD>Reconstruction of basic-blocks at Mach level</TD>
@@ -325,7 +325,7 @@ This IR is generic over the processor, even if currently, only used for MPPA_KVX
   <TD>Flattening bundles (only a bureaucratic operation)</TD>
   <TD>Asmvliw to Asm</TD>
   <TD><A HREF="html/compcert.kvx.Asmgen.html"><I>Asmgen</I></A></TD>
-  <TD><A HREF="html/compcert.kvx.Asmgenproof.html"><I>Asmgenproof</I></A></TD>
+  <TD><A HREF="html/compcert.kvx.Asmgenproof.html"><I>Asmgenproof</I></A> (whole simulation proof from <tt>Mach</tt> to <tt>Asm</tt>)</TD>
 </TR>
 </TABLE>
 
diff --git a/driver/Clflags.ml b/driver/Clflags.ml
index eb21b3f8..9df58903 100644
--- a/driver/Clflags.ml
+++ b/driver/Clflags.ml
@@ -33,9 +33,14 @@ let option_fcse3_across_calls = ref false
 let option_fcse3_across_merges = ref true
 let option_fcse3_glb = ref true
 let option_fredundancy = ref true
-let option_fduplicate = ref (-1)
-let option_finvertcond = ref true
-let option_ftracelinearize = ref false
+
+(** Options relative to superblock scheduling *)
+let option_fpredict = ref true (* insert static branch prediction information, and swaps ifso/ifnot branches accordingly *)
+let option_ftailduplicate = ref 0 (* perform tail duplication for blocks of size n *)
+let option_ftracelinearize = ref true (* uses branch prediction information to improve the linearization *)
+let option_funrollsingle = ref 0 (* unroll a single iteration of innermost loops of size n *)
+let option_funrollbody = ref 0 (* unroll the body of innermost loops of size n *)
+
 let option_fpostpass = ref true
 let option_fpostpass_sched = ref "list"
 let option_fifconversion = ref true
diff --git a/driver/Compopts.v b/driver/Compopts.v
index d576ede6..540e8922 100644
--- a/driver/Compopts.v
+++ b/driver/Compopts.v
@@ -27,9 +27,6 @@ Parameter generate_float_constants: unit -> bool.
 (** For value analysis.  Currently always false. *)
 Parameter va_strict: unit -> bool.
 
-(** Flag -fduplicate. Branch prediction annotation + tail duplication *)
-Parameter optim_duplicate: unit -> bool.
-
 (** Flag -ftailcalls.  For tail call optimization. *)
 Parameter optim_tailcalls: unit -> bool.
 
diff --git a/driver/Driver.ml b/driver/Driver.ml
index 90afb812..12f50762 100644
--- a/driver/Driver.ml
+++ b/driver/Driver.ml
@@ -207,15 +207,12 @@ Processing options:
   -fpostpass     Perform postpass scheduling (only for K1 architecture) [on]
   -fpostpass= <optim> Perform postpass scheduling with the specified optimization [list]
                    (<optim>=list: list scheduling, <optim>=ilp: ILP, <optim>=greedy: just packing bundles)
-  -fduplicate <nb_nodes> Perform tail duplication to form superblocks on predicted traces
-    nb_nodes control the heuristic deciding to duplicate or not
-    A value of -1 desactivates the entire pass (including branch prediction)
-    A value of 0 desactivates the duplication (but activates the branch prediction)
-    FIXME : this is desactivated by default for now
-    -finvertcond    Invert conditions based on predicted paths (to prefer fallthrough).
-                    Requires -fduplicate to be also activated [on]
-    -ftracelinearize Linearizes based on the traces identified by duplicate phase
-                    It is heavily recommended to activate -finvertcond with this pass [off]
+  -fpredict      Insert static branch prediction information [on]
+                 Also swaps ifso/ifnot branches accordingly at RTL level
+  -ftailduplicate n Perform tail duplication for RTL code blocks of size n (not counting Inops) [0]
+  -ftracelinearize  Uses branch prediction information to improve the Linearize [on]
+  -funrollsingle n  Unrolls a single iteration of innermost loops of size n (not counting Inops) [0]
+  -funrollbody n  Unrolls once the body of innermost loops of size n (not counting Inops) [0]
   -fforward-moves   Forward moves after CSE
   -finline       Perform inlining of functions [on]
   -finline-functions-called-once Integrate functions only required by their
@@ -280,6 +277,7 @@ let dump_mnemonics destfile =
 let optimization_options = [
     option_ftailcalls; option_fifconversion; option_fconstprop;
     option_fcse; option_fcse2; option_fcse3;
+    option_fpredict; option_ftracelinearize;
     option_fpostpass;
     option_fredundancy; option_finline; option_finline_functions_called_once;
 ]
@@ -420,8 +418,10 @@ let cmdline_actions =
   @ f_opt "move-loop-invariants" option_fmove_loop_invariants
   @ f_opt "redundancy" option_fredundancy
   @ f_opt "postpass" option_fpostpass
-  @ [ Exact "-fduplicate", Integer (fun n -> option_fduplicate := n) ]
-  @ f_opt "invertcond" option_finvertcond
+  @ [ Exact "-ftailduplicate", Integer (fun n -> option_ftailduplicate := n) ]
+  @ f_opt "predict" option_fpredict
+  @ [ Exact "-funrollsingle", Integer (fun n -> option_funrollsingle := n) ]
+  @ [ Exact "-funrollbody", Integer (fun n -> option_funrollbody := n) ]
   @ f_opt "tracelinearize" option_ftracelinearize
   @ f_opt_str "postpass" option_fpostpass option_fpostpass_sched
   @ f_opt "inline" option_finline
diff --git a/extraction/extraction.v b/extraction/extraction.v
index e43594fc..bd396cd8 100644
--- a/extraction/extraction.v
+++ b/extraction/extraction.v
@@ -111,8 +111,6 @@ Extract Constant Compopts.generate_float_constants =>
   "fun _ -> !Clflags.option_ffloatconstprop >= 2".
 Extract Constant Compopts.optim_tailcalls =>
   "fun _ -> !Clflags.option_ftailcalls".
-Extract Constant Compopts.optim_duplicate =>
-  "fun _ -> (if !Clflags.option_fduplicate = -1 then false else true)".
 Extract Constant Compopts.optim_constprop =>
   "fun _ -> !Clflags.option_fconstprop".
 Extract Constant Compopts.optim_CSE =>
diff --git a/kvx/Asm.v b/kvx/Asm.v
index 69d0ecf6..515e13e0 100644
--- a/kvx/Asm.v
+++ b/kvx/Asm.v
@@ -13,7 +13,7 @@
 (*                                                             *)
 (* *************************************************************)
 
-(** * Abstract syntax for KVX textual assembly language.
+(** Abstract syntax for KVX textual assembly language.
 
     Each emittable instruction is defined here. ';;' is also defined as an instruction.
     The goal of this representation is to stay compatible with the rest of the generic backend of CompCert
@@ -49,7 +49,7 @@ Inductive addressing : Type :=
   | ARegXS (ro: ireg)
 .
 
-(** Syntax *)
+(** * Syntax *)
 Inductive instruction : Type :=
   (** pseudo instructions *)
   | Pallocframe (sz: Z) (pos: ptrofs)               (**r allocate new stack frame *)
@@ -104,7 +104,7 @@ Inductive instruction : Type :=
   | Pclzll (rd rs: ireg)
   | Pstsud (rd rs1 rs2: ireg)
             
-  (** Loads **)
+  (** Loads *)
   | Plb     (trap: trapping_mode) (rd: ireg) (ra: ireg) (ofs: addressing)     (**r load byte *)
   | Plbu    (trap: trapping_mode) (rd: ireg) (ra: ireg) (ofs: addressing)     (**r load byte unsigned *)
   | Plh     (trap: trapping_mode) (rd: ireg) (ra: ireg) (ofs: addressing)     (**r load half word *)
@@ -118,7 +118,7 @@ Inductive instruction : Type :=
   | Plq     (rs: gpreg_q) (ra: ireg) (ofs: addressing)  (**r load 2*64-bit *)
   | Plo     (rs: gpreg_o) (ra: ireg) (ofs: addressing)  (**r load 4*64-bit *)
 
-  (** Stores **)
+  (** Stores *)
   | Psb     (rs: ireg) (ra: ireg) (ofs: addressing)     (**r store byte *)
   | Psh     (rs: ireg) (ra: ireg) (ofs: addressing)     (**r store half byte *)
   | Psw     (rs: ireg) (ra: ireg) (ofs: addressing)     (**r store int32 *)
@@ -547,6 +547,8 @@ Definition basic_to_instruction (b: basic) :=
   | PStoreORRO qrs ra ofs => Pso qrs ra (AOff ofs)
   end.
 
+(** * Semantics (given through the existence of well-formed VLIW program) *)
+
 Section RELSEM.
 
 Definition code := list instruction.
@@ -609,15 +611,15 @@ Program Definition genv_trans (ge: genv) : Asmvliw.genv :=
       Genv.genv_defs := PTree.map1 globdef_proj (Genv.genv_defs ge);
       Genv.genv_next := Genv.genv_next ge |}.
 Next Obligation.
-  destruct ge. simpl in *. eauto.
+  destruct ge. cbn in *. eauto.
 Qed. Next Obligation.
-  destruct ge; simpl in *.
+  destruct ge; cbn in *.
   rewrite PTree.gmap1 in H.
   destruct (genv_defs ! b) eqn:GEN.
   - eauto.
   - discriminate.
 Qed. Next Obligation.
-  destruct ge; simpl in *.
+  destruct ge; cbn in *.
   eauto.
 Qed.
 
@@ -653,14 +655,14 @@ Program Definition transf_function (f: Asmvliw.function) : function :=
 
 Lemma transf_function_proj: forall f, function_proj (transf_function f) = f.
 Proof.
-  intros f. destruct f as [sig blks]. unfold function_proj. simpl. auto.
+  intros f. destruct f as [sig blks]. unfold function_proj. cbn. auto.
 Qed.
 
 Definition transf_fundef : Asmvliw.fundef -> fundef := AST.transf_fundef transf_function.
 
 Lemma transf_fundef_proj: forall f, fundef_proj (transf_fundef f) = f.
 Proof.
-  intros f. destruct f as [f|e]; simpl; auto.
+  intros f. destruct f as [f|e]; cbn; auto.
   rewrite transf_function_proj. auto.
 Qed.
 
@@ -672,18 +674,18 @@ Lemma program_equals {A B: Type} : forall (p1 p2: AST.program A B),
   prog_main p1 = prog_main p2 ->
   p1 = p2.
 Proof.
-  intros. destruct p1. destruct p2. simpl in *. subst. auto.
+  intros. destruct p1. destruct p2. cbn in *. subst. auto.
 Qed.
 
 Lemma transf_program_proj: forall p, program_proj (transf_program p) = p.
 Proof.
-  intros p. destruct p as [defs pub main]. unfold program_proj. simpl.
-  apply program_equals; simpl; auto.
+  intros p. destruct p as [defs pub main]. unfold program_proj. cbn.
+  apply program_equals; cbn; auto.
   induction defs.
-  - simpl; auto.
-  - simpl. rewrite IHdefs. 
-    destruct a as [id gd]; simpl.
-    destruct gd as [f|v]; simpl; auto.
+  - cbn; auto.
+  - cbn. rewrite IHdefs. 
+    destruct a as [id gd]; cbn.
+    destruct gd as [f|v]; cbn; auto.
     rewrite transf_fundef_proj. auto.
 Qed.
 
@@ -705,16 +707,16 @@ Lemma match_program_transf:
   forall p tp, match_prog p tp -> transf_program p = tp.
 Proof.
   intros p tp H. inversion_clear H. inv H1.
-  destruct p as [defs pub main]. destruct tp as [tdefs tpub tmain]. simpl in *.
-  subst. unfold transf_program. unfold transform_program. simpl.
-  apply program_equals; simpl; auto.
-  induction H0; simpl; auto.
+  destruct p as [defs pub main]. destruct tp as [tdefs tpub tmain]. cbn in *.
+  subst. unfold transf_program. unfold transform_program. cbn.
+  apply program_equals; cbn; auto.
+  induction H0; cbn; auto.
   rewrite IHlist_forall2. apply cons_extract.
   destruct a1 as [ida gda]. destruct b1 as [idb gdb].
-  simpl in *.
+  cbn in *.
   inv H. inv H2.
-  - simpl in *. subst. auto.
-  - simpl in *. subst. inv H. auto.
+  - cbn in *. subst. auto.
+  - cbn in *. subst. inv H. auto.
 Qed.
 
 Section PRESERVATION.
@@ -742,7 +744,7 @@ Proof.
   pose proof (match_program_transf prog tprog TRANSF) as TR.
   subst. unfold semantics. rewrite transf_program_proj.
 
-  eapply forward_simulation_step with (match_states := match_states); simpl; auto.
+  eapply forward_simulation_step with (match_states := match_states); cbn; auto.
   - intros. exists s1. split; auto. congruence.
   - intros. inv H. auto.
   - intros. exists s1'. inv H0. split; auto. congruence.
diff --git a/kvx/Asmblock.v b/kvx/Asmblock.v
index 9c8e4cc3..64b2c535 100644
--- a/kvx/Asmblock.v
+++ b/kvx/Asmblock.v
@@ -78,7 +78,7 @@ Fixpoint code_to_basics (c: code) :=
 
 Lemma code_to_basics_id: forall c, code_to_basics (basics_to_code c) = Some c.
 Proof.
-  intros. induction c as [|i c]; simpl; auto.
+  intros. induction c as [|i c]; cbn; auto.
   rewrite IHc. auto.
 Qed.
 
@@ -88,8 +88,8 @@ Lemma code_to_basics_dist:
   code_to_basics c' = Some l' ->
   code_to_basics (c ++ c') = Some (l ++ l').
 Proof.
-  induction c as [|i c]; simpl; auto.
-  - intros. inv H. simpl. auto.
+  induction c as [|i c]; cbn; auto.
+  - intros. inv H. cbn. auto.
   - intros. destruct i; try discriminate. destruct (code_to_basics c) eqn:CTB; try discriminate.
     inv H. erewrite IHc; eauto. auto.
 Qed.
@@ -138,9 +138,9 @@ Lemma non_empty_bblock_refl:
 Proof.
   intros. split.
   - destruct body; destruct exit.
-    all: simpl; auto. intros. inversion H; contradiction.
+    all: cbn; auto. intros. inversion H; contradiction.
   - destruct body; destruct exit.
-    all: simpl; auto.
+    all: cbn; auto.
     all: intros; try (right; discriminate); try (left; discriminate).
     contradiction.
 Qed.
@@ -155,14 +155,14 @@ Lemma builtin_alone_refl:
 Proof.
   intros. split.
   - destruct body; destruct exit.
-    all: simpl; auto.
-    all: exploreInst; simpl; auto.
+    all: cbn; auto.
+    all: exploreInst; cbn; auto.
     unfold builtin_alone. intros. assert (Some (Pbuiltin e l b0) = Some (Pbuiltin e l b0)); auto.
     assert (b :: body = nil). eapply H; eauto. discriminate.
   - destruct body; destruct exit.
-    all: simpl; auto; try constructor.
+    all: cbn; auto; try constructor.
     + exploreInst; try discriminate.
-        simpl. contradiction.
+        cbn. contradiction.
     + intros. discriminate.
 Qed.
 
@@ -185,14 +185,14 @@ Ltac bblock_auto_correct := (apply non_empty_bblock_refl; try discriminate; try
 
 Lemma Istrue_proof_irrelevant (b: bool): forall (p1 p2:Is_true b), p1=p2.
 Proof.
-  destruct b; simpl; auto.
+  destruct b; cbn; auto.
   - destruct p1, p2; auto.
   - destruct p1.
 Qed.
 
 Lemma bblock_equality bb1 bb2: header bb1=header bb2 -> body bb1 = body bb2 -> exit bb1 = exit bb2 -> bb1 = bb2.
 Proof.
-  destruct bb1 as [h1 b1 e1 c1], bb2 as [h2 b2 e2 c2]; simpl.
+  destruct bb1 as [h1 b1 e1 c1], bb2 as [h2 b2 e2 c2]; cbn.
   intros; subst.
   rewrite (Istrue_proof_irrelevant _ c1 c2).
   auto.
@@ -212,51 +212,51 @@ Qed.
 Lemma length_nonil {A: Type} : forall l:(list A), l <> nil -> (length l > 0)%nat.
 Proof.
   intros. destruct l; try (contradict H; auto; fail).
-  simpl. omega.
+  cbn. omega.
 Qed.
 
 Lemma to_nat_pos : forall z:Z, (Z.to_nat z > 0)%nat -> z > 0.
 Proof.
   intros. destruct z; auto.
-  - contradict H. simpl. apply gt_irrefl.
+  - contradict H. cbn. apply gt_irrefl.
   - apply Zgt_pos_0.
-  - contradict H. simpl. apply gt_irrefl.
+  - contradict H. cbn. apply gt_irrefl.
 Qed.
 
 Lemma size_positive (b:bblock): size b > 0.
 Proof.
-  unfold size. destruct b as [hd bdy ex cor]. simpl.
-  destruct ex; destruct bdy; try (apply to_nat_pos; rewrite Nat2Z.id; simpl; omega).
-  inversion cor; contradict H; simpl; auto.
+  unfold size. destruct b as [hd bdy ex cor]. cbn.
+  destruct ex; destruct bdy; try (apply to_nat_pos; rewrite Nat2Z.id; cbn; omega).
+  inversion cor; contradict H; cbn; auto.
 Qed.
 
 
 Program Definition no_header (bb : bblock) := {| header := nil; body := body bb; exit := exit bb |}.
 Next Obligation.
-  destruct bb; simpl. assumption.
+  destruct bb; cbn. assumption.
 Defined.
 
 Lemma no_header_size:
   forall bb, size (no_header bb) = size bb.
 Proof.
-  intros. destruct bb as [hd bdy ex COR]. unfold no_header. simpl. reflexivity.
+  intros. destruct bb as [hd bdy ex COR]. unfold no_header. cbn. reflexivity.
 Qed.
 
 Program Definition stick_header (h : list label) (bb : bblock) := {| header := h; body := body bb; exit := exit bb |}.
 Next Obligation.
-  destruct bb; simpl. assumption.
+  destruct bb; cbn. assumption.
 Defined.
 
 Lemma stick_header_size:
   forall h bb, size (stick_header h bb) = size bb.
 Proof.
-  intros. destruct bb. unfold stick_header. simpl. reflexivity.
+  intros. destruct bb. unfold stick_header. cbn. reflexivity.
 Qed.
 
 Lemma stick_header_no_header:
   forall bb, stick_header (header bb) (no_header bb) = bb.
 Proof.
-  intros. destruct bb as [hd bdy ex COR]. simpl. unfold no_header; unfold stick_header; simpl. reflexivity.
+  intros. destruct bb as [hd bdy ex COR]. cbn. unfold no_header; unfold stick_header; cbn. reflexivity.
 Qed.
 
 (** * Sequential Semantics of basic blocks *)
@@ -308,7 +308,7 @@ Fixpoint exec_body (body: list basic) (rs: regset) (m: mem): outcome :=
 Theorem builtin_body_nil:
   forall bb ef args res, exit bb = Some (PExpand (Pbuiltin ef args res)) -> body bb = nil.
 Proof.
-  intros. destruct bb as [hd bdy ex WF]. simpl in *.
+  intros. destruct bb as [hd bdy ex WF]. cbn in *.
   apply wf_bblock_refl in WF. inv WF. unfold builtin_alone in H1.
   eapply H1; eauto.
 Qed.
@@ -321,11 +321,11 @@ Theorem exec_body_app:
     /\ exec_body l' rs' m' = Next rs'' m''.
 Proof.
   induction l.
-  - intros. simpl in H. repeat eexists. auto.
-  - intros. rewrite <- app_comm_cons in H. simpl in H.
+  - intros. cbn in H. repeat eexists. auto.
+  - intros. rewrite <- app_comm_cons in H. cbn in H.
     destruct (exec_basic_instr a rs m) eqn:EXEBI.
     + apply IHl in H. destruct H as (rs1 & m1 & EXEB1 & EXEB2).
-      repeat eexists. simpl. rewrite EXEBI. eauto. auto.
+      repeat eexists. cbn. rewrite EXEBI. eauto. auto.
     + discriminate.
 Qed.
 
diff --git a/kvx/Asmblockdeps.v b/kvx/Asmblockdeps.v
index 1881e7e9..3d981100 100644
--- a/kvx/Asmblockdeps.v
+++ b/kvx/Asmblockdeps.v
@@ -12,12 +12,14 @@
 (*                                                             *)
 (* *************************************************************)
 
-(** * Translation from Asmblock to AbstractBB 
+(** * Translation from [Asmvliw] to [AbstractBB] *)
 
-    We define a specific instance of AbstractBB, named L, translate bblocks from Asmblock into this instance
-    AbstractBB will then define two semantics for L : a sequential, and a semantic one
-    We prove a bisimulation between the parallel semantics of L and AsmVLIW
-    From this, we also deduce a bisimulation between the sequential semantics of L and Asmblock *)
+(** We define a specific instance [L] of [AbstractBB] and translate [bblocks] from [Asmvliw] into [L].
+    [AbstractBB] will then define two semantics for [L]: a sequential and a parallel one.
+    We prove a bisimulation between the parallel semantics of [L] and [AsmVLIW].
+    We also prove a bisimulation between the sequential semantics of [L] and [Asmblock].
+    Then, the checkers on [Asmblock] and [Asmvliw] are deduced from those of [L].
+ *)
 
 Require Import AST.
 Require Import Asmblock.
@@ -40,7 +42,7 @@ Require Import Lia.
 
 Open Scope impure.
 
-(** Definition of L *)
+(** Definition of [L] *)
 
 Module P<: ImpParam.
 Module R := Pos.
@@ -660,7 +662,7 @@ Module IST := ImpSimu L ImpPosDict.
 Import L.
 Import P.
 
-(** Compilation from Asmblock to L *)
+(** Compilation from [Asmvliw] to [L] *)
 
 Local Open Scope positive_scope.
 
@@ -748,6 +750,8 @@ Definition inv_ppos (p: R.t) : option preg :=
 
 Notation "a @ b" := (Econs a b) (at level 102, right associativity).
 
+(** Translations of instructions *)
+
 Definition trans_control (ctl: control) : inst :=
   match ctl with
   | Pret => [(#PC, PReg(#RA))]
@@ -859,6 +863,8 @@ Proof.
   intros. destruct bb as [hdr bdy ex COR]; unfold no_header; simpl. unfold trans_block. simpl. reflexivity.
 Qed.
 
+(** Lemmas on the translation *)
+
 Definition state := L.mem.
 Definition exec := L.run.
 
@@ -1800,6 +1806,7 @@ Definition bblock_simu_test (verb: bool) (p1 p2: Asmvliw.bblock) : ?? bool :=
 
 Local Hint Resolve IST.bblock_simu_test_correct bblock_simu_reduce IST.verb_bblock_simu_test_correct: wlp.
 
+(** Main simulation (Impure) theorem *) 
 Theorem bblock_simu_test_correct verb p1 p2 :
   WHEN bblock_simu_test verb p1 p2 ~> b THEN b=true -> forall ge fn, Asmblockprops.bblock_simu ge fn p1 p2.
 Proof.
@@ -1807,7 +1814,7 @@ Proof.
 Qed.
 Hint Resolve bblock_simu_test_correct: wlp.
 
-(* Coerce bblock_simu_test into a pure function (this is a little unsafe like all oracles in CompCert). *)
+(** ** Coerce bblock_simu_test into a pure function (this is a little unsafe like all oracles in CompCert). *)
 
 Import UnsafeImpure.
 
diff --git a/kvx/Asmblockgenproof.v b/kvx/Asmblockgenproof.v
index 5cb498bc..df1a070f 100644
--- a/kvx/Asmblockgenproof.v
+++ b/kvx/Asmblockgenproof.v
@@ -13,7 +13,7 @@
 (*                                                             *)
 (* *************************************************************)
 
-(** Correctness proof for RISC-V generation: main proof. *)
+(** Correctness proof for kvx/Asmblock generation: main proof. *)
 
 Require Import Coqlib Errors.
 Require Import Integers Floats AST Linking.
diff --git a/kvx/Asmblockprops.v b/kvx/Asmblockprops.v
index bc14b231..c3929be5 100644
--- a/kvx/Asmblockprops.v
+++ b/kvx/Asmblockprops.v
@@ -53,7 +53,7 @@ Qed.
 Lemma preg_of_not_SP:
   forall r, preg_of r <> SP.
 Proof.
-  intros. unfold preg_of; destruct r; simpl; congruence.
+  intros. unfold preg_of; destruct r; cbn; congruence.
 Qed.
 
 Hint Resolve preg_of_not_SP preg_of_not_PC: asmgen.
@@ -233,7 +233,7 @@ Proof.
   destruct (ireg_eq rd2 ra); try discriminate.
 *)
   rewrite Pregmap.gso; try discriminate.
-  simpl in *.
+  cbn in *.
   destruct (Mem.loadv _ _ _); try discriminate.
   destruct (Mem.loadv _ _ _); try discriminate.
   destruct (Mem.loadv _ _ _); try discriminate.
@@ -264,7 +264,7 @@ Lemma exec_store_q_offset_pc_var:
   exec_store_q_offset rs # PC <- v m rd ra ofs = Next rs' # PC <- v m'.
 Proof.
   intros. unfold exec_store_q_offset in *. unfold parexec_store_q_offset in *. rewrite Pregmap.gso; try discriminate.
-  simpl in *.
+  cbn in *.
   destruct (gpreg_q_expand _) as [s0 s1].
   destruct (Mem.storev _ _ _); try discriminate.
   destruct (Mem.storev _ _ _); try discriminate.
diff --git a/kvx/Asmgenproof.v b/kvx/Asmgenproof.v
index f43acd37..636c105f 100644
--- a/kvx/Asmgenproof.v
+++ b/kvx/Asmgenproof.v
@@ -13,7 +13,7 @@
 (*                                                             *)
 (* *************************************************************)
 
-(** Correctness proof for Asmgen *)
+(** Composing all passes from Mach to KVX Asm *)
 
 Require Import Coqlib Errors.
 Require Import Integers Floats AST Linking.
@@ -39,14 +39,14 @@ Proof.
   unfold Asmgen.transf_program in H. apply bind_inversion in H. destruct H.
   inversion_clear H. apply bind_inversion in H1. destruct H1.
   inversion_clear H. inversion H2. unfold time, Compopts.time in *. remember (Machblockgen.transf_program p) as mbp.
-  unfold match_prog; simpl.
+  unfold match_prog; cbn.
   exists mbp; split. apply Machblockgenproof.transf_program_match; auto.
   exists x; split. apply Asmblockgenproof.transf_program_match; auto.
   exists x0; split. apply PostpassSchedulingproof.transf_program_match; auto.
   exists tp; split. apply Asm.transf_program_match; auto. auto.
 Qed.
 
-(** Return Address Offset *)
+(** Return Address Offset for Mach *)
 
 Definition return_address_offset: Mach.function -> Mach.code -> ptrofs -> Prop :=
   Mach_return_address_offset Asmblockgenproof.return_address_offset.
@@ -59,6 +59,7 @@ Proof.
   intros; eapply Asmblockgenproof.return_address_exists; eauto.
 Qed.
 
+(** Main preservation theorem: from Mach to KVX Asm *)
 
 Section PRESERVATION.
 
@@ -71,7 +72,7 @@ Let tge := Genv.globalenv tprog.
 Theorem transf_program_correct:
   forward_simulation (Mach.semantics return_address_offset prog) (Asm.semantics tprog).
 Proof.
-  unfold match_prog in TRANSF. simpl in TRANSF.
+  unfold match_prog in TRANSF. cbn in TRANSF.
   inv TRANSF. inv H. inv H1. inv H. inv H2. inv H. inv H3. inv H.
   eapply compose_forward_simulations. 
   exploit Machblockgenproof.transf_program_correct; eauto.
@@ -86,7 +87,7 @@ End PRESERVATION.
 Instance TransfAsm: TransfLink match_prog := pass_match_link (compose_passes block_passes).
 
 (*******************************************)
-(* Stub actually needed by driver/Compiler *)
+(** Stub actually needed by driver/Compiler *)
 
 Module Asmgenproof0.
 
diff --git a/kvx/Asmvliw.v b/kvx/Asmvliw.v
index 301ee69a..66b468d7 100644
--- a/kvx/Asmvliw.v
+++ b/kvx/Asmvliw.v
@@ -41,7 +41,7 @@ Require Import Chunks.
     this view induces our sequential semantics of bundles defined in [Asmblock].
 *)
 
-(** General Purpose registers. *)
+(** ** General Purpose registers. *)
 
 Inductive gpreg: Type :=
   | GPR0:  gpreg | GPR1:  gpreg | GPR2:  gpreg | GPR3:  gpreg | GPR4:  gpreg
@@ -165,7 +165,7 @@ End PregEq.
 
 Module Pregmap := EMap(PregEq).
 
-(** Conventional names for stack pointer ([SP]), return address ([RA]), frame pointer ([FP]) and other temporaries used *)
+(** ** Conventional names for stack pointer ([SP]), return address ([RA]), frame pointer ([FP]) and other temporaries used *)
 
 Notation "'SP'" := GPR12 (only parsing) : asm.
 Notation "'FP'" := GPR17 (only parsing) : asm.
@@ -173,6 +173,8 @@ Notation "'MFP'" := R17 (only parsing) : asm.
 Notation "'GPRA'" := GPR16 (only parsing) : asm.
 Notation "'RTMP'" := GPR32 (only parsing) : asm.
 
+(** ** Names of tests in comparisons *)
+
 Inductive btest: Type :=
   | BTdnez                              (**r Double Not Equal to Zero *)
   | BTdeqz                              (**r Double Equal to Zero *)
@@ -214,55 +216,47 @@ Inductive ftest: Type :=
   | FTult                               (**r Unordered or Less Than *)
   .
 
-(** Offsets for load and store instructions.  An offset is either an
-  immediate integer or the low part of a symbol. *)
+(** *** Offsets for load and store instructions. *)
 
 Definition offset : Type := ptrofs.
 
-(** We model a subset of the KVX instruction set. In particular, we do not
-  support floats yet.
+(** *** Labels for goto (in the current function) *)
 
-  Although it is possible to use the 32-bits mode, for now we don't support it.
+Definition label := positive.
 
-  We follow a design close to the one used for the Risc-V port: one set of
-  pseudo-instructions for 32-bit integer arithmetic, with suffix W, another
-  set for 64-bit integer arithmetic, with suffix L.
+(** ** Instructions *)
+
+(** We model a subset of the KVX instruction set.
 
-  When mapping to actual instructions, the OCaml code in TargetPrinter.ml
+-  Although it is possible to use the 32-bits mode, for now we don't support it. When mapping to actual instructions, the OCaml code in TargetPrinter.ml
   throws an error if we are not in 64-bits mode.
-*)
 
-(** * Instructions *)
+- We follow a design close to the one used for the Risc-V port: one set of
+  pseudo-instructions for 32-bit integer arithmetic, with suffix W, another
+  set for 64-bit integer arithmetic, with suffix L.
 
-Definition label := positive.
+- With respect to other CompCert assemblies, we define a type hierarchy of instructions (instead of a flat type).
+  This helps us to factorize similar cases for the scheduling verifier.
+
+*)
 
-(** Instructions to be expanded in control-flow *)
+(** *** Instructions to be expanded in control-flow *)
 Inductive ex_instruction : Type :=
   (* Pseudo-instructions *)
   | Pbuiltin: external_function -> list (builtin_arg preg)
               -> builtin_res preg -> ex_instruction   (**r built-in function (pseudo) *)
 .
 
-(** FIXME: comment not up to date !
-
-
- The pseudo-instructions are the following:
+(** Similarly to other CompCert assembly languages, the pseudo-instructions are the following:
 
 - [Ploadsymbol]: load the address of a symbol in an integer register.
-  Expands to the [la] assembler pseudo-instruction, which does the right
-  thing even if we are in PIC mode.
 
 - [Pallocframe sz pos]: in the formal semantics, this
   pseudo-instruction allocates a memory block with bounds [0] and
   [sz], stores the value of the stack pointer at offset [pos] in this
   block, and sets the stack pointer to the address of the bottom of
   this block.
-  In the printed ASM assembly code, this allocation is:
-<<
-        mv      x30, sp
-        sub     sp,  sp, #sz
-        sw      x30, #pos(sp)
->>
+
   This cannot be expressed in our memory model, which does not reflect
   the fact that stack frames are adjacent and allocated/freed
   following a stack discipline.
@@ -270,25 +264,13 @@ Inductive ex_instruction : Type :=
 - [Pfreeframe sz pos]: in the formal semantics, this pseudo-instruction
   reads the word at [pos] of the block pointed by the stack pointer,
   frees this block, and sets the stack pointer to the value read.
-  In the printed ASM assembly code, this freeing is just an increment of [sp]:
-<<
-        add     sp,  sp, #sz
->>
   Again, our memory model cannot comprehend that this operation
   frees (logically) the current stack frame.
 
 - [Pbtbl reg table]: this is a N-way branch, implemented via a jump table
-  as follows:
-<<
-        la      x31, table
-        add     x31, x31, reg
-        jr      x31
-table:  .long   table[0], table[1], ...
->>
-  Note that [reg] contains 4 times the index of the desired table entry.
 *)
 
-(** Control Flow instructions *)
+(** *** Control Flow instructions *)
 Inductive cf_instruction : Type :=
   | Pret                                            (**r return *)
   | Pcall   (l: label)                              (**r function call *)
@@ -305,7 +287,7 @@ Inductive cf_instruction : Type :=
   | Pcbu    (bt: btest) (r: ireg) (l: label)        (**r branch based on btest with unsigned semantics *)
 .
 
-(** Loads **)
+(** *** Loads *)
 Definition concrete_default_notrap_load_value (chunk : memory_chunk) :=
   match chunk with
   | Mint8signed | Mint8unsigned | Mint16signed | Mint16unsigned
@@ -337,7 +319,7 @@ Inductive ld_instruction : Type :=
   | PLoadORRO  (rd: gpreg_o) (ra: ireg) (ofs: offset)
 .
 
-(** Stores **)
+(** *** Stores *)
 Inductive store_name : Type :=
   | Psb                                             (**r store byte *)
   | Psh                                             (**r store half byte *)
@@ -357,7 +339,7 @@ Inductive st_instruction : Type :=
   | PStoreORRO (rs: gpreg_o) (ra: ireg) (ofs: offset)
 .
 
-(** Arithmetic instructions **)
+(** *** Arithmetic instructions *)
 Inductive arith_name_r : Type :=
   | Ploadsymbol (id: ident) (ofs: ptrofs)           (**r load the address of a symbol *)
 .
@@ -571,6 +553,8 @@ Coercion PArithARRI64:   arith_name_arri64    >-> Funclass.
 
 End PArithCoercions.
 
+(** ** Basic instructions *)
+
 Inductive basic : Type :=
   | PArith          (i: ar_instruction)
   | PLoad           (i: ld_instruction)
@@ -586,6 +570,7 @@ Coercion PLoad:         ld_instruction >-> basic.
 Coercion PStore:        st_instruction >-> basic.
 Coercion PArith:        ar_instruction >-> basic.
 
+(** ** Control-flow instructions *)
 
 Inductive control : Type :=
   | PExpand         (i: ex_instruction)
@@ -596,9 +581,9 @@ Coercion PExpand:   ex_instruction >-> control.
 Coercion PCtlFlow:  cf_instruction >-> control.
 
 
-(** * Definition of a bblock (ie a bundle)
+(** * Definition of a bblock (ie a bundle) *)
 
-A bundle/bblock must contain at least one instruction.
+(** A bundle/bblock must contain at least one instruction.
 
 This choice simplifies the definition of [find_bblock] below:
 indeed, each address of a code block identifies at most one bundle
@@ -621,9 +606,8 @@ Definition non_empty_exit (exit: option control): bool :=
 Definition non_empty_bblockb (body: list basic) (exit: option control): bool := non_empty_body body || non_empty_exit exit.
 
 
-(** TODO
- * For now, we consider a builtin is alone in a bundle (and a basic block).
- * Is there a way to avoid that ?
+(** For now, we consider a builtin is alone in a bundle (and a basic block).
+    Is there a way to avoid that ? (TODO)
  *)
 Definition builtin_aloneb (body: list basic) (exit: option control) :=
   match exit with
@@ -655,12 +639,12 @@ Definition length_opt {A} (o: option A) : nat :=
   | None => 0
   end.
 
-(* WARNING: the notion of size is not the same than in Machblock !
-   We ignore labels here...
+(** The notion of size induces the notion of "valid" code address given by [find_bblock]
+    The result is in Z to be compatible with operations on PC. 
 
-   This notion of size induces the notion of "valid" code address given by [find_bblock]
+    WARNING: this notion of size is not the same than in Machblock !
+    We ignore labels here...
 
-   The result is in Z to be compatible with operations on PC.
 *)
 Definition size (b:bblock): Z := Z.of_nat (length (body b) + length_opt (exit b)).
 
@@ -678,7 +662,7 @@ Record function : Type := mkfunction { fn_sig: signature; fn_blocks: bblocks }.
 Definition fundef := AST.fundef function.
 Definition program := AST.program fundef unit.
 
-(** * Operational semantics *)
+(** * Parallel Semantics of bundles *)
 
 (** The semantics operates over a single mapping from registers
   (type [preg]) to values.  We maintain
@@ -695,7 +679,7 @@ Notation "a # b <- c" := (Pregmap.set b c a) (at level 1, b at next level) : asm
 
 Open Scope asm.
 
-(** Undefining some registers *)
+(** *** Undefining some registers *)
 
 Fixpoint undef_regs (l: list preg) (rs: regset) : regset :=
   match l with
@@ -704,7 +688,7 @@ Fixpoint undef_regs (l: list preg) (rs: regset) : regset :=
   end.
 
 
-(** Assigning a register pair *)
+(** *** Assigning a register pair *)
 Definition set_pair (p: rpair preg) (v: val) (rs: regset) : regset :=
   match p with
   | One r => rs#r <- v
@@ -712,7 +696,7 @@ Definition set_pair (p: rpair preg) (v: val) (rs: regset) : regset :=
   end.
 
 
-(** Assigning the result of a builtin *)
+(** *** Assigning the result of a builtin *)
 
 Fixpoint set_res (res: builtin_res preg) (v: val) (rs: regset) : regset :=
   match res with
@@ -723,12 +707,8 @@ Fixpoint set_res (res: builtin_res preg) (v: val) (rs: regset) : regset :=
 
 Local Open Scope asm.
 
-(** * Parallel Semantics of bundles *)
-
 Section RELSEM.
 
-(** Execution of arith instructions *)
-
 Variable ge: genv.
 
 (** The parallel semantics on bundles is purely small-step and defined as a relation
@@ -753,7 +733,7 @@ Inductive outcome: Type :=
   | Stuck
 .
 
-(** ** Arithmetic Expressions (including comparisons) *)
+(** *** Arithmetic Expressions (including comparisons) *)
 
 Inductive signedness: Type := Signed | Unsigned.
 
@@ -800,7 +780,7 @@ Definition notftest_for_cmp (c: comparison) :=
   | Cge => Normal FTult
   end.
 
-(* CoMPare Signed Words to Zero *)
+(* **** CoMPare Signed Words to Zero *)
 Definition btest_for_cmpswz (c: comparison) :=
   match c with
   | Cne => BTwnez
@@ -811,7 +791,7 @@ Definition btest_for_cmpswz (c: comparison) :=
   | Cgt => BTwgtz
   end.
 
-(* CoMPare Signed Doubles to Zero *)
+(* **** CoMPare Signed Doubles to Zero *)
 Definition btest_for_cmpsdz (c: comparison) :=
   match c with
   | Cne => BTdnez
@@ -849,7 +829,7 @@ Definition cmpu_for_btest (bt: btest) :=
   end.
 
 
-(* a few lemma on comparisons of unsigned (e.g. pointers) *)
+(* **** a few lemma on comparisons of unsigned (e.g. pointers) *)
 
 Definition Val_cmpu_bool cmp v1 v2: option bool :=
   Val.cmpu_bool (fun _ _ => true) cmp v1 v2.
@@ -869,7 +849,7 @@ Lemma Val_cmpu_correct (m:mem) (cmp: comparison) (v1 v2: val):
 Proof.
   unfold Val.cmpu, Val_cmpu.
   remember (Val.cmpu_bool (Mem.valid_pointer m) cmp v1 v2) as ob.
-  destruct ob; simpl.
+  destruct ob; cbn.
   - erewrite Val_cmpu_bool_correct; eauto.
     econstructor.
   - econstructor.
@@ -893,15 +873,15 @@ Lemma Val_cmplu_correct (m:mem) (cmp: comparison) (v1 v2: val):
 Proof.
   unfold Val.cmplu, Val_cmplu.
   remember (Val.cmplu_bool (Mem.valid_pointer m) cmp v1 v2) as ob.
-  destruct ob as [b|]; simpl.
+  destruct ob as [b|]; cbn.
   - erewrite Val_cmplu_bool_correct; eauto.
-    simpl. econstructor.
+    cbn. econstructor.
   - econstructor.
 Qed.
 
 
 
-(** Comparing integers *)
+(** **** Comparing integers *)
 Definition compare_int (t: itest) (v1 v2: val): val :=
   match t with
   | ITne  => Val.cmp Cne v1 v2
@@ -961,6 +941,8 @@ Definition compare_float (t: ftest) (v1 v2: val): val :=
   | FTult => Val.notbool (Val.cmpf Cge v1 v2)
   end.
 
+(** **** Arithmetic evaluators *)
+
 Definition arith_eval_r n :=
   match n with
   | Ploadsymbol s ofs => Genv.symbol_address ge s ofs
@@ -1212,7 +1194,7 @@ Definition parexec_arith_instr (ai: ar_instruction) (rsr rsw: regset): regset :=
 
 Definition eval_offset (ofs: offset) : res ptrofs := OK ofs.
 
-(** * load/store *)
+(** *** load/store instructions *)
 
 Definition parexec_incorrect_load trap chunk d rsw mw :=
   match trap with
@@ -1361,7 +1343,7 @@ Definition store_chunk n :=
   | Pfsd => Mfloat64
   end.
 
-(** * basic instructions *)
+(** ** Basic (instruction) step *)
 
 Definition bstep (bi: basic) (rsr rsw: regset) (mr mw: mem) :=
   match bi with
@@ -1417,7 +1399,7 @@ Definition bstep (bi: basic) (rsr rsw: regset) (mr mw: mem) :=
   | Pnop => Next rsw mw
 end.
 
-(* parexec with writes-in-order *)
+(** *** parexec with writes-in-order *)
 Fixpoint parexec_wio_body (body: list basic) (rsr rsw: regset) (mr mw: mem) :=
   match body with
   | nil => Next rsw mw
@@ -1428,7 +1410,7 @@ Fixpoint parexec_wio_body (body: list basic) (rsr rsw: regset) (mr mw: mem) :=
      end
   end.
 
-(** TODO: redundant w.r.t Machblock ?? *)
+(* TODO: redundant w.r.t Machblock ?? *)
 Lemma in_dec (lbl: label) (l: list label):  { List.In lbl l } + { ~(List.In lbl l) }.
 Proof.
   apply List.in_dec.
@@ -1437,25 +1419,25 @@ Qed.
 
 
 
-(** Note: copy-paste from Machblock *)
+(* Note: copy-paste from Machblock *)
 Definition is_label (lbl: label) (bb: bblock) : bool :=
   if in_dec lbl (header bb) then true else false.
 
 Lemma is_label_correct_true lbl bb:
   List.In lbl (header bb) <-> is_label lbl bb = true. 
 Proof.
-  unfold is_label; destruct (in_dec lbl (header bb)); simpl; intuition.
+  unfold is_label; destruct (in_dec lbl (header bb)); cbn; intuition.
 Qed.
 
 Lemma is_label_correct_false lbl bb:
   ~(List.In lbl (header bb)) <-> is_label lbl bb = false. 
 Proof.
-  unfold is_label; destruct (in_dec lbl (header bb)); simpl; intuition.
+  unfold is_label; destruct (in_dec lbl (header bb)); cbn; intuition.
 Qed.
 
 
 
-(** convert a label into a position in the code *)
+(** **** convert a label into a position in the code *)
 Fixpoint label_pos (lbl: label) (pos: Z) (lb: bblocks) {struct lb} : option Z :=
   match lb with
   | nil => None
@@ -1472,11 +1454,9 @@ Definition par_goto_label (f: function) (lbl: label) (rsr rsw: regset) (mw: mem)
       end
   end.
 
-(** Evaluating a branch 
+(** **** Parallel Evaluation of a branch *)
 
-Warning: in m PC is assumed to be already pointing on the next instruction !
-
-*)
+(** Warning: PC is assumed to be already pointing on the next bundle ! *)
 
 Definition par_eval_branch (f: function) (l: label) (rsr rsw: regset) (mw: mem) (res: option bool) :=
   match res with
@@ -1486,72 +1466,54 @@ Definition par_eval_branch (f: function) (l: label) (rsr rsw: regset) (mw: mem)
   end.
 
 
-(* FIXME: comment not up-to-date for parallel semantics *)
-
-(** Execution of a single control-flow instruction [i] in initial state [rs] and
-    [m].  Return updated state.
-
-    As above: PC is assumed to be incremented on the next block before the control-flow instruction
-
-    For instructions that correspond tobuiltin
-    actual RISC-V instructions, the cases are straightforward
-    transliterations of the informal descriptions given in the RISC-V
-    user-mode specification.  For pseudo-instructions, refer to the
-    informal descriptions given above.
+(** **** Parallel execution of a control-flow instruction  *)
 
-    Note that we set to [Vundef] the registers used as temporaries by
-    the expansions of the pseudo-instructions, so that the RISC-V code
-    we generate cannot use those registers to hold values that must
-    survive the execution of the pseudo-instruction. *)
+(**  As above: PC is assumed to be incremented on the next block before the control-flow instruction
+*)
 
 Definition parexec_control (f: function) (oc: option control) (rsr rsw: regset) (mw: mem) :=
   match oc with
-  | Some ic =>
-(** Get/Set system registers *)
-  match ic with
-
-
-(** Branch Control Unit instructions *)
-  | Pret =>
+  | None => Next (rsw#PC <- (rsr#PC)) mw
+  | Some ic => (**r Branch Control Unit instructions *)
+    match ic with
+    | Pret =>
       Next (rsw#PC <- (rsr#RA)) mw
-  | Pcall s =>
+    | Pcall s =>
       Next (rsw#RA <- (rsr#PC) #PC <- (Genv.symbol_address ge s Ptrofs.zero)) mw
-  | Picall r =>
+    | Picall r =>
       Next (rsw#RA <- (rsr#PC) #PC <- (rsr#r)) mw
-  | Pjumptable r tbl =>
+    | Pjumptable r tbl =>
       match rsr#r with
       | Vint n =>
-          match list_nth_z tbl (Int.unsigned n) with
-          | None => Stuck
-          | Some lbl => par_goto_label f lbl rsr (rsw #GPR62 <- Vundef #GPR63 <- Vundef) mw
-          end
+        match list_nth_z tbl (Int.unsigned n) with
+        | None => Stuck
+        | Some lbl => par_goto_label f lbl rsr (rsw #GPR62 <- Vundef #GPR63 <- Vundef) mw
+        end
       | _ => Stuck
       end
-  | Pgoto s =>
+    | Pgoto s =>
       Next (rsw#PC <- (Genv.symbol_address ge s Ptrofs.zero)) mw
-  | Pigoto r =>
+    | Pigoto r =>
       Next (rsw#PC <- (rsr#r)) mw
-  | Pj_l l =>
+    | Pj_l l =>
       par_goto_label f l rsr rsw mw
-  | Pcb bt r l =>
+    | Pcb bt r l =>
       match cmp_for_btest bt with
       | (Some c, Int)  => par_eval_branch f l rsr rsw mw (Val.cmp_bool c rsr#r (Vint (Int.repr 0)))
       | (Some c, Long) => par_eval_branch f l rsr rsw mw (Val.cmpl_bool c rsr#r (Vlong (Int64.repr 0)))
       | (None, _) => Stuck
       end
-  | Pcbu bt r l => 
+    | Pcbu bt r l => 
       match cmpu_for_btest bt with
       | (Some c, Int) => par_eval_branch f l rsr rsw mw (Val_cmpu_bool c rsr#r (Vint (Int.repr 0)))
       | (Some c, Long) => par_eval_branch f l rsr rsw mw (Val_cmplu_bool c rsr#r (Vlong (Int64.repr 0)))
       | (None, _) => Stuck
       end
-
-(** Pseudo-instructions *)
-  | Pbuiltin ef args res =>
+    (**r Pseudo-instructions *)
+    | Pbuiltin ef args res =>
       Stuck (**r treated specially below *)
-  end
-  | None => Next (rsw#PC <- (rsr#PC)) mw
-end.
+    end
+  end.
 
 
 Definition incrPC size_b (rs: regset) :=
@@ -1567,7 +1529,7 @@ Definition parexec_wio f bdy ext size_b (rs: regset) (m: mem): outcome :=
   | Stuck => Stuck
   end.
 
-(** non-deterministic (out-of-order writes) parallel execution of bundles *)
+(** *** non-deterministic (out-of-order writes) parallel execution of bundles *)
 Definition parexec_bblock (f: function) (bundle: bblock) (rs: regset) (m: mem) (o: outcome): Prop :=
    exists bdy1 bdy2, Permutation (bdy1++bdy2) (body bundle) /\ 
       o=match parexec_wio f bdy1 (exit bundle) (Ptrofs.repr (size bundle)) rs m with
@@ -1575,14 +1537,13 @@ Definition parexec_bblock (f: function) (bundle: bblock) (rs: regset) (m: mem) (
       | Stuck => Stuck
       end.
 
-(** deterministic parallel (out-of-order writes) execution of bundles *)
+(** *** deterministic parallel (out-of-order writes) execution of bundles *)
 Definition det_parexec (f: function) (bundle: bblock) (rs: regset) (m: mem) rs' m': Prop :=
    forall o, parexec_bblock f bundle rs m o -> o = Next rs' m'.
 
 
-(* FIXME: comment not up-to-date *)
-(** Translation of the LTL/Linear/Mach view of machine registers to
-  the RISC-V view.  Note that no LTL register maps to [X31].  This
+(** *** Translation of the LTL/Linear/Mach view of machine registers to
+  the assembly view.  Note that no LTL register maps to [X31].  This
   register is reserved as temporary, to be used by the generated RV32G
   code.  *)
 
@@ -1605,7 +1566,7 @@ Definition preg_of (r: mreg) : preg :=
   | R60 => GPR60 | R61 => GPR61 | R62 => GPR62 | R63 => GPR63
   end.
 
-(** Undefine all registers except SP and callee-save registers *)
+(** **** Undefine all registers except SP and callee-save registers *)
 
 Definition undef_caller_save_regs (rs: regset) : regset :=
   fun r =>
@@ -1614,10 +1575,9 @@ Definition undef_caller_save_regs (rs: regset) : regset :=
     then rs r
     else Vundef.
 
-(* FIXME: comment not up-to-date *)
-(** Extract the values of the arguments of an external call.
+(** **** Extract the values of the arguments of an external call.
     We exploit the calling conventions from module [Conventions], except that
-    we use RISC-V registers instead of locations. *)
+    we use assembly registers instead of locations. *)
 
 Inductive extcall_arg (rs: regset) (m: mem): loc -> val -> Prop :=
   | extcall_arg_reg: forall r,
@@ -1646,12 +1606,12 @@ Definition loc_external_result (sg: signature) : rpair preg :=
   map_rpair preg_of (loc_result sg).
 
 
-(** Looking up bblocks in a code sequence by position. *)
+(** ** Looking up bblocks in a code sequence by position. *)
 Fixpoint find_bblock (pos: Z) (lb: bblocks) {struct lb} : option bblock :=
   match lb with
   | nil => None
   | b :: il => 
-    if zlt pos 0 then None  (* NOTE: It is impossible to branch inside a block *)
+    if zlt pos 0 then None  (*r NOTE: It is impossible to branch inside a block *)
     else if zeq pos 0 then Some b
     else find_bblock (pos - (size b)) il
   end.
@@ -1707,7 +1667,7 @@ Proof.
    constructor 1.
    - rewrite app_nil_r; auto.
    - unfold parexec_wio_bblock.
-     destruct (parexec_wio f _ _ _); simpl; auto.
+     destruct (parexec_wio f _ _ _); cbn; auto.
 Qed.
 
 
@@ -1721,9 +1681,7 @@ Qed.
 
 End RELSEM.
 
-(** Execution of whole programs. *)
-
-(** Execution of whole programs. *)
+(** ** Execution of whole programs. *)
 
 Inductive initial_state (p: program): state -> Prop :=
   | initial_state_intro: forall m0,
@@ -1781,7 +1739,7 @@ Ltac Det_WIO X :=
       exploit det_parexec_write_in_order; [ eapply H | idtac]; clear H; intro X
   | _ => idtac
   end.
-  intros; constructor; simpl.
+  intros; constructor; cbn.
 - (* determ *) intros s t1 s1 t2 s2 H H0. inv H; Det_WIO X1;
   inv H0; Det_WIO X2; Equalities.
   + split. constructor. auto. 
@@ -1796,7 +1754,7 @@ Ltac Det_WIO X :=
     exploit external_call_determ. eexact H3. eexact H8. intros [A B].
     split. auto. intros. destruct B; auto. subst. auto.
 - (* trace length *)
-  red; intros. inv H; simpl.
+  red; intros. inv H; cbn.
   omega.
   eapply external_call_trace_length; eauto.
   eapply external_call_trace_length; eauto.
diff --git a/kvx/CSE2depsproof.v b/kvx/CSE2depsproof.v
index f283c8ac..6c584450 100644
--- a/kvx/CSE2depsproof.v
+++ b/kvx/CSE2depsproof.v
@@ -71,7 +71,7 @@ Section MEMORY_WRITE.
     unfold largest_size_chunk in *.
 
     rewrite ptrofs_modulus in *.
-    simpl in *.
+    cbn in *.
     inv ADDRR.
     inv ADDRW.
     destruct base; try discriminate.
@@ -126,12 +126,12 @@ Proof.
   { (* Aindexed / Aindexed *)
   destruct args as [ | base [ | ]]. 1,3: discriminate.
   destruct args' as [ | base' [ | ]]. 1,3: discriminate.
-  simpl in OVERLAP.
+  cbn in OVERLAP.
   destruct (peq base base'). 2: discriminate.
   subst base'.
   destruct (can_swap_accesses_ofs (Ptrofs.unsigned i0) chunk' (Ptrofs.unsigned i) chunk) eqn:SWAP.
   2: discriminate.
-  simpl in *.
+  cbn in *.
   eapply load_store_away with (F:=F) (V:=V) (genv:=genv) (sp:=sp); eassumption.
   }
 Qed.
diff --git a/kvx/CombineOpproof.v b/kvx/CombineOpproof.v
index dafc90df..5dffc565 100644
--- a/kvx/CombineOpproof.v
+++ b/kvx/CombineOpproof.v
@@ -46,7 +46,7 @@ Qed.
 Ltac UseGetSound :=
   match goal with
   | [ H: get _ = Some _ |- _ ] =>
-      let x := fresh "EQ" in (generalize (get_op_sound _ _ _ H); intros x; simpl in x; FuncInv)
+      let x := fresh "EQ" in (generalize (get_op_sound _ _ _ H); intros x; cbn in x; FuncInv)
   end.
 
 Lemma combine_compimm_ne_0_sound:
@@ -58,7 +58,7 @@ Proof.
   intros until args. functional induction (combine_compimm_ne_0 get x); intros EQ; inv EQ.
   (* of cmp *)
   UseGetSound. rewrite <- H.
-  destruct (eval_condition cond (map valu args) m); simpl; auto. destruct b; auto.
+  destruct (eval_condition cond (map valu args) m); cbn; auto. destruct b; auto.
 Qed.
 
 Lemma combine_compimm_eq_0_sound:
@@ -71,7 +71,7 @@ Proof.
   (* of cmp *)
   UseGetSound. rewrite <- H.
   rewrite eval_negate_condition.
-  destruct (eval_condition c (map valu args) m); simpl; auto. destruct b; auto.
+  destruct (eval_condition c (map valu args) m); cbn; auto. destruct b; auto.
 Qed.
 
 Lemma combine_compimm_eq_1_sound:
@@ -83,7 +83,7 @@ Proof.
   intros until args. functional induction (combine_compimm_eq_1 get x); intros EQ; inv EQ.
   (* of cmp *)
   UseGetSound. rewrite <- H.
-  destruct (eval_condition cond (map valu args) m); simpl; auto. destruct b; auto.
+  destruct (eval_condition cond (map valu args) m); cbn; auto. destruct b; auto.
 Qed.
 
 Lemma combine_compimm_ne_1_sound:
@@ -96,7 +96,7 @@ Proof.
   (* of cmp *)
   UseGetSound. rewrite <- H.
   rewrite eval_negate_condition.
-  destruct (eval_condition c (map valu args) m); simpl; auto. destruct b; auto.
+  destruct (eval_condition c (map valu args) m); cbn; auto. destruct b; auto.
 Qed.
 
 Theorem combine_cond_sound:
@@ -106,21 +106,21 @@ Theorem combine_cond_sound:
 Proof.
   intros. functional inversion H; subst.
   (* compimm ne zero *)
-  - simpl; eapply combine_compimm_ne_0_sound; eauto.
+  - cbn; eapply combine_compimm_ne_0_sound; eauto.
   (* compimm ne one *)
-  - simpl; eapply combine_compimm_ne_1_sound; eauto.
+  - cbn; eapply combine_compimm_ne_1_sound; eauto.
   (* compimm eq zero *)
-  - simpl; eapply combine_compimm_eq_0_sound; eauto.
+  - cbn; eapply combine_compimm_eq_0_sound; eauto.
   (* compimm eq one *)
-  - simpl; eapply combine_compimm_eq_1_sound; eauto.
+  - cbn; eapply combine_compimm_eq_1_sound; eauto.
   (* compuimm ne zero *)
-  - simpl; eapply combine_compimm_ne_0_sound; eauto.
+  - cbn; eapply combine_compimm_ne_0_sound; eauto.
   (* compuimm ne one *)
-  - simpl; eapply combine_compimm_ne_1_sound; eauto.
+  - cbn; eapply combine_compimm_ne_1_sound; eauto.
   (* compuimm eq zero *)
-  - simpl; eapply combine_compimm_eq_0_sound; eauto.
+  - cbn; eapply combine_compimm_eq_0_sound; eauto.
   (* compuimm eq one *)
-  - simpl; eapply combine_compimm_eq_1_sound; eauto.
+  - cbn; eapply combine_compimm_eq_1_sound; eauto.
 Qed.
 
 Theorem combine_addr_sound:
@@ -130,10 +130,10 @@ Theorem combine_addr_sound:
 Proof.
   intros. functional inversion H; subst.
 - (* indexed - addimm *)
-  UseGetSound. simpl. rewrite <- H0. destruct v; auto. simpl; rewrite H7; simpl.
+  UseGetSound. cbn. rewrite <- H0. destruct v; auto. cbn; rewrite H7; cbn.
   rewrite Ptrofs.add_assoc. auto.
 - (* indexed - addimml *)
-  UseGetSound. simpl. rewrite <- H0. destruct v; auto. simpl; rewrite H7; simpl.
+  UseGetSound. cbn. rewrite <- H0. destruct v; auto. cbn; rewrite H7; cbn.
   rewrite Ptrofs.add_assoc. auto.
 Qed.
 
@@ -144,33 +144,33 @@ Theorem combine_op_sound:
 Proof.
   intros. functional inversion H; subst.
   (* addimm - addimm *)
-  - UseGetSound. FuncInv. simpl.
+  - UseGetSound. FuncInv. cbn.
     rewrite <- H0. rewrite Val.add_assoc. auto.
   (* andimm - andimm *)
-  - UseGetSound; simpl.
+  - UseGetSound; cbn.
     generalize (Int.eq_spec p m0); rewrite H7; intros.
-    rewrite <- H0. rewrite Val.and_assoc. simpl. fold p. rewrite H1. auto.
-  - UseGetSound; simpl.
+    rewrite <- H0. rewrite Val.and_assoc. cbn. fold p. rewrite H1. auto.
+  - UseGetSound; cbn.
     rewrite <- H0. rewrite Val.and_assoc. auto.
   (* orimm - orimm *)
-  - UseGetSound. simpl. rewrite <- H0. rewrite Val.or_assoc. auto.
+  - UseGetSound. cbn. rewrite <- H0. rewrite Val.or_assoc. auto.
   (* xorimm - xorimm *)
-  - UseGetSound. simpl. rewrite <- H0. rewrite Val.xor_assoc. auto.
+  - UseGetSound. cbn. rewrite <- H0. rewrite Val.xor_assoc. auto.
   (* addlimm - addlimm *)
-  - UseGetSound. FuncInv. simpl.
+  - UseGetSound. FuncInv. cbn.
     rewrite <- H0. rewrite Val.addl_assoc. auto.
   (* andlimm - andlimm *)
-  - UseGetSound; simpl.
+  - UseGetSound; cbn.
     generalize (Int64.eq_spec p m0); rewrite H7; intros.
-    rewrite <- H0. rewrite Val.andl_assoc. simpl. fold p. rewrite H1. auto.
-  - UseGetSound; simpl.
+    rewrite <- H0. rewrite Val.andl_assoc. cbn. fold p. rewrite H1. auto.
+  - UseGetSound; cbn.
     rewrite <- H0. rewrite Val.andl_assoc. auto.
   (* orlimm - orlimm *)
-  - UseGetSound. simpl. rewrite <- H0. rewrite Val.orl_assoc. auto.
+  - UseGetSound. cbn. rewrite <- H0. rewrite Val.orl_assoc. auto.
   (* xorlimm - xorlimm *)
-  - UseGetSound. simpl. rewrite <- H0. rewrite Val.xorl_assoc. auto.
+  - UseGetSound. cbn. rewrite <- H0. rewrite Val.xorl_assoc. auto.
   (* cmp *)
-  - simpl. decEq; decEq. eapply combine_cond_sound; eauto.
+  - cbn. decEq; decEq. eapply combine_cond_sound; eauto.
 Qed.
 
 End COMBINE.
diff --git a/kvx/ConstpropOpproof.v b/kvx/ConstpropOpproof.v
index 05bbdde1..ffd35bcc 100644
--- a/kvx/ConstpropOpproof.v
+++ b/kvx/ConstpropOpproof.v
@@ -105,7 +105,7 @@ Proof.
   + (* global *)
     inv H2. exists (Genv.symbol_address ge id ofs); auto.
   + (* stack *)
-    inv H2. exists (Vptr sp ofs); split; auto. simpl. rewrite Ptrofs.add_zero_l; auto.
+    inv H2. exists (Vptr sp ofs); split; auto. cbn. rewrite Ptrofs.add_zero_l; auto.
 Qed.
 
 Lemma cond_strength_reduction_correct:
@@ -115,7 +115,7 @@ Lemma cond_strength_reduction_correct:
   eval_condition cond' e##args' m = eval_condition cond e##args m.
 Proof.
   intros until vl. unfold cond_strength_reduction.
-  case (cond_strength_reduction_match cond args vl); simpl; intros; InvApproxRegs; SimplVM.
+  case (cond_strength_reduction_match cond args vl); cbn; intros; InvApproxRegs; SimplVM.
 - apply Val.swap_cmp_bool.
 - auto.
 - apply Val.swap_cmpu_bool.
@@ -137,7 +137,7 @@ Proof.
   intros. unfold make_cmp_base.
   generalize (cond_strength_reduction_correct c args vl H).
   destruct (cond_strength_reduction c args vl) as [c' args']. intros EQ.
-  econstructor; split. simpl; eauto. rewrite EQ. auto.
+  econstructor; split. cbn; eauto. rewrite EQ. auto.
 Qed.
 
 Lemma make_cmp_correct:
@@ -154,43 +154,43 @@ Proof.
   unfold make_cmp. case (make_cmp_match c args vl); intros.
 - unfold make_cmp_imm_eq.
   destruct (Int.eq_dec n Int.one && vincl v1 (Uns Ptop 1)) eqn:E1.
-+ simpl in H; inv H. InvBooleans. subst n.
-  exists (e#r1); split; auto. simpl.
-  exploit Y; eauto. intros [A | [A | A]]; rewrite A; simpl; auto.
++ cbn in H; inv H. InvBooleans. subst n.
+  exists (e#r1); split; auto. cbn.
+  exploit Y; eauto. intros [A | [A | A]]; rewrite A; cbn; auto.
 + destruct (Int.eq_dec n Int.zero && vincl v1 (Uns Ptop 1)) eqn:E0.
-* simpl in H; inv H. InvBooleans. subst n.
-  exists (Val.xor e#r1 (Vint Int.one)); split; auto. simpl.
-  exploit Y; eauto. intros [A | [A | A]]; rewrite A; simpl; auto.
+* cbn in H; inv H. InvBooleans. subst n.
+  exists (Val.xor e#r1 (Vint Int.one)); split; auto. cbn.
+  exploit Y; eauto. intros [A | [A | A]]; rewrite A; cbn; auto.
 * apply make_cmp_base_correct; auto.
 - unfold make_cmp_imm_ne.
   destruct (Int.eq_dec n Int.zero && vincl v1 (Uns Ptop 1)) eqn:E0.
-+ simpl in H; inv H. InvBooleans. subst n.
-  exists (e#r1); split; auto. simpl.
-  exploit Y; eauto. intros [A | [A | A]]; rewrite A; simpl; auto.
++ cbn in H; inv H. InvBooleans. subst n.
+  exists (e#r1); split; auto. cbn.
+  exploit Y; eauto. intros [A | [A | A]]; rewrite A; cbn; auto.
 + destruct (Int.eq_dec n Int.one && vincl v1 (Uns Ptop 1)) eqn:E1.
-* simpl in H; inv H. InvBooleans. subst n.
-  exists (Val.xor e#r1 (Vint Int.one)); split; auto. simpl.
-  exploit Y; eauto. intros [A | [A | A]]; rewrite A; simpl; auto.
+* cbn in H; inv H. InvBooleans. subst n.
+  exists (Val.xor e#r1 (Vint Int.one)); split; auto. cbn.
+  exploit Y; eauto. intros [A | [A | A]]; rewrite A; cbn; auto.
 * apply make_cmp_base_correct; auto.
 - unfold make_cmp_imm_eq.
   destruct (Int.eq_dec n Int.one && vincl v1 (Uns Ptop 1)) eqn:E1.
-+ simpl in H; inv H. InvBooleans. subst n.
-  exists (e#r1); split; auto. simpl.
-  exploit Y; eauto. intros [A | [A | A]]; rewrite A; simpl; auto.
++ cbn in H; inv H. InvBooleans. subst n.
+  exists (e#r1); split; auto. cbn.
+  exploit Y; eauto. intros [A | [A | A]]; rewrite A; cbn; auto.
 + destruct (Int.eq_dec n Int.zero && vincl v1 (Uns Ptop 1)) eqn:E0.
-* simpl in H; inv H. InvBooleans. subst n.
-  exists (Val.xor e#r1 (Vint Int.one)); split; auto. simpl.
-  exploit Y; eauto. intros [A | [A | A]]; rewrite A; simpl; auto.
+* cbn in H; inv H. InvBooleans. subst n.
+  exists (Val.xor e#r1 (Vint Int.one)); split; auto. cbn.
+  exploit Y; eauto. intros [A | [A | A]]; rewrite A; cbn; auto.
 * apply make_cmp_base_correct; auto.
 - unfold make_cmp_imm_ne.
   destruct (Int.eq_dec n Int.zero && vincl v1 (Uns Ptop 1)) eqn:E0.
-+ simpl in H; inv H. InvBooleans. subst n.
-  exists (e#r1); split; auto. simpl.
-  exploit Y; eauto. intros [A | [A | A]]; rewrite A; simpl; auto.
++ cbn in H; inv H. InvBooleans. subst n.
+  exists (e#r1); split; auto. cbn.
+  exploit Y; eauto. intros [A | [A | A]]; rewrite A; cbn; auto.
 + destruct (Int.eq_dec n Int.one && vincl v1 (Uns Ptop 1)) eqn:E1.
-* simpl in H; inv H. InvBooleans. subst n.
-  exists (Val.xor e#r1 (Vint Int.one)); split; auto. simpl.
-  exploit Y; eauto. intros [A | [A | A]]; rewrite A; simpl; auto.
+* cbn in H; inv H. InvBooleans. subst n.
+  exists (Val.xor e#r1 (Vint Int.one)); split; auto. cbn.
+  exploit Y; eauto. intros [A | [A | A]]; rewrite A; cbn; auto.
 * apply make_cmp_base_correct; auto.
 - apply make_cmp_base_correct; auto.
 Qed.
@@ -203,7 +203,7 @@ Proof.
   intros. unfold make_addimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros.
   subst. exists (e#r); split; auto.
-  destruct (e#r); simpl; auto; rewrite ?Int.add_zero, ?Ptrofs.add_zero; auto.
+  destruct (e#r); cbn; auto; rewrite ?Int.add_zero, ?Ptrofs.add_zero; auto.
   exists (Val.add e#r (Vint n)); split; auto.
 Qed.
 
@@ -215,10 +215,10 @@ Lemma make_shlimm_correct:
 Proof.
   intros; unfold make_shlimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros. subst.
-  exists (e#r1); split; auto. destruct (e#r1); simpl; auto. rewrite Int.shl_zero. auto.
+  exists (e#r1); split; auto. destruct (e#r1); cbn; auto. rewrite Int.shl_zero. auto.
   destruct (Int.ltu n Int.iwordsize).
-  econstructor; split. simpl. eauto. auto.
-  econstructor; split. simpl. eauto. rewrite H; auto.
+  econstructor; split. cbn. eauto. auto.
+  econstructor; split. cbn. eauto. rewrite H; auto.
 Qed.
 
 Lemma make_shrimm_correct:
@@ -229,10 +229,10 @@ Lemma make_shrimm_correct:
 Proof.
   intros; unfold make_shrimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros. subst.
-  exists (e#r1); split; auto. destruct (e#r1); simpl; auto. rewrite Int.shr_zero. auto.
+  exists (e#r1); split; auto. destruct (e#r1); cbn; auto. rewrite Int.shr_zero. auto.
   destruct (Int.ltu n Int.iwordsize).
-  econstructor; split. simpl. eauto. auto.
-  econstructor; split. simpl. eauto. rewrite H; auto.
+  econstructor; split. cbn. eauto. auto.
+  econstructor; split. cbn. eauto. rewrite H; auto.
 Qed.
 
 Lemma make_shruimm_correct:
@@ -243,10 +243,10 @@ Lemma make_shruimm_correct:
 Proof.
   intros; unfold make_shruimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros. subst.
-  exists (e#r1); split; auto. destruct (e#r1); simpl; auto. rewrite Int.shru_zero. auto.
+  exists (e#r1); split; auto. destruct (e#r1); cbn; auto. rewrite Int.shru_zero. auto.
   destruct (Int.ltu n Int.iwordsize).
-  econstructor; split. simpl. eauto. auto.
-  econstructor; split. simpl. eauto. rewrite H; auto.
+  econstructor; split. cbn. eauto. auto.
+  econstructor; split. cbn. eauto. rewrite H; auto.
 Qed.
 
 Lemma make_mulimm_correct:
@@ -257,12 +257,12 @@ Lemma make_mulimm_correct:
 Proof.
   intros; unfold make_mulimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros. subst.
-  exists (Vint Int.zero); split; auto. destruct (e#r1); simpl; auto. rewrite Int.mul_zero; auto.
+  exists (Vint Int.zero); split; auto. destruct (e#r1); cbn; auto. rewrite Int.mul_zero; auto.
   predSpec Int.eq Int.eq_spec n Int.one; intros. subst.
-  exists (e#r1); split; auto. destruct (e#r1); simpl; auto. rewrite Int.mul_one; auto.
+  exists (e#r1); split; auto. destruct (e#r1); cbn; auto. rewrite Int.mul_one; auto.
   destruct (Int.is_power2 n) eqn:?; intros.
-  rewrite (Val.mul_pow2 e#r1 _ _ Heqo). econstructor; split. simpl; eauto. auto.
-  econstructor; split; eauto. simpl. rewrite H; auto.
+  rewrite (Val.mul_pow2 e#r1 _ _ Heqo). econstructor; split. cbn; eauto. auto.
+  econstructor; split; eauto. cbn. rewrite H; auto.
 Qed.
 
 Lemma make_divimm_correct:
@@ -275,11 +275,11 @@ Proof.
   intros; unfold make_divimm.
   predSpec Int.eq Int.eq_spec n Int.one; intros. subst. rewrite H0 in H.
   destruct (e#r1) eqn:?;
-    try (rewrite Val.divs_one in H; exists (Vint i); split; simpl; try rewrite Heqv0; auto);
+    try (rewrite Val.divs_one in H; exists (Vint i); split; cbn; try rewrite Heqv0; auto);
     inv H; auto.
   destruct (Int.is_power2 n) eqn:?.
   destruct (Int.ltu i (Int.repr 31)) eqn:?.
-  exists v; split; auto. simpl.
+  exists v; split; auto. cbn.
   erewrite Val.divs_pow2; eauto. reflexivity. congruence.
   exists v; auto.
   exists v; auto.
@@ -295,10 +295,10 @@ Proof.
   intros; unfold make_divuimm.
   predSpec Int.eq Int.eq_spec n Int.one; intros. subst. rewrite H0 in H.
   destruct (e#r1) eqn:?;
-    try (rewrite Val.divu_one in H; exists (Vint i); split; simpl; try rewrite Heqv0; auto);
+    try (rewrite Val.divu_one in H; exists (Vint i); split; cbn; try rewrite Heqv0; auto);
     inv H; auto.
   destruct (Int.is_power2 n) eqn:?.
-  econstructor; split. simpl; eauto.
+  econstructor; split. cbn; eauto.
   rewrite H0 in H. erewrite Val.divu_pow2 by eauto. auto.
   exists v; auto.
 Qed.
@@ -312,7 +312,7 @@ Lemma make_moduimm_correct:
 Proof.
   intros; unfold make_moduimm.
   destruct (Int.is_power2 n) eqn:?.
-  exists v; split; auto. simpl. decEq. eapply Val.modu_pow2; eauto. congruence.
+  exists v; split; auto. cbn. decEq. eapply Val.modu_pow2; eauto. congruence.
   exists v; auto.
 Qed.
 
@@ -324,18 +324,18 @@ Lemma make_andimm_correct:
 Proof.
   intros; unfold make_andimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros.
-  subst n. exists (Vint Int.zero); split; auto. destruct (e#r); simpl; auto. rewrite Int.and_zero; auto.
+  subst n. exists (Vint Int.zero); split; auto. destruct (e#r); cbn; auto. rewrite Int.and_zero; auto.
   predSpec Int.eq Int.eq_spec n Int.mone; intros.
-  subst n. exists (e#r); split; auto. destruct (e#r); simpl; auto. rewrite Int.and_mone; auto.
+  subst n. exists (e#r); split; auto. destruct (e#r); cbn; auto. rewrite Int.and_mone; auto.
   destruct (match x with Uns _ k => Int.eq (Int.zero_ext k (Int.not n)) Int.zero
                        | _ => false end) eqn:UNS.
   destruct x; try congruence.
   exists (e#r); split; auto.
-  inv H; auto. simpl. replace (Int.and i n) with i; auto.
+  inv H; auto. cbn. replace (Int.and i n) with i; auto.
   generalize (Int.eq_spec (Int.zero_ext n0 (Int.not n)) Int.zero); rewrite UNS; intro EQ.
   Int.bit_solve. destruct (zlt i0 n0).
   replace (Int.testbit n i0) with (negb (Int.testbit Int.zero i0)).
-  rewrite Int.bits_zero. simpl. rewrite andb_true_r. auto.
+  rewrite Int.bits_zero. cbn. rewrite andb_true_r. auto.
   rewrite <- EQ. rewrite Int.bits_zero_ext by omega. rewrite zlt_true by auto.
   rewrite Int.bits_not by auto. apply negb_involutive.
   rewrite H6 by auto. auto.
@@ -349,9 +349,9 @@ Lemma make_orimm_correct:
 Proof.
   intros; unfold make_orimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros.
-  subst n. exists (e#r); split; auto. destruct (e#r); simpl; auto. rewrite Int.or_zero; auto.
+  subst n. exists (e#r); split; auto. destruct (e#r); cbn; auto. rewrite Int.or_zero; auto.
   predSpec Int.eq Int.eq_spec n Int.mone; intros.
-  subst n. exists (Vint Int.mone); split; auto. destruct (e#r); simpl; auto. rewrite Int.or_mone; auto.
+  subst n. exists (Vint Int.mone); split; auto. destruct (e#r); cbn; auto. rewrite Int.or_mone; auto.
   econstructor; split; eauto. auto.
 Qed.
 
@@ -362,7 +362,7 @@ Lemma make_xorimm_correct:
 Proof.
   intros; unfold make_xorimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros.
-  subst n. exists (e#r); split; auto. destruct (e#r); simpl; auto. rewrite Int.xor_zero; auto.
+  subst n. exists (e#r); split; auto. destruct (e#r); cbn; auto. rewrite Int.xor_zero; auto.
   predSpec Int.eq Int.eq_spec n Int.mone; intros.
   subst n. exists (Val.notint e#r); split; auto.
   econstructor; split; eauto. auto.
@@ -376,7 +376,7 @@ Proof.
   intros. unfold make_addlimm.
   predSpec Int64.eq Int64.eq_spec n Int64.zero; intros.
   subst. exists (e#r); split; auto.
-  destruct (e#r); simpl; auto; rewrite ? Int64.add_zero, ? Ptrofs.add_zero; auto.
+  destruct (e#r); cbn; auto; rewrite ? Int64.add_zero, ? Ptrofs.add_zero; auto.
   exists (Val.addl e#r (Vlong n)); split; auto.
 Qed.
 
@@ -388,11 +388,11 @@ Lemma make_shllimm_correct:
 Proof.
   intros; unfold make_shllimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros. subst.
-  exists (e#r1); split; auto. destruct (e#r1); simpl; auto.
+  exists (e#r1); split; auto. destruct (e#r1); cbn; auto.
   unfold Int64.shl'. rewrite Z.shiftl_0_r, Int64.repr_unsigned. auto.
   destruct (Int.ltu n Int64.iwordsize').
-  econstructor; split. simpl. eauto. auto.
-  econstructor; split. simpl. eauto. rewrite H; auto.
+  econstructor; split. cbn. eauto. auto.
+  econstructor; split. cbn. eauto. rewrite H; auto.
 Qed.
 
 Lemma make_shrlimm_correct:
@@ -403,11 +403,11 @@ Lemma make_shrlimm_correct:
 Proof.
   intros; unfold make_shrlimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros. subst.
-  exists (e#r1); split; auto. destruct (e#r1); simpl; auto.
+  exists (e#r1); split; auto. destruct (e#r1); cbn; auto.
   unfold Int64.shr'. rewrite Z.shiftr_0_r, Int64.repr_signed. auto.
   destruct (Int.ltu n Int64.iwordsize').
-  econstructor; split. simpl. eauto. auto.
-  econstructor; split. simpl. eauto. rewrite H; auto.
+  econstructor; split. cbn. eauto. auto.
+  econstructor; split. cbn. eauto. rewrite H; auto.
 Qed.
 
 Lemma make_shrluimm_correct:
@@ -418,11 +418,11 @@ Lemma make_shrluimm_correct:
 Proof.
   intros; unfold make_shrluimm.
   predSpec Int.eq Int.eq_spec n Int.zero; intros. subst.
-  exists (e#r1); split; auto. destruct (e#r1); simpl; auto.
+  exists (e#r1); split; auto. destruct (e#r1); cbn; auto.
   unfold Int64.shru'. rewrite Z.shiftr_0_r, Int64.repr_unsigned. auto.
   destruct (Int.ltu n Int64.iwordsize').
-  econstructor; split. simpl. eauto. auto.
-  econstructor; split. simpl. eauto. rewrite H; auto.
+  econstructor; split. cbn. eauto. auto.
+  econstructor; split. cbn. eauto. rewrite H; auto.
 Qed.
 
 Lemma make_mullimm_correct:
@@ -433,15 +433,15 @@ Lemma make_mullimm_correct:
 Proof.
   intros; unfold make_mullimm.
   predSpec Int64.eq Int64.eq_spec n Int64.zero; intros. subst.
-  exists (Vlong Int64.zero); split; auto. destruct (e#r1); simpl; auto. rewrite Int64.mul_zero; auto.
+  exists (Vlong Int64.zero); split; auto. destruct (e#r1); cbn; auto. rewrite Int64.mul_zero; auto.
   predSpec Int64.eq Int64.eq_spec n Int64.one; intros. subst.
-  exists (e#r1); split; auto. destruct (e#r1); simpl; auto. rewrite Int64.mul_one; auto.
+  exists (e#r1); split; auto. destruct (e#r1); cbn; auto. rewrite Int64.mul_one; auto.
   destruct (Int64.is_power2' n) eqn:?; intros.
   exists (Val.shll e#r1 (Vint i)); split; auto.
-  destruct (e#r1); simpl; auto.
+  destruct (e#r1); cbn; auto.
   erewrite Int64.is_power2'_range by eauto.
   erewrite Int64.mul_pow2' by eauto. auto.
-  econstructor; split; eauto. simpl; rewrite H; auto.
+  econstructor; split; eauto. cbn; rewrite H; auto.
 Qed.
 
 Lemma make_divlimm_correct:
@@ -453,7 +453,7 @@ Lemma make_divlimm_correct:
 Proof.
   intros; unfold make_divlimm.
   destruct (Int64.is_power2' n) eqn:?. destruct (Int.ltu i (Int.repr 63)) eqn:?.
-  rewrite H0 in H. econstructor; split. simpl; eauto.
+  rewrite H0 in H. econstructor; split. cbn; eauto.
   erewrite Val.divls_pow2; eauto. auto.
   exists v; auto.
   exists v; auto.
@@ -468,9 +468,9 @@ Lemma make_divluimm_correct:
 Proof.
   intros; unfold make_divluimm.
   destruct (Int64.is_power2' n) eqn:?.
-  econstructor; split. simpl; eauto.
+  econstructor; split. cbn; eauto.
   rewrite H0 in H. destruct (e#r1); inv H. destruct (Int64.eq n Int64.zero); inv H2.
-  simpl.
+  cbn.
   erewrite Int64.is_power2'_range by eauto.    
   erewrite Int64.divu_pow2' by eauto.  auto. 
   exists v; auto.
@@ -485,9 +485,9 @@ Lemma make_modluimm_correct:
 Proof.
   intros; unfold make_modluimm.
   destruct (Int64.is_power2 n) eqn:?.
-  exists v; split; auto. simpl. decEq.
+  exists v; split; auto. cbn. decEq.
   rewrite H0 in H. destruct (e#r1); inv H. destruct (Int64.eq n Int64.zero); inv H2. 
-  simpl. erewrite Int64.modu_and by eauto. auto.
+  cbn. erewrite Int64.modu_and by eauto. auto.
   exists v; auto.
 Qed.
 
@@ -498,9 +498,9 @@ Lemma make_andlimm_correct:
 Proof.
   intros; unfold make_andlimm.
   predSpec Int64.eq Int64.eq_spec n Int64.zero; intros.
-  subst n. exists (Vlong Int64.zero); split; auto. destruct (e#r); simpl; auto. rewrite Int64.and_zero; auto.
+  subst n. exists (Vlong Int64.zero); split; auto. destruct (e#r); cbn; auto. rewrite Int64.and_zero; auto.
   predSpec Int64.eq Int64.eq_spec n Int64.mone; intros.
-  subst n. exists (e#r); split; auto. destruct (e#r); simpl; auto. rewrite Int64.and_mone; auto.
+  subst n. exists (e#r); split; auto. destruct (e#r); cbn; auto. rewrite Int64.and_mone; auto.
   econstructor; split; eauto. auto.
 Qed.
 
@@ -511,9 +511,9 @@ Lemma make_orlimm_correct:
 Proof.
   intros; unfold make_orlimm.
   predSpec Int64.eq Int64.eq_spec n Int64.zero; intros.
-  subst n. exists (e#r); split; auto. destruct (e#r); simpl; auto. rewrite Int64.or_zero; auto.
+  subst n. exists (e#r); split; auto. destruct (e#r); cbn; auto. rewrite Int64.or_zero; auto.
   predSpec Int64.eq Int64.eq_spec n Int64.mone; intros.
-  subst n. exists (Vlong Int64.mone); split; auto. destruct (e#r); simpl; auto. rewrite Int64.or_mone; auto.
+  subst n. exists (Vlong Int64.mone); split; auto. destruct (e#r); cbn; auto. rewrite Int64.or_mone; auto.
   econstructor; split; eauto. auto.
 Qed.
 
@@ -524,7 +524,7 @@ Lemma make_xorlimm_correct:
 Proof.
   intros; unfold make_xorlimm.
   predSpec Int64.eq Int64.eq_spec n Int64.zero; intros.
-  subst n. exists (e#r); split; auto. destruct (e#r); simpl; auto. rewrite Int64.xor_zero; auto.
+  subst n. exists (e#r); split; auto. destruct (e#r); cbn; auto. rewrite Int64.xor_zero; auto.
   predSpec Int64.eq Int64.eq_spec n Int64.mone; intros.
   subst n. exists (Val.notl e#r); split; auto.
   econstructor; split; eauto. auto.
@@ -538,9 +538,9 @@ Lemma make_mulfimm_correct:
 Proof.
   intros; unfold make_mulfimm.
   destruct (Float.eq_dec n (Float.of_int (Int.repr 2))); intros.
-  simpl. econstructor; split. eauto. rewrite H; subst n.
-  destruct (e#r1); simpl; auto. rewrite Float.mul2_add; auto.
-  simpl. econstructor; split; eauto.
+  cbn. econstructor; split. eauto. rewrite H; subst n.
+  destruct (e#r1); cbn; auto. rewrite Float.mul2_add; auto.
+  cbn. econstructor; split; eauto.
 Qed.
 
 Lemma make_mulfimm_correct_2:
@@ -551,10 +551,10 @@ Lemma make_mulfimm_correct_2:
 Proof.
   intros; unfold make_mulfimm.
   destruct (Float.eq_dec n (Float.of_int (Int.repr 2))); intros.
-  simpl. econstructor; split. eauto. rewrite H; subst n.
-  destruct (e#r2); simpl; auto. rewrite Float.mul2_add; auto.
+  cbn. econstructor; split. eauto. rewrite H; subst n.
+  destruct (e#r2); cbn; auto. rewrite Float.mul2_add; auto.
   rewrite Float.mul_commut; auto.
-  simpl. econstructor; split; eauto.
+  cbn. econstructor; split; eauto.
 Qed.
 
 Lemma make_mulfsimm_correct:
@@ -565,9 +565,9 @@ Lemma make_mulfsimm_correct:
 Proof.
   intros; unfold make_mulfsimm.
   destruct (Float32.eq_dec n (Float32.of_int (Int.repr 2))); intros.
-  simpl. econstructor; split. eauto. rewrite H; subst n.
-  destruct (e#r1); simpl; auto. rewrite Float32.mul2_add; auto.
-  simpl. econstructor; split; eauto.
+  cbn. econstructor; split. eauto. rewrite H; subst n.
+  destruct (e#r1); cbn; auto. rewrite Float32.mul2_add; auto.
+  cbn. econstructor; split; eauto.
 Qed.
 
 Lemma make_mulfsimm_correct_2:
@@ -578,10 +578,10 @@ Lemma make_mulfsimm_correct_2:
 Proof.
   intros; unfold make_mulfsimm.
   destruct (Float32.eq_dec n (Float32.of_int (Int.repr 2))); intros.
-  simpl. econstructor; split. eauto. rewrite H; subst n.
-  destruct (e#r2); simpl; auto. rewrite Float32.mul2_add; auto.
+  cbn. econstructor; split. eauto. rewrite H; subst n.
+  destruct (e#r2); cbn; auto. rewrite Float32.mul2_add; auto.
   rewrite Float32.mul_commut; auto.
-  simpl. econstructor; split; eauto.
+  cbn. econstructor; split; eauto.
 Qed.
 
 Lemma make_cast8signed_correct:
@@ -594,8 +594,8 @@ Proof.
   exists e#r; split; auto.
   assert (V: vmatch bc e#r (Sgn Ptop 8)).
   { eapply vmatch_ge; eauto. apply vincl_ge; auto. }
-  inv V; simpl; auto. rewrite is_sgn_sign_ext in H4 by auto. rewrite H4; auto.
-  econstructor; split; simpl; eauto.
+  inv V; cbn; auto. rewrite is_sgn_sign_ext in H4 by auto. rewrite H4; auto.
+  econstructor; split; cbn; eauto.
 Qed.
 
 Lemma make_cast16signed_correct:
@@ -608,8 +608,8 @@ Proof.
   exists e#r; split; auto.
   assert (V: vmatch bc e#r (Sgn Ptop 16)).
   { eapply vmatch_ge; eauto. apply vincl_ge; auto. }
-  inv V; simpl; auto. rewrite is_sgn_sign_ext in H4 by auto. rewrite H4; auto.
-  econstructor; split; simpl; eauto.
+  inv V; cbn; auto. rewrite is_sgn_sign_ext in H4 by auto. rewrite H4; auto.
+  econstructor; split; cbn; eauto.
 Qed.
 
 Lemma op_strength_reduction_correct:
@@ -620,7 +620,7 @@ Lemma op_strength_reduction_correct:
   exists w, eval_operation ge (Vptr sp Ptrofs.zero) op' e##args' m = Some w /\ Val.lessdef v w.
 Proof.
   intros until v; unfold op_strength_reduction;
-  case (op_strength_reduction_match op args vl); simpl; intros.
+  case (op_strength_reduction_match op args vl); cbn; intros.
 - (* cast8signed *)
   InvApproxRegs; SimplVM; inv H0. apply make_cast8signed_correct; auto.
 - (* cast16signed *)
@@ -733,15 +733,15 @@ Lemma addr_strength_reduction_correct:
   exists res', eval_addressing ge (Vptr sp Ptrofs.zero) addr' e##args' = Some res' /\ Val.lessdef res res'.
 Proof.
   intros until res. unfold addr_strength_reduction.
-  destruct (addr_strength_reduction_match addr args vl); simpl;
+  destruct (addr_strength_reduction_match addr args vl); cbn;
   intros VL EA; InvApproxRegs; SimplVM; try (inv EA).
 - destruct (orb _ _).
 + exists (Val.offset_ptr e#r1 n); auto.
-+ simpl. rewrite Genv.shift_symbol_address. econstructor; split; eauto. 
-  inv H0; simpl; auto.
++ cbn. rewrite Genv.shift_symbol_address. econstructor; split; eauto. 
+  inv H0; cbn; auto.
 - rewrite Ptrofs.add_zero_l. econstructor; split; eauto.
   change (Vptr sp (Ptrofs.add n1 n)) with (Val.offset_ptr (Vptr sp n1) n). 
-  inv H0; simpl; auto.
+  inv H0; cbn; auto.
 - exists res; auto.
 Qed.
 
diff --git a/kvx/Conventions1.v b/kvx/Conventions1.v
index ab30ded9..0b2cf406 100644
--- a/kvx/Conventions1.v
+++ b/kvx/Conventions1.v
@@ -108,7 +108,7 @@ Lemma loc_result_type:
   subtype (proj_sig_res sig) (typ_rpair mreg_type (loc_result sig)) = true.
 Proof.
   intros. unfold proj_sig_res, loc_result, mreg_type.
-  destruct (sig_res sig); try destruct Archi.ptr64; simpl; trivial; destruct t; trivial.
+  destruct (sig_res sig); try destruct Archi.ptr64; cbn; trivial; destruct t; trivial.
 Qed.
 
 (** The result locations are caller-save registers *)
@@ -118,7 +118,7 @@ Lemma loc_result_caller_save:
   forall_rpair (fun r => is_callee_save r = false) (loc_result s).
 Proof.
   intros. unfold loc_result, is_callee_save;
-            destruct (sig_res s); simpl; auto; try destruct Archi.ptr64; simpl; auto; try destruct t; simpl; auto.
+            destruct (sig_res s); cbn; auto; try destruct Archi.ptr64; cbn; auto; try destruct t; cbn; auto.
 Qed.
 
 (** If the result is in a pair of registers, those registers are distinct and have type [Tint] at least. *)
@@ -296,9 +296,9 @@ Proof.
              OKREGS regs -> OKF f -> ofs >= 0 -> OK (one_arg regs rn ofs ty f)).
   { intros until f; intros OR OF OO; red; unfold one_arg; intros.
     destruct (list_nth_z regs rn) as [r|] eqn:NTH; destruct H.
-  - subst p; simpl. apply OR. eapply list_nth_z_in; eauto. 
+  - subst p; cbn. apply OR. eapply list_nth_z_in; eauto. 
   - eapply OF; eauto. 
-  - subst p; simpl. auto using align_divides, typealign_pos.
+  - subst p; cbn. auto using align_divides, typealign_pos.
   - eapply OF; [idtac|eauto].
     generalize (AL ofs ty OO) (SKK ty); omega.
   }
@@ -310,16 +310,16 @@ Proof.
     assert (OO': ofs' >= 0) by (apply (AL ofs Tlong); auto).
     assert (DFL: OK (Twolong (S Outgoing (ofs' + 1) Tint) (S Outgoing ofs' Tint)
                      :: f rn' (ofs' + 2))).
-    { red; simpl; intros. destruct H.
-    - subst p; simpl. 
+    { red; cbn; intros. destruct H.
+    - subst p; cbn. 
       repeat split; auto using Z.divide_1_l. omega.
     - eapply OF; [idtac|eauto]. omega.
     }
     destruct (list_nth_z regs rn') as [r1|] eqn:NTH1;
     destruct (list_nth_z regs (rn' + 1)) as [r2|] eqn:NTH2;
     try apply DFL.
-    red; simpl; intros; destruct H.
-  - subst p; simpl. split; apply OR; eauto using list_nth_z_in.  
+    red; cbn; intros; destruct H.
+  - subst p; cbn. split; apply OR; eauto using list_nth_z_in.  
   - eapply OF; [idtac|eauto]. auto.
   }
   assert (C: forall regs rn ofs ty f,
@@ -327,10 +327,10 @@ Proof.
   { intros until f; intros OR OF OO OTY; unfold hybrid_arg; red; intros.
     set (rn' := align rn 2) in *.
     destruct (list_nth_z regs rn') as [r|] eqn:NTH; destruct H.
-  - subst p; simpl. apply OR. eapply list_nth_z_in; eauto. 
+  - subst p; cbn. apply OR. eapply list_nth_z_in; eauto. 
   - eapply OF; eauto. 
-  - subst p; simpl. rewrite OTY. split. apply (AL ofs Tlong OO). apply Z.divide_1_l. 
-  - eapply OF; [idtac|eauto]. generalize (AL ofs Tlong OO); simpl; omega.
+  - subst p; cbn. rewrite OTY. split. apply (AL ofs Tlong OO). apply Z.divide_1_l. 
+  - eapply OF; [idtac|eauto]. generalize (AL ofs Tlong OO); cbn; omega.
   }
   assert (D: OKREGS param_regs).
   { red. decide_goal. }
@@ -339,8 +339,8 @@ Proof.
 
   cut (forall va tyl rn ofs, ofs >= 0 -> OK (loc_arguments_rec va tyl rn ofs)).
   unfold OK. eauto.
-  induction tyl as [ | ty1 tyl]; intros until ofs; intros OO; simpl.
-  - red; simpl; tauto.
+  induction tyl as [ | ty1 tyl]; intros until ofs; intros OO; cbn.
+  - red; cbn; tauto.
   - destruct ty1.  
 + (* int *) apply A; auto.
 + (* float *) 
@@ -369,10 +369,10 @@ Remark fold_max_outgoing_above:
 Proof.
   assert (A: forall n l, max_outgoing_1 n l >= n).
   { intros; unfold max_outgoing_1. destruct l as [_ | []]; xomega. }
-  induction l; simpl; intros. 
+  induction l; cbn; intros. 
   - omega.
   - eapply Zge_trans. eauto.
-    destruct a; simpl. apply A. eapply Zge_trans; eauto.
+    destruct a; cbn. apply A. eapply Zge_trans; eauto.
 Qed.
 
 Lemma size_arguments_above:
@@ -392,14 +392,14 @@ Proof.
   assert (B: forall p n,
              In (S Outgoing ofs ty) (regs_of_rpair p) ->
              ofs + typesize ty <= max_outgoing_2 n p).
-  { intros. destruct p; simpl in H; intuition; subst; simpl.
+  { intros. destruct p; cbn in H; intuition; subst; cbn.
   - xomega.
   - eapply Z.le_trans. 2: apply A. xomega.
   - xomega. }
   assert (C: forall l n,
              In (S Outgoing ofs ty) (regs_of_rpairs l) ->
              ofs + typesize ty <= fold_left max_outgoing_2 l n).
-  { induction l; simpl; intros.
+  { induction l; cbn; intros.
   - contradiction.
   - rewrite in_app_iff in H. destruct H.
   + eapply Z.le_trans. eapply B; eauto. apply Z.ge_le. apply fold_max_outgoing_above.
diff --git a/kvx/ExtValues.v b/kvx/ExtValues.v
index 3664c00a..a0c10ddd 100644
--- a/kvx/ExtValues.v
+++ b/kvx/ExtValues.v
@@ -62,10 +62,10 @@ Lemma shift1_4_of_z_correct :
     end.
 Proof.
   intro. unfold shift1_4_of_z.
-  destruct (Z.eq_dec _ _); simpl; try congruence.
-  destruct (Z.eq_dec _ _); simpl; try congruence.
-  destruct (Z.eq_dec _ _); simpl; try congruence.
-  destruct (Z.eq_dec _ _); simpl; try congruence.
+  destruct (Z.eq_dec _ _); cbn; try congruence.
+  destruct (Z.eq_dec _ _); cbn; try congruence.
+  destruct (Z.eq_dec _ _); cbn; try congruence.
+  destruct (Z.eq_dec _ _); cbn; try congruence.
   trivial.
 Qed.
 
@@ -215,19 +215,19 @@ Theorem divu_is_divlu: forall v1 v2 : val,
     end.
 Proof.
   intros.
-  destruct v1; simpl; trivial.
-  destruct v2; simpl; trivial.
+  destruct v1; cbn; trivial.
+  destruct v2; cbn; trivial.
   destruct i as [i_val i_range].
   destruct i0 as [i0_val i0_range].
-  simpl.
+  cbn.
   unfold Int.eq, Int64.eq, Int.zero, Int64.zero.
-  simpl.
+  cbn.
   rewrite Int.unsigned_repr by (compute; split; discriminate).
   rewrite (Int64.unsigned_repr 0) by (compute; split; discriminate).
   rewrite (unsigned64_repr i0_val) by assumption.
-  destruct (zeq i0_val 0) as [ | Hnot0]; simpl; trivial.
+  destruct (zeq i0_val 0) as [ | Hnot0]; cbn; trivial.
   f_equal. f_equal.
-  unfold Int.divu, Int64.divu. simpl.
+  unfold Int.divu, Int64.divu. cbn.
   rewrite (unsigned64_repr i_val) by assumption.
   rewrite (unsigned64_repr i0_val) by assumption.
   unfold Int64.loword.
@@ -260,19 +260,19 @@ Theorem modu_is_modlu: forall v1 v2 : val,
     end.
 Proof.
   intros.
-  destruct v1; simpl; trivial.
-  destruct v2; simpl; trivial.
+  destruct v1; cbn; trivial.
+  destruct v2; cbn; trivial.
   destruct i as [i_val i_range].
   destruct i0 as [i0_val i0_range].
-  simpl.
+  cbn.
   unfold Int.eq, Int64.eq, Int.zero, Int64.zero.
-  simpl.
+  cbn.
   rewrite Int.unsigned_repr by (compute; split; discriminate).
   rewrite (Int64.unsigned_repr 0) by (compute; split; discriminate).
   rewrite (unsigned64_repr i0_val) by assumption.
-  destruct (zeq i0_val 0) as [ | Hnot0]; simpl; trivial.
+  destruct (zeq i0_val 0) as [ | Hnot0]; cbn; trivial.
   f_equal. f_equal.
-  unfold Int.modu, Int64.modu. simpl.
+  unfold Int.modu, Int64.modu. cbn.
   rewrite (unsigned64_repr i_val) by assumption.
   rewrite (unsigned64_repr i0_val) by assumption.
   unfold Int64.loword.
@@ -347,19 +347,19 @@ Theorem divs_is_divls: forall v1 v2 : val,
     end.
 Proof.
   intros.
-  destruct v1; simpl; trivial.
-  destruct v2; simpl; trivial.
+  destruct v1; cbn; trivial.
+  destruct v2; cbn; trivial.
   destruct i as [i_val i_range].
   destruct i0 as [i0_val i0_range].
-  simpl.
+  cbn.
   unfold Int.eq, Int64.eq, Int.zero, Int64.zero.
-  simpl.
+  cbn.
   replace (Int.unsigned (Int.repr 0)) with 0 in * by reflexivity.
-  destruct (zeq _ _) as [H0' | Hnot0]; simpl; trivial.
-  destruct (zeq i_val (Int.unsigned (Int.repr Int.min_signed))) as [Hmin | Hnotmin]; simpl.
+  destruct (zeq _ _) as [H0' | Hnot0]; cbn; trivial.
+  destruct (zeq i_val (Int.unsigned (Int.repr Int.min_signed))) as [Hmin | Hnotmin]; cbn.
   { subst.
     destruct (zeq i0_val (Int.unsigned Int.mone)) as [Hmone | Hnotmone]; trivial.
-    unfold Int.signed. simpl.
+    unfold Int.signed. cbn.
     replace (Int64.unsigned (Int64.repr 0)) with 0 in * by reflexivity.
     rewrite if_zlt_min_signed_half_modulus.
     replace (if
@@ -370,7 +370,7 @@ Proof.
              (Int64.unsigned (Int64.repr Int64.min_signed))
           then true
               else false) with false by reflexivity.
-    simpl.
+    cbn.
     rewrite orb_false_r.
     destruct (zlt i0_val Int.half_modulus) as [Hlt_half | Hge_half].
     {
@@ -380,7 +380,7 @@ Proof.
       unfold Val.loword.
       f_equal.
       unfold Int64.divs, Int.divs, Int64.loword.
-      unfold Int.signed, Int64.signed. simpl.      
+      unfold Int.signed, Int64.signed. cbn.      
       rewrite if_zlt_min_signed_half_modulus.
       change Int.half_modulus with 2147483648 in *.
       destruct (zlt _ _) as [discard|]; try omega. clear discard.
@@ -390,7 +390,7 @@ Proof.
       with 18446744071562067968.
       change Int64.half_modulus with 9223372036854775808.
       change Int64.modulus with 18446744073709551616.
-      simpl.
+      cbn.
       rewrite (Int64.unsigned_repr i0_val) by (change Int64.max_unsigned with 18446744073709551615; omega).
       destruct (zlt i0_val 9223372036854775808) as [discard |]; try omega.
       clear discard.
@@ -449,7 +449,7 @@ Lemma big_unsigned_signed:
 Proof.
   destruct x as [xval xrange].
   intro BIG.
-  unfold Int.signed, Int.unsigned in *. simpl in *.
+  unfold Int.signed, Int.unsigned in *. cbn in *.
   destruct (zlt _ _).
   omega.
   trivial.
@@ -499,10 +499,10 @@ Lemma divs_is_quot: forall v1 v2 : val,
     end.
 
 Proof.
-  destruct v1; destruct v2; simpl; trivial.
+  destruct v1; destruct v2; cbn; trivial.
   unfold Int.divs.
   rewrite signed_0_eqb.
-  destruct (Int.eq i0 Int.zero) eqn:Eeq0; simpl; trivial.
+  destruct (Int.eq i0 Int.zero) eqn:Eeq0; cbn; trivial.
   destruct (Int.eq i (Int.repr Int.min_signed) && Int.eq i0 Int.mone) eqn:EXCEPTION.
   { replace (Int.signed i0) with (-1).
     replace (Int.signed i) with Int.min_signed.
@@ -523,7 +523,7 @@ Proof.
     unfold Int.eq in EXCEPTION.
     destruct (zeq _ _) in EXCEPTION; try discriminate.
     destruct (zeq _ _) as [Hmone | ]  in EXCEPTION; try discriminate.
-    destruct i0 as [i0val i0range]; unfold Int.signed in *; simpl in *.
+    destruct i0 as [i0val i0range]; unfold Int.signed in *; cbn in *.
     rewrite Hmone.
     reflexivity.
   }
@@ -651,7 +651,7 @@ Qed.
 Lemma sub_add_neg :
   forall x y, Val.sub x y = Val.add x (Val.neg y).
 Proof.
-  destruct x; destruct y; simpl; trivial.
+  destruct x; destruct y; cbn; trivial.
   f_equal.
   apply Int.sub_add_opp.
 Qed.
@@ -659,7 +659,7 @@ Qed.
 Lemma neg_mul_distr_r :
   forall x y, Val.neg (Val.mul x y) = Val.mul x (Val.neg y).
 Proof.
-  destruct x; destruct y; simpl; trivial.
+  destruct x; destruct y; cbn; trivial.
   f_equal.
   apply Int.neg_mul_distr_r.
 Qed.
@@ -668,7 +668,7 @@ Qed.
 Lemma sub_addl_negl :
   forall x y, Val.subl x y = Val.addl x (Val.negl y).
 Proof.
-  destruct x; destruct y; simpl; trivial.
+  destruct x; destruct y; cbn; trivial.
   + f_equal. apply Int64.sub_add_opp.
   + destruct (Archi.ptr64) eqn:ARCHI64; trivial.
     f_equal. rewrite Ptrofs.sub_add_opp.
@@ -681,15 +681,15 @@ Proof.
     rewrite Hagree2.
     reflexivity.
     exact (Ptrofs.agree64_of_int ARCHI64 i0).
-  +  destruct (Archi.ptr64) eqn:ARCHI64; simpl; trivial.
-     destruct (eq_block _ _); simpl; trivial.
+  +  destruct (Archi.ptr64) eqn:ARCHI64; cbn; trivial.
+     destruct (eq_block _ _); cbn; trivial.
 Qed.
  *)
 
 Lemma negl_mull_distr_r :
   forall x y, Val.negl (Val.mull x y) = Val.mull x (Val.negl y).
 Proof.
-  destruct x; destruct y; simpl; trivial.
+  destruct x; destruct y; cbn; trivial.
   f_equal.
   apply Int64.neg_mul_distr_r.
 Qed.
diff --git a/kvx/InstructionScheduler.ml b/kvx/InstructionScheduler.ml
index e4dc3f97..eab0b21a 100644
--- a/kvx/InstructionScheduler.ml
+++ b/kvx/InstructionScheduler.ml
@@ -12,6 +12,16 @@
 (*                                                             *)
 (* *************************************************************)
 
+let with_destructor dtor stuff f =
+  try let ret = f stuff in
+      dtor stuff;
+      ret
+  with exn -> dtor stuff;
+              raise exn;;
+
+let with_out_channel chan f = with_destructor close_out chan f;;
+let with_in_channel chan f = with_destructor close_in chan f;;
+
 (** Schedule instructions on a synchronized pipeline
 @author David Monniaux, CNRS, VERIMAG *)
 
@@ -844,16 +854,15 @@ let pseudo_boolean_solver = ref "pb_solver"
 
 let pseudo_boolean_scheduler pb_type problem =
   try
-    let filename_in = "problem.opb"
-    (* needed only if not using stdout and filename_out = "problem.sol" *) in
-    let opb_problem = open_out filename_in in
-    let mapper = pseudo_boolean_print_problem opb_problem problem pb_type in
-    close_out opb_problem;
-    
-    let opb_solution = Unix.open_process_in (!pseudo_boolean_solver ^ " " ^ filename_in) in
-    let ret = adjust_check_solution mapper (pseudo_boolean_read_solution mapper opb_solution) in
-    close_in opb_solution;
-    Some ret
+    let filename_in = "problem.opb" in
+    (* needed only if not using stdout and filename_out = "problem.sol" *)
+    let mapper =
+      with_out_channel (open_out filename_in)
+        (fun opb_problem ->
+          pseudo_boolean_print_problem opb_problem problem pb_type) in
+    Some (with_in_channel
+      (Unix.open_process_in (!pseudo_boolean_solver ^ " " ^ filename_in))
+      (fun opb_solution -> adjust_check_solution mapper (pseudo_boolean_read_solution mapper opb_solution)))
   with
   | Unschedulable -> None;;
 				 
@@ -1193,23 +1202,23 @@ let ilp_read_solution mapper channel =
 let ilp_solver = ref "ilp_solver"
 
 let problem_nr = ref 0
-                             
+
 let ilp_scheduler pb_type problem =
   try
     let filename_in = Printf.sprintf  "problem%05d.lp" !problem_nr
     and filename_out = Printf.sprintf "problem%05d.sol" !problem_nr in
     incr problem_nr;
-    let opb_problem = open_out filename_in in
-    let mapper = ilp_print_problem opb_problem problem pb_type in
-    close_out opb_problem;
+    let mapper = with_out_channel (open_out filename_in)
+      (fun opb_problem -> ilp_print_problem opb_problem problem pb_type) in
 
     begin
       match Unix.system (!ilp_solver ^ " " ^ filename_in ^ " " ^ filename_out) with
       | Unix.WEXITED 0 ->
-         let opb_solution = open_in filename_out in
-         let ret = adjust_check_solution mapper (ilp_read_solution mapper opb_solution) in
-         close_in opb_solution;
-         Some ret
+         Some (with_in_channel
+                 (open_in filename_out)
+                 (fun opb_solution ->
+                   adjust_check_solution mapper
+                     (ilp_read_solution mapper opb_solution)))
       | Unix.WEXITED _ -> failwith "failed to start ilp solver"
       | _ -> None
     end
@@ -1245,3 +1254,10 @@ let cascaded_scheduler (problem : problem) =
      end;
      Some solution;;
 
+let scheduler_by_name name =
+  match name with
+  | "ilp" -> validated_scheduler cascaded_scheduler
+  | "list" -> validated_scheduler list_scheduler
+  | "revlist" -> validated_scheduler reverse_list_scheduler
+  | "greedy" -> greedy_scheduler
+  | s -> failwith ("unknown scheduler: " ^ s);;
diff --git a/kvx/InstructionScheduler.mli b/kvx/InstructionScheduler.mli
index f91c2d06..85e2a5c6 100644
--- a/kvx/InstructionScheduler.mli
+++ b/kvx/InstructionScheduler.mli
@@ -108,3 +108,6 @@ val smt_print_problem : out_channel -> problem -> unit;;
 val ilp_print_problem : out_channel -> problem -> pseudo_boolean_problem_type -> pseudo_boolean_mapper;;
 
 val ilp_scheduler : pseudo_boolean_problem_type -> problem -> solution option;;
+
+(** Schedule a problem using a scheduler given by a string name *)
+val scheduler_by_name : string -> problem -> int array option;;
diff --git a/kvx/NeedOp.v b/kvx/NeedOp.v
index 4c354d5a..f636336d 100644
--- a/kvx/NeedOp.v
+++ b/kvx/NeedOp.v
@@ -229,7 +229,7 @@ Lemma needs_of_condition0_sound:
 Proof.
   intros until arg2.
   intros Hcond Hagree.
-  apply eval_condition0_inj with (f := inject_id) (m1 := m1) (v1 := arg1); simpl; auto.
+  apply eval_condition0_inj with (f := inject_id) (m1 := m1) (v1 := arg1); cbn; auto.
   apply val_inject_lessdef. apply lessdef_vagree. assumption.
 Qed.
 
@@ -239,7 +239,7 @@ Lemma addl_sound:
   vagree (Val.addl v1 v2) (Val.addl w1 w2) x.
 Proof.
   unfold default; intros.
-  destruct x; simpl in *; trivial.
+  destruct x; cbn in *; trivial.
   - unfold Val.addl.
     destruct v1; destruct v2; trivial; destruct Archi.ptr64; trivial.
   - apply Val.addl_lessdef; trivial.
@@ -249,7 +249,7 @@ Lemma subl_lessdef:
   forall v1 v1' v2 v2',
   Val.lessdef v1 v1' -> Val.lessdef v2 v2' -> Val.lessdef (Val.subl v1 v2) (Val.subl v1' v2').
 Proof.
-  intros. inv H. inv H0. auto. destruct v1'; simpl; auto. simpl; auto.
+  intros. inv H. inv H0. auto. destruct v1'; cbn; auto. cbn; auto.
 Qed.
 
 Lemma subl_sound:
@@ -258,10 +258,10 @@ Lemma subl_sound:
   vagree (Val.subl v1 v2) (Val.subl w1 w2) x.
 Proof.
   unfold default; intros.
-  destruct x; simpl in *; trivial.
+  destruct x; cbn in *; trivial.
   - unfold Val.subl.
-    destruct v1; destruct v2; trivial; destruct Archi.ptr64; simpl; trivial.
-    destruct (eq_block _ _) ; simpl; trivial.
+    destruct v1; destruct v2; trivial; destruct Archi.ptr64; cbn; trivial.
+    destruct (eq_block _ _) ; cbn; trivial.
   - apply subl_lessdef; trivial.
 Qed.
 
@@ -272,7 +272,7 @@ Lemma mull_sound:
   vagree (Val.mull v1 v2) (Val.mull w1 w2) x.
 Proof.
   unfold default; intros.
-  destruct x; simpl in *; trivial.
+  destruct x; cbn in *; trivial.
   - unfold Val.mull.
     destruct v1; destruct v2; trivial.
   - unfold Val.mull.
@@ -284,7 +284,7 @@ Qed.
 
 Remark default_idem: forall nv, default (default nv) = default nv.
 Proof.
-  destruct nv; simpl; trivial.
+  destruct nv; cbn; trivial.
 Qed.
 
 Lemma vagree_triple_op_float :
@@ -298,14 +298,14 @@ Proof.
   induction nv;
   intros Hax Hby Hcz.
   - trivial.
-  - simpl in *. destruct a; simpl; trivial.
-    destruct b; simpl; trivial.
-    destruct c; simpl; trivial.
-  - simpl in *. destruct a; simpl; trivial.
-    destruct b; simpl; trivial.
-    destruct c; simpl; trivial.
+  - cbn in *. destruct a; cbn; trivial.
+    destruct b; cbn; trivial.
+    destruct c; cbn; trivial.
+  - cbn in *. destruct a; cbn; trivial.
+    destruct b; cbn; trivial.
+    destruct c; cbn; trivial.
     inv Hax. inv Hby. inv Hcz.
-    simpl.
+    cbn.
     constructor.
 Qed.
 
@@ -320,14 +320,14 @@ Proof.
   induction nv;
   intros Hax Hby Hcz.
   - trivial.
-  - simpl in *. destruct a; simpl; trivial.
-    destruct b; simpl; trivial.
-    destruct c; simpl; trivial.
-  - simpl in *. destruct a; simpl; trivial.
-    destruct b; simpl; trivial.
-    destruct c; simpl; trivial.
+  - cbn in *. destruct a; cbn; trivial.
+    destruct b; cbn; trivial.
+    destruct c; cbn; trivial.
+  - cbn in *. destruct a; cbn; trivial.
+    destruct b; cbn; trivial.
+    destruct c; cbn; trivial.
     inv Hax. inv Hby. inv Hcz.
-    simpl.
+    cbn.
     constructor.
 Qed.
 
@@ -343,7 +343,7 @@ Lemma needs_of_operation_sound:
   /\ vagree v v' nv.
 Proof.
   unfold needs_of_operation; intros; destruct op; try (eapply default_needs_of_operation_sound; eauto; fail);
-  simpl in *; FuncInv; InvAgree; TrivialExists.
+  cbn in *; FuncInv; InvAgree; TrivialExists.
 - apply sign_ext_sound; auto. compute; auto. 
 - apply sign_ext_sound; auto. compute; auto. 
 - apply add_sound; auto.
@@ -384,17 +384,17 @@ Proof.
 - destruct (eval_condition0 _ _ _) as [b|] eqn:EC.
   erewrite needs_of_condition0_sound by eauto.
   apply select_sound; auto.
-  simpl; auto with na.
+  cbn; auto with na.
   (* select imm *)
 - destruct (eval_condition0 _ _ _) as [b|] eqn:EC.
   { erewrite needs_of_condition0_sound by eauto.
   apply select_sound; auto with na. }
-  simpl; auto with na.
+  cbn; auto with na.
   (* select long imm *)
 - destruct (eval_condition0 _ _ _) as [b|] eqn:EC.
   { erewrite needs_of_condition0_sound by eauto.
   apply select_sound; auto with na. }
-  simpl; auto with na.
+  cbn; auto with na.
 Qed.
 
 Lemma operation_is_redundant_sound:
@@ -404,7 +404,7 @@ Lemma operation_is_redundant_sound:
   vagree_list (arg1 :: args) (arg1' :: args') (needs_of_operation op nv) ->
   vagree v arg1' nv.
 Proof.
-  intros. destruct op; simpl in *; try discriminate; inv H1; FuncInv; subst.
+  intros. destruct op; cbn in *; try discriminate; inv H1; FuncInv; subst.
 - apply sign_ext_redundant_sound; auto. omega.
 - apply sign_ext_redundant_sound; auto. omega.
 - apply andimm_redundant_sound; auto.
diff --git a/kvx/Op.v b/kvx/Op.v
index 544bb081..e2ffa3e5 100644
--- a/kvx/Op.v
+++ b/kvx/Op.v
@@ -508,9 +508,9 @@ Qed.
 Ltac FuncInv :=
   match goal with
   | H: (match ?x with nil => _ | _ :: _ => _ end = Some _) |- _ =>
-      destruct x; simpl in H; FuncInv
+      destruct x; cbn in H; FuncInv
   | H: (match ?v with Vundef => _ | Vint _ => _ | Vfloat _ => _ | Vptr _ _ => _ end = Some _) |- _ =>
-      destruct v; simpl in H; FuncInv
+      destruct v; cbn in H; FuncInv
   | H: (if Archi.ptr64 then _ else _) = Some _ |- _ =>
       destruct Archi.ptr64 eqn:?; FuncInv
   | H: (Some _ = Some _) |- _ =>
@@ -727,27 +727,27 @@ Qed.
 Remark type_sub:
   forall v1 v2, Val.has_type (Val.sub v1 v2) Tint.
 Proof.
-  intros. unfold Val.has_type, Val.sub. destruct Archi.ptr64, v1, v2; simpl; auto.
+  intros. unfold Val.has_type, Val.sub. destruct Archi.ptr64, v1, v2; cbn; auto.
   destruct (eq_block _ _); auto.
 Qed.
 
 Remark type_subl:
   forall v1 v2, Val.has_type (Val.subl v1 v2) Tlong.
 Proof.
-  intros. unfold Val.has_type, Val.subl. destruct Archi.ptr64, v1, v2; simpl; auto.
+  intros. unfold Val.has_type, Val.subl. destruct Archi.ptr64, v1, v2; cbn; auto.
   destruct (eq_block _ _); auto.
 Qed.
 
 Remark type_shl:
   forall v1 v2, Val.has_type (Val.shl v1 v2) Tint.
 Proof.
-  destruct v1, v2; simpl; trivial; destruct (Int.ltu _ _); simpl; trivial.
+  destruct v1, v2; cbn; trivial; destruct (Int.ltu _ _); cbn; trivial.
 Qed.
 
 Remark type_shll:
   forall v1 v2, Val.has_type (Val.shll v1 v2) Tlong.
 Proof.
-  destruct v1, v2; simpl; trivial; destruct (Int.ltu _ _); simpl; trivial.
+  destruct v1, v2; cbn; trivial; destruct (Int.ltu _ _); cbn; trivial.
 Qed.
 
 Lemma type_of_operation_sound:
@@ -757,7 +757,7 @@ Lemma type_of_operation_sound:
   Val.has_type v (snd (type_of_operation op)).
 Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type).
   intros.
-  destruct op; simpl; simpl in H0; FuncInv; subst; simpl.
+  destruct op; cbn; cbn in H0; FuncInv; subst; cbn.
   (* move *)
   - congruence.
   (* intconst, longconst, floatconst, singleconst *)
@@ -777,30 +777,30 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type).
   - apply type_add.
   (* addx, addximm *)
   - apply type_add.
-  - destruct v0; simpl; trivial.
-    destruct (Int.ltu _ _); simpl; trivial.
+  - destruct v0; cbn; trivial.
+    destruct (Int.ltu _ _); cbn; trivial.
   (* neg, sub *)
   - destruct v0...
   - apply type_sub.
   (* revsubimm, revsubx, revsubximm *)
   - destruct v0...
   - apply type_sub.
-  - destruct v0; simpl; trivial.
-    destruct (Int.ltu _ _); simpl; trivial.
+  - destruct v0; cbn; trivial.
+    destruct (Int.ltu _ _); cbn; trivial.
   (* mul, mulimm, mulhs, mulhu *)
   - destruct v0; destruct v1...
   - destruct v0...
   - destruct v0; destruct v1...
   - destruct v0; destruct v1...
   (* div, divu *)
-  - destruct v0; destruct v1; simpl in *; inv H0.
-    destruct (Int.eq i0 Int.zero || Int.eq i (Int.repr Int.min_signed) && Int.eq i0 Int.mone); inv H2...
-  - destruct v0; destruct v1; simpl in *; inv H0.
+  - destruct v0; destruct v1; cbn in *; inv H0.
+    destruct (_ || _); inv H2...
+  - destruct v0; destruct v1; cbn in *; inv H0.
     destruct (Int.eq i0 Int.zero); inv H2...
   (* mod, modu *)
-  - destruct v0; destruct v1; simpl in *; inv H0.
-    destruct (Int.eq i0 Int.zero || Int.eq i (Int.repr Int.min_signed) && Int.eq i0 Int.mone); inv H2...
-  - destruct v0; destruct v1; simpl in *; inv H0.
+  - destruct v0; destruct v1; cbn in *; inv H0.
+    destruct (_ || _); inv H2...
+  - destruct v0; destruct v1; cbn in *; inv H0.
     destruct (Int.eq i0 Int.zero); inv H2...
   (* and, andimm *)
   - destruct v0; destruct v1...
@@ -829,18 +829,18 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type).
   - destruct v0; destruct v1...
   - destruct v0...
   (* shl, shlimm *)
-  - destruct v0; destruct v1; simpl... destruct (Int.ltu i0 Int.iwordsize)...
-  - destruct v0; simpl... destruct (Int.ltu n Int.iwordsize)...
+  - destruct v0; destruct v1; cbn... destruct (Int.ltu i0 Int.iwordsize)...
+  - destruct v0; cbn... destruct (Int.ltu n Int.iwordsize)...
   (* shr, shrimm *)
-  - destruct v0; destruct v1; simpl... destruct (Int.ltu i0 Int.iwordsize)...
-  - destruct v0; simpl... destruct (Int.ltu n Int.iwordsize)...
+  - destruct v0; destruct v1; cbn... destruct (Int.ltu i0 Int.iwordsize)...
+  - destruct v0; cbn... destruct (Int.ltu n Int.iwordsize)...
   (* shru, shruimm *)
-  - destruct v0; destruct v1; simpl... destruct (Int.ltu i0 Int.iwordsize)...
-  - destruct v0; simpl... destruct (Int.ltu n Int.iwordsize)...
+  - destruct v0; destruct v1; cbn... destruct (Int.ltu i0 Int.iwordsize)...
+  - destruct v0; cbn... destruct (Int.ltu n Int.iwordsize)...
   (* shrx *)
-  - destruct v0; simpl... destruct (Int.ltu n (Int.repr 31)); simpl; trivial.
+  - destruct v0; cbn... destruct (Int.ltu n (Int.repr 31)); cbn; trivial.
   (* shrimm *)
-  - destruct v0; simpl...   
+  - destruct v0; cbn...   
   (* madd *)
   - apply type_add.
   - apply type_add.
@@ -858,13 +858,13 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type).
   - apply type_addl.
   (* addxl addxlimm *)
   - apply type_addl.
-  - destruct v0; simpl; trivial.
-    destruct (Int.ltu _ _); simpl; trivial.
+  - destruct v0; cbn; trivial.
+    destruct (Int.ltu _ _); cbn; trivial.
   (* negl, subl *)
   - destruct v0...
   - apply type_subl.
-  - destruct v0; simpl; trivial.
-    destruct (Int.ltu _ _); simpl; trivial.
+  - destruct v0; cbn; trivial.
+    destruct (Int.ltu _ _); cbn; trivial.
   - destruct v0...
   - apply type_subl.
   (* mull, mullhs, mullhu *)
@@ -873,14 +873,14 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type).
   - destruct v0; destruct v1...
   - destruct v0; destruct v1...
   (* divl, divlu *)
-  - destruct v0; destruct v1; simpl in *; inv H0.
-    destruct (Int64.eq i0 Int64.zero || Int64.eq i (Int64.repr Int64.min_signed) && Int64.eq i0 Int64.mone); inv H2...
-  - destruct v0; destruct v1; simpl in *; inv H0.
+  - destruct v0; destruct v1; cbn in *; inv H0.
+    destruct (_ || _); inv H2...
+  - destruct v0; destruct v1; cbn in *; inv H0.
     destruct (Int64.eq i0 Int64.zero); inv H2...
   (* modl, modlu *)
-  - destruct v0; destruct v1; simpl in *; inv H0.
-    destruct (Int64.eq i0 Int64.zero || Int64.eq i (Int64.repr Int64.min_signed) && Int64.eq i0 Int64.mone); inv H2...
-  - destruct v0; destruct v1; simpl in *; inv H0.
+  - destruct v0; destruct v1; cbn in *; inv H0.
+    destruct (_ || _); inv H2...
+  - destruct v0; destruct v1; cbn in *; inv H0.
     destruct (Int64.eq i0 Int64.zero); inv H2...
   (* andl, andlimm *)
   - destruct v0; destruct v1...
@@ -909,16 +909,16 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type).
   - destruct v0; destruct v1...
   - destruct v0...
   (* shll, shllimm *)
-  - destruct v0; destruct v1; simpl... destruct (Int.ltu i0 Int64.iwordsize')...
-  - destruct v0; simpl... destruct (Int.ltu n Int64.iwordsize')...
+  - destruct v0; destruct v1; cbn... destruct (Int.ltu i0 Int64.iwordsize')...
+  - destruct v0; cbn... destruct (Int.ltu n Int64.iwordsize')...
   (* shr, shrimm *)
-  - destruct v0; destruct v1; simpl... destruct (Int.ltu i0 Int64.iwordsize')...
-  - destruct v0; simpl... destruct (Int.ltu n Int64.iwordsize')...
+  - destruct v0; destruct v1; cbn... destruct (Int.ltu i0 Int64.iwordsize')...
+  - destruct v0; cbn... destruct (Int.ltu n Int64.iwordsize')...
   (* shru, shruimm *)
-  - destruct v0; destruct v1; simpl... destruct (Int.ltu i0 Int64.iwordsize')...
-  - destruct v0; simpl... destruct (Int.ltu n Int64.iwordsize')...
+  - destruct v0; destruct v1; cbn... destruct (Int.ltu i0 Int64.iwordsize')...
+  - destruct v0; cbn... destruct (Int.ltu n Int64.iwordsize')...
   (* shrxl *)
-  - destruct v0; simpl... destruct (Int.ltu n (Int.repr 63)); simpl; trivial.
+  - destruct v0; cbn... destruct (Int.ltu n (Int.repr 63)); cbn; trivial.
   (* maddl, maddlim *)
   - apply type_addl.
   - apply type_addl.
@@ -960,59 +960,59 @@ Proof with (try exact I; try reflexivity; auto using Val.Vptr_has_type).
   - destruct v0...
   - destruct v0...
   (* intoffloat, intuoffloat *)
-  - destruct v0; simpl... destruct (Float.to_int f); simpl; trivial.
-  - destruct v0; simpl... destruct (Float.to_intu f); simpl; trivial.
+  - destruct v0; cbn... destruct (Float.to_int f); cbn; trivial.
+  - destruct v0; cbn... destruct (Float.to_intu f); cbn; trivial.
   (* intofsingle, intuofsingle *)
-  - destruct v0; simpl...  destruct (Float32.to_int f); simpl; trivial.
-  - destruct v0; simpl...  destruct (Float32.to_intu f); simpl; trivial.
+  - destruct v0; cbn...  destruct (Float32.to_int f); cbn; trivial.
+  - destruct v0; cbn...  destruct (Float32.to_intu f); cbn; trivial.
   (* singleofint, singleofintu *)
-  - destruct v0; simpl...
-  - destruct v0; simpl...
+  - destruct v0; cbn...
+  - destruct v0; cbn...
   (* longoffloat, longuoffloat *)
-  - destruct v0; simpl... destruct (Float.to_long f); simpl; trivial.
-  - destruct v0; simpl... destruct (Float.to_longu f); simpl; trivial.
+  - destruct v0; cbn... destruct (Float.to_long f); cbn; trivial.
+  - destruct v0; cbn... destruct (Float.to_longu f); cbn; trivial.
   (* floatoflong, floatoflongu *)
-  - destruct v0; simpl...
-  - destruct v0; simpl...
+  - destruct v0; cbn...
+  - destruct v0; cbn...
   (* longofsingle, longuofsingle *)
-  - destruct v0; simpl... destruct (Float32.to_long f); simpl; trivial.
-  - destruct v0; simpl... destruct (Float32.to_longu f); simpl; trivial.
+  - destruct v0; cbn... destruct (Float32.to_long f); cbn; trivial.
+  - destruct v0; cbn... destruct (Float32.to_longu f); cbn; trivial.
   (* singleoflong, singleoflongu *)
-  - destruct v0; simpl...
-  - destruct v0; simpl...
+  - destruct v0; cbn...
+  - destruct v0; cbn...
   (* cmp *)
   - destruct (eval_condition cond vl m)... destruct b...
  (* extfz *)
   - unfold extfz.
     destruct (is_bitfield _ _).
-    + destruct v0; simpl; trivial.
+    + destruct v0; cbn; trivial.
     + constructor.
  (* extfs *)
   - unfold extfs.
     destruct (is_bitfield _ _).
-    + destruct v0; simpl; trivial.
+    + destruct v0; cbn; trivial.
     + constructor.
  (* extfzl *)
   - unfold extfzl.
     destruct (is_bitfieldl _ _).
-    + destruct v0; simpl; trivial.
+    + destruct v0; cbn; trivial.
     + constructor.
  (* extfsl *)
   - unfold extfsl.
     destruct (is_bitfieldl _ _).
-    + destruct v0; simpl; trivial.
+    + destruct v0; cbn; trivial.
     + constructor.
  (* insf *)
   - unfold insf, bitfield_mask.
     destruct (is_bitfield _ _).
-    + destruct v0; destruct v1; simpl; trivial.
-      destruct (Int.ltu _ _); simpl; trivial.
+    + destruct v0; destruct v1; cbn; trivial.
+      destruct (Int.ltu _ _); cbn; trivial.
     + constructor.
  (* insf *)
   - unfold insfl, bitfield_mask.
     destruct (is_bitfieldl _ _).
-    + destruct v0; destruct v1; simpl; trivial.
-      destruct (Int.ltu _ _); simpl; trivial.
+    + destruct v0; destruct v1; cbn; trivial.
+      destruct (Int.ltu _ _); cbn; trivial.
     + constructor.
  (* Osel *)
   - unfold Val.select. destruct (eval_condition0 _ _ m).
@@ -1047,7 +1047,7 @@ Lemma is_trapping_op_sound:
     eval_operation genv sp op vl m <> None.
 Proof.
   unfold args_of_operation.
-  destruct op; destruct eq_operation; intros; simpl in *; try congruence.
+  destruct op; destruct eq_operation; intros; cbn in *; try congruence.
   all: try (destruct vl as [ | vh1 vl1]; try discriminate).
   all: try (destruct vl1 as [ | vh2 vl2]; try discriminate).
   all: try (destruct vl2 as [ | vh3 vl3]; try discriminate).
@@ -1101,7 +1101,7 @@ Lemma eval_negate_condition:
   forall cond vl m,
   eval_condition (negate_condition cond) vl m = option_map negb (eval_condition cond vl m).
 Proof.
-  intros. destruct cond; simpl.
+  intros. destruct cond; cbn.
   repeat (destruct vl; auto). apply Val.negate_cmp_bool.
   repeat (destruct vl; auto). apply Val.negate_cmpu_bool.
   repeat (destruct vl; auto). apply Val.negate_cmp_bool.
@@ -1147,7 +1147,7 @@ Lemma eval_shift_stack_addressing:
   eval_addressing ge (Vptr sp Ptrofs.zero) (shift_stack_addressing delta addr) vl =
   eval_addressing ge (Vptr sp (Ptrofs.repr delta)) addr vl.
 Proof.
-  intros. destruct addr; simpl; auto. destruct vl; auto.
+  intros. destruct addr; cbn; auto. destruct vl; auto.
   rewrite Ptrofs.add_zero_l, Ptrofs.add_commut; auto.
 Qed.
 
@@ -1156,7 +1156,7 @@ Lemma eval_shift_stack_operation:
   eval_operation ge (Vptr sp Ptrofs.zero) (shift_stack_operation delta op) vl m =
   eval_operation ge (Vptr sp (Ptrofs.repr delta)) op vl m.
 Proof.
-  intros. destruct op; simpl; auto. destruct vl; auto.
+  intros. destruct op; cbn; auto. destruct vl; auto.
   rewrite Ptrofs.add_zero_l, Ptrofs.add_commut; auto.
 Qed.
 
@@ -1183,12 +1183,12 @@ Proof.
   assert (A: forall x n,
              Val.offset_ptr x (Ptrofs.add n (Ptrofs.repr delta)) =
              Val.add (Val.offset_ptr x n) (Vint (Int.repr delta))).
-  { intros; destruct x; simpl; auto. rewrite H1. 
+  { intros; destruct x; cbn; auto. rewrite H1. 
     rewrite Ptrofs.add_assoc. f_equal; f_equal; f_equal. symmetry; auto with ptrofs. }
-  destruct addr; simpl in H; inv H; simpl in *; FuncInv; subst.
+  destruct addr; cbn in H; inv H; cbn in *; FuncInv; subst.
 - rewrite A; auto.
 - unfold Genv.symbol_address. destruct (Genv.find_symbol ge i); auto. 
-  simpl. rewrite H1. f_equal; f_equal; f_equal. symmetry; auto with ptrofs.
+  cbn. rewrite H1. f_equal; f_equal; f_equal. symmetry; auto with ptrofs.
 - rewrite A; auto.
 Qed.
 
@@ -1223,17 +1223,17 @@ Lemma op_depends_on_memory_correct:
   op_depends_on_memory op = false ->
   eval_operation ge sp op args m1 = eval_operation ge sp op args m2.
 Proof.
-  intros until m2. destruct op; simpl; try congruence.
-  - destruct cond; simpl; try congruence;
+  intros until m2. destruct op; cbn; try congruence.
+  - destruct cond; cbn; try congruence;
     intros SF; auto; rewrite ? negb_false_iff in SF;
       unfold Val.cmpu_bool, Val.cmplu_bool; rewrite SF; reflexivity.
-  - destruct c0; simpl; try congruence;
+  - destruct c0; cbn; try congruence;
     intros SF; auto; rewrite ? negb_false_iff in SF;
       unfold Val.cmpu_bool, Val.cmplu_bool; rewrite SF; reflexivity.
-  - destruct c0; simpl; try congruence;
+  - destruct c0; cbn; try congruence;
     intros SF; auto; rewrite ? negb_false_iff in SF;
       unfold Val.cmpu_bool, Val.cmplu_bool; rewrite SF; reflexivity.
-  - destruct c0; simpl; try congruence;
+  - destruct c0; cbn; try congruence;
     intros SF; auto; rewrite ? negb_false_iff in SF;
       unfold Val.cmpu_bool, Val.cmplu_bool; rewrite SF; reflexivity.
 Qed.
@@ -1348,19 +1348,19 @@ Lemma eval_condition_inj:
   eval_condition cond vl1 m1 = Some b ->
   eval_condition cond vl2 m2 = Some b.
 Proof.
-  intros. destruct cond; simpl in H0; FuncInv; InvInject; simpl; auto.
-- inv H3; inv H2; simpl in H0; inv H0; auto.
+  intros. destruct cond; cbn in H0; FuncInv; InvInject; cbn; auto.
+- inv H3; inv H2; cbn in H0; inv H0; auto.
 - eauto 3 using Val.cmpu_bool_inject, Mem.valid_pointer_implies.
-- inv H3; simpl in H0; inv H0; auto.
+- inv H3; cbn in H0; inv H0; auto.
 - eauto 3 using Val.cmpu_bool_inject, Mem.valid_pointer_implies.
-- inv H3; inv H2; simpl in H0; inv H0; auto.
+- inv H3; inv H2; cbn in H0; inv H0; auto.
 - eauto 3 using Val.cmplu_bool_inject, Mem.valid_pointer_implies.
-- inv H3; simpl in H0; inv H0; auto.
+- inv H3; cbn in H0; inv H0; auto.
 - eauto 3 using Val.cmplu_bool_inject, Mem.valid_pointer_implies.
-- inv H3; inv H2; simpl in H0; inv H0; auto.
-- inv H3; inv H2; simpl in H0; inv H0; auto.
-- inv H3; inv H2; simpl in H0; inv H0; auto.
-- inv H3; inv H2; simpl in H0; inv H0; auto.
+- inv H3; inv H2; cbn in H0; inv H0; auto.
+- inv H3; inv H2; cbn in H0; inv H0; auto.
+- inv H3; inv H2; cbn in H0; inv H0; auto.
+- inv H3; inv H2; cbn in H0; inv H0; auto.
 Qed.
 
 Lemma eval_condition0_inj:
@@ -1369,10 +1369,10 @@ Lemma eval_condition0_inj:
   eval_condition0 cond v1 m1 = Some b ->
   eval_condition0 cond v2 m2 = Some b.
 Proof.
-  intros. destruct cond; simpl in H0; FuncInv; InvInject; simpl; auto.
-  - inv H; simpl in *; congruence.
+  intros. destruct cond; cbn in H0; FuncInv; InvInject; cbn; auto.
+  - inv H; cbn in *; congruence.
   - eauto 3 using Val.cmpu_bool_inject, Mem.valid_pointer_implies.
-  - inv H; simpl in *; congruence.
+  - inv H; cbn in *; congruence.
   - eauto 3 using Val.cmplu_bool_inject, Mem.valid_pointer_implies.
 Qed.
 
@@ -1393,248 +1393,244 @@ Lemma eval_operation_inj:
   eval_operation ge1 sp1 op vl1 m1 = Some v1 ->
   exists v2, eval_operation ge2 sp2 op vl2 m2 = Some v2 /\ Val.inject f v1 v2.
 Proof.
-  intros until v1; intros GL; intros. destruct op; simpl in H1; simpl; FuncInv; InvInject; TrivialExists.
+  intros until v1; intros GL; intros. destruct op; cbn in H1; cbn; FuncInv; InvInject; TrivialExists.
   (* addrsymbol *)
-  - apply GL; simpl; auto.
+  - apply GL; cbn; auto.
   (* addrstack *)
   - apply Val.offset_ptr_inject; auto. 
   (* castsigned *)
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* add, addimm *)
   - apply Val.add_inject; auto.
   - apply Val.add_inject; auto.
   (* addx, addximm *)
   - apply Val.add_inject; trivial.
-    inv H4; inv H2; simpl; try destruct (Int.ltu _ _); simpl; auto.
-  - inv H4; simpl; trivial.
-    destruct (Int.ltu _ _); simpl; trivial.
+    inv H4; inv H2; cbn; try destruct (Int.ltu _ _); cbn; auto.
+  - inv H4; cbn; trivial.
+    destruct (Int.ltu _ _); cbn; trivial.
   (* neg, sub *)
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
   - apply Val.sub_inject; auto.
   (* revsubimm, revsubx, revsubximm *)
-  - inv H4; simpl; trivial.
+  - inv H4; cbn; trivial.
   - apply Val.sub_inject; trivial.
-    inv H4; inv H2; simpl; try destruct (Int.ltu _ _); simpl; auto.
-  - inv H4; simpl; try destruct (Int.ltu _ _); simpl; auto.
+    inv H4; inv H2; cbn; try destruct (Int.ltu _ _); cbn; auto.
+  - inv H4; cbn; try destruct (Int.ltu _ _); cbn; auto.
   (* mul, mulimm, mulhs, mulhu *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
   (* div, divu *)
-  - inv H4; inv H3; simpl in H1; inv H1. simpl.
-    destruct (Int.eq i0 Int.zero
-              || Int.eq i (Int.repr Int.min_signed) && Int.eq i0 Int.mone); inv H2.
+  - inv H4; inv H3; cbn in H1; inv H1. cbn.
+    destruct (_ || _); inv H2.
     TrivialExists.
-  - inv H4; inv H3; simpl in H1; inv H1. simpl.
+  - inv H4; inv H3; cbn in H1; inv H1. cbn.
     destruct (Int.eq i0 Int.zero); inv H2. TrivialExists.
   (* mod, modu *)
-  - inv H4; inv H3; simpl in H1; inv H1. simpl.
-    destruct (Int.eq i0 Int.zero
-                     || Int.eq i (Int.repr Int.min_signed) && Int.eq i0 Int.mone); inv H2.
+  - inv H4; inv H3; cbn in H1; inv H1. cbn.
+    destruct (_ || _); inv H2.
     TrivialExists.
-  - inv H4; inv H3; simpl in H1; inv H1. simpl.
+  - inv H4; inv H3; cbn in H1; inv H1. cbn.
     destruct (Int.eq i0 Int.zero); inv H2. TrivialExists.
   (* and, andimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* nand, nandimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* or, orimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* nor, norimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* xor, xorimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* nxor, nxorimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* not *)
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
   (* andn, andnimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* orn, ornimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* shl, shlimm *)
-  - inv H4; inv H2; simpl; auto. destruct (Int.ltu i0 Int.iwordsize); auto.
-  - inv H4; simpl; auto. destruct (Int.ltu n Int.iwordsize); auto.
+  - inv H4; inv H2; cbn; auto. destruct (Int.ltu i0 Int.iwordsize); auto.
+  - inv H4; cbn; auto. destruct (Int.ltu n Int.iwordsize); auto.
   (* shr, shrimm *)
-  - inv H4; inv H2; simpl; auto. destruct (Int.ltu i0 Int.iwordsize); auto.
-  - inv H4; simpl; auto. destruct (Int.ltu n Int.iwordsize); auto.
+  - inv H4; inv H2; cbn; auto. destruct (Int.ltu i0 Int.iwordsize); auto.
+  - inv H4; cbn; auto. destruct (Int.ltu n Int.iwordsize); auto.
   (* shru, shruimm *)
-  - inv H4; inv H2; simpl; auto. destruct (Int.ltu i0 Int.iwordsize); auto.
-  - inv H4; simpl; auto. destruct (Int.ltu n Int.iwordsize); auto.
+  - inv H4; inv H2; cbn; auto. destruct (Int.ltu i0 Int.iwordsize); auto.
+  - inv H4; cbn; auto. destruct (Int.ltu n Int.iwordsize); auto.
   (* shrx *)
-  - inv H4; simpl; auto.
-    destruct (Int.ltu n (Int.repr 31)); inv H; simpl; auto.
+  - inv H4; cbn; auto.
+    destruct (Int.ltu n (Int.repr 31)); inv H; cbn; auto.
   (* rorimm *)
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
   (* madd, maddim *)
-  - inv H2; inv H3; inv H4; simpl; auto.
-  - inv H2; inv H4; simpl; auto.
+  - inv H2; inv H3; inv H4; cbn; auto.
+  - inv H2; inv H4; cbn; auto.
   (* msub *)
   - apply Val.sub_inject; auto.
-    inv H3; inv H2; simpl; auto.
+    inv H3; inv H2; cbn; auto.
   (* makelong, highlong, lowlong *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* cast32 *)
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* addl, addlimm *)
   - apply Val.addl_inject; auto.
   - apply Val.addl_inject; auto.
   (* addxl, addxlimm *)
   - apply Val.addl_inject; auto.
-    inv H4; simpl; trivial.
-    destruct (Int.ltu _ _); simpl; trivial.
-  - inv H4; simpl; trivial.
-    destruct (Int.ltu _ _); simpl; trivial.
+    inv H4; cbn; trivial.
+    destruct (Int.ltu _ _); cbn; trivial.
+  - inv H4; cbn; trivial.
+    destruct (Int.ltu _ _); cbn; trivial.
   (* negl, subl *)
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
   - apply Val.subl_inject; auto.
-    inv H4; inv H2; simpl; trivial;
-    destruct (Int.ltu _ _); simpl; trivial.
-  - inv H4; simpl; trivial;
-      destruct (Int.ltu _ _); simpl; trivial.
-  - inv H4; simpl; auto.
+    inv H4; inv H2; cbn; trivial;
+    destruct (Int.ltu _ _); cbn; trivial.
+  - inv H4; cbn; trivial;
+      destruct (Int.ltu _ _); cbn; trivial.
+  - inv H4; cbn; auto.
   - apply Val.subl_inject; auto.
   (* mull, mullhs, mullhu *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
   (* divl, divlu *)
-  - inv H4; inv H3; simpl in H1; inv H1. simpl.
-    destruct (Int64.eq i0 Int64.zero
-              || Int64.eq i (Int64.repr Int64.min_signed) && Int64.eq i0 Int64.mone); inv H2.
+  - inv H4; inv H3; cbn in H1; inv H1. cbn.
+    destruct (_ || _); inv H2.
     TrivialExists.
-  - inv H4; inv H3; simpl in H1; inv H1. simpl.
+  - inv H4; inv H3; cbn in H1; inv H1. cbn.
     destruct (Int64.eq i0 Int64.zero); inv H2. TrivialExists.
   (* modl, modlu *)
-  - inv H4; inv H3; simpl in H1; inv H1. simpl.
-    destruct (Int64.eq i0 Int64.zero
-                     || Int64.eq i (Int64.repr Int64.min_signed) && Int64.eq i0 Int64.mone); inv H2.
+  - inv H4; inv H3; cbn in H1; inv H1. cbn.
+    destruct (_ || _); inv H2.
     TrivialExists.
-  - inv H4; inv H3; simpl in H1; inv H1. simpl.
+  - inv H4; inv H3; cbn in H1; inv H1. cbn.
     destruct (Int64.eq i0 Int64.zero); inv H2. TrivialExists.
   (* andl, andlimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* nandl, nandlimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* orl, orlimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* norl, norlimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* xorl, xorlimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* nxorl, nxorlimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* notl *)
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
   (* andnl, andnlimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* ornl, ornlimm *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; cbn; auto.
   (* shll, shllimm *)
-  - inv H4; inv H2; simpl; auto. destruct (Int.ltu i0 Int64.iwordsize'); auto.
-  - inv H4; simpl; auto. destruct (Int.ltu n Int64.iwordsize'); auto.
+  - inv H4; inv H2; cbn; auto. destruct (Int.ltu i0 Int64.iwordsize'); auto.
+  - inv H4; cbn; auto. destruct (Int.ltu n Int64.iwordsize'); auto.
   (* shr, shrimm *)
-  - inv H4; inv H2; simpl; auto. destruct (Int.ltu i0 Int64.iwordsize'); auto.
-  - inv H4; simpl; auto. destruct (Int.ltu n Int64.iwordsize'); auto.
+  - inv H4; inv H2; cbn; auto. destruct (Int.ltu i0 Int64.iwordsize'); auto.
+  - inv H4; cbn; auto. destruct (Int.ltu n Int64.iwordsize'); auto.
   (* shru, shruimm *)
-  - inv H4; inv H2; simpl; auto. destruct (Int.ltu i0 Int64.iwordsize'); auto.
-  - inv H4; simpl; auto. destruct (Int.ltu n Int64.iwordsize'); auto.
+  - inv H4; inv H2; cbn; auto. destruct (Int.ltu i0 Int64.iwordsize'); auto.
+  - inv H4; cbn; auto. destruct (Int.ltu n Int64.iwordsize'); auto.
   (* shrx *)
-  - inv H4; simpl; auto.
-    destruct (Int.ltu n (Int.repr 63)); simpl; auto.
+  - inv H4; cbn; auto.
+    destruct (Int.ltu n (Int.repr 63)); cbn; auto.
 
   (* maddl, maddlimm *)
   - apply Val.addl_inject; auto.
-    inv H2; inv H3; inv H4; simpl; auto.
+    inv H2; inv H3; inv H4; cbn; auto.
   - apply Val.addl_inject; auto.
-    inv H4; inv H2; simpl; auto.
+    inv H4; inv H2; cbn; auto.
   (* msubl, msublimm *)
   - apply Val.subl_inject; auto.
-    inv H2; inv H3; inv H4; simpl; auto.
+    inv H2; inv H3; inv H4; cbn; auto.
     
   (* negf, absf *)
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* addf, subf *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
   (* mulf, divf *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
   (* minf, maxf *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
   (* fmaddf, fmsubf *)
-  - inv H4; inv H3; inv H2; simpl; auto.
-  - inv H4; inv H3; inv H2; simpl; auto.
+  - inv H4; inv H3; inv H2; cbn; auto.
+  - inv H4; inv H3; inv H2; cbn; auto.
   (* negfs, absfs *)
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* addfs, subfs *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
   (* mulfs, divfs *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
   (* minfs, maxfs *)
-  - inv H4; inv H2; simpl; auto.
-  - inv H4; inv H2; simpl; auto.
+  - inv H4; inv H2; cbn; auto.
+  - inv H4; inv H2; cbn; auto.
   (* invfs *)
-  - inv H4; simpl; auto.    
+  - inv H4; cbn; auto.    
   (* fmaddfs, fmsubfs *)
-  - inv H4; inv H3; inv H2; simpl; auto.
-  - inv H4; inv H3; inv H2; simpl; auto.
+  - inv H4; inv H3; inv H2; cbn; auto.
+  - inv H4; inv H3; inv H2; cbn; auto.
   (* singleoffloat, floatofsingle *)
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* intoffloat, intuoffloat *)
-  - inv H4; simpl; auto. destruct (Float.to_int f0); simpl; auto.
-  - inv H4; simpl; auto. destruct (Float.to_intu f0); simpl; auto.
+  - inv H4; cbn; auto. destruct (Float.to_int f0); cbn; auto.
+  - inv H4; cbn; auto. destruct (Float.to_intu f0); cbn; auto.
   (* intofsingle, intuofsingle *)
-  - inv H4; simpl; auto. destruct (Float32.to_int f0); simpl; auto.
-  - inv H4; simpl; auto. destruct (Float32.to_intu f0); simpl; auto.
+  - inv H4; cbn; auto. destruct (Float32.to_int f0); cbn; auto.
+  - inv H4; cbn; auto. destruct (Float32.to_intu f0); cbn; auto.
   (* singleofint, singleofintu *)
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* longoffloat, longuoffloat *)
-  - inv H4; simpl; auto. destruct (Float.to_long f0); simpl; auto.
-  - inv H4; simpl; auto. destruct (Float.to_longu f0); simpl; auto.
+  - inv H4; cbn; auto. destruct (Float.to_long f0); cbn; auto.
+  - inv H4; cbn; auto. destruct (Float.to_longu f0); cbn; auto.
   (* floatoflong, floatoflongu *)
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* longofsingle, longuofsingle *)
-  - inv H4; simpl; auto. destruct (Float32.to_long f0); simpl; auto.
-  - inv H4; simpl; auto. destruct (Float32.to_longu f0); simpl; auto.
+  - inv H4; cbn; auto. destruct (Float32.to_long f0); cbn; auto.
+  - inv H4; cbn; auto. destruct (Float32.to_longu f0); cbn; auto.
   (* singleoflong, singleoflongu *)
-  - inv H4; simpl; auto.
-  - inv H4; simpl; auto.
+  - inv H4; cbn; auto.
+  - inv H4; cbn; auto.
   (* cmp *)
   - subst v1. destruct (eval_condition cond vl1 m1) eqn:?.
     exploit eval_condition_inj; eauto. intros EQ; rewrite EQ.
-    destruct b; simpl; constructor.
-    simpl; constructor.
+    destruct b; cbn; constructor.
+    cbn; constructor.
 
  (* extfz *)
   - unfold extfz.
@@ -1664,16 +1660,16 @@ Proof.
   - unfold insf.
     destruct (is_bitfield _ _).
     + inv H4; inv H2; trivial.
-      simpl. destruct (Int.ltu _ _); trivial.
-      simpl. trivial.
+      cbn. destruct (Int.ltu _ _); trivial.
+      cbn. trivial.
     + trivial.
 
  (* insfl *)
   - unfold insfl.
     destruct (is_bitfieldl _ _).
     + inv H4; inv H2; trivial.
-      simpl. destruct (Int.ltu _ _); trivial.
-      simpl. trivial.
+      cbn. destruct (Int.ltu _ _); trivial.
+      cbn. trivial.
     + trivial.
 
  (* Osel *)
@@ -1711,13 +1707,13 @@ Lemma eval_addressing_inj:
   eval_addressing ge1 sp1 addr vl1 = Some v1 ->
   exists v2, eval_addressing ge2 sp2 addr vl2 = Some v2 /\ Val.inject f v1 v2.
 Proof.
-  intros. destruct addr; simpl in H2; simpl; FuncInv; InvInject; TrivialExists.
+  intros. destruct addr; cbn in H2; cbn; FuncInv; InvInject; TrivialExists.
   - apply Val.addl_inject; trivial.
-    destruct v0; destruct v'0; simpl; trivial; destruct (Int.ltu _ _); simpl; trivial; inv H3.
+    destruct v0; destruct v'0; cbn; trivial; destruct (Int.ltu _ _); cbn; trivial; inv H3.
     apply Val.inject_long.
   - apply Val.addl_inject; auto.
   - apply Val.offset_ptr_inject; auto.
-  - apply H; simpl; auto.
+  - apply H; cbn; auto.
   - apply Val.offset_ptr_inject; auto. 
 Qed.
 
@@ -1732,7 +1728,7 @@ Lemma eval_addressing_inj_none:
   eval_addressing ge2 sp2 addr vl2 = None.
 Proof.
   intros until vl2. intros Hglobal Hinjsp Hinjvl.
-  destruct addr; simpl in *.
+  destruct addr; cbn in *.
   1,2: inv Hinjvl; trivial;
        inv H0; trivial;
        inv H2; trivial;
@@ -1856,7 +1852,7 @@ Lemma eval_addressing_lessdef_none:
   eval_addressing genv sp addr vl2 = None.
 Proof.
   intros until vl2. intros Hlessdef Heval1.
-  destruct addr; simpl in *.
+  destruct addr; cbn in *.
   1, 2, 4, 5: inv Hlessdef; trivial;
   inv H0; trivial;
   inv H2; trivial;
@@ -1941,7 +1937,7 @@ Lemma eval_operation_inject:
   /\ Val.inject f v1 v2.
 Proof.
   intros.
-  rewrite eval_shift_stack_operation. simpl.
+  rewrite eval_shift_stack_operation. cbn.
   eapply eval_operation_inj with (sp1 := Vptr sp1 Ptrofs.zero) (m1 := m1); eauto.
   intros; eapply Mem.valid_pointer_inject_val; eauto.
   intros; eapply Mem.weak_valid_pointer_inject_val; eauto.
diff --git a/kvx/Peephole.v b/kvx/Peephole.v
index 35f4bbd9..5adb823b 100644
--- a/kvx/Peephole.v
+++ b/kvx/Peephole.v
@@ -153,6 +153,6 @@ Program Definition optimize_bblock (bb : bblock) :=
      exit := exit bb |}.
 Next Obligation.
   destruct (wf_bblockb (optimize_body (body bb))) eqn:Rwf.
-  - rewrite Rwf. simpl. trivial.
+  - rewrite Rwf. cbn. trivial.
   - exact (correct bb).
 Qed.
diff --git a/kvx/PostpassScheduling.v b/kvx/PostpassScheduling.v
index 7518866d..1f1f238a 100644
--- a/kvx/PostpassScheduling.v
+++ b/kvx/PostpassScheduling.v
@@ -12,6 +12,8 @@
 (*                                                             *)
 (* *************************************************************)
 
+(** Implementation (and basic properties) of the verified postpass scheduler *)
+
 Require Import Coqlib Errors AST Integers.
 Require Import Asmblock Axioms Memory Globalenvs.
 Require Import Asmblockdeps Asmblockgenproof0 Asmblockprops.
@@ -19,20 +21,13 @@ Require Peephole.
 
 Local Open Scope error_monad_scope.
 
-(** Oracle taking as input a basic block,
-    returns a schedule expressed as a list of bundles *)
+(** * Oracle taking as input a basic block,
+    returns a scheduled list of bundles *)
 Axiom schedule: bblock -> (list (list basic)) * option control.
 
 Extract Constant schedule => "PostpassSchedulingOracle.schedule".
 
-Definition state' := L.mem.
-Definition outcome' := option state'.
-
-Definition bblock' := L.bblock.
-
-Definition exec' := L.run.
-
-Definition exec := exec_bblock.
+(** * Concat all bundles into one big basic block *)
 
 (* Lemmas necessary for defining concat_all *)
 Lemma app_nonil {A: Type} (l l': list A) : l <> nil -> l ++ l' <> nil.
@@ -49,8 +44,6 @@ Proof.
   - intros. rewrite <- app_comm_cons. discriminate.
 Qed.
 
-
-
 Definition check_size bb :=
   if zlt Ptrofs.max_unsigned (size bb)
     then Error (msg "PostpassSchedulingproof.check_size")
@@ -213,6 +206,8 @@ Qed.
 Inductive is_concat : bblock -> list bblock -> Prop :=
   | mk_is_concat: forall tbb lbb, concat_all lbb = OK tbb -> is_concat tbb lbb.
 
+(** * Remainder of the verified scheduler *)
+
 Definition verify_schedule (bb bb' : bblock) : res unit :=
   match bblock_simub bb bb' with
   | true => OK tt
diff --git a/kvx/PostpassSchedulingOracle.ml b/kvx/PostpassSchedulingOracle.ml
index 822c0dc0..2107ce22 100644
--- a/kvx/PostpassSchedulingOracle.ml
+++ b/kvx/PostpassSchedulingOracle.ml
@@ -504,8 +504,7 @@ let alu_lite_y : int array = let resmap = fun r -> match r with
   | Rissue -> 3 | Rtiny -> 1 | Rlite -> 1 |  _ -> 0
   in Array.of_list (List.map resmap resource_names)
 
-let alu_nop : int array = let resmap = fun r -> match r with 
-  | Rissue -> 1 | Rnop -> 1 | _ -> 0
+let alu_nop : int array = let resmap = fun r -> 0
   in Array.of_list (List.map resmap resource_names)
 
 let alu_tiny : int array = let resmap = fun r -> match r with
@@ -627,16 +626,16 @@ let rec_to_usage r =
                           | Some U27L5 | Some U27L10 -> alu_tiny_x 
                           | Some E27U27L10 -> alu_tiny_y 
                           | _ -> raise InvalidEncoding)
-  | Maddw -> (match encoding with None -> mau_auxr
+  | Maddw | Msbfw -> (match encoding with None -> mau_auxr
                 | Some U6 | Some S10 | Some U27L5 -> mau_auxr_x
                 | _ -> raise InvalidEncoding)
-  | Maddd -> (match encoding with None | Some U6 | Some S10 -> mau_auxr
+  | Maddd | Msbfd -> (match encoding with None | Some U6 | Some S10 -> mau_auxr
                 | Some U27L5 | Some U27L10 -> mau_auxr_x
                 | Some E27U27L10 -> mau_auxr_y)
-  | Mulw| Msbfw -> (match encoding with None -> mau
+  | Mulw -> (match encoding with None -> mau
                                 | Some U6 | Some S10 | Some U27L5 -> mau_x
                                 | _ -> raise InvalidEncoding)
-  | Muld | Msbfd -> (match encoding with None | Some U6 | Some S10 -> mau
+  | Muld -> (match encoding with None | Some U6 | Some S10 -> mau
                                 | Some U27L5 | Some U27L10 -> mau_x
                                 | Some E27U27L10 -> mau_y)
   | Nop -> alu_nop
@@ -914,19 +913,20 @@ let print_bb oc bb =
   let asm_instructions = Asm.unfold_bblock bb
   in List.iter (print_inst oc) asm_instructions
 
+let print_schedule sched =
+  print_string "[ ";
+  Array.iter (fun x -> Printf.printf "%d; " x) sched;
+  print_endline "]";;
+
 let do_schedule bb =
-  let problem = build_problem bb
-  in let solution = (if !Clflags.option_fpostpass_sched = "ilp" then
-                      validated_scheduler cascaded_scheduler
-                    else if !Clflags.option_fpostpass_sched = "list" then
-                      validated_scheduler list_scheduler
-                    else if !Clflags.option_fpostpass_sched = "revlist" then
-                      validated_scheduler reverse_list_scheduler
-                    else if !Clflags.option_fpostpass_sched = "greedy" then
-                      greedy_scheduler else failwith ("Invalid scheduler:" ^ !Clflags.option_fpostpass_sched)) problem
+  let problem = build_problem bb in
+  (if debug then print_problem stdout problem);
+  let solution = scheduler_by_name (!Clflags.option_fpostpass_sched) problem
   in match solution with
   | None -> failwith "Could not find a valid schedule"
-  | Some sol -> let bundles = bundlize_solution bb sol in 
+  | Some sol ->
+     ((if debug then print_schedule sol);
+     let bundles = bundlize_solution bb sol in 
       (if debug then
       begin
         Printf.eprintf "Scheduling the following group of instructions:\n";
@@ -935,7 +935,7 @@ let do_schedule bb =
         List.iter (print_bb stderr) bundles;
         Printf.eprintf "--------------------------------\n"
       end;
-      bundles)
+      bundles))
 
 (**
  * Dumb schedule if the above doesn't work
diff --git a/kvx/SelectOp.vp b/kvx/SelectOp.vp
index 9e5d45a0..65dba3ac 100644
--- a/kvx/SelectOp.vp
+++ b/kvx/SelectOp.vp
@@ -103,8 +103,14 @@ Nondetfunction select0 (ty : typ) (cond0 : condition0) (e1 e2 e3: expr) :=
   | _, _, _ => (Eop (Osel cond0 ty) (e1 ::: e2 ::: e3 ::: Enil))
   end.
 
+Definition same_expr_pure (e1 e2: expr) :=
+  match e1, e2 with
+  | Evar v1, Evar v2 => if ident_eq v1 v2 then true else false
+  | _, _ => false
+  end.
+
 Definition select (ty : typ) (cond : condition) (args : exprlist) (e1 e2: expr)  : option expr :=
-  Some(
+  Some (if same_expr_pure e1 e2 then e1 else
   match cond_to_condition0 cond args with
   | None => select0 ty (Ccomp0 Cne) e1 e2 (Eop (Ocmp cond) args)
   | Some(cond0, ec) => select0 ty cond0 e1 e2 ec
@@ -356,12 +362,6 @@ Nondetfunction orimm (n1: int) (e2: expr) :=
        | _ => Eop (Oorimm n1) (e2:::Enil)
   end.
 
-Definition same_expr_pure (e1 e2: expr) :=
-  match e1, e2 with
-  | Evar v1, Evar v2 => if ident_eq v1 v2 then true else false
-  | _, _ => false
-  end.
-
 Nondetfunction or (e1: expr) (e2: expr) :=
   match e1, e2 with
   | Eop (Ointconst n1) Enil, t2 => orimm n1 t2
diff --git a/kvx/SelectOpproof.v b/kvx/SelectOpproof.v
index d1d0b95c..8c834de5 100644
--- a/kvx/SelectOpproof.v
+++ b/kvx/SelectOpproof.v
@@ -1548,6 +1548,15 @@ Proof.
   intros until b.
   intro Hop; injection Hop; clear Hop; intro; subst a.
   intros HeL He1 He2 HeC.
+  destruct same_expr_pure eqn:SAME.
+  {
+    destruct (eval_same_expr a1 a2 le v1 v2 SAME He1 He2) as [EQ1 EQ2].
+    subst a2. subst v2.
+    exists v1; split; trivial.
+    cbn.
+    rewrite if_same.
+    apply Val.lessdef_normalize.
+  }
   unfold cond_to_condition0.
   destruct (cond_to_condition0_match cond al).
   {
diff --git a/kvx/Stacklayout.v b/kvx/Stacklayout.v
index 46202e03..81ffcebb 100644
--- a/kvx/Stacklayout.v
+++ b/kvx/Stacklayout.v
@@ -63,7 +63,7 @@ Lemma frame_env_separated:
        ** P.
 Proof.
 Local Opaque Z.add Z.mul sepconj range.
-  intros; simpl.
+  intros; cbn.
   set (w := if Archi.ptr64 then 8 else 4).
   set (olink := align (4 * b.(bound_outgoing)) w).
   set (oretaddr := olink + w).
@@ -105,7 +105,7 @@ Lemma frame_env_range:
   let fe := make_env b in
   0 <= fe_stack_data fe /\ fe_stack_data fe + bound_stack_data b <= fe_size fe.
 Proof.
-  intros; simpl.
+  intros; cbn.
   set (w := if Archi.ptr64 then 8 else 4).
   set (olink := align (4 * b.(bound_outgoing)) w).
   set (oretaddr := olink + w).
@@ -133,7 +133,7 @@ Lemma frame_env_aligned:
   /\ (align_chunk Mptr | fe_ofs_link fe)
   /\ (align_chunk Mptr | fe_ofs_retaddr fe).
 Proof.
-  intros; simpl.
+  intros; cbn.
   set (w := if Archi.ptr64 then 8 else 4).
   set (olink := align (4 * b.(bound_outgoing)) w).
   set (oretaddr := olink + w).
diff --git a/kvx/ValueAOp.v b/kvx/ValueAOp.v
index e634fdc0..122c9a60 100644
--- a/kvx/ValueAOp.v
+++ b/kvx/ValueAOp.v
@@ -406,8 +406,8 @@ Lemma intoffloat_total_sound:
     vmatch bc (Val.maketotal (Val.intoffloat v)) (intoffloat_total x).
 Proof.
   unfold Val.intoffloat, intoffloat_total. intros.
-  inv MATCH; simpl in *; try constructor.
-  all: destruct (Float.to_int f) as [i|] eqn:E; simpl; [auto with va | constructor].
+  inv MATCH; cbn in *; try constructor.
+  all: destruct (Float.to_int f) as [i|] eqn:E; cbn; [auto with va | constructor].
   unfold ntop1, provenance.
   destruct (va_strict tt); constructor.
 Qed.
@@ -420,8 +420,8 @@ Lemma intuoffloat_total_sound:
     vmatch bc (Val.maketotal (Val.intuoffloat v)) (intuoffloat_total x).
 Proof.
   unfold Val.intoffloat, intoffloat_total. intros.
-  inv MATCH; simpl in *; try constructor.
-  all: destruct (Float.to_intu f) as [i|] eqn:E; simpl; [auto with va | constructor].
+  inv MATCH; cbn in *; try constructor.
+  all: destruct (Float.to_intu f) as [i|] eqn:E; cbn; [auto with va | constructor].
   unfold ntop1, provenance.
   destruct (va_strict tt); constructor.
 Qed.
@@ -434,8 +434,8 @@ Lemma intofsingle_total_sound:
     vmatch bc (Val.maketotal (Val.intofsingle v)) (intofsingle_total x).
 Proof.
   unfold Val.intofsingle, intofsingle_total. intros.
-  inv MATCH; simpl in *; try constructor.
-  all: destruct (Float32.to_int f) as [i|] eqn:E; simpl; [auto with va | constructor].
+  inv MATCH; cbn in *; try constructor.
+  all: destruct (Float32.to_int f) as [i|] eqn:E; cbn; [auto with va | constructor].
   unfold ntop1, provenance.
   destruct (va_strict tt); constructor.
 Qed.
@@ -448,8 +448,8 @@ Lemma intuofsingle_total_sound:
     vmatch bc (Val.maketotal (Val.intuofsingle v)) (intuofsingle_total x).
 Proof.
   unfold Val.intofsingle, intofsingle_total. intros.
-  inv MATCH; simpl in *; try constructor.
-  all: destruct (Float32.to_intu f) as [i|] eqn:E; simpl; [auto with va | constructor].
+  inv MATCH; cbn in *; try constructor.
+  all: destruct (Float32.to_intu f) as [i|] eqn:E; cbn; [auto with va | constructor].
   unfold ntop1, provenance.
   destruct (va_strict tt); constructor.
 Qed.
@@ -461,7 +461,7 @@ Lemma singleofint_total_sound:
               vmatch bc (Val.maketotal (Val.singleofint v)) (singleofint x).
 Proof.
   unfold Val.singleofint, singleofint; intros.
-  inv H; simpl.
+  inv H; cbn.
   all: auto with va.
   all: unfold ntop1, provenance.
   all: try constructor.
@@ -474,7 +474,7 @@ Lemma singleofintu_total_sound:
               vmatch bc (Val.maketotal (Val.singleofintu v)) (singleofintu x).
 Proof.
   unfold Val.singleofintu, singleofintu; intros.
-  inv H; simpl.
+  inv H; cbn.
   all: auto with va.
   all: unfold ntop1, provenance.
   all: try constructor.
@@ -488,8 +488,8 @@ Lemma longoffloat_total_sound:
     vmatch bc (Val.maketotal (Val.longoffloat v)) (longoffloat_total x).
 Proof.
   unfold Val.longoffloat, longoffloat_total. intros.
-  inv MATCH; simpl in *; try constructor.
-  all: destruct (Float.to_long f) as [i|] eqn:E; simpl; [auto with va | constructor].
+  inv MATCH; cbn in *; try constructor.
+  all: destruct (Float.to_long f) as [i|] eqn:E; cbn; [auto with va | constructor].
   unfold ntop1, provenance.
   destruct (va_strict tt); constructor.
 Qed.
@@ -502,8 +502,8 @@ Lemma longuoffloat_total_sound:
     vmatch bc (Val.maketotal (Val.longuoffloat v)) (longuoffloat_total x).
 Proof.
   unfold Val.longoffloat, longoffloat_total. intros.
-  inv MATCH; simpl in *; try constructor.
-  all: destruct (Float.to_longu f) as [i|] eqn:E; simpl; [auto with va | constructor].
+  inv MATCH; cbn in *; try constructor.
+  all: destruct (Float.to_longu f) as [i|] eqn:E; cbn; [auto with va | constructor].
   unfold ntop1, provenance.
   destruct (va_strict tt); constructor.
 Qed.
@@ -516,8 +516,8 @@ Lemma longofsingle_total_sound:
     vmatch bc (Val.maketotal (Val.longofsingle v)) (longofsingle_total x).
 Proof.
   unfold Val.longofsingle, longofsingle_total. intros.
-  inv MATCH; simpl in *; try constructor.
-  all: destruct (Float32.to_long f) as [i|] eqn:E; simpl; [auto with va | constructor].
+  inv MATCH; cbn in *; try constructor.
+  all: destruct (Float32.to_long f) as [i|] eqn:E; cbn; [auto with va | constructor].
   unfold ntop1, provenance.
   destruct (va_strict tt); constructor.
 Qed.
@@ -530,8 +530,8 @@ Lemma longuofsingle_total_sound:
     vmatch bc (Val.maketotal (Val.longuofsingle v)) (longuofsingle_total x).
 Proof.
   unfold Val.longofsingle, longofsingle_total. intros.
-  inv MATCH; simpl in *; try constructor.
-  all: destruct (Float32.to_longu f) as [i|] eqn:E; simpl; [auto with va | constructor].
+  inv MATCH; cbn in *; try constructor.
+  all: destruct (Float32.to_longu f) as [i|] eqn:E; cbn; [auto with va | constructor].
   unfold ntop1, provenance.
   destruct (va_strict tt); constructor.
 Qed.
@@ -543,7 +543,7 @@ Lemma singleoflong_total_sound:
               vmatch bc (Val.maketotal (Val.singleoflong v)) (singleoflong x).
 Proof.
   unfold Val.singleoflong, singleoflong; intros.
-  inv H; simpl.
+  inv H; cbn.
   all: auto with va.
   all: unfold ntop1, provenance.
   all: try constructor.
@@ -556,7 +556,7 @@ Lemma singleoflongu_total_sound:
               vmatch bc (Val.maketotal (Val.singleoflongu v)) (singleoflongu x).
 Proof.
   unfold Val.singleoflongu, singleoflongu; intros.
-  inv H; simpl.
+  inv H; cbn.
   all: auto with va.
   all: unfold ntop1, provenance.
   all: try constructor.
@@ -569,7 +569,7 @@ Lemma floatoflong_total_sound:
               vmatch bc (Val.maketotal (Val.floatoflong v)) (floatoflong x).
 Proof.
   unfold Val.floatoflong, floatoflong; intros.
-  inv H; simpl.
+  inv H; cbn.
   all: auto with va.
   all: unfold ntop1, provenance.
   all: try constructor.
@@ -582,7 +582,7 @@ Lemma floatoflongu_total_sound:
               vmatch bc (Val.maketotal (Val.floatoflongu v)) (floatoflongu x).
 Proof.
   unfold Val.floatoflongu, floatoflongu; intros.
-  inv H; simpl.
+  inv H; cbn.
   all: auto with va.
   all: unfold ntop1, provenance.
   all: try constructor.
@@ -620,7 +620,7 @@ Proof.
   intros v x;
   intro MATCH;
   inversion MATCH;
-  simpl;
+  cbn;
   constructor.
 Qed.
 
@@ -632,9 +632,9 @@ Lemma triple_op_float_sound:
 Proof.
   intros until z.
   intros Hax Hby Hcz.
-  inv Hax; simpl; try constructor;
-  inv Hby; simpl; try constructor;
-  inv Hcz; simpl; try constructor.
+  inv Hax; cbn; try constructor;
+  inv Hby; cbn; try constructor;
+  inv Hcz; cbn; try constructor.
 Qed.
 
 Lemma triple_op_single_sound:
@@ -645,9 +645,9 @@ Lemma triple_op_single_sound:
 Proof.
   intros until z.
   intros Hax Hby Hcz.
-  inv Hax; simpl; try constructor;
-  inv Hby; simpl; try constructor;
-  inv Hcz; simpl; try constructor.
+  inv Hax; cbn; try constructor;
+  inv Hby; cbn; try constructor;
+  inv Hcz; cbn; try constructor.
 Qed.
 
 Lemma fmaddf_sound :
@@ -691,9 +691,9 @@ Proof.
   intros until aargs; intros VM. inv VM.
   destruct cond; auto with va.
   inv H0.
-  destruct cond; simpl; eauto with va.
+  destruct cond; cbn; eauto with va.
   inv H2.
-  destruct cond; simpl; eauto with va.
+  destruct cond; cbn; eauto with va.
   destruct cond; auto with va.
 Qed.
 
@@ -703,7 +703,7 @@ Theorem eval_static_condition0_sound:
   cmatch (eval_condition0 cond varg m) (eval_static_condition0 cond aarg).
 Proof.
   intros until aarg; intro VM.
-  destruct cond; simpl; eauto with va.
+  destruct cond; cbn; eauto with va.
 Qed.
 
 Lemma symbol_address_sound:
@@ -812,8 +812,8 @@ Proof.
     + eauto with va.
     + destruct n; destruct shift; reflexivity.
   - (* shrx *)
-    inv H1; simpl; try constructor.
-    all: destruct Int.ltu; [simpl | constructor; fail].
+    inv H1; cbn; try constructor.
+    all: destruct Int.ltu; [cbn | constructor; fail].
     all: auto with va.
   - replace (match Val.shll a1 (Vint (int_of_shift1_4 shift)) with
     | Vlong n2 => Vlong (Int64.add n n2)
@@ -833,8 +833,8 @@ Proof.
     + eauto with va.
     + destruct a1; destruct shift; reflexivity.
   - (* shrxl *)
-    inv H1; simpl; try constructor.
-    all: destruct Int.ltu; [simpl | constructor; fail].
+    inv H1; cbn; try constructor.
+    all: destruct Int.ltu; [cbn | constructor; fail].
     all: auto with va.
   - apply of_optbool_sound. eapply eval_static_condition_sound; eauto.
 
@@ -865,12 +865,12 @@ Proof.
   (* insf *)
   - unfold insf, eval_static_insf.
     destruct (is_bitfield _ _).
-    + inv H1; inv H0; simpl; try constructor; destruct (Int.ltu _ _); simpl; constructor.
+    + inv H1; inv H0; cbn; try constructor; destruct (Int.ltu _ _); cbn; constructor.
     + constructor.
   (* insfl *)
   - unfold insfl, eval_static_insfl.
     destruct (is_bitfieldl _ _).
-    + inv H1; inv H0; simpl; try constructor; destruct (Int.ltu _ _); simpl; constructor.
+    + inv H1; inv H0; cbn; try constructor; destruct (Int.ltu _ _); cbn; constructor.
     + constructor.
     (* select *)
   - apply select_sound; auto. eapply eval_static_condition0_sound; eauto.
diff --git a/kvx/abstractbb/AbstractBasicBlocksDef.v b/kvx/abstractbb/AbstractBasicBlocksDef.v
index 0b1c502d..6960f363 100644
--- a/kvx/abstractbb/AbstractBasicBlocksDef.v
+++ b/kvx/abstractbb/AbstractBasicBlocksDef.v
@@ -45,7 +45,7 @@ End LangParam.
 
 
 
-(** * Syntax and (sequential) semantics of "basic blocks" *)
+(** * Syntax and (sequential) semantics of "abstract basic blocks" *)
 Module MkSeqLanguage(P: LangParam).
 
 Export P.
@@ -62,12 +62,12 @@ Definition assign (m: mem) (x:R.t) (v: value): mem
   := fun y => if R.eq_dec x y then v else m y.
 
 
-(** expressions *)
+(** Expressions *)
 
 Inductive exp :=
-  | PReg (x:R.t)
-  | Op (o:op) (le: list_exp)
-  | Old (e: exp)
+  | PReg (x:R.t)              (**r pseudo-register *)
+  | Op (o:op) (le: list_exp)  (**r operation *)
+  | Old (e: exp)              (**r evaluation of [e] in the initial state of the instruction (see [inst] below) *)
 with list_exp :=
   | Enil
   | Econs (e:exp) (le:list_exp)
@@ -95,7 +95,8 @@ with list_exp_eval (le: list_exp) (m old: mem): option (list value) :=
   | LOld le => list_exp_eval le old old
   end.
 
-Definition inst := list (R.t * exp). (* = a sequence of assignments *) 
+(** An instruction represents a sequence of assignments where [Old] refers to the initial state of the sequence. *) 
+Definition inst := list (R.t * exp). 
 
 Fixpoint inst_run (i: inst) (m old: mem): option mem :=
   match i with
@@ -107,6 +108,7 @@ Fixpoint inst_run (i: inst) (m old: mem): option mem :=
      end
   end.
 
+(** A basic block is a sequence of instructions. *) 
 Definition bblock := list inst.
 
 Fixpoint run (p: bblock) (m: mem): option mem :=
@@ -168,7 +170,7 @@ Lemma exp_equiv e old1 old2:
    (exp_eval e m1 old1) = (exp_eval e m2 old2).
 Proof.
   intros H1.
-  induction e using exp_mut with (P0:=fun l =>  forall m1 m2, (forall x, m1 x = m2 x) -> list_exp_eval l m1 old1 = list_exp_eval l m2 old2); simpl; try congruence; auto.
+  induction e using exp_mut with (P0:=fun l =>  forall m1 m2, (forall x, m1 x = m2 x) -> list_exp_eval l m1 old1 = list_exp_eval l m2 old2); cbn; try congruence; auto.
   - intros; erewrite IHe; eauto.
   - intros; erewrite IHe, IHe0; auto.
 Qed.
@@ -181,38 +183,38 @@ Lemma inst_equiv_refl i old1 old2:
   forall m1 m2, (forall x, m1 x = m2 x) -> 
   res_eq (inst_run i m1 old1) (inst_run i m2 old2).
 Proof.
-  intro H; induction i as [ | [x e]]; simpl; eauto.
+  intro H; induction i as [ | [x e]]; cbn; eauto.
   intros m1 m2 H1. erewrite exp_equiv; eauto.
-  destruct (exp_eval e m2 old2); simpl; auto.
+  destruct (exp_eval e m2 old2); cbn; auto.
   apply IHi.
   unfold assign; intro y. destruct (R.eq_dec x y); auto. 
 Qed.
 
 Lemma bblock_equiv_refl p: forall m1 m2, (forall x, m1 x = m2 x) -> res_eq (run p m1) (run p m2).
 Proof.
-  induction p as [ | i p']; simpl; eauto.
+  induction p as [ | i p']; cbn; eauto.
   intros m1 m2 H; lapply (inst_equiv_refl i m1 m2); auto.
   intros X; lapply (X m1 m2); auto; clear X.
-  destruct (inst_run i m1 m1); simpl.
-  - intros [m3 [H1 H2]]; rewrite H1; simpl; auto.
-  - intros H1; rewrite H1; simpl; auto.
+  destruct (inst_run i m1 m1); cbn.
+  - intros [m3 [H1 H2]]; rewrite H1; cbn; auto.
+  - intros H1; rewrite H1; cbn; auto.
 Qed.
 
 Lemma res_eq_sym om1 om2: res_eq om1 om2 -> res_eq om2 om1.
 Proof.
-  destruct om1; simpl.
-  - intros [m2 [H1 H2]]; subst; simpl. eauto.
-  - intros; subst; simpl; eauto.
+  destruct om1; cbn.
+  - intros [m2 [H1 H2]]; subst; cbn. eauto.
+  - intros; subst; cbn; eauto.
 Qed.
 
 Lemma res_eq_trans (om1 om2 om3: option mem): 
   (res_eq om1 om2) -> (res_eq om2 om3) -> (res_eq om1 om3).
 Proof.
-  destruct om1; simpl.
-  - intros [m2 [H1 H2]]; subst; simpl.
-    intros [m3 [H3 H4]]; subst; simpl.
+  destruct om1; cbn.
+  - intros [m2 [H1 H2]]; subst; cbn.
+    intros [m3 [H3 H4]]; subst; cbn.
     eapply ex_intro; intuition eauto. rewrite H2; auto.
-  - intro; subst; simpl; auto.
+  - intro; subst; cbn; auto.
 Qed.
 
 Lemma bblock_simu_alt p1 p2: bblock_simu p1 p2 <-> (forall m1 m2,  (forall x, m1 x = m2 x) -> (run p1 m1)<>None -> res_eq (run p1 m1) (run p2 m2)).
@@ -230,8 +232,8 @@ Lemma run_app p1: forall m1 p2,
      | None => None
      end.
 Proof.
-   induction p1; simpl; try congruence.
-   intros; destruct (inst_run _ _ _); simpl; auto.
+   induction p1; cbn; try congruence.
+   intros; destruct (inst_run _ _ _); cbn; auto.
 Qed.
 
 Lemma run_app_None p1 m1 p2:
@@ -250,12 +252,16 @@ Qed.
 
 End SEQLANG.
 
-Module Terms.
 
-(** terms in the symbolic evaluation 
-NB: such a term represents the successive computations in one given pseudo-register
+(** * Terms in the symbolic execution *)
+
+(** Such a term represents the successive computations in one given pseudo-register. 
+The [hid] has no formal semantics: it is only used by the hash-consing oracle (itself dynamically checked to behave like an identity function).
+
 *)
 
+Module Terms.
+
 Inductive term :=
   | Input (x:R.t) (hid:hashcode)
   | App (o: op) (l: list_term) (hid:hashcode)
@@ -328,17 +334,27 @@ Fixpoint allvalid ge (l: list term) m : Prop :=
 Lemma allvalid_extensionality ge (l: list term) m:
   allvalid ge l m <-> (forall t, List.In t l -> term_eval ge t m <> None).
 Proof.
-  induction l as [|t l]; simpl; try (tauto).
+  induction l as [|t l]; cbn; try (tauto).
   destruct l.
   - intuition (congruence || eauto).
   - rewrite IHl; clear IHl. intuition (congruence || eauto).
 Qed.
 
+(** * Rewriting rules in the symbolic execution *)
+
+(** The symbolic execution is parametrized by rewriting rules on pseudo-terms. *)
+
 Record pseudo_term: Type := intro_fail {
   mayfail: list term;
   effect: term
 }.
 
+(** Simulation relation between a term and a pseudo-term *)
+
+Definition match_pt (t: term) (pt: pseudo_term) :=
+      (forall ge m, term_eval ge t m <> None <-> allvalid ge pt.(mayfail) m)
+   /\ (forall ge m0 m1, term_eval ge t m0 = Some m1 -> term_eval ge pt.(effect) m0 = Some m1).
+
 Lemma inf_option_equivalence (A:Type) (o1 o2: option A):
   (o1 <> None -> o1 = o2) <-> (forall m1, o1 = Some m1 -> o2 = Some m1).
 Proof.
@@ -346,26 +362,24 @@ Proof.
   symmetry; eauto.
 Qed.
 
-Definition match_pt (t: term) (pt: pseudo_term) :=
-      (forall ge m, term_eval ge t m <> None <-> allvalid ge pt.(mayfail) m)
-   /\ (forall ge m0 m1, term_eval ge t m0 = Some m1 -> term_eval ge pt.(effect) m0 = Some m1).
-
 Lemma intro_fail_correct (l: list term) (t: term) :
    (forall ge m, term_eval ge t m <> None <-> allvalid ge l m) -> match_pt t (intro_fail l t).
 Proof.
-  unfold match_pt; simpl; intros; intuition congruence.
+  unfold match_pt; cbn; intros; intuition congruence.
 Qed.
 Hint Resolve intro_fail_correct: wlp.
 
+(** The default reduction of a term to a pseudo-term *) 
 Definition identity_fail (t: term):= intro_fail [t] t.
 
 Lemma identity_fail_correct (t: term): match_pt t (identity_fail t).
 Proof.
-  eapply intro_fail_correct; simpl; tauto.
+  eapply intro_fail_correct; cbn; tauto.
 Qed.
 Global Opaque identity_fail.
 Hint Resolve identity_fail_correct: wlp.
 
+(** The reduction for constant term *) 
 Definition nofail (is_constant: op -> bool) (t: term):=
     match t with
     | Input x _ => intro_fail ([])%list t
@@ -376,15 +390,16 @@ Definition nofail (is_constant: op -> bool) (t: term):=
 Lemma nofail_correct (is_constant: op -> bool) t:
  (forall ge o, is_constant o = true -> op_eval ge o nil <> None) -> match_pt t (nofail is_constant t).
 Proof.
-  destruct t; simpl.
-  + intros; eapply intro_fail_correct; simpl; intuition congruence.
-  + intros; destruct l; simpl; auto with wlp.
-    destruct (is_constant o) eqn:Heqo; simpl; intuition eauto with wlp.
-     eapply intro_fail_correct; simpl; intuition eauto with wlp.
+  destruct t; cbn.
+  + intros; eapply intro_fail_correct; cbn; intuition congruence.
+  + intros; destruct l; cbn; auto with wlp.
+    destruct (is_constant o) eqn:Heqo; cbn; intuition eauto with wlp.
+     eapply intro_fail_correct; cbn; intuition eauto with wlp.
 Qed.
 Global Opaque nofail.
 Hint Resolve nofail_correct: wlp.
 
+(** Term equivalence preserve the simulation by pseudo-terms *)
 Definition term_equiv t1 t2:= forall ge m, term_eval ge t1 m = term_eval ge t2 m.
 
 Global Instance term_equiv_Equivalence : Equivalence term_equiv.
@@ -401,6 +416,7 @@ Proof.
 Qed.
 Hint Resolve match_pt_term_equiv: wlp.
 
+(** Other generic reductions *)
 Definition app_fail (l: list term) (pt: pseudo_term): pseudo_term :=
   {| mayfail := List.rev_append l pt.(mayfail); effect := pt.(effect) |}.
 
@@ -409,7 +425,7 @@ Lemma app_fail_allvalid_correct l pt t1 t2: forall
   (V2: forall (ge : genv) (m : mem), term_eval ge t2 m <> None <-> allvalid ge (mayfail {| mayfail := t1 :: l; effect := t1 |}) m)
   (ge : genv) (m : mem), term_eval ge t2 m <> None <-> allvalid ge (mayfail (app_fail l pt)) m.
 Proof.
-  intros; generalize (V1 ge m) (V2 ge m); rewrite !allvalid_extensionality; simpl. clear V1 V2.
+  intros; generalize (V1 ge m) (V2 ge m); rewrite !allvalid_extensionality; cbn. clear V1 V2.
   intuition subst.
   + rewrite rev_append_rev, in_app_iff, <- in_rev in H3. destruct H3; eauto.
   + eapply H3; eauto.
@@ -431,6 +447,8 @@ Extraction Inline app_fail.
 Import ImpCore.Notations.
 Local Open Scope impure_scope.
 
+(** Specification of rewriting functions in parameter of the symbolic execution: in the impure monad, because the rewriting functions produce hash-consed terms (wrapped in pseudo-terms).
+*)
 Record reduction:= {
   result:> term -> ?? pseudo_term;
   result_correct: forall t, WHEN result t ~> pt THEN match_pt t pt;
diff --git a/kvx/abstractbb/ImpSimuTest.v b/kvx/abstractbb/ImpSimuTest.v
index c914eee1..b1a3b985 100644
--- a/kvx/abstractbb/ImpSimuTest.v
+++ b/kvx/abstractbb/ImpSimuTest.v
@@ -10,13 +10,16 @@
 (*                                                             *)
 (* *************************************************************)
 
-(** Implementation of a symbolic execution of sequential semantics of Abstract Basic Blocks
+(** Implementation of a simulation test (ie a "scheduling verifier") for the sequential semantics of Abstract Basic Blocks.
+
+It is based on a symbolic execution procedure of Abstract Basic Blocks with imperative hash-consing and rewriting.
+
+It also provides debugging information when the test fails.
 
-with imperative hash-consing, and rewriting.
 
 *)
 
-Require Export Impure.ImpHCons.
+Require Export Impure.ImpHCons. (**r Import the Impure library. See https://github.com/boulme/ImpureDemo *)
 Export Notations.
 Import HConsing.
 
@@ -32,6 +35,7 @@ Import ListNotations.
 Local Open Scope list_scope.
 
 
+(** * Interface of (impure) equality tests for operators *)
 Module Type ImpParam.
 
 Include LangParam.
@@ -54,6 +58,8 @@ Include MkSeqLanguage LP.
 End ISeqLanguage.
 
 
+(** * A generic dictinary on PseudoRegisters with an impure equality test  *)
+
 Module Type ImpDict.
 
 Declare Module R: PseudoRegisters.
@@ -91,26 +97,27 @@ Parameter eq_test_correct: forall A (d1 d2: t A),
 
 (* NB: we could also take an eq_test on R.t (but not really useful with "pure" dictionaries  *)
 
-
-(* only for debugging *)
+(** only for debugging *)
 Parameter not_eq_witness: forall {A}, t A -> t A -> ?? option R.t.
 
 End ImpDict.
 
 
+(** * Specification of the provided tests *)
 Module Type ImpSimuInterface.
 
 Declare Module CoreL: ISeqLanguage.
 Import CoreL.
 Import Terms.
 
+(** the silent test (without debugging informations) *)
 Parameter bblock_simu_test: reduction -> bblock -> bblock -> ?? bool.
 
 Parameter bblock_simu_test_correct: forall reduce (p1 p2 : bblock),
     WHEN bblock_simu_test reduce p1 p2 ~> b
     THEN b = true -> forall ge : genv, bblock_simu ge p1 p2.
 
-
+(** the verbose test extended with debugging informations *)
 Parameter verb_bblock_simu_test
      : reduction ->
        (R.t -> ?? pstring) ->
@@ -127,6 +134,7 @@ Parameter verb_bblock_simu_test_correct:
 End ImpSimuInterface.
 
 
+(** * Implementation of the provided tests *)
 
 Module ImpSimu (L: ISeqLanguage) (Dict: ImpDict with Module R:=L.LP.R): ImpSimuInterface with Module CoreL := L.
 
@@ -152,13 +160,13 @@ Definition list_term_set_hid (l: list_term) (hid: hashcode): list_term :=
 Lemma term_eval_set_hid ge t hid m:
   term_eval ge (term_set_hid t hid) m = term_eval ge t m.
 Proof.
-  destruct t; simpl; auto.
+  destruct t; cbn; auto.
 Qed.
 
 Lemma list_term_eval_set_hid ge l hid m:
   list_term_eval ge (list_term_set_hid l hid) m = list_term_eval ge l m.
 Proof.
-  destruct l; simpl; auto.
+  destruct l; cbn; auto.
 Qed.
 
 (* Local nickname *)
@@ -168,7 +176,7 @@ Section SimuWithReduce.
 
 Variable reduce: reduction.
 
-Section CanonBuilding.
+Section CanonBuilding. (** Implementation of the symbolic execution (ie a "canonical form" representing the semantics of an abstract basic block) *)
 
 Variable hC_term: hashinfo term -> ?? term.
 Hypothesis hC_term_correct: forall t, WHEN hC_term t ~> t' THEN forall ge m, term_eval ge (hdata t) m = term_eval ge t' m.
@@ -307,7 +315,7 @@ Proof.
     destruct (DM0 m) as (PRE & VALID0); clear DM0.
     assert (VALID1: allvalid ge hd.(hpre) m -> pre d ge m). { unfold smem_valid in PRE; tauto. }
     assert (VALID2: allvalid ge hd.(hpre) m -> forall x : Dict.R.t, ST.term_eval ge (d x) m <> None). { unfold smem_valid in PRE; tauto. }
-    rewrite !allvalid_extensionality in * |- *; simpl.
+    rewrite !allvalid_extensionality in * |- *; cbn.
     intuition (subst; eauto).
     + eapply smem_valid_set_proof; eauto.
       erewrite <- EQT; eauto.
@@ -315,11 +323,11 @@ Proof.
       intros X1; exploit smem_valid_set_decompose_2; eauto.
       rewrite <- EQT; eauto.
     + exploit smem_valid_set_decompose_1; eauto.
-  - clear DM0. unfold hsmem_post_eval, hsmem_post_eval in * |- *; simpl.
+  - clear DM0. unfold hsmem_post_eval, hsmem_post_eval in * |- *; cbn.
     Local Hint Resolve smem_valid_set_decompose_1: core.
     intros; case (R.eq_dec x x0).
-    + intros; subst; rewrite !Dict.set_spec_eq; simpl; eauto.
-    + intros; rewrite !Dict.set_spec_diff; simpl; eauto.
+    + intros; subst; rewrite !Dict.set_spec_eq; cbn; eauto.
+    + intros; rewrite !Dict.set_spec_diff; cbn; eauto.
 Qed.
 Local Hint Resolve naive_set_correct: core.
 
@@ -396,10 +404,10 @@ Lemma hterm_append_correct l: forall lh,
   WHEN hterm_append l lh ~> lh' THEN (forall ge m, allvalid ge lh' m <-> (allvalid ge l m /\ allvalid ge lh m)).
 Proof.
   Local Hint Resolve eq_trans: localhint.
-  induction l as [|t l']; simpl; wlp_xsimplify ltac:(eauto with wlp).
+  induction l as [|t l']; cbn; wlp_xsimplify ltac:(eauto with wlp).
   - intros; rewrite! allvalid_extensionality; intuition eauto.
   - intros REC ge m; rewrite REC; clear IHl' REC. rewrite !allvalid_extensionality.
-    simpl; intuition (subst; eauto with wlp localhint).
+    cbn; intuition (subst; eauto with wlp localhint).
 Qed.
 (*Local Hint Resolve hterm_append_correct: wlp.*)
 Global Opaque hterm_append.
@@ -423,8 +431,8 @@ Lemma smart_set_correct hd x ht:
     forall ge m y, hsmem_post_eval ge d y m = hsmem_post_eval ge (Dict.set hd x ht) y m.
 Proof.
   destruct ht; wlp_simplify.
-  unfold hsmem_post_eval; simpl. case (R.eq_dec x0 y).
-  - intros; subst. rewrite Dict.set_spec_eq, Dict.rem_spec_eq. simpl; congruence.
+  unfold hsmem_post_eval; cbn. case (R.eq_dec x0 y).
+  - intros; subst. rewrite Dict.set_spec_eq, Dict.rem_spec_eq. cbn; congruence.
   - intros; rewrite Dict.set_spec_diff, Dict.rem_spec_diff; auto.
 Qed.
 (*Local Hint Resolve smart_set_correct: wlp.*)
@@ -448,17 +456,17 @@ Proof.
   generalize (hterm_append_correct _ _ _ Hexta0); intro APPEND.
   generalize (hterm_lift_correct _ _ Hexta1); intro LIFT.
   generalize (smart_set_correct _ _ _ _ Hexta3); intro SMART.
-  eapply equiv_hsmem_models; eauto; unfold equiv_hsmem; simpl. 
+  eapply equiv_hsmem_models; eauto; unfold equiv_hsmem; cbn. 
   destruct H as (VALID & EFFECT); split.
   - intros; rewrite APPEND, <- VALID.
-    rewrite !allvalid_extensionality in * |- *; simpl; intuition (subst; eauto).
+    rewrite !allvalid_extensionality in * |- *; cbn; intuition (subst; eauto).
   - intros m x0 ALLVALID; rewrite SMART.
     destruct (term_eval ge ht m) eqn: Hht.
     * case (R.eq_dec x x0).
-      + intros; subst. unfold hsmem_post_eval; simpl. rewrite !Dict.set_spec_eq.
+      + intros; subst. unfold hsmem_post_eval; cbn. rewrite !Dict.set_spec_eq.
         erewrite LIFT, EFFECT; eauto.
-      + intros; unfold hsmem_post_eval; simpl. rewrite !Dict.set_spec_diff; auto.
-    * rewrite allvalid_extensionality in ALLVALID; destruct (ALLVALID ht); simpl; auto.
+      + intros; unfold hsmem_post_eval; cbn. rewrite !Dict.set_spec_diff; auto.
+    * rewrite allvalid_extensionality in ALLVALID; destruct (ALLVALID ht); cbn; auto.
 Qed.
 Local Hint Resolve hsmem_set_correct: wlp.
 Global Opaque hsmem_set.
@@ -473,7 +481,7 @@ Proof.
   intro H.
   induction e using exp_mut with (P0:=fun le =>  forall d hd,
      smem_model ge d hd -> forall m, smem_valid ge d m -> smem_valid ge od m -> list_term_eval ge (list_exp_term le hd hod) m = list_term_eval ge (list_exp_term le d od) m); 
-     unfold smem_model in * |- * ; simpl; intuition eauto.
+     unfold smem_model in * |- * ; cbn; intuition eauto.
   - erewrite IHe; eauto.
   - erewrite IHe0, IHe; eauto.
 Qed.
@@ -508,10 +516,10 @@ Lemma exp_hterm_correct_x ge e hod od:
    induction e using exp_mut with (P0:=fun le =>  forall d hd,
      smem_model ge d hd ->
      WHEN list_exp_hterm le hd hod ~> lt THEN forall m, smem_valid ge d m -> smem_valid ge od m -> list_term_eval ge lt m = ST.list_term_eval ge (list_exp_term le d od) m); 
-     unfold smem_model, hsmem_post_eval in * |- * ; simpl; wlp_simplify.
+     unfold smem_model, hsmem_post_eval in * |- * ; cbn; wlp_simplify.
   - rewrite H1, <- H4; auto.
-  - rewrite H4, <- H0; simpl; auto.
-  - rewrite H5, <- H0, <- H4; simpl; auto.
+  - rewrite H4, <- H0; cbn; auto.
+  - rewrite H5, <- H0, <- H4; cbn; auto.
 Qed.
 Global Opaque exp_hterm.
 
@@ -536,7 +544,7 @@ Lemma hinst_smem_correct i: forall hd hod,
     forall ge od d, smem_model ge od hod -> smem_model ge d hd -> (forall m, smem_valid ge d m -> smem_valid ge od m) -> smem_model ge (inst_smem i d od) hd'.
 Proof.
   Local Hint Resolve smem_valid_set_proof: core.
-  induction i; simpl; wlp_simplify; eauto 15 with wlp.
+  induction i; cbn; wlp_simplify; eauto 15 with wlp.
 Qed.
 Global Opaque hinst_smem.
 Local Hint Resolve hinst_smem_correct: wlp.
@@ -556,7 +564,7 @@ Fixpoint bblock_hsmem_rec (p: bblock) (d: hsmem): ?? hsmem :=
 Lemma bblock_hsmem_rec_correct p: forall hd,
   WHEN bblock_hsmem_rec p hd ~> hd' THEN forall ge d, smem_model ge d hd -> smem_model ge (bblock_smem_rec p d) hd'.
 Proof.
-  induction p; simpl; wlp_simplify.
+  induction p; cbn; wlp_simplify.
 Qed.
 Global Opaque bblock_hsmem_rec.
 Local Hint Resolve bblock_hsmem_rec_correct: wlp.
@@ -565,8 +573,8 @@ Definition hsmem_empty: hsmem := {| hpre:= nil ; hpost := Dict.empty |}.
 
 Lemma hsmem_empty_correct ge: smem_model ge smem_empty hsmem_empty.
 Proof.
-  unfold smem_model, smem_valid, hsmem_post_eval; simpl; intuition try congruence.
-  rewrite !Dict.empty_spec; simpl; auto.
+  unfold smem_model, smem_valid, hsmem_post_eval; cbn; intuition try congruence.
+  rewrite !Dict.empty_spec; cbn; auto.
 Qed.
 
 Definition bblock_hsmem: bblock -> ?? hsmem
@@ -714,7 +722,7 @@ Theorem g_bblock_simu_test_correct p1 p2:
   WHEN g_bblock_simu_test p1 p2 ~> b THEN b=true -> forall ge, bblock_simu ge p1 p2.
 Proof.
   wlp_simplify.
-  destruct exta0; simpl in * |- *; auto.
+  destruct exta0; cbn in * |- *; auto.
 Qed.
 Global Opaque g_bblock_simu_test.
 
@@ -1117,9 +1125,9 @@ Extraction Inline lift.
 
 End ImpSimu.
 
-Require Import FMapPositive.
-
 
+(** * Implementation of the Dictionary (based on PositiveMap) *)
+Require Import FMapPositive.
 Require Import PArith.
 Require Import FMapPositive.
 
@@ -1201,12 +1209,12 @@ Lemma eq_test_correct A d1: forall (d2: t A),
  WHEN eq_test d1 d2 ~> b THEN
   b=true -> forall x, get d1 x = get d2 x.
 Proof.
-  unfold get; induction d1 as [|l1 Hl1 [x1|] r1 Hr1]; destruct d2 as [|l2 [x2|] r2]; simpl; 
-  wlp_simplify; (discriminate || (subst; destruct x; simpl; auto)).
+  unfold get; induction d1 as [|l1 Hl1 [x1|] r1 Hr1]; destruct d2 as [|l2 [x2|] r2]; cbn; 
+  wlp_simplify; (discriminate || (subst; destruct x; cbn; auto)).
 Qed.
 Global Opaque eq_test.
 
-(* ONLY FOR DEBUGGING INFO: get some key of a non-empty d *)
+(** ONLY FOR DEBUGGING INFO: get some key of a non-empty d *)
 Fixpoint pick {A} (d: t A): ?? R.t :=
   match d with
   | Leaf _ => FAILWITH "unexpected empty dictionary"
@@ -1219,7 +1227,7 @@ Fixpoint pick {A} (d: t A): ?? R.t :=
     RET (xO p)
   end.
 
-(* ONLY FOR DEBUGGING INFO: find one variable on which d1 and d2 differs *)
+(** ONLY FOR DEBUGGING INFO: find one variable on which d1 and d2 differs *)
 Fixpoint not_eq_witness {A} (d1 d2: t A): ?? option R.t :=
   match d1, d2 with
   | Leaf _, Leaf _ => RET None
diff --git a/kvx/abstractbb/Parallelizability.v b/kvx/abstractbb/Parallelizability.v
index feebeee5..e5d21434 100644
--- a/kvx/abstractbb/Parallelizability.v
+++ b/kvx/abstractbb/Parallelizability.v
@@ -26,7 +26,7 @@ Require Import Sorting.Permutation.
 Require Import Bool.
 Local Open Scope lazy_bool_scope.
 
-
+(** * Definition of the parallel semantics *)
 Module ParallelSemantics (L: SeqLanguage).
 
 Export L.
@@ -50,8 +50,8 @@ Fixpoint inst_prun (i: inst) (m tmp old: mem): option mem :=
 Lemma inst_run_prun i: forall m old,
   inst_run ge i m old = inst_prun i m m old.
 Proof.
-  induction i as [|[y e] i']; simpl; auto.
-  intros m old; destruct (exp_eval ge e m old); simpl; auto.
+  induction i as [|[y e] i']; cbn; auto.
+  intros m old; destruct (exp_eval ge e m old); cbn; auto.
 Qed.
 
 
@@ -76,8 +76,8 @@ Lemma inst_prun_equiv i old: forall m1 m2 tmp,
   (forall x, m1 x = m2 x) ->
   res_eq (inst_prun i m1 tmp old) (inst_prun i m2 tmp old).
 Proof.
-  induction i as [|[x e] i']; simpl; eauto.
-  intros m1 m2 tmp H; destruct (exp_eval ge e tmp old); simpl; auto.
+  induction i as [|[x e] i']; cbn; eauto.
+  intros m1 m2 tmp H; destruct (exp_eval ge e tmp old); cbn; auto.
   eapply IHi'; unfold assign. intros; destruct (R.eq_dec x x0); auto.
 Qed.
 
@@ -85,12 +85,12 @@ Lemma prun_iw_equiv p: forall m1 m2 old,
   (forall x, m1 x = m2 x) ->
   res_eq (prun_iw p m1 old) (prun_iw p m2 old).
 Proof.
-  induction p as [|i p']; simpl; eauto.
+  induction p as [|i p']; cbn; eauto.
   - intros m1 m2 old H.
     generalize (inst_prun_equiv i old m1 m2 old H);
-    destruct (inst_prun i m1 old old); simpl.
-    + intros (m3 & H3 & H4); rewrite H3; simpl; eauto.
-    + intros H1; rewrite H1; simpl; auto.
+    destruct (inst_prun i m1 old old); cbn.
+    + intros (m3 & H3 & H4); rewrite H3; cbn; eauto.
+    + intros H1; rewrite H1; cbn; auto.
 Qed.
 
 
@@ -101,8 +101,8 @@ Lemma prun_iw_app p1: forall m1 old p2,
      | None => None
      end.
 Proof.
-   induction p1; simpl; try congruence.
-   intros; destruct (inst_prun _ _ _); simpl; auto.
+   induction p1; cbn; try congruence.
+   intros; destruct (inst_prun _ _ _); cbn; auto.
 Qed.
 
 Lemma prun_iw_app_None p1: forall m1 old p2,
@@ -132,12 +132,12 @@ Fixpoint notIn {A} (x: A) (l:list A): Prop :=
 
 Lemma notIn_iff A (x:A) l: (~List.In x l) <-> notIn x l.
 Proof.
-  induction l; simpl; intuition.
+  induction l; cbn; intuition.
 Qed.
 
 Lemma notIn_app A (x:A) l1: forall l2, notIn x (l1++l2) <-> (notIn x l1 /\ notIn x l2).
 Proof.
-  induction l1; simpl.
+  induction l1; cbn.
   - intuition.
   - intros; rewrite IHl1. intuition.
 Qed.
@@ -145,7 +145,7 @@ Qed.
 
 Lemma In_Permutation A (l1 l2: list A): Permutation l1 l2 -> forall x, In x l1 -> In x l2.
 Proof.
-  induction 1; simpl; intuition.
+  induction 1; cbn; intuition.
 Qed.
 
 Lemma Permutation_incl A (l1 l2: list A): Permutation l1 l2 -> incl l1 l2.
@@ -174,7 +174,7 @@ Qed.
 
 Lemma disjoint_cons_l A (x:A) (l1 l2: list A): disjoint (x::l1) l2 <-> (notIn x l2) /\ (disjoint l1 l2).
 Proof.
-  unfold disjoint. simpl; intuition subst; auto. 
+  unfold disjoint. cbn; intuition subst; auto. 
 Qed.
 
 Lemma disjoint_cons_r A (x:A) (l1 l2: list A): disjoint l1 (x::l2) <-> (notIn x l1) /\ (disjoint l1 l2).
@@ -230,13 +230,13 @@ Fixpoint frame_assign m1 (f: list R.t) m2 :=
 Lemma frame_assign_def f: forall m1 m2 x,
    frame_assign m1 f m2 x = if notIn_dec x f then m1 x else m2 x.
 Proof.
-  induction f as [|y f] ; simpl; auto.
-  - intros; destruct (notIn_dec x []); simpl in *; tauto.
-  - intros; rewrite IHf; destruct (notIn_dec x (y::f)); simpl in *.
-    + destruct (notIn_dec x f); simpl in *; intuition.
+  induction f as [|y f] ; cbn; auto.
+  - intros; destruct (notIn_dec x []); cbn in *; tauto.
+  - intros; rewrite IHf; destruct (notIn_dec x (y::f)); cbn in *.
+    + destruct (notIn_dec x f); cbn in *; intuition.
       rewrite assign_diff; auto.
       rewrite <- notIn_iff in *; intuition.
-    + destruct (notIn_dec x f); simpl in *; intuition subst.
+    + destruct (notIn_dec x f); cbn in *; intuition subst.
       rewrite assign_eq; auto.
       rewrite <- notIn_iff in *; intuition.
 Qed.
@@ -266,7 +266,7 @@ Lemma frame_eq_list_split f1 (f2: R.t -> Prop) om1 om2:
  (forall m1 m2 x, om1 = Some m1 -> om2 = Some m2 -> f2 x -> notIn x f1 -> m1 x = m2 x) ->
  frame_eq f2 om1 om2.
 Proof.
-  unfold frame_eq; destruct om1 as [ m1 | ]; simpl; auto.
+  unfold frame_eq; destruct om1 as [ m1 | ]; cbn; auto.
   intros (m2 & H0 & H1); subst.
   intros H.
   eexists; intuition eauto.
@@ -280,7 +280,7 @@ Lemma frame_eq_res_eq f om1 om2:
  res_eq om1 om2.
 Proof.
   intros H H0; lapply (frame_eq_list_split f (fun _ => True) om1 om2 H); eauto.
-  clear H H0; unfold frame_eq, res_eq. destruct om1; simpl; firstorder.
+  clear H H0; unfold frame_eq, res_eq. destruct om1; cbn; firstorder.
 Qed.
 *)
 
@@ -296,9 +296,9 @@ Lemma inst_wframe_correct i m' old: forall m tmp,
   inst_prun ge i m tmp old = Some m' -> 
   forall x, notIn x (inst_wframe i) -> m' x  = m x.
 Proof.
-  induction i as [|[y e] i']; simpl.
+  induction i as [|[y e] i']; cbn.
   - intros m tmp H x H0; inversion_clear H; auto.
-  - intros m tmp H x (H1 & H2); destruct (exp_eval ge e tmp old); simpl; try congruence.
+  - intros m tmp H x (H1 & H2); destruct (exp_eval ge e tmp old); cbn; try congruence.
     cutrewrite (m x = assign m y v x); eauto.
     rewrite assign_diff; auto.
 Qed.
@@ -306,9 +306,9 @@ Qed.
 Lemma inst_prun_fequiv i old: forall m1 m2 tmp, 
   frame_eq (fun x => In x (inst_wframe i)) (inst_prun ge i m1 tmp old) (inst_prun ge i m2 tmp old).
 Proof.
-  induction i as [|[y e] i']; simpl.
+  induction i as [|[y e] i']; cbn.
   - intros m1 m2 tmp; eexists; intuition eauto.
-  - intros m1 m2 tmp. destruct (exp_eval ge e tmp old); simpl; auto.
+  - intros m1 m2 tmp. destruct (exp_eval ge e tmp old); cbn; auto.
     eapply frame_eq_list_split; eauto. clear IHi'.
     intros m1' m2' x H1 H2.
     lapply (inst_wframe_correct i' m1' old (assign m1 y v) (assign tmp y v)); eauto.
@@ -323,7 +323,7 @@ Lemma inst_prun_None i m1 m2 tmp old:
   inst_prun ge i m2 tmp old = None.
 Proof.
   intros H; generalize (inst_prun_fequiv i old m1 m2 tmp).
-  rewrite H; simpl; auto.
+  rewrite H; cbn; auto.
 Qed.
 
 Lemma inst_prun_Some i m1 m2 tmp old m1': 
@@ -331,7 +331,7 @@ Lemma inst_prun_Some i m1 m2 tmp old m1':
   res_eq (Some (frame_assign m2 (inst_wframe i) m1')) (inst_prun ge i m2 tmp old).
 Proof.
   intros H; generalize (inst_prun_fequiv i old m1 m2 tmp).
-  rewrite H; simpl.
+  rewrite H; cbn.
   intros (m2' & H1 & H2).
   eexists; intuition eauto.
   rewrite frame_assign_def.
@@ -351,7 +351,7 @@ Local Hint Resolve Permutation_app_head Permutation_app_tail Permutation_app_com
 Lemma bblock_wframe_Permutation p p': 
  Permutation p p' -> Permutation (bblock_wframe p)  (bblock_wframe p').
 Proof.
-  induction 1 as [|i p p'|i1 i2 p|p1 p2 p3]; simpl; auto.
+  induction 1 as [|i p p'|i1 i2 p|p1 p2 p3]; cbn; auto.
   - rewrite! app_assoc; auto.
   - eapply Permutation_trans; eauto.
 Qed.
@@ -361,11 +361,11 @@ Lemma bblock_wframe_correct p m' old: forall m,
   prun_iw p m old = Some m' -> 
   forall x, notIn x (bblock_wframe p) -> m' x = m x.
 Proof.
-  induction p as [|i p']; simpl.
+  induction p as [|i p']; cbn.
   - intros m H; inversion_clear H; auto.
   - intros m H x; rewrite notIn_app; intros (H1 & H2). 
     remember (inst_prun i m old old) as om.
-    destruct om as [m1|]; simpl.
+    destruct om as [m1|]; cbn.
     + eapply eq_trans.
       eapply IHp'; eauto.
       eapply inst_wframe_correct; eauto.
@@ -375,12 +375,12 @@ Qed.
 Lemma prun_iw_fequiv p old: forall m1 m2, 
   frame_eq (fun x => In x (bblock_wframe p)) (prun_iw p m1 old) (prun_iw p m2 old).
 Proof.
-  induction p as [|i p']; simpl.
+  induction p as [|i p']; cbn.
   - intros m1 m2; eexists; intuition eauto.
   - intros m1 m2; generalize (inst_prun_fequiv i old m1 m2 old).
     remember (inst_prun i m1 old old) as om.
-    destruct om as [m1'|]; simpl.
-    + intros (m2' & H1 & H2). rewrite H1; simpl.
+    destruct om as [m1'|]; cbn.
+    + intros (m2' & H1 & H2). rewrite H1; cbn.
     eapply frame_eq_list_split; eauto. clear IHp'.
     intros m1'' m2'' x H3 H4. rewrite in_app_iff.
     intros X X2. assert (X1: In x (inst_wframe i)). { destruct X; auto. rewrite <- notIn_iff in X2; tauto. }
@@ -389,7 +389,7 @@ Proof.
     lapply (bblock_wframe_correct p' m2'' old m2'); eauto.
     intros Xm2' Xm1'. 
     rewrite Xm1', Xm2'; auto.
-    + intro H; rewrite H; simpl; auto.
+    + intro H; rewrite H; cbn; auto.
 Qed.
 
 Lemma prun_iw_equiv p m1 m2 old: 
@@ -418,7 +418,7 @@ Fixpoint is_det (p: bblock): Prop :=
 Lemma is_det_Permutation p p': 
  Permutation p p' -> is_det p -> is_det p'.
 Proof.
-  induction 1; simpl; auto.
+  induction 1; cbn; auto.
   - intros; intuition. eapply disjoint_incl_r. 2: eauto.
     eapply Permutation_incl. eapply Permutation_sym. 
     eapply bblock_wframe_Permutation; auto.
@@ -431,20 +431,20 @@ Theorem is_det_correct p p':
   is_det p -> 
   forall m old, res_eq (prun_iw ge p m old) (prun_iw ge p' m old).
 Proof.
-  induction 1 as [ | i p p' | i1 i2 p | p1 p2 p3 ]; simpl; eauto.
+  induction 1 as [ | i p p' | i1 i2 p | p1 p2 p3 ]; cbn; eauto.
   - intros [H0 H1] m old.
     remember (inst_prun ge i m old old) as om0.
-    destruct om0 as [ m0 | ]; simpl; auto.
+    destruct om0 as [ m0 | ]; cbn; auto.
   - rewrite disjoint_app_r.
     intros ([Z1 Z2] & Z3 & Z4) m old.
     remember (inst_prun ge i2 m old old) as om2.
-    destruct om2 as [ m2 | ]; simpl; auto.
+    destruct om2 as [ m2 | ]; cbn; auto.
     + remember (inst_prun ge i1 m old old) as om1.
-      destruct om1 as [ m1 | ]; simpl; auto.
-      * lapply (inst_prun_Some i2 m m1 old old m2); simpl; auto.
-        lapply (inst_prun_Some i1 m m2 old old m1); simpl; auto.
+      destruct om1 as [ m1 | ]; cbn; auto.
+      * lapply (inst_prun_Some i2 m m1 old old m2); cbn; auto.
+        lapply (inst_prun_Some i1 m m2 old old m1); cbn; auto.
         intros (m1' & Hm1' & Xm1') (m2' & Hm2' & Xm2').
-        rewrite Hm1', Hm2'; simpl.
+        rewrite Hm1', Hm2'; cbn.
         eapply prun_iw_equiv.
         intros x; rewrite <- Xm1', <- Xm2'. clear Xm2' Xm1' Hm1' Hm2' m1' m2'.
         rewrite frame_assign_def.
@@ -455,9 +455,9 @@ Proof.
           erewrite (inst_wframe_correct i1 m1 old m old); eauto.
         }
         rewrite frame_assign_notIn; auto.
-     * erewrite inst_prun_None; eauto. simpl; auto.
+     * erewrite inst_prun_None; eauto. cbn; auto.
    + remember (inst_prun ge i1 m old old) as om1.
-     destruct om1 as [ m1 | ]; simpl; auto.
+     destruct om1 as [ m1 | ]; cbn; auto.
      erewrite inst_prun_None; eauto.
   - intros; eapply res_eq_trans.
     eapply IHPermutation1; eauto.
@@ -486,7 +486,7 @@ Lemma exp_frame_correct e old1 old2:
    (exp_eval ge e m1 old1)=(exp_eval ge e m2 old2).
 Proof.
   induction e using exp_mut with (P0:=fun l => (forall x, In x (list_exp_frame l) -> old1 x = old2 x) -> forall m1 m2, (forall x, In x (list_exp_frame l) -> m1 x = m2 x) ->
-   (list_exp_eval ge l m1 old1)=(list_exp_eval ge l m2 old2)); simpl; auto.
+   (list_exp_eval ge l m1 old1)=(list_exp_eval ge l m2 old2)); cbn; auto.
   - intros H1 m1 m2 H2; rewrite H2; auto.
   - intros H1 m1 m2 H2; erewrite IHe; eauto.
   - intros H1 m1 m2 H2; erewrite IHe, IHe0; eauto; 
@@ -501,7 +501,7 @@ Fixpoint inst_frame (i: inst): list R.t :=
 
 Lemma inst_wframe_frame i x: In x (inst_wframe i) -> In x (inst_frame i).
 Proof.
-  induction i as [ | [y e] i']; simpl; intuition.
+  induction i as [ | [y e] i']; cbn; intuition.
 Qed.
 
 
@@ -511,13 +511,13 @@ Lemma inst_frame_correct i wframe old1 old2: forall m tmp1 tmp2,
   (forall x, notIn x wframe -> tmp1 x = tmp2 x) ->
   inst_prun ge i m tmp1 old1 = inst_prun ge i m tmp2 old2.
 Proof.
-  induction i as [|[x e] i']; simpl; auto.
+  induction i as [|[x e] i']; cbn; auto.
   intros m tmp1 tmp2; rewrite disjoint_cons_l, disjoint_app_l.
   intros (H1 & H2 & H3) H6 H7.
   cutrewrite (exp_eval ge e tmp1 old1 = exp_eval ge e tmp2 old2).
   - destruct (exp_eval ge e tmp2 old2); auto.
     eapply IHi'; eauto. 
-    simpl; intros x0 H0; unfold assign. destruct (R.eq_dec x x0); simpl; auto. 
+    cbn; intros x0 H0; unfold assign. destruct (R.eq_dec x x0); cbn; auto. 
   - unfold disjoint in H2; apply exp_frame_correct.
     intros;apply H6; auto.
     intros;apply H7; auto.
@@ -535,8 +535,8 @@ Fixpoint pararec (p: bblock) (wframe: list R.t): Prop :=
 
 Lemma pararec_disjoint (p: bblock): forall wframe, pararec p wframe -> disjoint (bblock_wframe p) wframe.
 Proof.
-  induction p as [|i p']; simpl.
-  - unfold disjoint; simpl; intuition.
+  induction p as [|i p']; cbn.
+  - unfold disjoint; cbn; intuition.
   - intros wframe [H0 H1]; rewrite disjoint_app_l. 
     generalize (IHp' _ H1).
     rewrite disjoint_app_r. intuition. 
@@ -546,7 +546,7 @@ Qed.
 
 Lemma pararec_det p: forall wframe, pararec p wframe -> is_det p.
 Proof.
-  induction p as [|i p']; simpl; auto.
+  induction p as [|i p']; cbn; auto.
   intros wframe [H0 H1]. generalize (pararec_disjoint _ _ H1). rewrite disjoint_app_r.
   intuition. 
   - apply disjoint_sym; auto.
@@ -558,7 +558,7 @@ Lemma pararec_correct p old: forall wframe m,
   (forall x, notIn x wframe -> m x = old x) ->
   run ge p m = prun_iw ge p m old.
 Proof.
-  elim p; clear p; simpl; auto.
+  elim p; clear p; cbn; auto.
   intros i p' X wframe m [H H0] H1.
   erewrite inst_run_prun, inst_frame_correct; eauto.
   remember (inst_prun ge i m old old) as om0.
@@ -590,17 +590,17 @@ End PARALLELI.
 End ParallelizablityChecking.
 
 
-Module Type PseudoRegSet.
-
-Declare Module R: PseudoRegisters.
-
-(** We assume a datatype [t] refining (list R.t)
+(** * We assume a datatype [PseudoRegSet.t] refining [list R.t] *)
 
+(** 
 This data-refinement is given by an abstract "invariant" match_frame below, 
 preserved by the following operations.
-
 *)
 
+Module Type PseudoRegSet.
+
+Declare Module R: PseudoRegisters.
+
 Parameter t: Type.
 Parameter match_frame: t -> (list R.t) -> Prop.
 
@@ -646,7 +646,7 @@ Fixpoint inst_wsframe(i:inst): S.t :=
 
 Lemma inst_wsframe_correct i: S.match_frame (inst_wsframe i) (inst_wframe i).
 Proof.
-  induction i; simpl; auto.
+  induction i; cbn; auto.
 Qed.
 
 Fixpoint exp_sframe (e: exp): S.t :=
@@ -664,7 +664,7 @@ with list_exp_sframe (le: list_exp): S.t :=
 
 Lemma exp_sframe_correct e: S.match_frame (exp_sframe e) (exp_frame e).
 Proof.
-  induction e using exp_mut with (P0:=fun l => S.match_frame (list_exp_sframe l) (list_exp_frame l)); simpl; auto.
+  induction e using exp_mut with (P0:=fun l => S.match_frame (list_exp_sframe l) (list_exp_frame l)); cbn; auto.
 Qed.
 
 Fixpoint inst_sframe (i: inst): S.t :=
@@ -677,7 +677,7 @@ Local Hint Resolve exp_sframe_correct: core.
 
 Lemma inst_sframe_correct i: S.match_frame (inst_sframe i) (inst_frame i).
 Proof.
-  induction i as [|[y e] i']; simpl; auto.
+  induction i as [|[y e] i']; cbn; auto.
 Qed.
 
 Local Hint Resolve inst_wsframe_correct inst_sframe_correct: core.
@@ -692,7 +692,7 @@ Fixpoint is_pararec (p: bblock) (wsframe: S.t): bool :=
 
 Lemma is_pararec_correct (p: bblock): forall s l, S.match_frame s l -> (is_pararec p s)=true -> (pararec p l).
 Proof.
-  induction p; simpl; auto.
+  induction p; cbn; auto.
   intros s l H1 H2; rewrite lazy_andb_bool_true in H2. destruct H2 as [H2 H3].
   constructor 1; eauto.
 Qed.
@@ -716,6 +716,11 @@ End ParallelChecks.
 
 
 
+(** * Implementing the datatype [PosPseudoRegSet.t] refining [list R.t] *)
+
+(* This data-refinement is given by an abstract "invariant" match_frame below, 
+preserved by the following operations.
+*)
 
 Require Import PArith.
 Require Import MSets.MSetPositive.
@@ -724,12 +729,6 @@ Module PosPseudoRegSet <: PseudoRegSet with Module R:=Pos.
 
 Module R:=Pos.
 
-(** We assume a datatype [t] refining (list R.t)
-
-This data-refinement is given by an abstract "invariant" match_frame below, 
-preserved by the following operations.
-
-*)
 
 Definition t:=PositiveSet.t.
 
@@ -740,14 +739,14 @@ Definition empty:=PositiveSet.empty.
 
 Lemma empty_match_frame: match_frame empty nil.
 Proof.
-  unfold match_frame, empty, PositiveSet.In; simpl; intuition.
+  unfold match_frame, empty, PositiveSet.In; cbn; intuition.
 Qed.
 
 Definition add: R.t -> t -> t := PositiveSet.add.
 
 Lemma add_match_frame: forall s x l, match_frame s l -> match_frame (add x s) (x::l).
 Proof.
-  unfold match_frame, add; simpl. 
+  unfold match_frame, add; cbn. 
   intros s x l H y. rewrite PositiveSet.add_spec, H.
   intuition. 
 Qed.
@@ -773,13 +772,13 @@ Fixpoint is_disjoint (s s': PositiveSet.t) : bool :=
 
 Lemma is_disjoint_spec_true s: forall s', is_disjoint s s' = true -> forall x, PositiveSet.In x s -> PositiveSet.In x s' -> False.
 Proof.
-  unfold PositiveSet.In; induction s as [ |l IHl o r IHr]; simpl; try discriminate.
-  destruct s' as [|l' o' r']; simpl; try discriminate.
+  unfold PositiveSet.In; induction s as [ |l IHl o r IHr]; cbn; try discriminate.
+  destruct s' as [|l' o' r']; cbn; try discriminate.
   intros X.
   assert (H: ~(o = true /\ o'=true) /\ is_disjoint l l' = true /\ is_disjoint r r'=true).
-  { destruct o, o', (is_disjoint l l'), (is_disjoint r r'); simpl in X; intuition. }
+  { destruct o, o', (is_disjoint l l'), (is_disjoint r r'); cbn in X; intuition. }
   clear X; destruct H as (H & H1 & H2).
-  destruct x as [i|i|]; simpl; eauto.
+  destruct x as [i|i|]; cbn; eauto.
 Qed.
 
 Lemma is_disjoint_match_frame: forall s1 s2 l1 l2, match_frame s1 l1 -> match_frame s2 l2 -> (is_disjoint s1 s2)=true -> disjoint l1 l2.
diff --git a/kvx/abstractbb/SeqSimuTheory.v b/kvx/abstractbb/SeqSimuTheory.v
index 61f8f2ec..df6b9963 100644
--- a/kvx/abstractbb/SeqSimuTheory.v
+++ b/kvx/abstractbb/SeqSimuTheory.v
@@ -55,13 +55,14 @@ with list_term_eval ge (l: list_term) (m: mem) {struct l}: option (list value) :
     end
   end.
 
-(* the symbolic memory:
-  - pre: pre-condition expressing that the computation has not yet abort on a None.
-  - post: the post-condition for each pseudo-register 
-*)
-Record smem:= {pre: genv -> mem -> Prop; post:> R.t -> term}.
-
-(** initial symbolic memory *)
+(** The (abstract) symbolic memory state *)
+Record smem :=
+{
+   pre: genv -> mem -> Prop; (**r pre-condition expressing that the computation has not yet abort on a None. *)
+   post:> R.t -> term (**r the output term computed on each pseudo-register *)
+}.
+
+(** Initial symbolic memory state *)
 Definition smem_empty := {| pre:=fun _ _ => True; post:=(fun x => Input x) |}.
 
 Fixpoint exp_term (e: exp) (d old: smem) : term :=
@@ -78,11 +79,12 @@ with list_exp_term (le: list_exp) (d old: smem) : list_term :=
   end.
 
 
-(** assignment of the symbolic memory *)
+(** assignment of the symbolic memory state *)
 Definition smem_set (d:smem) x (t:term) := 
   {| pre:=(fun ge m => (term_eval ge (d x) m) <> None /\ (d.(pre) ge m));
      post:=fun y => if R.eq_dec x y then t else d y |}.
 
+(** Simulation theory: the theorem [bblock_smem_simu] ensures that the simulation between two abstract basic blocks is implied by the simulation between their symbolic execution. *)
 Section SIMU_THEORY.
 
 Variable ge: genv.
@@ -90,13 +92,13 @@ Variable ge: genv.
 Lemma set_spec_eq d x t m:
  term_eval ge (smem_set d x t x) m = term_eval ge t m.
 Proof.
-  unfold smem_set; simpl; case (R.eq_dec x x); try congruence.
+  unfold smem_set; cbn; case (R.eq_dec x x); try congruence.
 Qed.
 
 Lemma set_spec_diff d x y t m:
   x <> y -> term_eval ge (smem_set d x t y) m = term_eval ge (d y) m.
 Proof.
-  unfold smem_set; simpl; case (R.eq_dec x y); try congruence.
+  unfold smem_set; cbn; case (R.eq_dec x y); try congruence.
 Qed.
 
 Fixpoint inst_smem (i: inst) (d old: smem): smem :=
@@ -121,15 +123,15 @@ Definition bblock_smem: bblock -> smem
 Lemma inst_smem_pre_monotonic i old: forall d m,
   (pre (inst_smem i d old) ge m) -> (pre d ge m).
 Proof.
-  induction i as [|[y e] i IHi]; simpl; auto.
+  induction i as [|[y e] i IHi]; cbn; auto.
   intros d a H; generalize (IHi _ _ H); clear H IHi.
-  unfold smem_set; simpl; intuition.
+  unfold smem_set; cbn; intuition.
 Qed.
 
 Lemma bblock_smem_pre_monotonic p: forall d m,
   (pre (bblock_smem_rec p d) ge m) -> (pre d ge m).
 Proof.
-  induction p as [|i p' IHp']; simpl; eauto.
+  induction p as [|i p' IHp']; cbn; eauto.
   intros d a H; eapply inst_smem_pre_monotonic; eauto.
 Qed.
 
@@ -144,7 +146,7 @@ Proof.
   intro H.
   induction e using exp_mut with
     (P0:=fun l => forall (d:smem) m1, (forall x, term_eval ge (d x) m0 = Some (m1 x)) -> list_term_eval ge (list_exp_term l d od) m0 = list_exp_eval ge l m1 old);
-  simpl; auto.
+  cbn; auto.
   - intros; erewrite IHe; eauto.
   - intros. erewrite IHe, IHe0; eauto. 
 Qed.
@@ -154,12 +156,12 @@ Lemma inst_smem_abort i m0 x old: forall (d:smem),
     term_eval ge (d x) m0 = None ->
     term_eval ge (inst_smem i d old x) m0 = None.
 Proof.
-  induction i as [|[y e] i IHi]; simpl; auto.
+  induction i as [|[y e] i IHi]; cbn; auto.
   intros d VALID H; erewrite IHi; eauto. clear IHi.
-  unfold smem_set; simpl; destruct (R.eq_dec y x); auto.
+  unfold smem_set; cbn; destruct (R.eq_dec y x); auto.
   subst;
   generalize (inst_smem_pre_monotonic _ _ _ _ VALID); clear VALID.
-  unfold smem_set; simpl. intuition congruence.
+  unfold smem_set; cbn. intuition congruence.
 Qed.
 
 Lemma block_smem_rec_abort p m0 x: forall d,
@@ -167,7 +169,7 @@ Lemma block_smem_rec_abort p m0 x: forall d,
     term_eval ge (d x) m0 = None ->
     term_eval ge (bblock_smem_rec p d x) m0 = None.
 Proof.
-  induction p; simpl; auto.
+  induction p; cbn; auto.
   intros d VALID H; erewrite IHp; eauto. clear IHp.
   eapply inst_smem_abort; eauto.
 Qed.
@@ -179,13 +181,13 @@ Lemma inst_smem_Some_correct1 i m0 old (od:smem):
   (forall x, term_eval ge (d x) m0 = Some (m1 x)) ->
    forall x, term_eval ge (inst_smem i d od x) m0 = Some (m2 x).
 Proof.
-  intro X; induction i as [|[x e] i IHi]; simpl; intros m1 m2 d H.
+  intro X; induction i as [|[x e] i IHi]; cbn; intros m1 m2 d H.
   - inversion_clear H; eauto.
   - intros H0 x0.
     destruct (exp_eval ge e m1 old) eqn:Heqov; try congruence.
     refine (IHi _ _ _ _ _ _); eauto.
     clear x0; intros x0.
-    unfold assign, smem_set; simpl. destruct (R.eq_dec x x0); auto.
+    unfold assign, smem_set; cbn. destruct (R.eq_dec x x0); auto.
     subst; erewrite term_eval_exp; eauto.
 Qed.
 
@@ -195,7 +197,7 @@ Lemma bblocks_smem_rec_Some_correct1 p m0: forall (m1 m2: mem) (d: smem),
   forall x, term_eval ge (bblock_smem_rec p d x) m0 = Some (m2 x).
 Proof.
   Local Hint Resolve inst_smem_Some_correct1: core.
-  induction p as [ | i p]; simpl; intros m1 m2 d H.
+  induction p as [ | i p]; cbn; intros m1 m2 d H.
   - inversion_clear H; eauto.
   - intros H0 x0.
     destruct (inst_run ge i m1 m1) eqn: Heqov.
@@ -216,15 +218,15 @@ Lemma inst_smem_None_correct i m0 old (od: smem):
    (forall x, term_eval ge (d x) m0 = Some (m1 x)) ->
   inst_run ge i m1 old = None -> exists x, term_eval ge (inst_smem i d od x) m0 = None.
 Proof.
-  intro X; induction i as [|[x e] i IHi]; simpl; intros m1 d.
+  intro X; induction i as [|[x e] i IHi]; cbn; intros m1 d.
   - discriminate.
   - intros VALID H0.
     destruct (exp_eval ge e m1 old) eqn: Heqov.
     + refine (IHi _ _ _ _); eauto.
-      intros x0; unfold assign, smem_set; simpl. destruct (R.eq_dec x x0); auto.
+      intros x0; unfold assign, smem_set; cbn. destruct (R.eq_dec x x0); auto.
       subst; erewrite term_eval_exp; eauto.
     + intuition.
-      constructor 1 with (x:=x); simpl.
+      constructor 1 with (x:=x); cbn.
       apply inst_smem_abort; auto.
       rewrite set_spec_eq. 
       erewrite term_eval_exp; eauto.
@@ -239,14 +241,14 @@ Lemma inst_smem_Some_correct2 i m0 old (od: smem):
   res_eq (Some m2) (inst_run ge i m1 old).
 Proof.
   intro X.
-  induction i as [|[x e] i IHi]; simpl; intros m1 m2 d VALID H0.
+  induction i as [|[x e] i IHi]; cbn; intros m1 m2 d VALID H0.
   - intros H; eapply ex_intro; intuition eauto.
     generalize (H0 x); rewrite H.
     congruence.
   - intros H.
     destruct (exp_eval ge e m1 old) eqn: Heqov.
     + refine (IHi _ _ _ _ _ _); eauto.
-      intros x0; unfold assign, smem_set; simpl; destruct (R.eq_dec x x0); auto.
+      intros x0; unfold assign, smem_set; cbn; destruct (R.eq_dec x x0); auto.
       subst; erewrite term_eval_exp; eauto.
     + generalize (H x).
       rewrite inst_smem_abort; discriminate || auto.
@@ -260,7 +262,7 @@ Lemma bblocks_smem_rec_Some_correct2 p m0: forall (m1 m2: mem) d,
   (forall x, term_eval ge (bblock_smem_rec p d x) m0 = Some (m2 x)) ->
     res_eq (Some m2) (run ge p m1).
 Proof.
-  induction p as [|i p]; simpl; intros m1 m2 d VALID H0.
+  induction p as [|i p]; cbn; intros m1 m2 d VALID H0.
   - intros H; eapply ex_intro; intuition eauto.
     generalize (H0 x); rewrite H.
     congruence.
@@ -291,13 +293,13 @@ Lemma inst_valid i m0 old (od:smem):
   (forall x, term_eval ge (d x) m0 = Some (m1 x)) ->
    pre (inst_smem i d od) ge m0.
 Proof.
-  induction i as [|[x e] i IHi]; simpl; auto.
+  induction i as [|[x e] i IHi]; cbn; auto.
   intros Hold m1 m2 d VALID0 H Hm1.
-  destruct (exp_eval ge e m1 old) eqn: Heq; simpl; try congruence.
+  destruct (exp_eval ge e m1 old) eqn: Heq; cbn; try congruence.
   eapply IHi; eauto.
-  + unfold smem_set in * |- *; simpl. 
+  + unfold smem_set in * |- *; cbn. 
     rewrite Hm1; intuition congruence.
-  + intros x0. unfold assign, smem_set; simpl; destruct (R.eq_dec x x0); auto.
+  + intros x0. unfold assign, smem_set; cbn; destruct (R.eq_dec x x0); auto.
     subst; erewrite term_eval_exp; eauto.
 Qed.
 
@@ -309,7 +311,7 @@ Lemma block_smem_rec_valid p m0: forall (m1 m2: mem) (d:smem),
   pre (bblock_smem_rec p d) ge m0.
 Proof.
   Local Hint Resolve inst_valid: core.
-  induction p as [ | i p]; simpl; intros m1 d H; auto.
+  induction p as [ | i p]; cbn; intros m1 d H; auto.
   intros H0 H1.
   destruct (inst_run ge i m1 m1) eqn: Heqov; eauto.
   congruence.
@@ -320,7 +322,7 @@ Lemma bblock_smem_valid p m0 m1:
   pre (bblock_smem p) ge m0.
 Proof.
   intros; eapply block_smem_rec_valid; eauto.
-  unfold smem_empty; simpl. auto.
+  unfold smem_empty; cbn. auto.
 Qed.
 
 Definition smem_valid ge d m := pre d ge m /\ forall x, term_eval ge (d x) m <> None.
@@ -337,7 +339,7 @@ Theorem bblock_smem_simu p1 p2:
 Proof.
   Local Hint Resolve bblock_smem_valid bblock_smem_Some_correct1: core.
   intros (INCL & EQUIV) m DONTFAIL; unfold smem_valid in * |-.
-  destruct (run ge p1 m) as [m1|] eqn: RUN1; simpl; try congruence.
+  destruct (run ge p1 m) as [m1|] eqn: RUN1; cbn; try congruence.
   assert (X: forall x, term_eval ge (bblock_smem p1 x) m = Some (m1 x)); eauto.
   eapply bblock_smem_Some_correct2; eauto.
   + destruct (INCL m); intuition eauto.
@@ -369,21 +371,20 @@ Lemma smem_valid_set_proof d x t m:
 Proof.
   unfold smem_valid; intros (PRE & VALID) PREt. split.
   + split; auto.
-  + intros x0; unfold smem_set; simpl; case (R.eq_dec x x0); intros; subst; auto.
+  + intros x0; unfold smem_set; cbn; case (R.eq_dec x x0); intros; subst; auto.
 Qed.
 
 
 End SIMU_THEORY.
 
-(** REMARKS: more abstract formulation of the proof... 
-    but relying on functional_extensionality.
+(** REMARK: this theorem reformulates the lemma above in a more abstract way (but relies on functional_extensionality axiom).
 *)
 Definition smem_correct ge (d: smem) (m: mem) (om: option mem): Prop:=
    forall m', om=Some m' <-> (d.(pre) ge m /\ forall x, term_eval ge (d x) m = Some (m' x)).
 
 Lemma bblock_smem_correct ge p m: smem_correct ge (bblock_smem p) m (run ge p m).
 Proof.
-  unfold smem_correct; simpl; intros m'; split.
+  unfold smem_correct; cbn; intros m'; split.
   + intros; split.
     * eapply bblock_smem_valid; eauto.
     * eapply bblock_smem_Some_correct1; eauto.
diff --git a/kvx/lib/ForwardSimulationBlock.v b/kvx/lib/ForwardSimulationBlock.v
index f79814f2..61466dad 100644
--- a/kvx/lib/ForwardSimulationBlock.v
+++ b/kvx/lib/ForwardSimulationBlock.v
@@ -42,11 +42,11 @@ Lemma starN_split n s t s':
   forall m k, n=m+k ->
   exists (t1 t2:trace) s0, starN (step L) (globalenv L) m s t1 s0 /\ starN (step L) (globalenv L) k s0 t2 s' /\ t=t1**t2.
 Proof.
-  induction 1; simpl.
+  induction 1; cbn.
   + intros m k H; assert (X: m=0); try omega.
     assert (X0: k=0); try omega.
     subst; repeat (eapply ex_intro); intuition eauto.
-  + intros m; destruct m as [| m']; simpl.
+  + intros m; destruct m as [| m']; cbn.
     - intros k H2; subst; repeat (eapply ex_intro); intuition eauto.
     - intros k H2. inversion H2.
       exploit (IHstarN m' k); eauto. intro.
@@ -61,7 +61,7 @@ Lemma starN_tailstep n s t1 s':
   forall (t t2:trace) s'',
      Step L s' t2 s'' -> t = t1 ** t2 -> starN (step L) (globalenv L) (S n) s t s''.
 Proof.
-  induction 1; simpl. 
+  induction 1; cbn. 
   + intros t t1 s0; autorewrite with trace_rewrite.
     intros; subst; eapply starN_step; eauto.
     autorewrite with trace_rewrite; auto.
@@ -153,8 +153,8 @@ Definition head (s: memostate): state L1 :=
 
 Lemma head_followed (s: memostate): follows_in_block (head s) (real s).
 Proof.
-  destruct s as [rs ms Hs]. simpl.
-  destruct ms as [ms|]; unfold head; simpl; auto.
+  destruct s as [rs ms Hs]. cbn.
+  destruct ms as [ms|]; unfold head; cbn; auto.
   constructor 1.
   omega.
   cutrewrite ((dist_end_block rs - dist_end_block rs)%nat=O).
@@ -198,21 +198,21 @@ Definition memoL1 := {|
 Lemma discr_dist_end s:
   {dist_end_block s = O} + {dist_end_block s <> O}.
 Proof.
-  destruct (dist_end_block s); simpl; intuition.
+  destruct (dist_end_block s); cbn; intuition.
 Qed.
 
 Lemma memo_simulation_step:
   forall s1 t s1', Step L1 s1 t s1' ->
   forall s2, s1 = (real s2) -> exists s2', Step memoL1 s2 t s2' /\ s1' = (real s2').
 Proof.
-  intros s1 t s1' H1 [rs2 ms2 Hmoi] H2. simpl in H2; subst.
+  intros s1 t s1' H1 [rs2 ms2 Hmoi] H2. cbn in H2; subst.
   destruct (discr_dist_end rs2) as [H3 | H3].
-  + refine (ex_intro _ {|real:=s1'; memorized:=None |} _); simpl.
+  + refine (ex_intro _ {|real:=s1'; memorized:=None |} _); cbn.
     intuition.
   + destruct ms2 as [s|].
-    - refine (ex_intro _ {|real:=s1'; memorized:=Some s |} _); simpl.
+    - refine (ex_intro _ {|real:=s1'; memorized:=Some s |} _); cbn.
       intuition.
-    - refine (ex_intro _ {|real:=s1'; memorized:=Some rs2 |} _); simpl.
+    - refine (ex_intro _ {|real:=s1'; memorized:=Some rs2 |} _); cbn.
       intuition.
   Unshelve.
   * intros; discriminate.
@@ -228,7 +228,7 @@ Qed.
 Lemma forward_memo_simulation_1: forward_simulation L1 memoL1.
 Proof.
   apply forward_simulation_step with (match_states:=fun s1 s2 => s1 = (real s2)); auto.
-  + intros s1 H; eapply ex_intro with (x:={|real:=s1; memorized:=None |}); simpl.
+  + intros s1 H; eapply ex_intro with (x:={|real:=s1; memorized:=None |}); cbn.
     intuition.
   + intros; subst; auto.
   + intros; exploit memo_simulation_step; eauto.
@@ -239,8 +239,8 @@ Qed.
 
 Lemma forward_memo_simulation_2: forward_simulation memoL1 L2.
 Proof.
-  unfold memoL1; simpl.
-  apply forward_simulation_opt with (measure:=fun s => dist_end_block (real s)) (match_states:=fun s1 s2 => match_states (head s1) s2); simpl; auto.
+  unfold memoL1; cbn.
+  apply forward_simulation_opt with (measure:=fun s => dist_end_block (real s)) (match_states:=fun s1 s2 => match_states (head s1) s2); cbn; auto.
   + intros s1 [H0 H1]; destruct (match_initial_states (real s1) H0).
     unfold head; rewrite H1.
     intuition eauto.
@@ -254,14 +254,14 @@ Proof.
     - (* MidBloc *)
       constructor 2. destruct (simu_mid_block (real s1) t (real s1')) as [H5 H4]; auto.
       unfold head in * |- *. rewrite H3. rewrite H4. intuition.
-      destruct (memorized s1); simpl; auto. tauto.
+      destruct (memorized s1); cbn; auto. tauto.
     - (* EndBloc *)
       constructor 1.
       destruct (simu_end_block (head s1) t (real s1') s2) as (s2' & H3 & H4); auto.
       * destruct (head_followed s1) as [H4 H3].
         cutrewrite (dist_end_block (head s1) - dist_end_block (real s1) = dist_end_block (head s1)) in H3; try omega.
         eapply starN_tailstep; eauto.
-      * unfold head; rewrite H2; simpl. intuition eauto. 
+      * unfold head; rewrite H2; cbn. intuition eauto. 
 Qed.
 
 Lemma forward_simulation_block_rel: forward_simulation L1 L2.
diff --git a/kvx/lib/Machblock.v b/kvx/lib/Machblock.v
index 08e0eba2..404c2a96 100644
--- a/kvx/lib/Machblock.v
+++ b/kvx/lib/Machblock.v
@@ -12,6 +12,8 @@
 (*                                                             *)
 (* *************************************************************)
 
+(** Abstract syntax and semantics of a Mach variant, structured with basic-blocks. *)
+
 Require Import Coqlib.
 Require Import Maps.
 Require Import AST.
@@ -28,7 +30,9 @@ Require Stacklayout.
 Require Import Mach.
 Require Import Linking.
 
-(** basic instructions (ie no control-flow) *)
+(** * Abstract Syntax *)
+
+(** ** basic instructions (ie no control-flow) *)
 Inductive basic_inst: Type :=
   | MBgetstack: ptrofs -> typ -> mreg -> basic_inst
   | MBsetstack: mreg -> ptrofs -> typ -> basic_inst
@@ -40,7 +44,7 @@ Inductive basic_inst: Type :=
 
 Definition bblock_body := list basic_inst.
 
-(** control flow instructions *)
+(** ** control flow instructions *)
 Inductive control_flow_inst: Type :=
   | MBcall: signature -> mreg + ident -> control_flow_inst
   | MBtailcall: signature -> mreg + ident -> control_flow_inst
@@ -51,6 +55,7 @@ Inductive control_flow_inst: Type :=
   | MBreturn: control_flow_inst
   .
 
+(** ** basic block *)
 Record bblock := mk_bblock {
   header: list label;
   body:   bblock_body;
@@ -65,7 +70,7 @@ Lemma bblock_eq:
   b1 = b2.
 Proof.
   intros. destruct b1. destruct b2.
-  simpl in *. subst. auto.
+  cbn in *. subst. auto.
 Qed.
 
 Definition length_opt {A} (o: option A) : nat :=
@@ -80,17 +85,19 @@ Lemma size_null b:
   size b = 0%nat ->
   header b = nil /\ body b = nil /\ exit b = None.
 Proof.
-  destruct b as [h b e]. simpl. unfold size. simpl.
+  destruct b as [h b e]. cbn. unfold size. cbn.
   intros H.
   assert (length h = 0%nat) as Hh; [ omega |].
   assert (length b = 0%nat) as Hb; [ omega |].
   assert (length_opt e = 0%nat) as He; [ omega|].
   repeat split.
-  destruct h; try (simpl in Hh; discriminate); auto.
-  destruct b; try (simpl in Hb; discriminate); auto.
-  destruct e; try (simpl in He; discriminate); auto.
+  destruct h; try (cbn in Hh; discriminate); auto.
+  destruct b; try (cbn in Hb; discriminate); auto.
+  destruct e; try (cbn in He; discriminate); auto.
 Qed.
 
+(** ** programs *)
+
 Definition code := list bblock.
 
 Record function: Type := mkfunction
@@ -106,7 +113,7 @@ Definition program := AST.program fundef unit.
 
 Definition genv := Genv.t fundef unit.
 
-(*** sémantique ***)
+(** * Operational (blockstep) semantics ***)
 
 Lemma in_dec (lbl: label) (l: list label):  { List.In lbl l } + { ~(List.In lbl l) }.
 Proof.
@@ -120,13 +127,13 @@ Definition is_label (lbl: label) (bb: bblock) : bool :=
 Lemma is_label_correct_true lbl bb:
   List.In lbl (header bb) <-> is_label lbl bb = true. 
 Proof.
-  unfold is_label; destruct (in_dec lbl (header bb)); simpl; intuition.
+  unfold is_label; destruct (in_dec lbl (header bb)); cbn; intuition.
 Qed.
 
 Lemma is_label_correct_false lbl bb:
   ~(List.In lbl (header bb)) <-> is_label lbl bb = false. 
 Proof.
-  unfold is_label; destruct (in_dec lbl (header bb)); simpl; intuition.
+  unfold is_label; destruct (in_dec lbl (header bb)); cbn; intuition.
 Qed.
 
 
@@ -155,7 +162,7 @@ Definition find_function_ptr
       Genv.find_symbol ge symb
   end.
 
-(** Machblock execution states. *)
+(** ** Machblock execution states. *)
 
 Inductive stackframe: Type :=
   | Stackframe:
diff --git a/kvx/lib/Machblockgen.v b/kvx/lib/Machblockgen.v
index 287e4f7b..3d5d7b2c 100644
--- a/kvx/lib/Machblockgen.v
+++ b/kvx/lib/Machblockgen.v
@@ -29,6 +29,8 @@ Require Import Mach.
 Require Import Linking.
 Require Import Machblock.
 
+(** * Tail-recursive (greedy) translation from Mach code to Machblock code *)
+
 Inductive Machblock_inst: Type :=
 | MB_label (lbl: label) 
 | MB_basic (bi: basic_inst)
@@ -71,9 +73,12 @@ Definition add_to_new_bblock (i:Machblock_inst) : bblock :=
   | MB_cfi i => cfi_bblock i
   end.
 
-(** Adding an instruction to the beginning of a bblock list
-  * Either adding the instruction to the head of the list,
-  * or create a new bblock with the instruction *)
+(** Adding an instruction to the beginning of a bblock list by
+
+- either adding the instruction to the head of the list,
+
+- or creating a new bblock with the instruction 
+*)
 Definition add_to_code (i:Machblock_inst) (bl:code) : code :=
   match bl with
   | bh::bl0 => match i with
@@ -112,7 +117,7 @@ Definition transf_program (src: Mach.program) : program :=
   transform_program transf_fundef src.
 
 
-(** Abstracting trans_code *)
+(** * Abstracting trans_code with a simpler inductive relation *)
 
 Inductive is_end_block: Machblock_inst -> code -> Prop :=
   | End_empty mbi: is_end_block mbi nil
@@ -143,11 +148,11 @@ Lemma add_to_code_is_trans_code i c bl:
   is_trans_code c bl ->
   is_trans_code (i::c) (add_to_code (trans_inst i) bl).
 Proof.
-  destruct bl as [|bh0 bl]; simpl.
+  destruct bl as [|bh0 bl]; cbn.
   - intro H. inversion H. subst. eauto.
   - remember (trans_inst i) as ti.
     destruct ti as [l|bi|cfi].
-    + intros; eapply Tr_add_label; eauto. destruct i; simpl in * |- *; congruence.
+    + intros; eapply Tr_add_label; eauto. destruct i; cbn in * |- *; congruence.
     + intros. remember (header bh0) as hbh0. destruct hbh0 as [|b].
       * eapply Tr_add_basic; eauto.
       * cutrewrite (add_basic bi empty_bblock = add_to_new_bblock (MB_basic bi)); auto.
@@ -165,7 +170,7 @@ Lemma trans_code_is_trans_code_rev c1: forall c2 mbi,
   is_trans_code c2 mbi ->
   is_trans_code (rev_append c1 c2) (trans_code_rev c1 mbi).
 Proof.
-  induction c1 as [| i c1]; simpl; auto.
+  induction c1 as [| i c1]; cbn; auto.
 Qed.
 
 Lemma trans_code_is_trans_code c: is_trans_code c (trans_code c).
@@ -181,17 +186,17 @@ Lemma add_to_code_is_trans_code_inv i c bl:
   is_trans_code (i::c) bl -> exists bl0, is_trans_code c bl0 /\ bl = add_to_code (trans_inst i) bl0.
 Proof.
   intro H; inversion H as [|H0 H1 bl0| | H0 bi bh H1 bl0]; clear H; subst; (repeat econstructor); eauto.
-  + (* case Tr_end_block *) inversion H3; subst; simpl; auto.
+  + (* case Tr_end_block *) inversion H3; subst; cbn; auto.
      * destruct (header bh); congruence.
-     * destruct bl0; simpl; congruence.
-  + (* case Tr_add_basic *) rewrite H3. simpl. destruct (header bh); congruence.
+     * destruct bl0; cbn; congruence.
+  + (* case Tr_add_basic *) rewrite H3. cbn. destruct (header bh); congruence.
 Qed. 
 
 Lemma trans_code_is_trans_code_rev_inv c1: forall c2 mbi, 
   is_trans_code (rev_append c1 c2) mbi ->
   exists mbi0, is_trans_code c2 mbi0 /\ mbi=trans_code_rev c1 mbi0.
 Proof.
-  induction c1 as [| i c1]; simpl; eauto.
+  induction c1 as [| i c1]; cbn; eauto.
   intros; exploit IHc1; eauto.
   intros (mbi0 & H1 & H2); subst.
   exploit add_to_code_is_trans_code_inv; eauto.
diff --git a/kvx/lib/Machblockgenproof.v b/kvx/lib/Machblockgenproof.v
index dfb97bfe..fc722887 100644
--- a/kvx/lib/Machblockgenproof.v
+++ b/kvx/lib/Machblockgenproof.v
@@ -146,16 +146,16 @@ Lemma parent_sp_preserved:
   forall s,
   Mach.parent_sp s = parent_sp (trans_stack s).
 Proof.
-  unfold parent_sp. unfold Mach.parent_sp. destruct s; simpl; auto.
-  unfold trans_stackframe. destruct s; simpl; auto.
+  unfold parent_sp. unfold Mach.parent_sp. destruct s; cbn; auto.
+  unfold trans_stackframe. destruct s; cbn; auto.
 Qed.
 
 Lemma parent_ra_preserved:
   forall s,
   Mach.parent_ra s = parent_ra (trans_stack s).
 Proof.
-  unfold parent_ra. unfold Mach.parent_ra. destruct s; simpl; auto.
-  unfold trans_stackframe. destruct s; simpl; auto.
+  unfold parent_ra. unfold Mach.parent_ra. destruct s; cbn; auto.
+  unfold trans_stackframe. destruct s; cbn; auto.
 Qed.
 
 Lemma external_call_preserved:
@@ -175,11 +175,11 @@ Proof.
   destruct i; try (constructor 2; split; auto; discriminate ).
   destruct (peq l0 l) as [P|P].
   - constructor. subst l0; split; auto.
-    revert H. unfold Mach.find_label. simpl. rewrite peq_true.
+    revert H. unfold Mach.find_label. cbn. rewrite peq_true.
     intros H; injection H; auto.
   - constructor 2. split.
     + intro F. injection F. intros. contradict P; auto.
-    + revert H. unfold Mach.find_label. simpl. rewrite peq_false; auto.
+    + revert H. unfold Mach.find_label. cbn. rewrite peq_false; auto.
 Qed.
 
 Lemma find_label_is_end_block_not_label i l c bl:
@@ -192,24 +192,24 @@ Proof.
   remember (is_label l _) as b.
   cutrewrite (b = false); auto.
   subst; unfold is_label.
-  destruct i; simpl in * |- *; try (destruct (in_dec l nil); intuition).
+  destruct i; cbn in * |- *; try (destruct (in_dec l nil); intuition).
   inversion H.
   destruct (in_dec l (l0::nil)) as [H6|H6]; auto.
-  simpl in H6; intuition try congruence.
+  cbn in H6; intuition try congruence.
 Qed.
 
 Lemma find_label_at_begin l bh bl:
   In l (header bh)
   -> find_label l (bh :: bl) = Some (bh::bl).
 Proof.
-  unfold find_label; rewrite is_label_correct_true; intro H; rewrite H; simpl; auto.
+  unfold find_label; rewrite is_label_correct_true; intro H; rewrite H; cbn; auto.
 Qed.
 
 Lemma find_label_add_label_diff l bh bl:
       ~(In l (header bh)) -> 
       find_label l (bh::bl) = find_label l bl.
 Proof.
-  unfold find_label; rewrite is_label_correct_false; intro H; rewrite H; simpl; auto.
+  unfold find_label; rewrite is_label_correct_false; intro H; rewrite H; cbn; auto.
 Qed.
 
 Definition concat (h: list label) (c: code): code :=
@@ -227,18 +227,18 @@ Proof.
   rewrite <- is_trans_code_inv in * |-.
   induction Heqbl. 
   + (* Tr_nil *) 
-    intros; exists (l::nil); simpl in * |- *; intuition.
+    intros; exists (l::nil); cbn in * |- *; intuition.
     discriminate.
   + (* Tr_end_block *)
     intros.
     exploit Mach_find_label_split; eauto.
     clear H0; destruct 1 as [(H0&H2)|(H0&H2)].
-    - subst. rewrite find_label_at_begin; simpl; auto.
+    - subst. rewrite find_label_at_begin; cbn; auto.
       inversion H as [mbi H1 H2| | ].
       subst.
       inversion Heqbl.
       subst.
-      exists (l :: nil); simpl; eauto.
+      exists (l :: nil); cbn; eauto.
     - exploit IHHeqbl; eauto.
       destruct 1 as (h & H3 & H4).
       exists h.
@@ -251,21 +251,21 @@ Proof.
     - subst.
       inversion H0 as [H1].
       clear H0.
-      erewrite find_label_at_begin; simpl; eauto.
+      erewrite find_label_at_begin; cbn; eauto.
       subst_is_trans_code Heqbl.
-      exists (l :: nil); simpl; eauto.
+      exists (l :: nil); cbn; eauto.
     - subst; assert (H: l0 <> l); try congruence; clear H0.
       exploit IHHeqbl; eauto.
       clear IHHeqbl Heqbl.
       intros (h & H3 & H4).
-      simpl; unfold is_label, add_label; simpl.
-      destruct (in_dec l (l0::header bh)) as [H5|H5]; simpl in H5.
+      cbn; unfold is_label, add_label; cbn.
+      destruct (in_dec l (l0::header bh)) as [H5|H5]; cbn in H5.
       * destruct H5; try congruence.
-        exists (l0::h); simpl; intuition.
+        exists (l0::h); cbn; intuition.
         rewrite find_label_at_begin in H4; auto.
         apply f_equal. inversion H4 as [H5]. clear H4.
-        destruct (trans_code c'); simpl in * |- *;
-        inversion H5; subst; simpl; auto.
+        destruct (trans_code c'); cbn in * |- *;
+        inversion H5; subst; cbn; auto.
       * exists h. intuition.
         erewrite <- find_label_add_label_diff; eauto.
   + (* Tr_add_basic *)
@@ -318,12 +318,12 @@ Local Hint Resolve exec_MBgetstack exec_MBsetstack exec_MBgetparam exec_MBop exe
 
 Lemma size_add_label l bh: size (add_label l bh) = size bh + 1.
 Proof.
-  unfold add_label, size; simpl; omega.
+  unfold add_label, size; cbn; omega.
 Qed.
 
 Lemma size_add_basic bi bh: header bh = nil -> size (add_basic bi bh) = size bh + 1.
 Proof.
-  intro H. unfold add_basic, size; rewrite H; simpl. omega.
+  intro H. unfold add_basic, size; rewrite H; cbn. omega.
 Qed.
 
 
@@ -418,8 +418,8 @@ Proof.
   + exists lbl. 
     unfold trans_inst in H1. 
     destruct i; congruence.
-  + unfold add_basic in H; simpl in H; congruence.
-  + unfold cfi_bblock in H; simpl in H; congruence.
+  + unfold add_basic in H; cbn in H; congruence.
+  + unfold cfi_bblock in H; cbn in H; congruence.
 Qed.
 
 Local Hint Resolve Mlabel_is_not_basic: core.
@@ -433,11 +433,11 @@ Proof.
   intros b bl H; remember (trans_inst i) as ti.
   destruct ti as [lbl|bi|cfi];
   inversion H as [|d0 d1 d2 H0 H1| |]; subst;
-  try (rewrite <- Heqti in * |- *); simpl in * |- *;
+  try (rewrite <- Heqti in * |- *); cbn in * |- *;
   try congruence.
   + (* label at end block *)
     inversion H1; subst. inversion H0; subst.
-    assert (X:i=Mlabel lbl). { destruct i; simpl in Heqti; congruence. }
+    assert (X:i=Mlabel lbl). { destruct i; cbn in Heqti; congruence. }
     subst. repeat econstructor; eauto.
   + (* label at mid block *)
     exploit IHc; eauto.
@@ -451,12 +451,12 @@ Proof.
       assert (X:(trans_inst i) = MB_basic bi ). { repeat econstructor; congruence. }
       repeat econstructor; congruence.
     - exists (i::c), c, c.
-      repeat econstructor; eauto; inversion H0; subst; repeat econstructor; simpl; try congruence.
+      repeat econstructor; eauto; inversion H0; subst; repeat econstructor; cbn; try congruence.
       * exploit (add_to_new_block_is_label i0); eauto.
-        intros (l & H8); subst; simpl; congruence.
+        intros (l & H8); subst; cbn; congruence.
       * exploit H3; eauto.
       * exploit (add_to_new_block_is_label i0); eauto.
-        intros (l & H8); subst; simpl; congruence.
+        intros (l & H8); subst; cbn; congruence.
   + (* basic at mid block *)
     inversion H1; subst.
     exploit IHc; eauto.
@@ -476,7 +476,7 @@ Lemma step_simu_header st f sp rs m s c h c' t:
  starN (Mach.step (inv_trans_rao rao)) (Genv.globalenv prog) (length h) (Mach.State st f sp c rs m) t s -> 
  s = Mach.State st f sp c' rs m /\ t = E0.
 Proof.
-  induction 1; simpl; intros hs; try (inversion hs; tauto).
+  induction 1; cbn; intros hs; try (inversion hs; tauto).
   inversion hs as [|n1 s1 t1 t2 s2 t3 s3 H1]. inversion H1. subst. auto. 
 Qed.
 
@@ -487,21 +487,21 @@ Lemma step_simu_basic_step (i: Mach.instruction) (bi: basic_inst) (c: Mach.code)
   Mach.step (inv_trans_rao rao) ge (Mach.State s f sp (i::c) rs m) t s' ->
   exists rs' m', s'=Mach.State s f sp c rs' m' /\ t=E0 /\ basic_step tge (trans_stack s) f sp rs m bi rs' m'.
 Proof.
-  destruct i; simpl in * |-;
+  destruct i; cbn in * |-;
    (discriminate
     || (intro H; inversion_clear H; intro X; inversion_clear X; eapply ex_intro; eapply ex_intro; intuition eauto)).
   - eapply exec_MBgetparam; eauto. exploit (functions_translated); eauto. intro.
     destruct H3 as (tf & A & B). subst. eapply A.
-    all: simpl; rewrite <- parent_sp_preserved; auto.
-  - eapply exec_MBop; eauto. rewrite <- H. destruct o; simpl; auto. destruct (rs ## l); simpl; auto.
+    all: cbn; rewrite <- parent_sp_preserved; auto.
+  - eapply exec_MBop; eauto. rewrite <- H. destruct o; cbn; auto. destruct (rs ## l); cbn; auto.
     unfold Genv.symbol_address; rewrite symbols_preserved; auto.
-  - eapply exec_MBload; eauto; rewrite <- H; destruct a; simpl; auto; destruct (rs ## l); simpl; auto;
+  - eapply exec_MBload; eauto; rewrite <- H; destruct a; cbn; auto; destruct (rs ## l); cbn; auto;
     unfold Genv.symbol_address; rewrite symbols_preserved; auto.
-  - eapply exec_MBload_notrap1; eauto; rewrite <- H; destruct a; simpl; auto; destruct (rs ## l); simpl; auto;
+  - eapply exec_MBload_notrap1; eauto; rewrite <- H; destruct a; cbn; auto; destruct (rs ## l); cbn; auto;
     unfold Genv.symbol_address; rewrite symbols_preserved; auto.
-  - eapply exec_MBload_notrap2; eauto; rewrite <- H; destruct a; simpl; auto; destruct (rs ## l); simpl; auto;
+  - eapply exec_MBload_notrap2; eauto; rewrite <- H; destruct a; cbn; auto; destruct (rs ## l); cbn; auto;
     unfold Genv.symbol_address; rewrite symbols_preserved; auto.
-  - eapply exec_MBstore; eauto; rewrite <- H; destruct a; simpl; auto; destruct (rs ## l); simpl; auto;
+  - eapply exec_MBstore; eauto; rewrite <- H; destruct a; cbn; auto; destruct (rs ## l); cbn; auto;
     unfold Genv.symbol_address; rewrite symbols_preserved; auto.
 Qed.
 
@@ -511,7 +511,7 @@ Lemma star_step_simu_body_step s f sp c bdy c':
   starN (Mach.step (inv_trans_rao rao)) ge (length bdy) (Mach.State s f sp c rs m) t s' ->
   exists rs' m', s'=Mach.State s f sp c' rs' m' /\ t=E0 /\ body_step tge (trans_stack s) f sp bdy rs m rs' m'.
 Proof.
-  induction 1; simpl.
+  induction 1; cbn.
   + intros. inversion H. exists rs. exists m. auto.
   + intros. inversion H0. exists rs. exists m. auto.
   + intros. inversion H1; subst. 
@@ -531,15 +531,15 @@ Local Hint Resolve eval_builtin_args_preserved external_call_symbols_preserved f
 Lemma match_states_concat_trans_code st f sp c rs m h: 
   match_states (Mach.State st f sp c rs m) (State (trans_stack st) f sp (concat h (trans_code c)) rs m).
 Proof.
-  intros; constructor 1; simpl.
+  intros; constructor 1; cbn.
   + intros (t0 & s1' & H0) t s'. 
     remember (trans_code _) as bl.
     destruct bl as [|bh bl]. 
     { rewrite <- is_trans_code_inv in Heqbl; inversion Heqbl; inversion H0; congruence. } 
     clear H0.
-    simpl; constructor 1; 
-    intros X; inversion X as [d1 d2 d3 d4 d5 d6 d7 rs' m' d10 d11 X1 X2| | | ]; subst; simpl in * |- *; 
-    eapply exec_bblock; eauto; simpl;
+    cbn; constructor 1; 
+    intros X; inversion X as [d1 d2 d3 d4 d5 d6 d7 rs' m' d10 d11 X1 X2| | | ]; subst; cbn in * |- *; 
+    eapply exec_bblock; eauto; cbn;
     inversion X2 as [cfi d1 d2 d3 H1|]; subst; eauto;
     inversion H1; subst; eauto.
   + intros H r; constructor 1; intro X; inversion X.
@@ -551,7 +551,7 @@ Lemma step_simu_cfi_step (i: Mach.instruction) (cfi: control_flow_inst) (c: Mach
   Mach.step (inv_trans_rao rao) ge (Mach.State stk f sp (i::c) rs m) t s' ->
   exists s2, cfi_step rao tge cfi (State (trans_stack stk) f sp (b::blc) rs m) t s2 /\ match_states s' s2.
 Proof.
-  destruct i; simpl in * |-;
+  destruct i; cbn in * |-;
   (intro H; intro Htc;apply is_trans_code_inv in Htc;rewrite Htc;inversion_clear H;intro X; inversion_clear X).
   * eapply ex_intro.
     intuition auto.
@@ -561,8 +561,8 @@ Proof.
     intuition auto.
     eapply exec_MBtailcall;eauto. 
     - rewrite <-H; exploit (find_function_ptr_same); eauto.
-    - simpl; rewrite <- parent_sp_preserved; auto.
-    - simpl; rewrite <- parent_ra_preserved; auto.
+    - cbn; rewrite <- parent_sp_preserved; auto.
+    - cbn; rewrite <- parent_ra_preserved; auto.
   * eapply ex_intro.
     intuition auto.
     eapply exec_MBbuiltin ;eauto.
@@ -605,7 +605,7 @@ Proof.
     inversion H1; subst. 
     exploit (step_simu_cfi_step); eauto.
     intros [s2 [Hcfi1 Hcfi3]].
-    inversion H4. subst; simpl.
+    inversion H4. subst; cbn.
     autorewrite  with trace_rewrite.
     exists s2.
     split;eauto.
@@ -616,7 +616,7 @@ Lemma simu_end_block:
   starN (Mach.step (inv_trans_rao rao)) ge (Datatypes.S (dist_end_block s1)) s1 t s1' ->
   exists s2', step rao tge (trans_state s1) t s2' /\ match_states s1' s2'.
 Proof.
-  destruct s1; simpl.
+  destruct s1; cbn.
   + (* State *)
     remember (trans_code _) as tc.
     rewrite <- is_trans_code_inv in Heqtc.
@@ -624,7 +624,7 @@ Proof.
     destruct tc as [|b bl].
     { (* nil => absurd *) 
       inversion Heqtc. subst. 
-      unfold dist_end_block_code; simpl.
+      unfold dist_end_block_code; cbn.
       inversion_clear H;
       inversion_clear H0. 
     }
@@ -659,7 +659,7 @@ Proof.
     intros t s1' H; inversion_clear H.
     eapply ex_intro; constructor 1; eauto.
     inversion H1; subst; clear H1.
-    inversion_clear H0; simpl.
+    inversion_clear H0; cbn.
     - (* function_internal*)
       cutrewrite (trans_code (Mach.fn_code f0) = fn_code (transf_function f0)); eauto.
       eapply exec_function_internal; eauto.
@@ -674,7 +674,7 @@ Proof.
     intros t s1' H; inversion_clear H.
     eapply ex_intro; constructor 1; eauto.
     inversion H1; subst; clear H1.
-    inversion_clear H0; simpl.
+    inversion_clear H0; cbn.
     eapply exec_return.
 Qed.
 
@@ -685,10 +685,10 @@ dist_end_block_code (i :: c) = 0.
 Proof.
   unfold dist_end_block_code.
   intro H. destruct H as [cfi H].
-  destruct i;simpl in H;try(congruence); ( 
+  destruct i;cbn in H;try(congruence); ( 
     remember (trans_code _) as bl; 
     rewrite <- is_trans_code_inv in Heqbl;
-    inversion Heqbl; subst; simpl in * |- *; try (congruence)).
+    inversion Heqbl; subst; cbn in * |- *; try (congruence)).
 Qed.
 
 Theorem transf_program_correct: 
@@ -697,23 +697,23 @@ Proof.
   apply forward_simulation_block_trans with (dist_end_block := dist_end_block) (trans_state := trans_state).
 (* simu_mid_block *)
   - intros s1 t s1' H1 H2.
-    destruct H1; simpl in * |- *; omega || (intuition auto);
-    destruct H2; eapply cfi_dist_end_block; simpl; eauto.
+    destruct H1; cbn in * |- *; omega || (intuition auto);
+    destruct H2; eapply cfi_dist_end_block; cbn; eauto.
 (* public_preserved *)
   - apply senv_preserved.
 (* match_initial_states *)
-  - intros. simpl.
+  - intros. cbn.
     eapply ex_intro; constructor 1.
     eapply match_states_trans_state.
     destruct H. split.
     apply init_mem_preserved; auto.
     rewrite prog_main_preserved. rewrite <- H0. apply symbols_preserved.
 (* match_final_states *)
-  - intros. simpl. destruct H. split with (r := r); auto.
+  - intros. cbn. destruct H. split with (r := r); auto.
 (* final_states_end_block *)
-  - intros. simpl in H0.
+  - intros. cbn in H0.
     inversion H0.
-    inversion H; simpl; auto.
+    inversion H; cbn; auto.
     all: try (subst; discriminate).
     apply cfi_dist_end_block; exists MBreturn; eauto.
 (* simu_end_block *)
@@ -733,8 +733,8 @@ Proof.
   intro H; destruct c as [|i' c]. { inversion H. }
   remember (trans_inst i) as ti.
   destruct ti as [lbl|bi|cfi].
-  - (*i=lbl *) cutrewrite (i = Mlabel lbl). 2: ( destruct i; simpl in * |- *; try congruence ).
-    exists nil; simpl; eexists. eapply Tr_add_label; eauto.
+  - (*i=lbl *) cutrewrite (i = Mlabel lbl). 2: ( destruct i; cbn in * |- *; try congruence ).
+    exists nil; cbn; eexists. eapply Tr_add_label; eauto.
   - (*i=basic*)
     destruct i'.
     10: { exists (add_to_new_bblock (MB_basic bi)::nil).  exists b. 
@@ -742,11 +742,11 @@ Proof.
       rewrite Heqti.        
       eapply  Tr_end_block; eauto.
       rewrite <-Heqti.
-      eapply End_basic. inversion H; try(simpl; congruence).
-      simpl in H5; congruence. }
-    all: try(exists nil; simpl; eexists;  eapply  Tr_add_basic; eauto; inversion H; try(eauto || congruence)).
+      eapply End_basic. inversion H; try(cbn; congruence).
+      cbn in H5; congruence. }
+    all: try(exists nil; cbn; eexists;  eapply  Tr_add_basic; eauto; inversion H; try(eauto || congruence)).
   - (*i=cfi*)
-    destruct i; try(simpl in Heqti; congruence).
+    destruct i; try(cbn in Heqti; congruence).
     all: exists (add_to_new_bblock (MB_cfi cfi)::nil);  exists b; 
         cutrewrite ((add_to_new_bblock (MB_cfi cfi) :: nil) ++ (b::l)=(add_to_new_bblock (MB_cfi cfi) :: (b::l)));eauto;
         rewrite Heqti;        
@@ -768,13 +768,13 @@ Qed.
 (* FIXME: these two lemma should go into [Coqlib.v] *) 
 Lemma is_tail_app A (l1: list A): forall l2, is_tail l2 (l1 ++ l2).
 Proof.
-  induction l1; simpl; auto with coqlib.
+  induction l1; cbn; auto with coqlib.
 Qed.
 Hint Resolve is_tail_app: coqlib.
 
 Lemma is_tail_app_inv A (l1: list A): forall l2 l3, is_tail (l1 ++ l2) l3 -> is_tail l2 l3.
 Proof.
-  induction l1; simpl; auto with coqlib.
+  induction l1; cbn; auto with coqlib.
   intros l2 l3 H; inversion H; eauto with coqlib.
 Qed.
 Hint Resolve is_tail_app_inv: coqlib.
@@ -787,17 +787,17 @@ Proof.
   - intros; subst.
     remember (trans_code (Mcall _ _::c)) as tc2.
     rewrite <- is_trans_code_inv in Heqtc2.
-    inversion Heqtc2; simpl in * |- *; subst; try congruence.
+    inversion Heqtc2; cbn in * |- *; subst; try congruence.
     subst_is_trans_code H1.
     eapply ex_intro; eauto with coqlib.
   - intros; exploit IHis_tail; eauto. clear IHis_tail.
     intros (b & Hb). inversion Hb; clear Hb.
     * exploit (trans_code_monotonic i c2); eauto.
       intros (l' & b' & Hl'); rewrite Hl'.
-      exists b'; simpl; eauto with coqlib.
+      exists b'; cbn; eauto with coqlib.
     * exploit (trans_code_monotonic i c2); eauto.
       intros (l' & b' & Hl'); rewrite Hl'.
-      simpl; eapply ex_intro.
+      cbn; eapply ex_intro.
       eapply is_tail_trans; eauto with coqlib.
 Qed.
 
diff --git a/lib/Coqlib.v b/lib/Coqlib.v
index 02c5d07f..16d880fa 100644
--- a/lib/Coqlib.v
+++ b/lib/Coqlib.v
@@ -1325,3 +1325,9 @@ Lemma nlist_forall2_imply:
 Proof.
   induction 1; simpl; intros; constructor; auto.
 Qed.
+
+Lemma if_same : forall {T : Type} (b : bool) (x : T),
+    (if b then x else x) = x.
+Proof.
+  destruct b; trivial.
+Qed.
diff --git a/runtime/include/ccomp_kvx_fixes.h b/runtime/include/ccomp_kvx_fixes.h
index 65d65e7b..a518a069 100644
--- a/runtime/include/ccomp_kvx_fixes.h
+++ b/runtime/include/ccomp_kvx_fixes.h
@@ -33,13 +33,26 @@ extern __int128 __compcert_acswapd(void *address, unsigned long long new_value,
 #define __builtin_kvx_acswapw __compcert_acswapw
 extern __int128 __compcert_acswapw(void *address, unsigned long long new_value, unsigned long long old_value);
 
+#define __builtin_kvx_aladdd __compcert_aladdd
+extern long long __compcert_aladdd(void *address, unsigned long long incr);
+
+#define __builtin_kvx_aladdw __compcert_aladdw
+extern int __compcert_aladdw(void *address, unsigned int incr);
+
 #define __builtin_kvx_afaddd __compcert_afaddd
 extern long long __compcert_afaddd(void *address, unsigned long long incr);
 
 #define __builtin_kvx_afaddw __compcert_afaddw
 extern int __compcert_afaddw(void *address, unsigned int incr);
-#endif
+
+#define __builtin_kvx_ld __compcert_ld
+extern int __compcert_ld(void *address, const char *str, const int b);
+
+#define __builtin_kvx_lwz __compcert_lwz
+extern int __compcert_lwz(void *address, const char *str, const int b);
 
 /* #define __builtin_expect(x, y) (x) */
 #define __builtin_ctz(x) __builtin_kvx_ctzw(x)
 #define __builtin_clz(x) __builtin_kvx_clzw(x)
+
+#endif
diff --git a/test/kvx/instr/Makefile b/test/kvx/instr/Makefile
index e4f964b3..fce32178 100644
--- a/test/kvx/instr/Makefile
+++ b/test/kvx/instr/Makefile
@@ -1,15 +1,15 @@
 SHELL := /bin/bash
 
-KVXC ?= k1-cos-gcc
+KVXC ?= kvx-elf-gcc
 CC ?= gcc
 CCOMP ?= ccomp
 OPTIM ?= -O2
 CFLAGS ?= $(OPTIM)
 CCOMPFLAGS ?= $(CFLAGS)
-SIMU ?= k1-mppa
+SIMU ?= kvx-mppa
 TIMEOUT ?= --signal=SIGTERM 120s
 DIFF ?= python2.7 floatcmp.py -reltol .00001
-HARDRUN ?= k1-jtag-runner
+HARDRUN ?= kvx-jtag-runner
 
 DIR=./
 SRCDIR=$(DIR)
@@ -64,7 +64,7 @@ simutest: $(X86_GCC_OUT) $(GCC_SIMUOUT)
 		x86out=$(OUTDIR)/$$test.x86-gcc.out;\
 		gccout=$(OUTDIR)/$$test.gcc.simu.out;\
 		if grep "__KVX__" -q $$test.c; then\
-			printf "$(YELLOW)UNTESTED: $$test.c contains an \`#ifdef __KVX__\`\n";\
+			printf "$(YELLOW)UNTESTED: $$test.c contains an \`#ifdef __KVX__\`\n$(NC)";\
 		elif $(DIFF) $$x86out $$gccout > /dev/null; test $${PIPESTATUS[0]} -ne 0; then\
 			>&2 printf "$(RED)ERROR: $$x86out and $$gccout differ$(NC)\n";\
 		else\
@@ -92,7 +92,7 @@ hardtest: $(X86_GCC_OUT) $(GCC_HARDOUT)
 		x86out=$(OUTDIR)/$$test.x86-gcc.out;\
 		gccout=$(OUTDIR)/$$test.gcc.hard.out;\
 		if grep "__KVX__" -q $$test.c; then\
-			printf "$(YELLOW)UNTESTED: $$test.c contains an \`#ifdef __KVX__\`\n";\
+			printf "$(YELLOW)UNTESTED: $$test.c contains an \`#ifdef __KVX__\`\n$(NC)";\
 		elif $(DIFF) $$x86out $$gccout > /dev/null; test $${PIPESTATUS[0]} -ne 0; then\
 			>&2 printf "$(RED)ERROR: $$x86out and $$gccout differ$(NC)\n";\
 		else\
diff --git a/test/kvx/interop/Makefile b/test/kvx/interop/Makefile
index a0d4d7da..aa018aac 100644
--- a/test/kvx/interop/Makefile
+++ b/test/kvx/interop/Makefile
@@ -1,12 +1,12 @@
 SHELL := /bin/bash
 
-KVXC ?= k1-cos-gcc
+KVXC ?= kvx-elf-gcc
 CC ?= gcc
 CCOMP ?= ccomp
 CFLAGS ?= -O2 -Wno-varargs
-SIMU ?= k1-mppa
+SIMU ?= kvx-mppa
 TIMEOUT ?= --signal=SIGTERM 120s
-HARDRUN ?= k1-jtag-runner
+HARDRUN ?= kvx-jtag-runner
 
 DIR=./
 SRCDIR=$(DIR)
diff --git a/test/kvx/lib/Makefile b/test/kvx/lib/Makefile
index 5a947bb3..7df7dd16 100644
--- a/test/kvx/lib/Makefile
+++ b/test/kvx/lib/Makefile
@@ -1,10 +1,10 @@
-KVXC ?= k1-cos-gcc
-K1AR ?= k1-cos-ar
+KVXC ?= kvx-elf-gcc
+K1AR ?= kvx-elf-ar
 CC ?= gcc
 AR ?= gcc-ar
 CCOMP ?= ccomp
 CFLAGS ?= -O1 -Wl,--wrap=printf
-SIMU ?= k1-mppa
+SIMU ?= kvx-mppa
 TIMEOUT ?= --signal=SIGTERM 60s
 
 DIR=./
diff --git a/test/kvx/mmult/Makefile b/test/kvx/mmult/Makefile
index e7cd890e..252f8911 100644
--- a/test/kvx/mmult/Makefile
+++ b/test/kvx/mmult/Makefile
@@ -1,8 +1,8 @@
-KVXC ?= k1-cos-gcc
+KVXC ?= kvx-elf-gcc
 CC ?= gcc
 CCOMP ?= ccomp
 CFLAGS ?= -O2
-SIMU ?= k1-mppa
+SIMU ?= kvx-mppa
 TIMEOUT ?= 10s
 
 KVXCPATH=$(shell which $(KVXC))
@@ -65,3 +65,7 @@ check: $(CCOMP_OUT) $(STUB_OUT)
 	else\
 		echo "GOOD kvx: $< succeeded";\
 	fi
+
+.PHONY:
+clean:
+	rm -f *.out mmult-test-ccomp-kvx mmult-test-gcc-kvx mmult-test-gcc-x86
diff --git a/test/kvx/prng/Makefile b/test/kvx/prng/Makefile
index 68e5ffc9..b97f4aa4 100644
--- a/test/kvx/prng/Makefile
+++ b/test/kvx/prng/Makefile
@@ -1,8 +1,8 @@
-KVXC ?= k1-cos-gcc
+KVXC ?= kvx-elf-gcc
 CC ?= gcc
 CCOMP ?= ccomp
 CFLAGS ?= -O2
-SIMU ?= k1-mppa
+SIMU ?= kvx-mppa
 TIMEOUT ?= 10s
 
 KVXCPATH=$(shell which $(KVXC))
@@ -67,3 +67,4 @@ check: $(CCOMP_OUT) $(STUB_OUT)
 .PHONY:
 clean:
 	rm -f prng-test-gcc-x86 prng-test-gcc-kvx prng-test-ccomp-kvx
+	rm -f *.out
diff --git a/test/kvx/sort/Makefile b/test/kvx/sort/Makefile
index c4090352..46a8f025 100644
--- a/test/kvx/sort/Makefile
+++ b/test/kvx/sort/Makefile
@@ -1,9 +1,9 @@
-KVXC ?= k1-cos-gcc
+KVXC ?= kvx-elf-gcc
 CC ?= gcc
 CCOMP ?= ccomp
 CFLAGS ?= -O2
-SIMU ?= k1-mppa
-TIMEOUT ?= 10s
+SIMU ?= kvx-mppa
+TIMEOUT ?= 20s
 
 KVXCPATH=$(shell which $(KVXC))
 CCPATH=$(shell which $(CC))
@@ -89,3 +89,10 @@ check: $(STUB_OUT) $(CCOMP_OUT)
 			echo "GOOD kvx: $$test succeeded";\
 		fi;\
 	done
+
+.PHONY:
+clean:
+	for test in insertion main merge selection; do\
+		rm -f $$test-ccomp-kvx $$test-gcc-kvx $$test-gcc-x86;\
+	done
+	rm -f *.out
diff --git a/test/monniaux/cycles.h b/test/monniaux/cycles.h
index 1f7a991a..f26060a7 100644
--- a/test/monniaux/cycles.h
+++ b/test/monniaux/cycles.h
@@ -6,7 +6,7 @@
 typedef uint64_t cycle_t;
 #define PRcycle PRId64
 
-#include <../../k1-cos/include/hal/cos_registers.h>
+#include <../../kvx-cos/include/hal/cos_registers.h>
 
 static inline void cycle_count_config(void)
 {
diff --git a/test/monniaux/loop_nest/syrk.c b/test/monniaux/loop_nest/syrk.c
new file mode 100644
index 00000000..490d0a01
--- /dev/null
+++ b/test/monniaux/loop_nest/syrk.c
@@ -0,0 +1,28 @@
+/* Include polybench common header. */
+#include "polybench.h"
+
+/* Include benchmark-specific header. */
+/* Default data type is double, default size is 4000. */
+#include "syrk.h"
+
+/* Main computational kernel. The whole function will be timed,
+   including the call and return. */
+void kernel_syrk(int ni, int nj,
+		 DATA_TYPE alpha,
+		 DATA_TYPE beta,
+		 DATA_TYPE POLYBENCH_2D(C,NI,NI,ni,ni),
+		 DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj))
+{
+  int i, j, k;
+
+  /*  C := alpha*A*A' + beta*C */
+#if 0
+  for (i = 0; i < _PB_NI; i++)
+    for (j = 0; j < _PB_NI; j++)
+      C[i][j] *= beta;
+#endif
+  for (i = 0; i < _PB_NI; i++)
+    for (j = 0; j < _PB_NI; j++)
+      for (k = 0; k < _PB_NJ; k++)
+	  C[i][j] += alpha * A[i][k] * A[j][k];
+}
diff --git a/test/monniaux/rules.mk b/test/monniaux/rules.mk
index f0db6afa..c0594ef9 100644
--- a/test/monniaux/rules.mk
+++ b/test/monniaux/rules.mk
@@ -24,12 +24,12 @@ ALL_GCCFLAGS+=$(ALL_CFLAGS) -std=c99 -Wextra -Werror=implicit
 ALL_CCOMPFLAGS+=$(ALL_CFLAGS)
 
 # The compilers
-KVX_CC?=k1-cos-gcc
+KVX_CC?=kvx-cos-gcc
 KVX_CCOMP?=ccomp
 
 # Command to execute
-#EXECUTE_CYCLES?=timeout --signal=SIGTERM 3m k1-cluster --syscall=libstd_scalls.so --cycle-based --
-EXECUTE_CYCLES?=k1-cluster --syscall=libstd_scalls.so --cycle-based --
+#EXECUTE_CYCLES?=timeout --signal=SIGTERM 3m kvx-cluster --syscall=libstd_scalls.so --cycle-based --
+EXECUTE_CYCLES?=kvx-cluster --syscall=libstd_scalls.so --cycle-based --
 
 # You can define up to GCC4FLAGS and CCOMP4FLAGS
 GCC0FLAGS?=$(ALL_GCCFLAGS) -O0
diff --git a/test/monniaux/scheduling/mal_schedule.c b/test/monniaux/scheduling/mal_schedule.c
new file mode 100644
index 00000000..a6ba967f
--- /dev/null
+++ b/test/monniaux/scheduling/mal_schedule.c
@@ -0,0 +1,14 @@
+#include <stdint.h>
+int16_t meuh;
+extern int uv_encode(double, double, int);
+void f(int *ab, int e) {
+  uint32_t *ao = (uint32_t *)ab;
+  int16_t *aq = &meuh;
+  while (e) {
+    int ar, as;
+    ar = 1. / 2147483647;
+    as = uv_encode(5, *aq, *ab);
+    if (as)
+      *ao++ = ar;
+  }
+}
diff --git a/test/monniaux/yarpgen/Makefile b/test/monniaux/yarpgen/Makefile
index 28bd5ae0..24dd19c3 100644
--- a/test/monniaux/yarpgen/Makefile
+++ b/test/monniaux/yarpgen/Makefile
@@ -1,5 +1,6 @@
 TARGET_CCOMP=../../../ccomp
 TARGET_CC=gcc
+#EXECUTE=kvx-cluster -- 
 
 all:
 
@@ -19,7 +20,7 @@ MAX=129
 PREFIX=ran%06.f
 
 CCOMPOPTS=-static
-CCOMPFLAGS+=-funprototyped -fbitfields -fno-cse2 -stdlib ../../../runtime # FIXME
+CCOMPFLAGS+= -funprototyped -fbitfields -fno-cse2 -stdlib ../../../runtime
 
 TESTS_C=$(shell seq --format $(PREFIX)/func.c 1 $(MAX)) \
 	$(shell seq --format $(PREFIX)/driver.c 1 $(MAX)) \
diff --git a/tools/compiler_expand.ml b/tools/compiler_expand.ml
index 1fa5ad28..e5cab30c 100644
--- a/tools/compiler_expand.ml
+++ b/tools/compiler_expand.ml
@@ -20,13 +20,14 @@ TOTAL, (Option "profile_arcs"), (Some "Profiling insertion"), "Profiling";
 TOTAL, (Option "branch_probabilities"), (Some "Profiling use"), "ProfilingExploit";
 TOTAL, (Option "optim_move_loop_invariants"), (Some "Inserting initial nop"), "FirstNop";
 TOTAL, Always, (Some "Renumbering"), "Renumber";
-PARTIAL, (Option "optim_duplicate"),  (Some "Tail-duplicating"), "Duplicate";
+PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE";
+PARTIAL, Always, (Some "Duplicating blocks"), "Duplicate";
 TOTAL, Always, (Some "Renumbering pre constprop"), "Renumber";
 TOTAL, (Option "optim_constprop"), (Some "Constant propagation"), "Constprop";
 TOTAL, Always, (Some "Renumbering pre CSE"), "Renumber";
-PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE";
 TOTAL, (Option "optim_CSE2"), (Some "CSE2"), "CSE2";
 PARTIAL, (Option "optim_CSE3"), (Some "CSE3"), "CSE3";
+TOTAL, (Option "optim_CSE3"), (Some "Kill useless moves after CSE3"), "KillUselessMoves";
 TOTAL, (Option "optim_forward_moves"), (Some "Forwarding moves"), "ForwardMoves";
 PARTIAL, (Option "optim_redundancy"), (Some "Redundancy elimination"), "Deadcode";
 PARTIAL, (Option "optim_move_loop_invariants"), (Some "LICM"), "LICM";