64 files changed, 2908 insertions, 532 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9f407912..b3bb418f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -14,6 +14,8 @@ check-admitted:
   rules:
     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-ssa"'
+      when: always
     - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "master"'
@@ -35,6 +37,8 @@ build_x86_64:
   rules:
     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-ssa"'
+      when: always
     - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "master"'
@@ -58,6 +62,8 @@ build_ia32:
   rules:
     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-ssa"'
+      when: always
     - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "master"'
@@ -76,13 +82,13 @@ build_aarch64:
   script:
     - ./config_aarch64.sh
     - make -j "$NJOBS"
-    - export LD_LIBRARY_PATH=/usr/aarch64-linux-gnu/lib
-    - sudo ln -s /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 /lib
-    - make -C test CCOMPOPTS='-static' SIMU='qemu-aarch64' EXECUTE='qemu-aarch64' all test
-    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='aarch64-linux-gnu-gcc' EXECUTE='qemu-aarch64' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
+    - make -C test SIMU='qemu-aarch64 -L /usr/aarch64-linux-gnu' EXECUTE='qemu-aarch64 -L /usr/aarch64-linux-gnu' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='aarch64-linux-gnu-gcc' EXECUTE='qemu-aarch64 -L /usr/aarch64-linux-gnu' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
   rules:
     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-ssa"'
+      when: always
     - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "master"'
@@ -101,10 +107,8 @@ build_arm:
   script:
     - ./config_arm.sh
     - make -j "$NJOBS"
-    - export LD_LIBRARY_PATH=/usr/arm-linux-gnueabi/lib
-    - sudo ln -s /usr/arm-linux-gnueabi/lib/ld-linux.so.3 /lib  # FIXME: UGLY !
-    - make -C test CCOMPOPTS=-static SIMU='qemu-arm' EXECUTE='qemu-arm' all test
-    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='arm-linux-gnueabi-gcc' EXECUTE='qemu-arm' CCOMPOPTS='-static' TARGET_CFLAGS='-static' BITS=32
+    - make -C test SIMU='qemu-arm -L /usr/arm-linux-gnueabi' EXECUTE='qemu-arm -L /usr/arm-linux-gnueabi' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='arm-linux-gnueabi-gcc' EXECUTE='qemu-arm -L /usr/arm-linux-gnueabi' CCOMPOPTS='-static' TARGET_CFLAGS='-static' BITS=32
   rules:
     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
@@ -129,10 +133,8 @@ build_armhf:
   script:
     - ./config_armhf.sh
     - make -j "$NJOBS"
-    - export LD_LIBRARY_PATH=/usr/arm-linux-gnueabihf/lib
-    - sudo ln -s /usr/arm-linux-gnueabihf/lib/ld-linux-armhf.so.3 /lib  # FIXME: UGLY !
-    - make -C test CCOMPOPTS=-static SIMU='qemu-arm' EXECUTE='qemu-arm' all test
-    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='arm-linux-gnueabihf-gcc' EXECUTE='qemu-arm' CCOMPOPTS='-static' TARGET_CFLAGS='-static' BITS=32
+    - make -C test SIMU='qemu-arm -L /usr/arm-linux-gnueabihf' EXECUTE='qemu-arm -L /usr/arm-linux-gnueabihf' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='arm-linux-gnueabihf-gcc' EXECUTE='qemu-arm -L /usr/arm-linux-gnueabihf' CCOMPOPTS='-static' TARGET_CFLAGS='-static' BITS=32
   rules:
     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
@@ -144,47 +146,59 @@ build_armhf:
       when: always
     - when: manual
 
-# build_ppc:
-#   stage: build
-#   image: coqorg/coq:8.12.2-ocaml-4.11.1-flambda
-#   before_script:
-#     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
-#     - sudo apt-get -y install gcc-powerpc-linux-gnu qemu-user
-#     - eval `opam config env`
-#     - opam update
-#     - opam install -y menhir
-#   script:
-#     - ./config_ppc.sh
-#     - make -j "$NJOBS"
-#   rules:
-#     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
-#       when: always
-#     - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
-#       when: always
-#     - if: '$CI_COMMIT_BRANCH == "master"'
-#       when: always
-#     - when: manual
+build_ppc:
+  stage: build
+  image: coqorg/coq:8.12.2-ocaml-4.11.1-flambda
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-powerpc-linux-gnu wget ninja-build libglib2.0-dev libpixman-1-dev
+    - wget --no-verbose https://download.qemu.org/qemu-6.0.0.tar.xz
+    - tar xJf qemu-6.0.0.tar.xz
+    - (cd qemu-6.0.0 && ./configure --target-list=ppc-linux-user && make && sudo make install)
+    - eval `opam config env`
+    - opam update
+    - opam install -y menhir
+  script:
+    - ./config_ppc.sh
+    - make -j "$NJOBS"
+    - qemu-ppc --version
+    - make -C test SIMU='qemu-ppc -L /usr/powerpc-linux-gnu -cpu 7400' EXECUTE='qemu-ppc -L /usr/powerpc-linux-gnu -cpu 7400' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='powerpc-linux-gnu-gcc' EXECUTE='qemu-ppc -L /usr/powerpc-linux-gnu -cpu 7400' BITS=32
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-ssa"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
 
-# build_ppc64:
-#   stage: build
-#   image: coqorg/coq:8.12.2-ocaml-4.11.1-flambda
-#   before_script:
-#     - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
-#     - sudo apt-get -y install gcc-powerpc64-linux-gnu
-#     - eval `opam config env`
-#     - opam update
-#     - opam install -y menhir
-#   script:
-#     - ./config_ppc64.sh
-#     - make -j "$NJOBS"
-#   rules:
-#     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
-#       when: always
-#     - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
-#       when: always
-#     - if: '$CI_COMMIT_BRANCH == "master"'
-#       when: always
-#     - when: manual
+build_ppc64:
+  stage: build
+  image: coqorg/coq:8.12.2-ocaml-4.11.1-flambda
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-powerpc64-linux-gnu
+    - eval `opam config env`
+    - opam update
+    - opam install -y menhir
+  script:
+    - ./config_ppc64.sh
+    - make -j "$NJOBS"
+      #- make -C test SIMU='qemu-ppc64 -L /usr/powerpc64-linux-gnu -cpu 7400' EXECUTE='qemu-ppc64 -L /usr/powerpc64-linux-gnu -cpu 7400' all test
+      #- ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='powerpc64-linux-gnu-gcc' EXECUTE='qemu-ppc64 -L /usr/powerpc64-linux-gnu -cpu 7400' BITS=32
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "kvx-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-ssa"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
 
 build_rv64:
   stage: build
@@ -198,13 +212,13 @@ build_rv64:
   script:
     - ./config_rv64.sh
     - make -j "$NJOBS"
-    - export LD_LIBRARY_PATH=/usr/riscv64-linux-gnu/lib
-    - sudo ln -s /usr/riscv64-linux-gnu/lib/ld-linux-riscv64-lp64d.so.1 /lib
-    - make -C test CCOMPOPTS=-static SIMU='qemu-riscv64' EXECUTE='qemu-riscv64' all test
-    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='riscv64-linux-gnu-gcc' EXECUTE='qemu-riscv64' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
+    - make -C test SIMU='qemu-riscv64 -L /usr/riscv64-linux-gnu' EXECUTE='qemu-riscv64 -L /usr/riscv64-linux-gnu' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='riscv64-linux-gnu-gcc' EXECUTE='qemu-riscv64 -L /usr/riscv64-linux-gnu' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
   rules:
     - if: '$CI_COMMIT_BRANCH == "kvx-work"'
       when: always
+    - if: '$CI_COMMIT_BRANCH == "kvx-work-ssa"'
+      when: always
     - if: '$CI_COMMIT_BRANCH == "kvx-work-velus"'
       when: always
     - if: '$CI_COMMIT_BRANCH == "master"'
diff --git a/Makefile b/Makefile
index 10c4f4bf..3039a375 100644
--- a/Makefile
+++ b/Makefile
@@ -144,7 +144,8 @@ BACKEND=\
   FirstNop.v FirstNopproof.v \
   Allnontrap.v Allnontrapproof.v \
   Allocation.v Allocationproof.v \
-  Tunneling.v Tunnelingproof.v \
+  LTLTunneling.v LTLTunnelingproof.v \
+  RTLTunneling.v RTLTunnelingproof.v \
   Linear.v Lineartyping.v \
   Linearize.v Linearizeproof.v \
   CleanupLabels.v CleanupLabelsproof.v \
diff --git a/aarch64/Machregsaux.ml b/aarch64/Machregsaux.ml
index 41db3bd4..98e461eb 100644
--- a/aarch64/Machregsaux.ml
+++ b/aarch64/Machregsaux.ml
@@ -19,3 +19,6 @@ let class_of_type = function
   | AST.Tint | AST.Tlong -> 0
   | AST.Tfloat | AST.Tsingle -> 1
   | AST.Tany32 | AST.Tany64 -> assert false
+
+(* number of available registers per class *)
+let nr_regs = [| 29; 32 |]
diff --git a/aarch64/Machregsaux.mli b/aarch64/Machregsaux.mli
index 01b0f9fd..23ac1c9a 100644
--- a/aarch64/Machregsaux.mli
+++ b/aarch64/Machregsaux.mli
@@ -15,3 +15,6 @@
 val is_scratch_register: string -> bool
 
 val class_of_type: AST.typ -> int
+
+(* Number of registers in each class *)
+val nr_regs : int array
diff --git a/aarch64/PostpassSchedulingOracle.ml b/aarch64/PostpassSchedulingOracle.ml
index cde3e7a7..6f784238 100644
--- a/aarch64/PostpassSchedulingOracle.ml
+++ b/aarch64/PostpassSchedulingOracle.ml
@@ -507,6 +507,9 @@ let build_problem bb =
   {
     max_latency = -1;
     resource_bounds = opweights.pipelined_resource_bounds;
+    live_regs_entry = Registers.Regset.empty; (* unused here *)
+    typing = (fun x -> AST.Tint);                (* unused here *)
+    reference_counting = None;
     instruction_usages = instruction_usages bb;
     latency_constraints = latency_constraints bb;
   }
diff --git a/aarch64/PrepassSchedulingOracle.ml b/aarch64/PrepassSchedulingOracle.ml
index 1935e785..d7e80cd9 100644
--- a/aarch64/PrepassSchedulingOracle.ml
+++ b/aarch64/PrepassSchedulingOracle.ml
@@ -1,14 +1,26 @@
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           Sylvain Boulmé     Grenoble-INP, VERIMAG          *)
+(*           David Monniaux     CNRS, VERIMAG                  *)
+(*           Cyril Six          Kalray                         *)
+(*           Léo Gourdin        UGA, VERIMAG                   *)
+(*           Nicolas Nardino    ENS-Lyon, VERIMAG              *)
+(*                                                             *)
+(*                                                             *)
+(* *************************************************************)
+
 open AST
 open BTL
 open Maps
 open InstructionScheduler
 open Registers
 open PrepassSchedulingOracleDeps
-open RTLcommonaux
-   
+open PrintBTL
+open DebugPrint
+
 let use_alias_analysis () = false
 
-let build_constraints_and_resources (opweights : opweights) insts btl =
+let build_constraints_and_resources (opweights : opweights) seqa btl =
   let last_reg_reads : int list PTree.t ref = ref PTree.empty
   and last_reg_write : (int * int) PTree.t ref = ref PTree.empty
   and last_mem_reads : int list ref = ref []
@@ -16,8 +28,7 @@ let build_constraints_and_resources (opweights : opweights) insts btl =
   and last_branch : int option ref = ref None
   and last_non_pipelined_op : int array =
     Array.make opweights.nr_non_pipelined_units (-1)
-  and latency_constraints : latency_constraint list ref = ref []
-  and resources = ref [] in
+  and latency_constraints : latency_constraint list ref = ref [] in
   let add_constraint instr_from instr_to latency =
     assert (instr_from <= instr_to);
     assert (latency >= 0);
@@ -60,7 +71,29 @@ let build_constraints_and_resources (opweights : opweights) insts btl =
     last_reg_write := PTree.set reg (i, latency) !last_reg_write;
     last_reg_reads := PTree.remove reg !last_reg_reads
   in
-  let add_input_regs i regs = List.iter (add_input_reg i) regs
+  let add_input_regs i regs = List.iter (add_input_reg i) regs in
+  let rec add_builtin_res i (res : reg builtin_res) =
+    match res with
+    | BR r -> add_output_reg i 10 r
+    | BR_none -> ()
+    | BR_splitlong (hi, lo) ->
+        add_builtin_res i hi;
+        add_builtin_res i lo
+  in
+  let rec add_builtin_arg i (ba : reg builtin_arg) =
+    match ba with
+    | BA r -> add_input_reg i r
+    | BA_int _ | BA_long _ | BA_float _ | BA_single _ -> ()
+    | BA_loadstack (_, _) -> add_input_mem i
+    | BA_addrstack _ -> ()
+    | BA_loadglobal (_, _, _) -> add_input_mem i
+    | BA_addrglobal _ -> ()
+    | BA_splitlong (hi, lo) ->
+        add_builtin_arg i hi;
+        add_builtin_arg i lo
+    | BA_addptr (a1, a2) ->
+        add_builtin_arg i a1;
+        add_builtin_arg i a2
   and irreversible_action i =
     match !last_branch with None -> () | Some j -> add_constraint j i 1
   in
@@ -77,58 +110,122 @@ let build_constraints_and_resources (opweights : opweights) insts btl =
       resources
   in
   Array.iteri
-    (fun i inst ->
+    (fun i (inst, other_uses) ->
+      List.iter (fun use -> add_input_reg i use) (Regset.elements other_uses);
       match inst with
-      | Bnop _ ->
-          let rs = Array.map (fun _ -> 0) opweights.pipelined_resource_bounds in
-          resources := rs :: !resources
+      | Bnop _ -> ()
       | Bop (op, lr, rd, _) ->
           add_non_pipelined_resources i
             (opweights.non_pipelined_resources_of_op op (List.length lr));
           if Op.is_trapping_op op then irreversible_action i;
           add_input_regs i lr;
-          add_output_reg i (opweights.latency_of_op op (List.length lr)) rd;
-          let rs = opweights.resources_of_op op (List.length lr) in
-          resources := rs :: !resources
+          add_output_reg i (opweights.latency_of_op op (List.length lr)) rd
       | Bload (trap, chk, addr, lr, rd, _) ->
           if trap = TRAP then irreversible_action i;
           add_input_mem i;
           add_input_regs i lr;
           add_output_reg i
             (opweights.latency_of_load trap chk addr (List.length lr))
-            rd;
-          let rs = opweights.resources_of_load trap chk addr (List.length lr) in
-          resources := rs :: !resources
+            rd
       | Bstore (chk, addr, lr, src, _) ->
           irreversible_action i;
           add_input_regs i lr;
           add_input_reg i src;
-          add_output_mem i;
-          let rs = opweights.resources_of_store chk addr (List.length lr) in
-          resources := rs :: !resources
+          add_output_mem i
       | Bcond (cond, lr, BF (Bgoto s, _), ibnot, _) ->
-          let live = (get_some @@ PTree.get s btl).input_regs in
-          add_input_regs i (Regset.elements live);
           set_branch i;
           add_input_mem i;
-          add_input_regs i lr;
-          let rs = opweights.resources_of_cond cond (List.length lr) in
-          resources := rs :: !resources
+          add_input_regs i lr
       | Bcond (_, _, _, _, _) ->
-          failwith "get_simple_dependencies: invalid Bcond"
-      | BF (_, _) -> failwith "get_simple_dependencies: BF"
-      | Bseq (_, _) -> failwith "get_simple_dependencies: Bseq")
-    insts;
-  (!latency_constraints, Array.of_list (List.rev !resources))
+          failwith "build_constraints_and_resources: invalid Bcond"
+      | BF (Bcall (signature, ef, lr, rd, _), _) ->
+          set_branch i;
+          (match ef with
+          | Datatypes.Coq_inl r -> add_input_reg i r
+          | Datatypes.Coq_inr symbol -> ());
+          add_input_mem i;
+          add_input_regs i lr;
+          add_output_reg i (opweights.latency_of_call signature ef) rd;
+          add_output_mem i;
+          failwith "build_constraints_and_resources: invalid Bcall"
+      | BF (Btailcall (signature, ef, lr), _) ->
+          set_branch i;
+          (match ef with
+          | Datatypes.Coq_inl r -> add_input_reg i r
+          | Datatypes.Coq_inr symbol -> ());
+          add_input_mem i;
+          add_input_regs i lr;
+          failwith "build_constraints_and_resources: invalid Btailcall"
+      | BF (Bbuiltin (ef, lr, rd, _), _) ->
+          set_branch i;
+          add_input_mem i;
+          List.iter (add_builtin_arg i) lr;
+          add_builtin_res i rd;
+          add_output_mem i;
+          failwith "build_constraints_and_resources: invalid Bbuiltin"
+      | BF (Bjumptable (lr, _), _) ->
+          set_branch i;
+          add_input_reg i lr;
+          failwith "build_constraints_and_resources: invalid Bjumptable"
+      | BF (Breturn (Some r), _) ->
+          set_branch i;
+          add_input_reg i r;
+          failwith "build_constraints_and_resources: invalid Breturn Some"
+      | BF (Breturn None, _) ->
+          set_branch i;
+          failwith "build_constraints_and_resources: invalid Breturn None"
+      | BF (Bgoto _, _) ->
+          failwith "build_constraints_and_resources: invalid Bgoto"
+      | Bseq (_, _) -> failwith "build_constraints_and_resources: Bseq")
+    seqa;
+  !latency_constraints
 
-let define_problem (opweights : opweights) ibf btl =
-  let simple_deps, resources =
-    build_constraints_and_resources opweights ibf btl
-  in
+let resources_of_instruction (opweights : opweights) = function
+  | Bnop _ -> Array.map (fun _ -> 0) opweights.pipelined_resource_bounds
+  | Bop (op, inputs, output, _) ->
+      opweights.resources_of_op op (List.length inputs)
+  | Bload (trap, chunk, addressing, addr_regs, output, _) ->
+      opweights.resources_of_load trap chunk addressing (List.length addr_regs)
+  | Bstore (chunk, addressing, addr_regs, input, _) ->
+      opweights.resources_of_store chunk addressing (List.length addr_regs)
+  | BF (Bcall (signature, ef, inputs, output, _), _) ->
+      opweights.resources_of_call signature ef
+  | BF (Bbuiltin (ef, builtin_inputs, builtin_output, _), _) ->
+      opweights.resources_of_builtin ef
+  | Bcond (cond, args, _, _, _) ->
+      opweights.resources_of_cond cond (List.length args)
+  | BF (Btailcall _, _) | BF (Bjumptable _, _) | BF (Breturn _, _) ->
+      opweights.pipelined_resource_bounds
+  | BF (Bgoto _, _) | Bseq (_, _) ->
+      failwith "resources_of_instruction: invalid btl instruction"
+
+let print_sequence pp seqa =
+  Array.iteri
+    (fun i (inst, other_uses) ->
+      debug "i=%d\n inst = " i;
+      print_btl_inst pp inst;
+      debug "\n other_uses=";
+      print_regset other_uses;
+      debug "\n")
+    seqa
+
+let length_of_chunk = function
+  | Mint8signed | Mint8unsigned -> 1
+  | Mint16signed | Mint16unsigned -> 2
+  | Mint32 | Mfloat32 | Many32 -> 4
+  | Mint64 | Mfloat64 | Many64 -> 8
+
+let define_problem (opweights : opweights) (live_entry_regs : Regset.t)
+    (typing : RTLtyping.regenv) reference_counting seqa btl =
+  let simple_deps = build_constraints_and_resources opweights seqa btl in
   {
     max_latency = -1;
     resource_bounds = opweights.pipelined_resource_bounds;
-    instruction_usages = resources;
+    live_regs_entry = live_entry_regs;
+    typing;
+    reference_counting = Some reference_counting;
+    instruction_usages =
+      Array.map (resources_of_instruction opweights) (Array.map fst seqa);
     latency_constraints = simple_deps;
   }
 
@@ -154,27 +251,35 @@ let zigzag_scheduler problem early_ones =
         { problem with latency_constraints = !constraints' }
   | None -> None
 
-let prepass_scheduler_by_name name problem insts =
+let prepass_scheduler_by_name name problem seqa =
   match name with
   | "zigzag" ->
       let early_ones =
         Array.map
-          (fun inst ->
+          (fun (inst, _) ->
             match inst with Bcond (_, _, _, _, _) -> true | _ -> false)
-          insts
+          seqa
       in
       zigzag_scheduler problem early_ones
   | _ -> scheduler_by_name name problem
 
-let schedule_sequence insts btl =
+let schedule_sequence seqa btl (live_regs_entry : Registers.Regset.t)
+    (typing : RTLtyping.regenv) reference =
   let opweights = OpWeights.get_opweights () in
   try
-    if Array.length insts <= 1 then None
+    if Array.length seqa <= 1 then None
     else
-      let nr_instructions = Array.length insts in
-      let problem = define_problem opweights insts btl in
+      let nr_instructions = Array.length seqa in
+      if !Clflags.option_debug_compcert > 6 then
+        Printf.printf "prepass scheduling length = %d\n" nr_instructions;
+      let problem =
+        define_problem opweights live_regs_entry typing reference seqa btl
+      in
+      if !Clflags.option_debug_compcert > 7 then (
+        print_sequence stdout seqa;
+        print_problem stdout problem);
       match
-        prepass_scheduler_by_name !Clflags.option_fprepass_sched problem insts
+        prepass_scheduler_by_name !Clflags.option_fprepass_sched problem seqa
       with
       | None ->
           Printf.printf "no solution in prepass scheduling\n";
@@ -190,5 +295,3 @@ let schedule_sequence insts btl =
   with Failure s ->
     Printf.printf "failure in prepass scheduling: %s\n" s;
     None
-
-
diff --git a/backend/CSE2proof.v b/backend/CSE2proof.v
index 49dbd409..252240c9 100644
--- a/backend/CSE2proof.v
+++ b/backend/CSE2proof.v
@@ -1399,7 +1399,7 @@ Proof.
     2: discriminate.
     econstructor; split.
     {
-      eapply exec_Iop with (v := (default_notrap_load_value chunk)); eauto.
+      eapply exec_Iop with (v := Vundef); eauto.
       simpl.
       rewrite <- subst_args_ok with (genv := ge) (f := f) (pc := pc) (sp := sp) (m := m) in H0.
       {
@@ -1472,7 +1472,7 @@ Proof.
     2: discriminate.
     econstructor; split.
     {
-      eapply exec_Iop with (v := (default_notrap_load_value chunk)); eauto.
+      eapply exec_Iop with (v := Vundef); eauto.
       simpl.
       rewrite <- subst_args_ok with (genv := ge) (f := f) (pc := pc) (sp := sp) (m := m) in H0.
       {
diff --git a/backend/CSE3analysisproof.v b/backend/CSE3analysisproof.v
index d53cf604..382c9f4c 100644
--- a/backend/CSE3analysisproof.v
+++ b/backend/CSE3analysisproof.v
@@ -502,7 +502,7 @@ Section SOUNDNESS.
         end
       with
       | Some dat => v' = dat
-      | None => v' = default_notrap_load_value chunk
+      | None => v' = Vundef
       end
     end.
     
diff --git a/backend/CSEdomain.v b/backend/CSEdomain.v
index f78e1d25..9641d012 100644
--- a/backend/CSEdomain.v
+++ b/backend/CSEdomain.v
@@ -113,12 +113,12 @@ Inductive rhs_eval_to (valu: valuation) (ge: genv) (sp: val) (m: mem):
 (*  | load_notrap1_eval_to: forall chunk addr vl,
       eval_addressing ge sp addr (map valu vl) = None ->
       rhs_eval_to valu ge sp m (Load NOTRAP chunk addr vl)
-                  (default_notrap_load_value chunk)
+                  Vundef
   | load_notrap2_eval_to: forall chunk addr vl a,
       eval_addressing ge sp addr (map valu vl) = Some a ->
       Mem.loadv chunk m a = None ->
       rhs_eval_to valu ge sp m (Load NOTRAP chunk addr vl)
-                  (default_notrap_load_value chunk) *).
+                  Vundef *).
 
 Inductive equation_holds (valu: valuation) (ge: genv) (sp: val) (m: mem):
                                                       equation -> Prop :=
diff --git a/backend/CSEproof.v b/backend/CSEproof.v
index cf51f5a2..556b44b3 100644
--- a/backend/CSEproof.v
+++ b/backend/CSEproof.v
@@ -402,7 +402,7 @@ Lemma add_load_holds_none1:
   forall valu1 ge sp rs m n addr (args: list reg) chunk dst,
   numbering_holds valu1 ge sp rs m n ->
   eval_addressing ge sp addr rs##args = None ->
-  exists valu2, numbering_holds valu2 ge sp (rs#dst <- (default_notrap_load_value chunk)) m (add_load n dst chunk addr args).
+  exists valu2, numbering_holds valu2 ge sp (rs#dst <- Vundef) m (add_load n dst chunk addr args).
 Proof.
   unfold add_load; intros.
   destruct (valnum_regs n args) as [n1 vl] eqn:VN.
@@ -418,7 +418,7 @@ Lemma add_load_holds_none2:
   numbering_holds valu1 ge sp rs m n ->
   eval_addressing ge sp addr rs##args = Some a ->
   Mem.loadv chunk m a = None ->
-  exists valu2, numbering_holds valu2 ge sp (rs#dst <- (default_notrap_load_value chunk)) m (add_load n dst NOTRAP chunk addr args).
+  exists valu2, numbering_holds valu2 ge sp (rs#dst <- Vundef) m (add_load n dst NOTRAP chunk addr args).
 Proof.
   unfold add_load; intros.
   destruct (valnum_regs n args) as [n1 vl] eqn:VN.
@@ -1210,7 +1210,6 @@ Proof.
       exists valu1.
       apply set_unknown_holds.
       assumption.
-      unfold default_notrap_load_value.
       apply set_reg_lessdef; eauto.
     }
     {
diff --git a/backend/Deadcodeproof.v b/backend/Deadcodeproof.v
index b51d6cce..be20af0b 100644
--- a/backend/Deadcodeproof.v
+++ b/backend/Deadcodeproof.v
@@ -845,7 +845,6 @@ Ltac UseTransfer :=
   eapply match_succ_states; eauto. simpl; auto.
   apply eagree_update; auto.
   rewrite is_int_zero_sound by auto.
-  unfold default_notrap_load_value.
   constructor.
 + (* preserved *)
   exploit eval_addressing_lessdef_none. eapply add_needs_all_lessdef; eauto. eassumption.
@@ -878,7 +877,6 @@ Ltac UseTransfer :=
   eapply match_succ_states; eauto. simpl; auto.
   apply eagree_update; auto.
   rewrite is_int_zero_sound by auto.
-  unfold default_notrap_load_value.
   constructor.
 + (* preserved *)
   exploit eval_addressing_lessdef. eapply add_needs_all_lessdef; eauto. eauto.
diff --git a/backend/LTL.v b/backend/LTL.v
index 3edd60a2..a382ef0e 100644
--- a/backend/LTL.v
+++ b/backend/LTL.v
@@ -217,14 +217,14 @@ Inductive step: state -> trace -> state -> Prop :=
         E0 (Block s f sp bb rs' m)
   | exec_Lload_notrap1: forall s f sp chunk addr args dst bb rs m rs',
       eval_addressing ge sp addr (reglist rs args) = None ->
-      rs' = Locmap.set (R dst)  (default_notrap_load_value chunk)
+      rs' = Locmap.set (R dst)  Vundef
                        (undef_regs (destroyed_by_load chunk addr) rs) ->
       step (Block s f sp (Lload NOTRAP chunk addr args dst :: bb) rs m)
         E0 (Block s f sp bb rs' m)
   | exec_Lload_notrap2: forall s f sp chunk addr args dst bb rs m a rs',
       eval_addressing ge sp addr (reglist rs args) = Some a ->
       Mem.loadv chunk m a = None ->
-      rs' = Locmap.set (R dst) (default_notrap_load_value chunk)
+      rs' = Locmap.set (R dst) Vundef
                        (undef_regs (destroyed_by_load chunk addr) rs) ->
       step (Block s f sp (Lload NOTRAP chunk addr args dst :: bb) rs m)
         E0 (Block s f sp bb rs' m)
diff --git a/backend/Tunneling.v b/backend/LTLTunneling.v
index c849ea92..4b404724 100644
--- a/backend/Tunneling.v
+++ b/backend/LTLTunneling.v
@@ -77,7 +77,7 @@ Definition UF := PTree.t (node * Z).
 
 (* The oracle returns a map of "nop" node to their target with a distance (ie the number of the "nop" node on the path) to the target. *)
 Axiom branch_target: LTL.function -> UF. 
-Extract Constant branch_target => "Tunnelingaux.branch_target".
+Extract Constant branch_target => "LTLTunnelingaux.branch_target".
 
 Local Open Scope error_monad_scope.
 
diff --git a/backend/LTLTunnelingaux.ml b/backend/LTLTunnelingaux.ml
new file mode 100644
index 00000000..c3b8cf82
--- /dev/null
+++ b/backend/LTLTunnelingaux.ml
@@ -0,0 +1,108 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           Sylvain Boulmé  Grenoble-INP, VERIMAG             *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+(*
+
+This file implements the [branch_target] oracle that identifies "nop" branches in a LTL function,
+and computes their target node with the distance (ie the number of cummulated nops) toward this target.
+
+See [LTLTunneling.v]
+
+*)
+
+open Coqlib
+open LTL
+open Maps
+open Camlcoq
+open Tunnelinglibs
+
+module LANG = struct
+  type code_unit = LTL.bblock
+  type funct = LTL.coq_function
+end
+
+module OPT = struct
+  let langname = "LTL"
+  let limit_tunneling = None
+  let debug_flag = ref false
+  let final_dump = false
+end
+
+module Partial = Tunnelinglibs.Tunneling(LANG)(OPT)
+
+module FUNS = struct
+  let build_simplified_cfg c acc pc bb =
+    match bb with
+    | Lbranch s :: _ ->
+       let ns = get_node c s in
+       set_branch c pc ns;
+       acc
+    | Lcond (_, _, s1, s2, _) :: _ ->
+       c.num_rems <- c.num_rems + 1;
+       let ns1 = get_node c s1 in
+       let ns2 = get_node c s2 in
+       let npc = get_node c pc in
+       npc.inst <- COND(ns1, ns2);
+       npc::acc
+    | _ -> acc
+
+  let print_code_unit c println (pc, bb) =
+    match bb with
+    | Lbranch s::_ -> (if println then Partial.debug "\n"); Partial.debug "%d:Lbranch %d %s\n" pc (P.to_int s) (string_of_labeli c.nodes pc); false
+    | Lcond (_, _, s1, s2, _)::_ -> (if println then Partial.debug "\n"); Partial.debug "%d:Lcond (%d,%d) %s\n" pc (P.to_int s1) (P.to_int s2) (string_of_labeli c.nodes pc); false
+    | _ -> Partial.debug "%d " pc; true
+
+  let fn_code f = f.fn_code
+  let fn_entrypoint f = f.fn_entrypoint
+
+
+  (*************************************************************)
+  (* Copy-paste of the extracted code of the verifier          *)
+  (* with [raise (BugOnPC (P.to_int pc))] instead of [Error.*] *)
+
+  let check_code_unit td pc bb =
+    match PTree.get pc td with
+    | Some p ->
+      let (tpc, dpc) = p in
+      let dpc0 = dpc in
+      (match bb with
+       | [] ->
+         raise (BugOnPC (P.to_int pc))
+       | i :: _ ->
+         (match i with
+          | Lbranch s ->
+            let (ts, ds) = get td s in
+            if peq tpc ts
+            then if zlt ds dpc0
+                 then ()
+                 else raise (BugOnPC (P.to_int pc))
+            else raise (BugOnPC (P.to_int pc))
+          | Lcond (_, _, s1, s2, _) ->
+            let (ts1, ds1) = get td s1 in
+            let (ts2, ds2) = get td s2 in
+            if peq tpc ts1
+            then if peq tpc ts2
+                 then if zlt ds1 dpc0
+                      then if zlt ds2 dpc0
+                           then ()
+                           else raise (BugOnPC (P.to_int pc))
+                      else raise (BugOnPC (P.to_int pc))
+                 else raise (BugOnPC (P.to_int pc))
+            else raise (BugOnPC (P.to_int pc))
+          | _ ->
+            raise (BugOnPC (P.to_int pc))))
+    | None -> ()
+end
+
+module T = Partial.T(FUNS)
+let branch_target = T.branch_target
+
diff --git a/backend/Tunnelingproof.v b/backend/LTLTunnelingproof.v
index 3bc92f75..d36d3c76 100644
--- a/backend/Tunnelingproof.v
+++ b/backend/LTLTunnelingproof.v
@@ -17,7 +17,7 @@ Require Import Coqlib Maps Errors.
 Require Import AST Linking.
 Require Import Values Memory Events Globalenvs Smallstep.
 Require Import Op Locations LTL.
-Require Import Tunneling.
+Require Import LTLTunneling.
 
 Local Open Scope nat.
 
diff --git a/backend/Linear.v b/backend/Linear.v
index 1443f795..cb11f7dc 100644
--- a/backend/Linear.v
+++ b/backend/Linear.v
@@ -170,7 +170,7 @@ Inductive step: state -> trace -> state -> Prop :=
       forall s f sp chunk addr args dst b rs m rs',
       eval_addressing ge sp addr (reglist rs args) = None ->
       rs' = Locmap.set (R dst)
-                       (default_notrap_load_value chunk)
+                       Vundef
                        (undef_regs (destroyed_by_load chunk addr) rs) ->
       step (State s f sp (Lload NOTRAP chunk addr args dst :: b) rs m)
         E0 (State s f sp b rs' m)
@@ -179,7 +179,7 @@ Inductive step: state -> trace -> state -> Prop :=
       eval_addressing ge sp addr (reglist rs args) = Some a ->
       Mem.loadv chunk m a = None ->
       rs' = Locmap.set (R dst)
-                       (default_notrap_load_value chunk)
+                       Vundef
                        (undef_regs (destroyed_by_load chunk addr) rs) ->
       step (State s f sp (Lload NOTRAP chunk addr args dst :: b) rs m)
         E0 (State s f sp b rs' m)
diff --git a/backend/Lineartyping.v b/backend/Lineartyping.v
index 22658fb7..cf903aad 100644
--- a/backend/Lineartyping.v
+++ b/backend/Lineartyping.v
@@ -338,14 +338,12 @@ Local Opaque mreg_type.
   simpl in *; InvBooleans.
   econstructor; eauto.
   apply wt_setreg. eapply Val.has_subtype; eauto.
-  unfold default_notrap_load_value.
   constructor.
   apply wt_undef_regs; auto.
 - (* load notrap2 *)
   simpl in *; InvBooleans.
   econstructor; eauto.
   apply wt_setreg. eapply Val.has_subtype; eauto.
-  unfold default_notrap_load_value.
   constructor.
   apply wt_undef_regs; auto.
 - (* store *)
diff --git a/backend/Mach.v b/backend/Mach.v
index 1c6fdb18..2cfd738d 100644
--- a/backend/Mach.v
+++ b/backend/Mach.v
@@ -330,14 +330,14 @@ Inductive step: state -> trace -> state -> Prop :=
   | exec_Mload_notrap1:
       forall s f sp chunk addr args dst c rs m rs',
       eval_addressing ge sp addr rs##args = None ->
-      rs' = ((undef_regs (destroyed_by_load chunk addr) rs)#dst <- (default_notrap_load_value chunk)) ->
+      rs' = ((undef_regs (destroyed_by_load chunk addr) rs)#dst <- Vundef) ->
       step (State s f sp (Mload NOTRAP chunk addr args dst :: c) rs m)
         E0 (State s f sp c rs' m)
   | exec_Mload_notrap2:
       forall s f sp chunk addr args dst c rs m a rs',
       eval_addressing ge sp addr rs##args = Some a ->
       Mem.loadv chunk m a = None ->
-      rs' = ((undef_regs (destroyed_by_load chunk addr) rs)#dst <- (default_notrap_load_value chunk)) ->
+      rs' = ((undef_regs (destroyed_by_load chunk addr) rs)#dst <- Vundef) ->
       step (State s f sp (Mload NOTRAP chunk addr args dst :: c) rs m)
         E0 (State s f sp c rs' m)
   | exec_Mstore:
diff --git a/backend/RTL.v b/backend/RTL.v
index 31b5cf99..fe350adf 100644
--- a/backend/RTL.v
+++ b/backend/RTL.v
@@ -225,14 +225,14 @@ Inductive step: state -> trace -> state -> Prop :=
       (fn_code f)!pc = Some(Iload NOTRAP chunk addr args dst pc') ->
       eval_addressing ge sp addr rs##args = None ->
       step (State s f sp pc rs m)
-        E0 (State s f sp pc' (rs#dst <- (default_notrap_load_value chunk)) m)
+        E0 (State s f sp pc' (rs#dst <- Vundef) m)
   | exec_Iload_notrap2:
       forall s f sp pc rs m chunk addr args dst pc' a,
       (fn_code f)!pc = Some(Iload NOTRAP chunk addr args dst pc') ->
       eval_addressing ge sp addr rs##args = Some a ->
       Mem.loadv chunk m a = None->
       step (State s f sp pc rs m)
-        E0 (State s f sp pc' (rs#dst <- (default_notrap_load_value chunk)) m)
+        E0 (State s f sp pc' (rs#dst <- Vundef) m)
   | exec_Istore:
       forall s f sp pc rs m chunk addr args src pc' a m',
       (fn_code f)!pc = Some(Istore chunk addr args src pc') ->
diff --git a/backend/RTLTunneling.v b/backend/RTLTunneling.v
new file mode 100644
index 00000000..df663f48
--- /dev/null
+++ b/backend/RTLTunneling.v
@@ -0,0 +1,126 @@
+(* *********************************************************************)
+(*                                                                     *)
+(*              The Compcert verified compiler                         *)
+(*                                                                     *)
+(*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*          TODO: Proper author information                            *)
+(*                                                                     *)
+(*  Copyright Institut National de Recherche en Informatique et en     *)
+(*  Automatique.  All rights reserved.  This file is distributed       *)
+(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
+(*                                                                     *)
+(* *********************************************************************)
+
+(** Branch tunneling for the RTL representation *)
+
+(* rq: ce code est commenté avec des remarques sur des éléments du code pas évidents, sur lequels j'ai buté, ou qui changent du tunneling LTL. Ils pourront être enlevés ultérieurement *)
+
+Require Import Coqlib Maps Errors.
+Require Import AST.
+Require Import RTL.
+
+(* This is a port of tunneling for LTL. See LTLTunneling.v *)
+
+Definition UF := PTree.t (node * Z).
+
+(* The oracle returns a map of "nop" node to their target with a distance (ie the number of the "nop" node on the path) to the target. *)
+Axiom branch_target: RTL.function -> UF.
+Extract Constant branch_target => "RTLTunnelingaux.branch_target".
+(* TODO: add an extraction command to link branch_target with its implementation *)
+
+Local Open Scope error_monad_scope.
+
+Definition get (td: UF) (pc: node): node*Z :=
+  match td!pc with (* rq: notation pour `PTree.get pc td` *)
+  | Some (t,d) => (t,Z.abs d)
+  | None       => (pc,0)
+  end.
+
+(* rq: "coerce" la structure d'abre UF en une fonction qui retrouve le représentant canonique de la classe d'un noeud *)
+Definition target (td: UF) (pc: node): node := fst (get td pc).
+Coercion target: UF >-> Funclass.
+
+(* we check that the domain of [td] is included in the domain of [c] *)
+(* rq: pas de basic block en RTL *)
+Definition check_included (td: UF) (c: code): option instruction
+  := PTree.fold (fun (ok:option instruction) pc _ => if ok then c!pc else None) td (Some (Inop xH)).
+(* rq: PTree.fold replie l'arbre avec un parcours préfixe. La fonction qui prend en argument l'accumulateur, le label du noeud et le pointeur (`positive`) vers ce noeud *)
+
+(* we check the validity of targets and their bound:
+*  the distance of a "nop" node (w.r.t to the target) must be greater than the one of its parents.
+*)
+Definition check_instr (td: UF) (pc: node) (i: instruction): res unit :=
+  match td!pc with
+  | None => OK tt (* rq: le type `unit` est le singleton {`tt`} *)
+  | Some (tpc, dpc) =>
+      let dpc := Z.abs dpc in
+      match i with
+      | Inop s =>
+          let (ts,ds) := get td s in
+          if peq tpc ts then
+            if zlt ds dpc then OK tt
+            else Error (msg "bad distance in Inop")
+          else Error (msg "invalid skip of Inop")
+      | Icond _ _ ifso ifnot _ =>
+          let (tso,dso) := get td ifso in
+          let (tnot,dnot) := get td ifnot in
+          if peq tpc tso then
+            if peq tpc tnot then
+              if zlt dso dpc then
+                if zlt dnot dpc then OK tt
+                else Error (msg "bad distance on else branch")
+              else Error (msg "bad distance on then branch")
+            else Error (msg "invalid skip of else branch")
+          else Error (msg "invalid skip of then branch")
+      | _ => Error (msg "cannot skip this instruction")
+      end
+  end.
+
+Definition check_code (td: UF) (c: code): res unit :=
+  PTree.fold (fun ok pc i => do _ <- ok; check_instr td pc i) c (OK tt).
+(* rq: `do _ <- ok; ...` exécute le bloc à droite du ';' si `ok` est un `OK _`, sinon il passe simplement l'erreur *)
+
+(* The second pass rewrites all LTL instructions, replacing every
+ * successor [s] of every instruction by [t s], the canonical representative
+ * of its equivalence class in the union-find data structure.
+ *)
+
+(* rq: beaucoup plus de cas que pour LTL, car plein d'instructions ont des successeurs *)
+Definition tunnel_instr (t: node -> node) (i: instruction) : instruction :=
+  match i with
+  | Inop s => Inop (t s)
+  | Iop op args res s => Iop op args res (t s)
+  | Iload trap chunk addr args dst s => Iload trap chunk addr args dst (t s)
+  | Istore chunk addr args src s => Istore chunk addr args src (t s)
+  | Icall sig ros args res s => Icall sig ros args res (t s)
+  | Ibuiltin ef args res s => Ibuiltin ef args res (t s)
+  | Icond cond args ifso ifnot info =>
+      let ifso'  := t ifso in
+      let ifnot' := t ifnot in
+      if peq ifso' ifnot'
+      (* rq: si les deux branches se rejoignent, le if ne sert à rien *)
+      then Inop ifso'
+      else Icond cond args ifso' ifnot' info
+  | Ijumptable arg tbl => Ijumptable arg (List.map t tbl)
+  | _ => i
+  end.
+
+Definition tunnel_function (f: RTL.function): res RTL.function :=
+  let td := branch_target f in
+  let c := fn_code f in
+  if check_included td c then
+    do _ <- check_code td c ; OK
+    (mkfunction (* rq: mkfunction construit un "Record" `RTL.function` *)
+      (fn_sig f)
+      (fn_params f)
+      (fn_stacksize f)
+      (PTree.map1 (tunnel_instr td) c) (* rq: `map1` est un map sans le pointeur *)
+      (td (fn_entrypoint f)))
+  else Error (msg "Some node of the union-find is not in the CFG").
+
+Definition tunnel_fundef (f: fundef): res fundef :=
+  transf_partial_fundef tunnel_function f.
+
+Definition transf_program (p: program): res program :=
+  transform_partial_program tunnel_fundef p.
+
diff --git a/backend/RTLTunnelingaux.ml b/backend/RTLTunnelingaux.ml
new file mode 100644
index 00000000..9333e357
--- /dev/null
+++ b/backend/RTLTunnelingaux.ml
@@ -0,0 +1,112 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           Sylvain Boulmé  Grenoble-INP, VERIMAG             *)
+(*           TODO: Proper author information                   *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+(*
+
+This file implements the [branch_target] oracle that identifies "nop" branches in a RTL function,
+and computes their target node with the distance (ie the number of cummulated nops) toward this target.
+
+See [RTLTunneling.v]
+
+*)
+
+open Coqlib
+open RTL
+open Maps
+open Camlcoq
+open Tunnelinglibs
+
+module LANG = struct
+  type code_unit = RTL.instruction
+  type funct = RTL.coq_function
+end
+
+module OPT = struct
+  let langname = "RTL"
+  let limit_tunneling = None
+  let debug_flag = ref false
+  let final_dump = false
+end
+
+module Partial = Tunnelinglibs.Tunneling(LANG)(OPT)
+
+module FUNS = struct
+  let build_simplified_cfg c acc pc i =
+    match i with
+    | Inop s ->
+       let ns = get_node c s in
+       set_branch c pc ns;
+       incr nopcounter;
+       acc
+    | Icond (_, _, s1, s2, _) ->
+       c.num_rems <- c.num_rems + 1;
+       let ns1 = get_node c s1 in
+       let ns2 = get_node c s2 in
+       let npc = get_node c pc in
+       npc.inst <- COND(ns1, ns2);
+       npc::acc
+    | _ -> acc
+
+  let print_code_unit c println (pc, i) =
+    match i with
+    | Inop s -> (if println then Partial.debug "\n");
+        Partial.debug "%d:Inop %d %s\n" pc (P.to_int s) (string_of_labeli c.nodes pc);
+        false
+    | Icond (_, _, s1, s2, _) -> (if println then Partial.debug "\n");
+        Partial.debug "%d:Icond (%d,%d) %s\n" pc (P.to_int s1) (P.to_int s2) (string_of_labeli c.nodes pc);
+        false
+    | _ -> Partial.debug "%d " pc;
+        true
+
+  let fn_code f = f.fn_code
+  let fn_entrypoint f = f.fn_entrypoint
+
+
+  (*************************************************************)
+  (* Copy-paste of the extracted code of the verifier          *)
+  (* with [raise (BugOnPC (P.to_int pc))] instead of [Error.*] *)
+
+  let check_code_unit td pc i =
+    match PTree.get pc td with
+    | Some p ->
+        let (tpc, dpc) = p in
+        let dpc0 = dpc in begin
+          match i with
+          | Inop s ->
+            let (ts, ds) = get td s in
+            if peq tpc ts
+            then if zlt ds dpc0
+                 then ()
+                 else raise (BugOnPC (P.to_int pc))
+            else raise (BugOnPC (P.to_int pc))
+          | Icond (_, _, s1, s2, _) ->
+            let (ts1, ds1) = get td s1 in
+            let (ts2, ds2) = get td s2 in
+            if peq tpc ts1
+            then if peq tpc ts2
+                 then if zlt ds1 dpc0
+                      then if zlt ds2 dpc0
+                           then ()
+                           else raise (BugOnPC (P.to_int pc))
+                      else raise (BugOnPC (P.to_int pc))
+                 else raise (BugOnPC (P.to_int pc))
+            else raise (BugOnPC (P.to_int pc))
+          | _ ->
+            raise (BugOnPC (P.to_int pc)) end
+    | None -> ()
+
+end
+
+module T = Partial.T(FUNS)
+let branch_target = T.branch_target
+
diff --git a/backend/RTLTunnelingproof.v b/backend/RTLTunnelingproof.v
new file mode 100644
index 00000000..51ef0f7a
--- /dev/null
+++ b/backend/RTLTunnelingproof.v
@@ -0,0 +1,670 @@
+(* *********************************************************************)
+(*                                                                     *)
+(*              The Compcert verified compiler                         *)
+(*                                                                     *)
+(*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*          TODO: Proper author information                            *)
+(*                                                                     *)
+(*  Copyright Institut National de Recherche en Informatique et en     *)
+(*  Automatique.  All rights reserved.  This file is distributed       *)
+(*  under the terms of the INRIA Non-Commercial License Agreement.     *)
+(*                                                                     *)
+(* *********************************************************************)
+
+(** Correctness proof for the branch tunneling optimization for RTL. *)
+(* This is a port of Tunnelingproof.v, the same optimisation for LTL. *)
+
+Require Import Coqlib Maps Errors.
+Require Import AST Linking.
+Require Import Values Memory Registers Events Globalenvs Smallstep.
+Require Import Op Locations RTL.
+Require Import RTLTunneling.
+Require Import Conventions1.
+
+Local Open Scope nat.
+
+(**) Definition check_included_spec (c:code) (td:UF) (ok: option instruction) :=
+  ok <> None -> forall pc, c!pc = None -> td!pc = None.
+
+Lemma check_included_correct (td:UF) (c:code):
+  check_included_spec c td (check_included td c).
+Proof.
+  apply PTree_Properties.fold_rec with (P:=check_included_spec c); unfold check_included_spec.
+  - intros m m' oi EQ IND N pc. rewrite <- EQ. apply IND. apply N.
+  - intros N pc. rewrite PTree.gempty. auto.
+  - intros m oi pc v N S IND. destruct oi.
+    + intros. rewrite PTree.gsspec. destruct (peq _ _); try congruence. apply IND. congruence. apply H0.
+    + contradiction.
+Qed.
+
+(**) Inductive target_bounds (target: node -> node) (bound: node -> nat) (pc: node) : option instruction -> Prop :=
+  | TB_default (TB: target pc = pc) oi:
+      target_bounds target bound pc oi
+  | TB_nop s
+      (EQ: target pc = target s)
+      (DEC: bound s < bound pc):
+      target_bounds target bound pc (Some (Inop s))
+  | TB_cond cond args ifso ifnot info
+      (EQSO: target pc = target ifso)
+      (EQNOT: target pc = target ifnot)
+      (DECSO: bound ifso < bound pc)
+      (DECNOT: bound ifnot < bound pc):
+      target_bounds target bound pc (Some (Icond cond args ifso ifnot info))
+.
+Local Hint Resolve TB_default: core.
+
+Lemma target_None (td: UF) (pc: node): td!pc = None -> td pc = pc.
+Proof.
+  unfold target, get. intro EQ. rewrite EQ. auto.
+Qed.
+Local Hint Resolve target_None Z.abs_nonneg: core.
+
+Lemma get_nonneg td pc t d: get td pc = (t,d) -> (0 <= d)%Z.
+Proof.
+  unfold get. destruct td!pc as [(tpc,dpc)|]; intro H; inv H; lia.
+Qed.
+Local Hint Resolve get_nonneg: core.
+
+(**) Definition bound (td: UF) (pc: node) := Z.to_nat (snd (get td pc)).
+
+
+(* TODO: à réécrire proprement *)
+Lemma check_instr_correct (td: UF) (pc: node) (i: instruction):
+  check_instr td pc i = OK tt ->
+  target_bounds (target td) (bound td) pc (Some i).
+Proof.
+  unfold check_instr. destruct (td!pc) as [(tpc,dpc)|] eqn:EQ.
+  assert (DPC: snd (get td pc) = Z.abs dpc). { unfold get. rewrite EQ. auto. }
+  - destruct i; try congruence.
+    + destruct (get td n) as (ts,ds) eqn:EQs.
+      destruct (peq _ _); try congruence.
+      destruct (zlt _ _); try congruence. intros _.
+      apply TB_nop. replace (td pc) with tpc.
+      unfold target. rewrite EQs. auto.
+      unfold target. unfold get. rewrite EQ. auto.
+      unfold bound. rewrite DPC. rewrite EQs; simpl. apply Z2Nat.inj_lt; try lia. apply get_nonneg with td n ts. apply EQs.
+    + destruct (get td n) as (tso,dso) eqn:EQSO.
+      destruct (get td n0) as (tnot,dnot) eqn:EQNOT.
+      intro H.
+      repeat ((destruct (peq _ _) in H || destruct (zlt _ _) in H); try congruence).
+      apply TB_cond; subst.
+      * unfold target. replace (fst (get td pc)) with tnot. rewrite EQSO. auto.
+        unfold get. rewrite EQ. auto.
+      * unfold target. replace (fst (get td pc)) with tnot. rewrite EQNOT. auto.
+        unfold get. rewrite EQ. auto.
+      * unfold bound. rewrite DPC. apply Z2Nat.inj_lt; try lia. apply get_nonneg with td n tnot. rewrite EQSO. auto. rewrite EQSO. auto.
+      * unfold bound. rewrite DPC. apply Z2Nat.inj_lt; try lia. apply get_nonneg with td n0 tnot. rewrite EQNOT; auto. rewrite EQNOT; auto.
+  - intros _. apply TB_default. unfold target. unfold get. rewrite EQ. auto.
+Qed.
+
+Definition check_code_spec (td:UF) (c:code) (ok: res unit) :=
+   ok = OK tt -> forall pc i, c!pc = Some i -> target_bounds (target td) (bound td) pc (Some i).
+
+Lemma check_code_correct (td:UF) c:
+   check_code_spec td c (check_code td c).
+Proof.
+  unfold check_code. apply PTree_Properties.fold_rec; unfold check_code_spec.
+  - intros. rewrite <- H in H2. apply H0; auto.
+  - intros. rewrite PTree.gempty in H0. congruence.
+  - intros m [[]|e] pc i N S IND; simpl; try congruence.
+    intros H pc0 i0. rewrite PTree.gsspec. destruct (peq _ _).
+    subst. intro. inv H0. apply check_instr_correct. apply H.
+    auto.
+Qed.
+
+Theorem branch_target_bounds:
+  forall f tf pc,
+  tunnel_function f = OK tf ->
+  target_bounds (branch_target f) (bound (branch_target f)) pc (f.(fn_code)!pc).
+Proof.
+(* ~old
+  intros t tf pc. unfold tunnel_function.
+  destruct (check_included _ _) eqn:HI; try congruence.
+  destruct (check_code _ _) eqn: HC.
+  - intros _. destruct ((fn_code t) ! pc) eqn:EQ.
+    + exploit check_code_correct; eauto.
+    replace tt with u. auto. {destruct u; auto. }
+    + exploit check_included_correct; eauto. rewrite HI. congruence.
+    intro. apply TB_default. unfold target. unfold get. rewrite H. debug auto.
+  - simpl. congruence.
+*)
+
+  intros. unfold tunnel_function in H.
+  destruct (check_included _ _) eqn:EQinc; try congruence.
+  monadInv H. rename EQ into EQcode.
+  destruct (_ ! _) eqn:EQ.
+  - exploit check_code_correct. destruct x. apply EQcode. apply EQ. auto.
+  - exploit check_included_correct.
+    rewrite EQinc. congruence.
+    apply EQ.
+    intro. apply TB_default. apply target_None. apply H.
+Qed.
+
+(** Preservation of semantics *)
+
+Definition match_prog (p tp: program) :=
+  match_program (fun _ f tf => tunnel_fundef f = OK tf) eq p tp.
+  (* rq: `(fun _ ...)` est la fonction pour matcher des fonctions
+   *     `eq` la fonction pour matcher les variables ? (`varinfo` dans la def)
+   *     `p` et `tp` sont les programmes donc on doit dire s'ils match
+   *)
+
+ (**) Lemma transf_program_match:
+  forall prog tprog, transf_program prog = OK tprog -> match_prog prog tprog.
+Proof.
+  intros. eapply match_transform_partial_program_contextual; eauto.
+Qed.
+
+
+Section PRESERVATION.
+
+
+Variables prog tprog: program.
+Hypothesis TRANSL: match_prog prog tprog. (* rq: on suppose que les programmes match *)
+Let ge := Genv.globalenv prog.
+Let tge := Genv.globalenv tprog.
+
+(* rq: pour les deux lemmes suivants, j'ai recopié les preuves, mais je ne les comprends pas du tout... D'où vient `exploit` ?? *)
+(* rq: `exploit` vient de Coqlib! Mais je comprends pas encore vraiment ce qui se passe... *)
+Lemma functions_translated:
+  forall (v: val) (f: fundef),
+  Genv.find_funct ge v = Some f ->
+  exists tf, tunnel_fundef f = OK tf /\ Genv.find_funct tge v = Some tf.
+Proof.
+  intros.
+  exploit (Genv.find_funct_match TRANSL). apply H.
+  intros (cu & tf & A & B & C).
+  (* exists tf. split. apply B. apply A. *)
+  (* rq: on peut remplacer les `split` et `apply` par `eauto`, et ne pas spécifier le
+   * `exists`, avec un `eexists` *)
+  eexists. eauto.
+  (* rq: Je ne comprends pas à quoi sert le `intuition`... *)
+  (* repeat eexists; intuition eauto. *)
+Qed.
+
+Lemma function_ptr_translated:
+  forall v f,
+  Genv.find_funct_ptr ge v = Some f ->
+  exists tf,
+  Genv.find_funct_ptr tge v = Some tf /\ tunnel_fundef f = OK tf.
+Proof.
+  intros. exploit (Genv.find_funct_ptr_match TRANSL).
+  - apply H.
+  - intros (cu & tf & A & B & C). exists tf. split. apply A. apply B.
+Qed.
+
+Lemma symbols_preserved s: Genv.find_symbol tge s = Genv.find_symbol ge s.
+Proof.
+  apply (Genv.find_symbol_match TRANSL).
+  (* rq: ici pas de `exploit` mais un `apply` parce que `Genv.find_symbol_match
+   * prouve vraiment pile ce qu'on veut *)
+Qed.
+
+Lemma sig_preserved:
+  forall f tf, tunnel_fundef f = OK tf -> funsig tf = funsig f.
+Proof.
+  intros. destruct f; simpl in H.
+  -
+    (*
+    unfold bind in H.
+    destruct (tunnel_function _) as [x|] eqn:EQ; try congruence.
+    inversion H.
+    unfold tunnel_function in EQ.
+    destruct (check_included _ _) in EQ; try congruence.
+    unfold bind in EQ. destruct (check_code _ _) in EQ; try congruence. inversion EQ. auto.
+    *)
+
+    monadInv H. (* rq: c'est une tactique maison... *)
+    unfold tunnel_function in EQ.
+    destruct (check_included _ _) in EQ; try congruence.
+    monadInv EQ. auto.
+  - monadInv H. auto.
+Qed.
+
+Lemma senv_preserved:
+  Senv.equiv ge tge.
+Proof.
+  eapply (Genv.senv_match TRANSL). (* Il y a déjà une preuve de cette propriété très exactement, je ne vais pas réinventer la roue ici *)
+Qed.
+
+(* TODO: vérifier s'il faut faire quelque chose avec le `res` ? *)
+Inductive match_stackframes: stackframe -> stackframe -> Prop :=
+  | match_stackframes_intro:
+      forall res f tf sp pc rs trs
+      (TF: tunnel_function f = OK tf)
+      (RS: Registers.regs_lessdef rs trs),
+      match_stackframes
+        (Stackframe res f sp pc rs)
+        (Stackframe res tf sp (branch_target f pc) trs).
+
+(* rq: `match_states s1 s2` correspond à s1 ~ s2 *)
+Inductive match_states: state -> state -> Prop :=
+  | match_states_intro:
+      forall s ts f tf sp pc rs trs m tm
+      (STK: list_forall2 match_stackframes s ts)
+      (TF: tunnel_function f = OK tf)
+      (RS: Registers.regs_lessdef rs trs)
+      (MEM: Mem.extends m tm),
+      match_states
+        (State s f sp pc rs m)
+        (State ts tf sp (branch_target f pc) trs tm)
+  | match_states_call:
+      forall s ts f tf a ta m tm
+      (STK: list_forall2 match_stackframes s ts)
+      (TF: tunnel_fundef f = OK tf)
+      (ARGS: list_forall2 Val.lessdef a ta)
+      (MEM: Mem.extends m tm),
+      match_states
+        (Callstate s f a m)
+        (Callstate ts tf ta tm)
+  | match_states_return:
+      forall s ts v tv m tm
+      (STK: list_forall2 match_stackframes s ts)
+      (VAL: Val.lessdef v tv)
+      (MEM: Mem.extends m tm),
+      match_states
+        (Returnstate s v m)
+        (Returnstate ts tv tm).
+
+(* TODO: il faut définir une bonne mesure *)
+(* (**) Definition measure (st: state) : nat :=
+  match st with
+  | State s f sp pc rs m =>
+      match (fn_code f)!pc with
+      | Some (Inop pc') => (bound (branch_target f) pc') * 2 + 1
+      | Some (Icond _ _ ifso ifnot _) => (max
+                                    (bound (branch_target f) ifso)
+                                    (bound (branch_target f) ifnot)
+                                  ) * 2 + 1
+      | Some _ => (bound (branch_target f) pc) * 2
+      | None => 0
+      end
+  | Callstate s f v m => 0
+  | Returnstate s v m => 0
+  end.
+*)
+
+Definition measure (st: state): nat :=
+  match st with
+  | State s f sp pc rs m => bound (branch_target f) pc
+  | Callstate s f v m => 0
+  | Returnstate s v m => 0
+  end.
+
+
+(* rq: sans les lemmes définis au-dessus je ne vois pas trop comment j'aurais
+ * fait... ni comment j'aurais eu l'idée d'en faire des lemmes ? *)
+(**) Lemma transf_initial_states:
+  forall s1: state, initial_state prog s1 ->
+  exists s2: state, initial_state tprog s2 /\ match_states s1 s2.
+Proof.
+  intros. inversion H as [b f m0 ge0 MEM SYM PTR SIG CALL].
+  exploit function_ptr_translated.
+  - apply PTR.
+  - intros (tf & TPTR & TUN).
+    exists (Callstate nil tf nil m0). split.
+    + apply initial_state_intro with b.
+      * apply (Genv.init_mem_match TRANSL). apply MEM.
+      * rewrite (match_program_main TRANSL).
+        rewrite symbols_preserved. apply SYM.
+      * apply TPTR.
+      * rewrite <- SIG. apply sig_preserved. apply TUN.
+    + apply match_states_call.
+      * apply list_forall2_nil.
+      * apply TUN.
+      * apply list_forall2_nil.
+      * apply Mem.extends_refl.
+Qed.
+
+(**) Lemma transf_final_states:
+  forall (s1 : state)
+  (s2 : state) (r : Integers.Int.int),
+  match_states s1 s2 ->
+  final_state s1 r ->
+  final_state s2 r.
+Proof.
+  (* rq: `inv` au lieu de `inversion` fait beaucoup de nettoyage dans les
+   * hypothèses, mais je sais pas trop ce que ça fait exactement *)
+  intros. inv H0. inv H. inv VAL. inversion STK. apply final_state_intro.
+Qed.
+
+Lemma tunnel_function_unfold:
+  forall f tf pc,
+  tunnel_function f = OK tf ->
+  (fn_code tf) ! pc =
+  option_map (tunnel_instr (branch_target f)) (fn_code f) ! pc.
+Proof.
+  intros f tf pc.
+  unfold tunnel_function.
+  destruct (check_included _ _) eqn:EQinc; try congruence.
+  destruct (check_code _ _) eqn:EQcode; simpl; try congruence.
+  intro. inv H. simpl. rewrite PTree.gmap1. reflexivity.
+Qed.
+
+Lemma reglist_lessdef:
+  forall (rs trs: Registers.Regmap.t val) (args: list Registers.reg),
+  regs_lessdef rs trs -> Val.lessdef_list (rs##args) (trs##args).
+Proof.
+  intros. induction args; simpl; constructor.
+  apply H. apply IHargs.
+Qed.
+
+Lemma instruction_type_preserved:
+  forall (f tf:function) (pc:node) (i ti:instruction)
+  (TF: tunnel_function f = OK tf)
+  (FATPC: (fn_code f) ! pc = Some i)
+  (NOTINOP: forall s, i <> Inop s)
+  (NOTICOND: forall cond args ifso ifnot info, i <> Icond cond args ifso ifnot info)
+  (TI: ti = tunnel_instr (branch_target f) i),
+  (fn_code tf) ! (branch_target f pc) = Some ti.
+Proof.
+  intros.
+  assert ((fn_code tf) ! pc = Some (tunnel_instr (branch_target f) i)) as TFATPC.
+  rewrite (tunnel_function_unfold f tf pc); eauto.
+  rewrite FATPC; eauto.
+  exploit branch_target_bounds; eauto.
+  intro TB. inversion TB as [BT|s|cond args ifso ifnot info]; try (rewrite FATPC in H; congruence).
+Qed.
+
+(* rq: utilisée pour `tunnel_step_correct` mais je ne comprends pas tout *)
+(**) Lemma find_function_translated:
+  forall ros rs trs fd,
+  regs_lessdef rs trs ->
+  find_function ge ros rs = Some fd ->
+  exists tfd, tunnel_fundef fd = OK tfd /\ find_function tge ros trs = Some tfd.
+Proof.
+  intros. destruct ros; simpl in *.
+  - (* reg *)
+    assert (E: trs # r = rs # r).
+    { exploit Genv.find_funct_inv. apply H0. intros (b & EQ).
+      generalize (H r) . rewrite EQ. intro LD. inv LD. auto. }
+    rewrite E. exploit functions_translated; eauto.
+  - (* ident *)
+    rewrite symbols_preserved. destruct (Genv.find_symbol ge i); inv H0.
+    exploit function_ptr_translated; eauto.
+    intros (tf & X1 & X2). exists tf; intuition.
+Qed.
+
+Lemma list_forall2_lessdef_rs:
+  forall rs trs args,
+  regs_lessdef rs trs ->
+  list_forall2 Val.lessdef rs ## args trs ## args.
+Proof.
+  intros rs trs args LD.
+  exploit (reglist_lessdef rs trs args). apply LD.
+  induction args; simpl; intros H; try constructor; inv H.
+  apply H3. apply IHargs. apply H5.
+Qed.
+
+Lemma fn_stacksize_preserved:
+  forall f tf
+  (TF: tunnel_function f = OK tf),
+  fn_stacksize f = fn_stacksize tf.
+Proof.
+  intros f tf. unfold tunnel_function.
+  destruct (check_included _ _); try congruence.
+  intro H. monadInv H. simpl. reflexivity.
+Qed.
+
+Lemma regs_setres_lessdef:
+  forall res vres tvres rs trs,
+  regs_lessdef rs trs -> Val.lessdef vres tvres ->
+  regs_lessdef (regmap_setres res vres rs) (regmap_setres res tvres trs).
+Proof.
+  induction res; intros; simpl; try auto using set_reg_lessdef.
+Qed.
+
+Lemma regmap_optget_lessdef:
+  forall or rs trs,
+  regs_lessdef rs trs -> Val.lessdef (regmap_optget or Vundef rs) (regmap_optget or Vundef trs).
+Proof.
+  intros or rs trs RS.
+  induction or; simpl; auto using set_reg_lessdef.
+Qed.
+
+Lemma tunnel_fundef_Internal:
+  forall (f: function) (tf: fundef)
+  (TF: tunnel_fundef (Internal f) = OK tf),
+  exists (tf': function), tf = (Internal tf') /\ tunnel_function f = OK tf'.
+Proof.
+  intros f tf.
+  unfold tunnel_fundef. simpl. intro H. monadInv H. exists x.
+  split. reflexivity. apply EQ.
+Qed.
+
+Lemma tunnel_fundef_External:
+  forall (ef: external_function) (tf: fundef)
+  (TF: tunnel_fundef (External ef) = OK tf),
+  tf = (External ef).
+Proof.
+  intros f tf.
+  unfold tunnel_fundef. simpl. intro H. monadInv H. reflexivity.
+Qed.
+
+
+Lemma fn_entrypoint_preserved:
+  forall f tf
+  (TF: tunnel_function f = OK tf),
+  fn_entrypoint tf = branch_target f (fn_entrypoint f).
+Proof.
+  intros f tf.
+  unfold tunnel_function. destruct (check_included _ _); try congruence.
+  intro TF. monadInv TF. simpl. reflexivity.
+Qed.
+
+Lemma init_regs_lessdef:
+  forall f tf args targs
+  (ARGS: list_forall2 Val.lessdef args targs)
+  (TF: tunnel_function f = OK tf),
+  regs_lessdef (init_regs args (fn_params f)) (init_regs targs (fn_params tf)).
+Proof.
+  assert (regs_lessdef (Regmap.init Vundef) (Regmap.init Vundef)) as Hundef.
+  { unfold Regmap.init. unfold regs_lessdef. intro. unfold Regmap.get. rewrite PTree.gempty. apply Val.lessdef_undef. }
+
+  intros f tf args targs ARGS.
+
+  unfold tunnel_function. destruct (check_included _ _) eqn:EQinc; try congruence.
+  intro TF. monadInv TF. simpl. 
+  (* 
+  induction ARGS.
+  - induction (fn_params f) eqn:EQP; apply Hundef.
+  - induction (fn_params f) eqn:EQP.
+    * simpl. apply Hundef.
+    * simpl. apply set_reg_lessdef. apply H. 
+    *)
+    
+  generalize (fn_params f) as l. induction ARGS; induction l; try (simpl; apply Hundef).
+    simpl. apply set_reg_lessdef; try assumption. apply IHARGS.
+Qed.
+
+Lemma lessdef_forall2_list:
+  forall args ta,
+  list_forall2 Val.lessdef args ta -> Val.lessdef_list args ta.
+Proof.
+  intros args ta H. induction H. apply Val.lessdef_list_nil. apply Val.lessdef_list_cons. apply H. apply IHlist_forall2.
+Qed.
+
+
+(* `Lemma tunnel_step_correct` correspond au diagramme "option simulation" *)
+Lemma tunnel_step_correct:
+  forall st1 t st2, step ge st1 t st2 ->
+  forall st1' (MS: match_states st1 st1'),
+  (exists st2', step tge st1' t st2' /\ match_states st2 st2')
+  \/ (measure st2 < measure st1 /\ t = E0 /\ match_states st2 st1')%nat.
+Proof.
+  intros st1 t st2 H. induction H; intros; try (inv MS).
+  - (* Inop *)
+    exploit branch_target_bounds. apply TF.
+    rewrite H. intro. inv H0.
+    + (* TB_default *)
+      rewrite TB. left. eexists. split.
+      * apply exec_Inop. rewrite (tunnel_function_unfold f tf pc). rewrite H. simpl. eauto. apply TF.
+      * constructor; try assumption.
+    + (* TB_nop *)
+      simpl. right. repeat split. apply DEC.
+      rewrite EQ. apply match_states_intro; assumption.
+  - (* Iop *)
+    exploit eval_operation_lessdef; try eassumption.
+    apply reglist_lessdef. apply RS.
+    intros (tv & EVAL & LD).
+    left; eexists; split.
+    + eapply exec_Iop with (v:=tv).
+      apply instruction_type_preserved with (Iop op args res pc'); (simpl; auto)||congruence.
+      rewrite <- EVAL. apply eval_operation_preserved. apply symbols_preserved.
+    + apply match_states_intro; eauto. apply set_reg_lessdef. apply LD. apply RS.
+  - (* Iload *)
+    exploit eval_addressing_lessdef; try eassumption.
+    apply reglist_lessdef. apply RS.
+    intros (ta & EVAL & LD).
+    exploit Mem.loadv_extends; try eassumption.
+    intros (tv & LOAD & LD').
+    left. eexists. split.
+    + eapply exec_Iload.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * rewrite <- EVAL. apply eval_addressing_preserved. apply symbols_preserved.
+      * apply LOAD.
+    + apply match_states_intro; try assumption. apply set_reg_lessdef. apply LD'. apply RS.
+  - (* Iload NOTRAP1 *)
+    exploit eval_addressing_lessdef_none; try eassumption.
+    apply reglist_lessdef; apply RS.
+    left. eexists. split.
+    + eapply exec_Iload_notrap1.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * rewrite <- H1. apply eval_addressing_preserved. apply symbols_preserved.
+    + apply match_states_intro; try assumption. apply set_reg_lessdef. apply Val.lessdef_undef. apply RS.
+  - (* Iload NOTRAP2 *)
+    exploit eval_addressing_lessdef; try eassumption.
+    apply reglist_lessdef; apply RS.
+    intros (ta & EVAL & LD).
+    (* TODO: on peut sans doute factoriser ça *)
+    destruct (Mem.loadv chunk tm ta) eqn:Htload.
+    + left; eexists; split.
+      eapply exec_Iload.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * rewrite <- EVAL. apply eval_addressing_preserved. apply symbols_preserved.
+      * apply Htload.
+      * apply match_states_intro; try assumption. apply set_reg_lessdef; eauto.
+    + left; eexists; split.
+      eapply exec_Iload_notrap2.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * rewrite <- EVAL. apply eval_addressing_preserved. apply symbols_preserved.
+      * apply Htload.
+      * apply match_states_intro; try assumption. apply set_reg_lessdef; eauto.
+  - (* Lstore *)
+    exploit eval_addressing_lessdef; try eassumption.
+    apply reglist_lessdef; apply RS.
+    intros (ta & EVAL & LD).
+    exploit Mem.storev_extends; try eassumption. apply RS.
+    intros (tm' & STORE & MEM').
+    left. eexists. split.
+    + eapply exec_Istore.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * rewrite <- EVAL. apply eval_addressing_preserved. apply symbols_preserved.
+      * rewrite STORE. reflexivity.
+    + apply match_states_intro; try eassumption.
+  - (* Icall *)
+    left.
+    exploit find_function_translated; try eassumption.
+    intros (tfd & TFD & FIND).
+    eexists. split.
+    + eapply exec_Icall.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * apply FIND.
+      * apply sig_preserved. apply TFD.
+    + apply match_states_call; try assumption.
+      * apply list_forall2_cons; try assumption. apply match_stackframes_intro; try assumption.
+      * apply list_forall2_lessdef_rs. apply RS.
+  - (* Itailcall *)
+    exploit find_function_translated; try eassumption.
+    intros (tfd & TFD & FIND).
+    exploit Mem.free_parallel_extends; try eassumption.
+    intros (tm' & FREE & MEM').
+    left. eexists. split.
+    + eapply exec_Itailcall.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * apply FIND.
+      * apply sig_preserved. apply TFD.
+      * erewrite <- fn_stacksize_preserved; try eassumption.
+    + apply match_states_call; try assumption.
+      apply list_forall2_lessdef_rs. apply RS.
+  - (* Ibuiltin *)
+    exploit eval_builtin_args_lessdef; try eassumption. apply RS.
+    intros (vl2 & EVAL & LD).
+    exploit external_call_mem_extends; try eassumption.
+    intros (tvres & tm' & EXT & LDRES & MEM' & UNCHGD).
+    left. eexists. split.
+    + eapply exec_Ibuiltin.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * eapply eval_builtin_args_preserved. eapply symbols_preserved. eapply EVAL.
+      * eapply external_call_symbols_preserved. eapply senv_preserved. eapply EXT.
+    + apply match_states_intro; try assumption. apply regs_setres_lessdef; try assumption.
+  - (* Icond *)
+    simpl. exploit branch_target_bounds. apply TF. rewrite H. intro. inv H1.
+    + (* TB_default *)
+      rewrite TB.
+      destruct (fn_code tf)!pc as [[]|] eqn:EQ;
+      assert (tunnel_function f = OK tf) as TF'; auto;
+      unfold tunnel_function in TF; destruct (check_included _ _) in TF; monadInv TF;
+      simpl in EQ; rewrite PTree.gmap1 in EQ; rewrite H in EQ; simpl in EQ; destruct (peq _ _) eqn: EQpeq in EQ; try congruence.
+      * left. eexists. split. 
+        -- eapply exec_Inop. simpl. rewrite PTree.gmap1. rewrite H. simpl. rewrite EQpeq. reflexivity.
+        -- destruct b. apply match_states_intro; eauto. rewrite e. apply match_states_intro; eauto.
+      * left. eexists. split.
+        -- eapply exec_Icond; auto. simpl. rewrite PTree.gmap1. rewrite H. simpl. rewrite EQpeq. reflexivity. eapply eval_condition_lessdef; try eassumption. apply reglist_lessdef. apply RS.
+        -- destruct b; apply match_states_intro; auto.
+    + (* TB_cond *) right; repeat split; destruct b; try assumption.
+      * rewrite EQSO. apply match_states_intro; try assumption.
+      * rewrite EQNOT. apply match_states_intro; try assumption.
+  - (* Ijumptable *)
+    left. eexists. split.
+    + eapply exec_Ijumptable.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * generalize (RS arg). rewrite H0. intro. inv H2. reflexivity.
+      * rewrite list_nth_z_map. rewrite H1. simpl. reflexivity.
+    + apply match_states_intro; try eassumption.
+  - (* Ireturn *)
+    exploit Mem.free_parallel_extends; try eassumption.
+    intros (tm' & FREE & MEM').
+    left. eexists. split.
+    + eapply exec_Ireturn.
+      * exploit instruction_type_preserved; (simpl; eauto)||congruence.
+      * erewrite <- fn_stacksize_preserved. eapply FREE. eapply TF.
+    + apply match_states_return; try eassumption.
+      apply regmap_optget_lessdef. apply RS.
+  - (* internal function *)
+    exploit tunnel_fundef_Internal; try eassumption.
+    intros (tf' & EQ & TF'). subst.
+    exploit Mem.alloc_extends; try eassumption. reflexivity. reflexivity.
+    intros (m2' & ALLOC & EXT).
+    left. eexists. split.
+    + eapply exec_function_internal.
+      rewrite <- (fn_stacksize_preserved f tf'). eapply ALLOC. eapply TF'.
+    + rewrite (fn_entrypoint_preserved f tf'); try eassumption. apply match_states_intro; try eassumption.
+      apply init_regs_lessdef. apply ARGS. apply TF'.
+  - (* external function *)
+    exploit external_call_mem_extends. eapply H. eapply MEM.  eapply lessdef_forall2_list. eapply ARGS.
+    intros (tvres & tm' & EXTCALL & LD & EXT & MEMUNCHGD).
+    left. eexists. split.
+    + erewrite (tunnel_fundef_External ef tf); try eassumption.
+      eapply exec_function_external. eapply external_call_symbols_preserved. eapply senv_preserved. eapply EXTCALL.
+    + eapply match_states_return; try assumption.
+  - (* return *)
+    inv STK. inv H1.
+    left. eexists. split.
+    + eapply exec_return.
+    + eapply match_states_intro; try assumption.
+      apply set_reg_lessdef; try assumption.
+Qed.
+
+
+Theorem transf_program_correct:
+  forward_simulation (RTL.semantics prog) (RTL.semantics tprog).
+Proof.
+  eapply forward_simulation_opt.
+  apply senv_preserved.
+  apply transf_initial_states.
+  apply transf_final_states.
+  exact tunnel_step_correct.
+Qed.
+
+End PRESERVATION.
diff --git a/backend/RTLcommonaux.ml b/backend/RTLcommonaux.ml
index 0e369d04..f7e49aa4 100644
--- a/backend/RTLcommonaux.ml
+++ b/backend/RTLcommonaux.ml
@@ -6,12 +6,15 @@ open Kildall
 open Lattice
 
 let p2i r = P.to_int r
+
 let i2p i = P.of_int i
 
 let get_some = function
   | None -> failwith "Got None instead of Some _"
   | Some thing -> thing
 
+let get_ok r = match r with Errors.OK x -> x | _ -> failwith "Did not get OK"
+
 let successors_inst = function
   | Inop n
   | Iop (_, _, _, n)
@@ -42,49 +45,70 @@ let get_join_points code entry =
   let reached = ref (PTree.map (fun n i -> false) code) in
   let reached_twice = ref (PTree.map (fun n i -> false) code) in
   let rec traverse pc =
-    if get_some @@ PTree.get pc !reached then begin
+    if get_some @@ PTree.get pc !reached then (
       if not (get_some @@ PTree.get pc !reached_twice) then
-        reached_twice := PTree.set pc true !reached_twice
-    end else begin
+        reached_twice := PTree.set pc true !reached_twice)
+    else (
       reached := PTree.set pc true !reached;
-      traverse_succs (successors_inst @@ get_some @@ PTree.get pc code)
-    end
+      traverse_succs (successors_inst @@ get_some @@ PTree.get pc code))
   and traverse_succs = function
     | [] -> ()
-    | [pc] -> traverse pc
-    | pc :: l -> traverse pc; traverse_succs l
-  in traverse entry; !reached_twice
+    | [ pc ] -> traverse pc
+    | pc :: l ->
+        traverse pc;
+        traverse_succs l
+  in
+  traverse entry;
+  !reached_twice
 
-let transfer f pc after = let open Liveness in
+let transfer f pc after =
+  let open Liveness in
   match PTree.get pc f.fn_code with
-  | Some i ->
-    (match i with
-    | Inop _ -> after
-    | Iop (_, args, res, _) ->
-        reg_list_live args (Regset.remove res after)
-    | Iload (_, _, _, args, dst, _) ->
-        reg_list_live args (Regset.remove dst after)
-    | Istore (_, _, args, src, _) ->
-        reg_list_live args (Regset.add src after)
-    | Icall (_, ros, args, res, _) ->
-        reg_list_live args (reg_sum_live ros (Regset.remove res after))
-    | Itailcall (_, ros, args) ->
-        reg_list_live args (reg_sum_live ros Regset.empty)
-    | Ibuiltin (_, args, res, _) ->
-        reg_list_live (AST.params_of_builtin_args args)
-          (reg_list_dead (AST.params_of_builtin_res res) after)
-    | Icond (_, args, _, _, _) ->
-        reg_list_live args after
-    | Ijumptable (arg, _) ->
-        Regset.add arg after
-    | Ireturn optarg ->
-        reg_option_live optarg Regset.empty)
+  | Some i -> (
+      match i with
+      | Inop _ -> after
+      | Iop (_, args, res, _) -> reg_list_live args (Regset.remove res after)
+      | Iload (_, _, _, args, dst, _) ->
+          reg_list_live args (Regset.remove dst after)
+      | Istore (_, _, args, src, _) -> reg_list_live args (Regset.add src after)
+      | Icall (_, ros, args, res, _) ->
+          reg_list_live args (reg_sum_live ros (Regset.remove res after))
+      | Itailcall (_, ros, args) ->
+          reg_list_live args (reg_sum_live ros Regset.empty)
+      | Ibuiltin (_, args, res, _) ->
+          reg_list_live
+            (AST.params_of_builtin_args args)
+            (reg_list_dead (AST.params_of_builtin_res res) after)
+      | Icond (_, args, _, _, _) -> reg_list_live args after
+      | Ijumptable (arg, _) -> Regset.add arg after
+      | Ireturn optarg -> reg_option_live optarg Regset.empty)
   | None -> Regset.empty
 
-module RegsetLat = LFSet(Regset)
-
-module DS = Backward_Dataflow_Solver(RegsetLat)(NodeSetBackward)
+module RegsetLat = LFSet (Regset)
+module DS = Backward_Dataflow_Solver (RegsetLat) (NodeSetBackward)
 
 let analyze f =
-  let liveouts = get_some @@ DS.fixpoint f.fn_code successors_instr (transfer f) in
-  PTree.map (fun n _ -> let lo = PMap.get n liveouts in transfer f n lo) f.fn_code
+  let liveouts =
+    get_some @@ DS.fixpoint f.fn_code successors_instr (transfer f)
+  in
+  PTree.map
+    (fun n _ ->
+      let lo = PMap.get n liveouts in
+      transfer f n lo)
+    f.fn_code
+
+let get_outputs liveness n last =
+  let path_last_successors = successors_inst last in
+  let list_input_regs =
+    List.map (fun n -> get_some @@ PTree.get n liveness) path_last_successors
+  in
+  List.fold_left Regset.union Regset.empty list_input_regs
+(* TODO gourdinl to remove, as we do not need por anymore?
+   let por = match last_instruction with (* see RTLpathLivegen.final_inst_checker *)
+     | Icall (_, _, _, res, _) -> Regset.remove res outputs
+     | Ibuiltin (_, _, res, _) ->  Liveness.reg_list_dead (AST.params_of_builtin_res res) outputs
+     | Itailcall (_, _, _) | Ireturn _ ->
+        assert (outputs = Regset.empty); (* defensive check for performance *)
+        outputs
+     | _ -> outputs
+   in (por, outputs) *)
diff --git a/backend/RTLtyping.v b/backend/RTLtyping.v
index 15ed6d8a..6048f895 100644
--- a/backend/RTLtyping.v
+++ b/backend/RTLtyping.v
@@ -858,7 +858,7 @@ Lemma wt_exec_Iload_notrap:
   forall env f chunk addr args dst s rs,
   wt_instr f env (Iload NOTRAP chunk addr args dst s) ->
   wt_regset env rs ->
-  wt_regset env (rs#dst <- (default_notrap_load_value chunk)).
+  wt_regset env (rs#dst <- Vundef).
 Proof.
   intros. 
   eapply wt_regset_assign; eauto. simpl. trivial.
diff --git a/backend/Tailcallproof.v b/backend/Tailcallproof.v
index 80a68327..39fc10fb 100644
--- a/backend/Tailcallproof.v
+++ b/backend/Tailcallproof.v
@@ -440,7 +440,7 @@ Proof.
   TransfInstr.
   assert (Val.lessdef_list (rs##args) (rs'##args)). apply regs_lessdef_regs; auto.
   left.
-  exists (State s' (transf_function f) (Vptr sp0 Ptrofs.zero) pc' (rs'#dst <- (default_notrap_load_value chunk)) m'); split.
+  exists (State s' (transf_function f) (Vptr sp0 Ptrofs.zero) pc' (rs'#dst <- Vundef) m'); split.
   eapply exec_Iload_notrap1.
   eassumption.
   eapply eval_addressing_lessdef_none. eassumption.
@@ -465,7 +465,7 @@ Proof.
     exact symbols_preserved.
     assumption.
     econstructor; eauto. apply set_reg_lessdef; auto.
-  + exists (State s' (transf_function f) (Vptr sp0 Ptrofs.zero) pc' (rs'#dst <- (default_notrap_load_value chunk)) m'); split.
+  + exists (State s' (transf_function f) (Vptr sp0 Ptrofs.zero) pc' (rs'#dst <- Vundef) m'); split.
     eapply exec_Iload_notrap2. eassumption.
     erewrite eval_addressing_preserved.
     eassumption.
diff --git a/backend/Tunnelingaux.ml b/backend/Tunnelingaux.ml
deleted file mode 100644
index 87e6d303..00000000
--- a/backend/Tunnelingaux.ml
+++ /dev/null
@@ -1,283 +0,0 @@
-(* *************************************************************)
-(*                                                             *)
-(*             The Compcert verified compiler                  *)
-(*                                                             *)
-(*           Sylvain Boulmé  Grenoble-INP, VERIMAG             *)
-(*                                                             *)
-(*  Copyright VERIMAG. All rights reserved.                    *)
-(*  This file is distributed under the terms of the INRIA      *)
-(*  Non-Commercial License Agreement.                          *)
-(*                                                             *)
-(* *************************************************************)
-
-(*
-
-This file implements the [branch_target] oracle that identifies "nop" branches in a LTL function,
-and computes their target node with the distance (ie the number of cummulated nops) toward this target.
-
-See [Tunneling.v]
-
-*)
-
-open Coqlib
-open LTL
-open Maps
-open Camlcoq
-
-let limit_tunneling = None (* for debugging: [Some x] limit the number of iterations *)
-let debug_flag = ref false
-let final_dump = false   (* set to true to have a more verbose debugging *)
-
-let debug fmt =
-  if !debug_flag then Printf.eprintf fmt
-  else Printf.ifprintf stderr fmt
-
-exception BugOnPC of int
-
-(* type of labels in the cfg *)
-type label = int * P.t
-
-(* instructions under analyzis *)
-type simple_inst = (* a simplified view of LTL instructions *)
-  LBRANCH of node
-| LCOND of node * node
-| OTHER
-and node = {
-    lab : label;
-    mutable inst: simple_inst;
-    mutable link: node; (* link in the union-find: itself for non "nop"-nodes, target of the "nop" otherwise *)
-    mutable dist: int;
-    mutable tag: int
-  }
-
-(* type of the (simplified) CFG *)
-type cfg = {
-    nodes: (int, node) Hashtbl.t;
-    mutable rems: node list; (* remaining conditions that may become lbranch or not *)
-    mutable num_rems: int;
-    mutable iter_num: int (* number of iterations in elimination of conditions *)
-  }
-
-let lab_i (n: node): int = fst n.lab
-let lab_p (n: node): P.t = snd n.lab
-
-let rec target c n = (* inspired from the "find" of union-find algorithm *)
-  match n.inst with
-  | LCOND(s1,s2) ->
-     if n.link != n
-     then update c n
-     else if n.tag < c.iter_num then (
-       (* we try to change the condition ... *)
-       n.tag <- c.iter_num; (* ... but at most once by iteration *)
-       let ts1 = target c s1 in
-       let ts2 = target c s2 in
-       if ts1 == ts2 then (n.link <- ts1; ts1) else n
-     ) else n
-  | _ ->
-     if n.link != n
-     then update c n
-     else n
-and update c n =
-  let t = target c n.link in
-  n.link <- t; t
-
-let get_node c p =
-  let li = P.to_int p in
-  try
-    Hashtbl.find c.nodes li
-  with
-    Not_found ->
-      let rec n = { lab = (li, p); inst = OTHER; link = n ; dist = 0; tag = 0 }  in
-      Hashtbl.add c.nodes li n;
-      n
-
-let set_branch c p s =
-  let li = P.to_int p in
-  try
-    let n = Hashtbl.find c.nodes li in
-    n.inst <- LBRANCH s;
-    n.link <- target c s
-  with
-    Not_found ->
-      let n = { lab = (li,p); inst = LBRANCH s; link = target c s; dist = 0; tag = 0 } in
-      Hashtbl.add c.nodes li n
-
-
-(* build [c.nodes] and accumulate in [acc] conditions at beginning of LTL basic-blocks *)
-let build_simplified_cfg c acc pc bb =
-  match bb with
-  | Lbranch s :: _ ->
-     let ns = get_node c s in
-     set_branch c pc ns;
-     acc
-  | Lcond (_, _, s1, s2, _) :: _ ->
-     c.num_rems <- c.num_rems + 1;
-     let ns1 = get_node c s1 in
-     let ns2 = get_node c s2 in
-     let npc = get_node c pc in
-     npc.inst <- LCOND(ns1, ns2);
-     npc::acc
-  | _ -> acc
-
-(* try to change a condition into a branch
-[acc] is the current accumulator of conditions to consider in the next iteration of repeat_change_cond
-*)
-let try_change_cond c acc pc =
-  match pc.inst with
-  | LCOND(s1,s2) ->
-     let ts1 = target c s1 in
-     let ts2 = target c s2 in
-     if ts1 == ts2 then (
-       pc.link <- ts1;
-       c.num_rems <- c.num_rems - 1;
-       acc
-     ) else
-       pc::acc
-  | _  -> raise (BugOnPC (lab_i pc)) (* LCOND expected *)
-
-(* repeat [try_change_cond] until no condition is changed into a branch *)
-let rec repeat_change_cond c =
-  c.iter_num <- c.iter_num + 1;
-  debug "++ Tunneling.branch_target %d: remaining number of conds to consider = %d\n" (c.iter_num) (c.num_rems);
-  let old =  c.num_rems in
-  c.rems <- List.fold_left (try_change_cond c) [] c.rems;
-  let curr = c.num_rems in
-  let continue =
-    match limit_tunneling with
-    | Some n -> curr < old && c.iter_num < n
-    | None -> curr < old
-  in
-  if continue
-  then repeat_change_cond c
-
-
-(* compute the final distance of each nop nodes to its target *)
-let undef_dist = -1
-let self_dist = undef_dist-1
-let rec dist n =
-  if n.dist = undef_dist
-  then (
-    n.dist <- self_dist; (* protection against an unexpected loop in the data-structure *)
-    n.dist <-
-      (match n.inst with
-       | OTHER -> 0
-       | LBRANCH p -> 1 + dist p
-       | LCOND (p1,p2) -> 1 + (max (dist p1) (dist p2)));
-    n.dist
-  ) else if n.dist=self_dist then raise (BugOnPC (lab_i n))
-    else n.dist
-
-let final_export f c =
-  let count = ref 0 in
-  let filter_nops_init_dist _ n acc =
-    let tn = target c n in
-    if tn == n
-    then (
-      n.dist <- 0; (* force [n] to be a base case in the recursion of [dist] *)
-      acc
-    ) else (
-      n.dist <- undef_dist; (* force [dist] to compute the actual [n.dist] *)
-      count := !count+1;
-      n::acc
-    )
-  in
-  let nops = Hashtbl.fold filter_nops_init_dist c.nodes [] in
-  let res = List.fold_left (fun acc n -> PTree.set (lab_p n) (lab_p n.link, Z.of_uint (dist n)) acc) PTree.empty nops in
-  debug "* Tunneling.branch_target: final number of eliminated nops = %d\n" !count;
-  res
-
-(*********************************************)
-(*** START: printing and debugging functions *)
-
-let string_of_labeli nodes ipc =
-  try
-    let pc = Hashtbl.find nodes ipc in
-    if pc.link == pc
-    then Printf.sprintf "(Target@%d)" (dist pc)
-    else Printf.sprintf "(Nop %d @%d)" (lab_i pc.link) (dist pc)
-  with
-    Not_found -> ""
-
-let print_bblock c println (pc, bb) =
-  match bb with
-  | Lbranch s::_ -> (if println then debug "\n"); debug "%d:Lbranch %d %s\n" pc (P.to_int s) (string_of_labeli c.nodes pc); false
-  | Lcond (_, _, s1, s2, _)::_ -> (if println then debug "\n"); debug "%d:Lcond (%d,%d) %s\n" pc (P.to_int s1) (P.to_int s2) (string_of_labeli c.nodes pc); false
-  | _ -> debug "%d " pc; true
-
-
-let print_cfg f c  =
-  let a = Array.of_list (PTree.fold (fun acc pc bb -> (P.to_int pc,bb)::acc) f.fn_code []) in
-  Array.fast_sort (fun (i1,_) (i2,_) -> i2 - i1) a;
-  let ep = P.to_int f.fn_entrypoint in
-  debug "entrypoint: %d %s\n" ep (string_of_labeli c.nodes ep);
-  let println = Array.fold_left (print_bblock c) false a in
-  (if println then debug "\n");debug "remaining cond:";
-  List.iter (fun n -> debug "%d " (lab_i n)) c.rems;
-  debug "\n"
-
-(*************************************************************)
-(* Copy-paste of the extracted code of the verifier          *)
-(* with [raise (BugOnPC (P.to_int pc))] instead of [Error.*] *)
-
-let get td pc =
-  match PTree.get pc td with
-  | Some p -> let (t0, d) = p in (t0, d)
-  | None -> (pc, Z.of_uint 0)
-
-let check_bblock td pc bb =
-  match PTree.get pc td with
-  | Some p ->
-    let (tpc, dpc) = p in
-    let dpc0 = dpc in
-    (match bb with
-     | [] ->
-       raise (BugOnPC (P.to_int pc))
-     | i :: _ ->
-       (match i with
-        | Lbranch s ->
-          let (ts, ds) = get td s in
-          if peq tpc ts
-          then if zlt ds dpc0
-               then ()
-               else raise (BugOnPC (P.to_int pc))
-          else raise (BugOnPC (P.to_int pc))
-        | Lcond (_, _, s1, s2, _) ->
-          let (ts1, ds1) = get td s1 in
-          let (ts2, ds2) = get td s2 in
-          if peq tpc ts1
-          then if peq tpc ts2
-               then if zlt ds1 dpc0
-                    then if zlt ds2 dpc0
-                         then ()
-                         else raise (BugOnPC (P.to_int pc))
-                    else raise (BugOnPC (P.to_int pc))
-               else raise (BugOnPC (P.to_int pc))
-          else raise (BugOnPC (P.to_int pc))
-        | _ ->
-          raise (BugOnPC (P.to_int pc))))
-  | None -> ()
-
-(** val check_code : coq_UF -> code -> unit res **)
-
-let check_code td c =
-  PTree.fold (fun _ pc bb -> check_bblock td pc bb) c (())
-
-(*** END: copy-paste & debugging functions *******)
-
-let branch_target f =
-  debug "* Tunneling.branch_target: starting on a new function\n";
-  if limit_tunneling <> None then debug "* WARNING: limit_tunneling <> None\n";
-  let c = { nodes = Hashtbl.create 100; rems = []; num_rems = 0; iter_num = 0 } in
-  c.rems <- PTree.fold (build_simplified_cfg c) f.fn_code [];
-  repeat_change_cond c;
-  let res = final_export f c in
-  if !debug_flag then (
-    try
-      check_code res f.fn_code;
-      if final_dump then print_cfg f c;
-    with e -> (
-      print_cfg f c;
-      check_code res f.fn_code
-    )
-  );
-  res
diff --git a/backend/Tunnelinglibs.ml b/backend/Tunnelinglibs.ml
new file mode 100644
index 00000000..7c826ba4
--- /dev/null
+++ b/backend/Tunnelinglibs.ml
@@ -0,0 +1,272 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           Sylvain Boulmé  Grenoble-INP, VERIMAG             *)
+(*           TODO: Proper author information                   *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+(*
+
+This file implements the core functions of the tunneling passes, for both RTL
+and LTL, by using a simplified CFG as a transparent interface
+
+See [LTLTunneling.v] and [RTLTunneling.v]
+
+*)
+
+open Maps
+open Camlcoq
+
+(* type of labels in the cfg *)
+type label = int * P.t
+
+(* instructions under analyzis *)
+type simple_inst = (* a simplified view of instructions *)
+  BRANCH of node
+| COND of node * node
+| OTHER
+and node = {
+    lab: label;
+    mutable inst: simple_inst;
+    mutable link: node; (* link in the union-find: itself for non "nop"-nodes, target of the "nop" otherwise *)
+    mutable dist: int;
+    mutable tag: int
+  }
+
+type positive = P.t
+type integer = Z.t
+
+(* type of the (simplified) CFG *)
+type cfg = {
+    nodes: (int, node) Hashtbl.t;
+    mutable rems: node list; (* remaining conditions that may become lbranch or not *)
+    mutable num_rems: int;
+    mutable iter_num: int (* number of iterations in elimination of conditions *)
+  }
+
+exception BugOnPC of int
+
+(* keeps track of the total number of nops seen, for debugging purposes *)
+let nopcounter = ref 0
+
+(* General functions that do not require language-specific context, and that
+ are used for building language-specific functions *)
+
+let rec target c n = (* inspired from the "find" of union-find algorithm *)
+  match n.inst with
+  | COND(s1,s2) ->
+     if n.link != n
+     then update c n
+     else if n.tag < c.iter_num then (
+       (* we try to change the condition ... *)
+       n.tag <- c.iter_num; (* ... but at most once by iteration *)
+       let ts1 = target c s1 in
+       let ts2 = target c s2 in
+       if ts1 == ts2 then (n.link <- ts1; ts1) else n
+     ) else n
+  | _ ->
+     if n.link != n
+     then update c n
+     else n
+and update c n =
+  let t = target c n.link in
+  n.link <- t; t
+
+let get_node c p =
+  let li = P.to_int p in
+  try
+    Hashtbl.find c.nodes li
+  with
+    Not_found ->
+      let rec n = { lab = (li, p); inst = OTHER; link = n ; dist = 0; tag = 0 }  in
+      Hashtbl.add c.nodes li n;
+      n
+
+let set_branch c p s =
+  let li = P.to_int p in
+  try
+    let n = Hashtbl.find c.nodes li in
+    n.inst <- BRANCH s;
+    n.link <- target c s
+  with
+    Not_found ->
+      let n = { lab = (li,p); inst = BRANCH s; link = target c s; dist = 0; tag = 0 } in
+      Hashtbl.add c.nodes li n
+
+let get td pc =
+  match PTree.get pc td with
+  | Some p -> let (t0, d) = p in (t0, d)
+  | None -> (pc, Z.of_uint 0)
+
+let lab_i (n: node): int = fst n.lab
+let lab_p (n: node): P.t = snd n.lab
+
+let undef_dist = -1
+let self_dist = undef_dist-1
+let rec dist n =
+  if n.dist = undef_dist
+  then (
+    n.dist <- self_dist; (* protection against an unexpected loop in the data-structure *)
+    n.dist <-
+      (match n.inst with
+       | OTHER -> 0
+       | BRANCH p -> 1 + dist p
+       | COND (p1,p2) -> 1 + (max (dist p1) (dist p2)));
+    n.dist
+  ) else if n.dist=self_dist then raise (BugOnPC (lab_i n))
+    else n.dist
+
+let string_of_labeli nodes ipc =
+  try
+    let pc = Hashtbl.find nodes ipc in
+    if pc.link == pc
+    then Printf.sprintf "(Target@%d)" (dist pc)
+    else Printf.sprintf "(Nop %d @%d)" (lab_i pc.link) (dist pc)
+  with
+    Not_found -> ""
+
+(*
+ * When given the necessary types and options as context, and then some
+ * language-specific functions that cannot be factorised between LTL and RTL, the
+ * `Tunneling` functor returns a module containing the corresponding
+ * `branch_target` function.
+ *)
+
+module Tunneling = functor
+  (* Language-specific types *)
+  (LANG: sig
+    type code_unit (* the type of a node of the code cfg (an instruction or a bblock *)
+    type funct (* type of internal functions *)
+  end)
+
+  (* Compilation options for debugging *)
+  (OPT: sig
+    val langname: string
+    val limit_tunneling: int option (* for debugging: [Some x] limit the number of iterations *)
+    val debug_flag: bool ref
+    val final_dump: bool (* set to true to have a more verbose debugging *)
+  end)
+  -> struct
+
+  (* The `debug` function uses values from `OPT`, and is used in functions passed to `F`
+     so it must be defined between the two *)
+  let debug fmt =
+    if !OPT.debug_flag then Printf.eprintf fmt
+    else Printf.ifprintf stderr fmt
+
+  module T
+    (* Language-specific functions *)
+    (FUNS: sig
+      (* build [c.nodes] and accumulate in [acc] conditions at beginning of LTL basic-blocks *)
+      val build_simplified_cfg: cfg -> node list -> positive -> LANG.code_unit -> node list
+      val print_code_unit: cfg -> bool -> int * LANG.code_unit -> bool
+      val fn_code: LANG.funct -> LANG.code_unit PTree.t
+      val fn_entrypoint: LANG.funct -> positive
+      val check_code_unit: (positive * integer) PTree.t -> positive -> LANG.code_unit -> unit
+    end)
+    (* only export what's needed *)
+    : sig val branch_target: LANG.funct -> (positive * integer) PTree.t end
+  = struct
+
+    (* try to change a condition into a branch [acc] is the current accumulator of
+       conditions to consider in the next iteration of repeat_change_cond *)
+    let try_change_cond c acc pc =
+      match pc.inst with
+      | COND(s1,s2) ->
+         let ts1 = target c s1 in
+         let ts2 = target c s2 in
+         if ts1 == ts2 then (
+           pc.link <- ts1;
+           c.num_rems <- c.num_rems - 1;
+           acc
+         ) else
+           pc::acc
+      | _  -> raise (BugOnPC (lab_i pc)) (* COND expected *)
+
+    (* repeat [try_change_cond] until no condition is changed into a branch *)
+    let rec repeat_change_cond c =
+      c.iter_num <- c.iter_num + 1;
+      debug "++ %sTunneling.branch_target %d: remaining number of conds to consider = %d\n" OPT.langname (c.iter_num) (c.num_rems);
+      let old =  c.num_rems in
+      c.rems <- List.fold_left (try_change_cond c) [] c.rems;
+      let curr = c.num_rems in
+      let continue =
+        match OPT.limit_tunneling with
+        | Some n -> curr < old && c.iter_num < n
+        | None -> curr < old
+      in
+      if continue
+      then repeat_change_cond c
+
+
+    (*********************************************)
+    (*** START: printing and debugging functions *)
+
+    let print_cfg (f: LANG.funct) c  =
+      let a = Array.of_list (PTree.fold (fun acc pc cu -> (P.to_int pc,cu)::acc) (FUNS.fn_code f) []) in
+      Array.fast_sort (fun (i1,_) (i2,_) -> i2 - i1) a;
+      let ep = P.to_int (FUNS.fn_entrypoint f) in
+      debug "entrypoint: %d %s\n" ep (string_of_labeli c.nodes ep);
+      let println = Array.fold_left (FUNS.print_code_unit c) false a in
+      (if println then debug "\n");debug "remaining cond:";
+      List.iter (fun n -> debug "%d " (lab_i n)) c.rems;
+      debug "\n"
+
+
+    (*************************************************************)
+    (* Copy-paste of the extracted code of the verifier          *)
+    (* with [raise (BugOnPC (P.to_int pc))] instead of [Error.*] *)
+
+    (** val check_code : coq_UF -> code -> unit res **)
+
+    let check_code td c =
+      PTree.fold (fun _ pc cu -> FUNS.check_code_unit td pc cu) c (())
+
+    (*** END: copy-paste & debugging functions *******)
+
+    (* compute the final distance of each nop nodes to its target *)
+    let final_export f c =
+      let count = ref 0 in
+      let filter_nops_init_dist _ n acc =
+        let tn = target c n in
+        if tn == n
+        then (
+          n.dist <- 0; (* force [n] to be a base case in the recursion of [dist] *)
+          acc
+        ) else (
+          n.dist <- undef_dist; (* force [dist] to compute the actual [n.dist] *)
+          count := !count+1;
+          n::acc
+        )
+      in
+      let nops = Hashtbl.fold filter_nops_init_dist c.nodes [] in
+      let res = List.fold_left (fun acc n -> PTree.set (lab_p n) (lab_p n.link, Z.of_uint (dist n)) acc) PTree.empty nops in
+      debug "* %sTunneling.branch_target: initial number of nops = %d\n" OPT.langname !nopcounter;
+      debug "* %sTunneling.branch_target: final number of eliminated nops = %d\n" OPT.langname !count;
+      res
+
+    let branch_target f =
+      debug "* %sTunneling.branch_target: starting on a new function\n" OPT.langname;
+      if OPT.limit_tunneling <> None then debug "* WARNING: limit_tunneling <> None\n";
+      let c = { nodes = Hashtbl.create 100; rems = []; num_rems = 0; iter_num = 0 } in
+      c.rems <- PTree.fold (FUNS.build_simplified_cfg c) (FUNS.fn_code f) [];
+      repeat_change_cond c;
+      let res = final_export f c in
+      if !OPT.debug_flag then (
+        try
+          check_code res (FUNS.fn_code f);
+          if OPT.final_dump then print_cfg f c;
+        with e -> (
+          print_cfg f c;
+          check_code res (FUNS.fn_code f)
+        )
+      );
+      res
+  end
+end
diff --git a/backend/ValueAnalysis.v b/backend/ValueAnalysis.v
index 561e94c9..e20edff7 100644
--- a/backend/ValueAnalysis.v
+++ b/backend/ValueAnalysis.v
@@ -1287,13 +1287,11 @@ Proof.
   eapply sound_succ_state; eauto. simpl; auto.
   unfold transfer; rewrite H. eauto.
   apply ematch_update; auto.
-  unfold default_notrap_load_value.
   constructor.
 - (* load notrap2 *)
   eapply sound_succ_state; eauto. simpl; auto.
   unfold transfer; rewrite H. eauto.
   apply ematch_update; auto.
-  unfold default_notrap_load_value.
   constructor.
 - (* store *)
   exploit eval_static_addressing_sound; eauto with va. intros VMADDR.
diff --git a/common/DebugPrint.ml b/common/DebugPrint.ml
index d94b31d8..83a485b0 100644
--- a/common/DebugPrint.ml
+++ b/common/DebugPrint.ml
@@ -133,7 +133,7 @@ let print_instructions insts code =
     debug "[ ";
     List.iter (
       fun n -> (PrintRTL.print_instruction stdout (P.to_int n, get_some @@ PTree.get n code))
-    ) insts; debug "]"
+    ) insts; debug "  ]"
   end
 
 let print_arrayp arr = begin
diff --git a/common/Memory.v b/common/Memory.v
index bf8ca083..ff17efb0 100644
--- a/common/Memory.v
+++ b/common/Memory.v
@@ -41,8 +41,6 @@ Require Export Memdata.
 Require Export Memtype.
 Require Import Lia.
 
-Definition default_notrap_load_value (chunk : memory_chunk) := Vundef.
-
 
 (* To avoid useless definitions of inductors in extracted code. *)
 Local Unset Elimination Schemes.
diff --git a/doc/index-kvx.html b/doc/index-kvx.html
index 62afb423..daa4cdc4 100644
--- a/doc/index-kvx.html
+++ b/doc/index-kvx.html
@@ -315,10 +315,10 @@ This IR is generic over the processor, even if currently, only used for KVX.
 </TR>
 
 <TR valign="top">
-  <TD>Branch tunneling</TD>
+  <TD>(LTL) Branch tunneling</TD>
   <TD>LTL to LTL</TD>
-  <TD><A HREF="html/compcert.backend.Tunneling.html">Tunneling</A></TD>
-  <TD><A HREF="html/compcert.backend.Tunnelingproof.html">Tunnelingproof</A></TD>
+  <TD><A HREF="html/compcert.backend.LTLTunneling.html">LTLTunneling</A></TD>
+  <TD><A HREF="html/compcert.backend.LTLTunnelingproof.html">LTLTunnelingproof</A></TD>
 </TR>
 
 <TR valign="top">
diff --git a/doc/index.html b/doc/index.html
index c3912cb2..34b87924 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -270,10 +270,10 @@ code.
 </TR>
 
 <TR valign="top">
-  <TD>Branch tunneling</TD>
+  <TD>(LTL) Branch tunneling</TD>
   <TD>LTL to LTL</TD>
-  <TD><A HREF="html/compcert.backend.Tunneling.html">Tunneling</A></TD>
-  <TD><A HREF="html/compcert.backend.Tunnelingproof.html">Tunnelingproof</A></TD>
+  <TD><A HREF="html/compcert.backend.LTLTunneling.html">LTLTunneling</A></TD>
+  <TD><A HREF="html/compcert.backend.LTLTunnelingproof.html">LTLTunnelingproof</A></TD>
 </TR>
 
 <TR valign="top">
diff --git a/driver/Clflags.ml b/driver/Clflags.ml
index fa17c2d9..25bd2c78 100644
--- a/driver/Clflags.ml
+++ b/driver/Clflags.ml
@@ -51,7 +51,7 @@ let option_flooprotate = ref 0 (* rotate the innermost loops to have the conditi
 let option_mtune = ref ""
                 
 let option_fprepass = ref true
-let option_fprepass_sched = ref "list"
+let option_fprepass_sched = ref "regpres"
 
 let option_fpostpass = ref true
 let option_fpostpass_sched = ref "list"
@@ -115,4 +115,6 @@ let option_inline_auto_threshold = ref 0
 let option_profile_arcs = ref false
 let option_fbranch_probabilities = ref true
 let option_debug_compcert = ref 0
+let option_regpres_threshold = ref 2
+let option_regpres_wait_window = ref false
 let main_function_name = ref "main"
diff --git a/driver/Compiler.vexpand b/driver/Compiler.vexpand
index 1d218657..9673267d 100644
--- a/driver/Compiler.vexpand
+++ b/driver/Compiler.vexpand
@@ -306,7 +306,7 @@ EXPAND_RTL_FORWARD_SIMULATIONS
   eapply compose_forward_simulations.
     eapply Allocationproof.transf_program_correct; eassumption.
   eapply compose_forward_simulations.
-    eapply Tunnelingproof.transf_program_correct; eassumption.
+    eapply LTLTunnelingproof.transf_program_correct; eassumption.
   eapply compose_forward_simulations.
     eapply Linearizeproof.transf_program_correct; eassumption.
   eapply compose_forward_simulations.
diff --git a/driver/Driver.ml b/driver/Driver.ml
index 7192ba4b..3f5a4bd9 100644
--- a/driver/Driver.ml
+++ b/driver/Driver.ml
@@ -210,7 +210,9 @@ Processing options:
   -mtune=         Type of CPU (for scheduling on some architectures)
   -fprepass      Perform prepass scheduling (only on some architectures) [on]
   -fprepass= <optim> Perform postpass scheduling with the specified optimization [list]
-                   (<optim>=list: list scheduling, <optim>=revlist: reverse list scheduling, <optim>=zigzag: zigzag scheduling, <optim>=ilp: ILP, <optim>=greedy: just packing bundles)
+                   (<optim>=list: list scheduling, <optim>=revlist: reverse list scheduling, <optim>=regpres: list scheduling aware of register pressure, <optim>=regpres_bis: variant of regpres, <optim>=zigzag: zigzag scheduling, <optim>=ilp: ILP, <optim>=greedy: just packing bundles)
+  -regpres-threshold n   With `-fprepass= regpres`, set threshold value for number of free registers before trying to decrease register pressure
+  -fregpres-wait-window  When register pressure is high, use a 5-cycle waiting window instead of scheduling short paths first (default no)
   -fpostpass     Perform postpass scheduling (only for K1 architecture) [on] 
   -fpostpass= <optim> Perform postpass scheduling with the specified optimization [list]
                    (<optim>=list: list scheduling, <optim>=ilp: ILP, <optim>=greedy: just packing bundles)
@@ -296,9 +298,9 @@ let num_input_files = ref 0
 let cmdline_actions =
   let f_opt name ref =
     [Exact("-f" ^ name), Set ref; Exact("-fno-" ^ name), Unset ref] in
-  let f_opt_str name ref strref =
+  let f_opt_str name default ref strref =
     [Exact("-f" ^ name ^ "="), String 
-      (fun s -> (strref := (if s == "" then "list" else s)); ref := true)
+      (fun s -> (strref := (if s == "" then default else s)); ref := true)
      ] in
   let f_str name strref default =
     [Exact("-f" ^ name ^ "="), String 
@@ -342,6 +344,7 @@ let cmdline_actions =
   Exact "-fprofile-use=", String (fun s -> Profilingaux.load_profiling_info s);
   Exact "-finline-auto-threshold", Integer (fun n -> option_inline_auto_threshold := n);
   Exact "-debug-compcert", Integer (fun n -> option_debug_compcert := n);
+  Exact "-regpres-threshold", Integer (fun n -> option_regpres_threshold := n);
   Exact "-fsmall-data", Integer(fun n -> option_small_data := n);
   Exact "-fsmall-const", Integer(fun n -> option_small_const := n);
   Exact "-ffloat-const-prop", Integer(fun n -> option_ffloatconstprop := n); 
@@ -424,6 +427,7 @@ let cmdline_actions =
   @ f_opt "redundancy" option_fredundancy
   @ [ Exact "-mtune", String (fun s -> option_mtune := s) ]
   @ f_opt "prepass" option_fprepass
+  @ f_opt "regpres-wait-window" option_regpres_wait_window
   @ f_opt "postpass" option_fpostpass
   @ [ Exact "-ftailduplicate", Integer (fun n -> option_ftailduplicate := n) ]
   @ f_opt "predict" option_fpredict
@@ -431,8 +435,8 @@ let cmdline_actions =
   @ [ Exact "-funrollbody", Integer (fun n -> option_funrollbody := n) ]
   @ [ Exact "-flooprotate", Integer (fun n -> option_flooprotate := n) ]
   @ f_opt "tracelinearize" option_ftracelinearize
-  @ f_opt_str "prepass" option_fprepass option_fprepass_sched
-  @ f_opt_str "postpass" option_fpostpass option_fpostpass_sched
+  @ f_opt_str "prepass" "regpress" option_fprepass option_fprepass_sched
+  @ f_opt_str "postpass" "list" option_fpostpass option_fpostpass_sched
   @ f_opt "inline" option_finline
   @ f_opt "inline-functions-called-once" option_finline_functions_called_once
   @ f_opt "globaladdrtmp" option_fglobaladdrtmp
diff --git a/kvx/Asmblockdeps.v b/kvx/Asmblockdeps.v
index b6d18c3e..a9786e0a 100644
--- a/kvx/Asmblockdeps.v
+++ b/kvx/Asmblockdeps.v
@@ -164,17 +164,17 @@ Definition arith_eval (ao: arith_op) (l: list value) :=
   | _, _ => None
   end.
 
-Definition exec_incorrect_load trap chunk :=
+Definition exec_incorrect_load trap :=
   match trap with
   | TRAP => None
-  | NOTRAP => Some (Val (concrete_default_notrap_load_value chunk))
+  | NOTRAP => Some (Val Vundef)
   end.
 
 Definition exec_load_deps_offset (trap: trapping_mode) (chunk: memory_chunk) (m: mem) (v: val) (ofs: offset) :=
   let (ge, fn) := Ge in
   match (eval_offset ofs) with
   | OK ptr => match Mem.loadv chunk m (Val.offset_ptr v ptr) with 
-              | None => exec_incorrect_load trap chunk
+              | None => exec_incorrect_load trap
               | Some vl => Some (Val vl)
               end
   | _ => None
@@ -182,13 +182,13 @@ Definition exec_load_deps_offset (trap: trapping_mode) (chunk: memory_chunk) (m:
 
 Definition exec_load_deps_reg (trap: trapping_mode) (chunk: memory_chunk) (m: mem) (v vo: val) :=
   match Mem.loadv chunk m (Val.addl v vo) with
-  | None => exec_incorrect_load trap chunk
+  | None => exec_incorrect_load trap
   | Some vl => Some (Val vl)
   end.
 
 Definition exec_load_deps_regxs (trap: trapping_mode) (chunk: memory_chunk) (m: mem) (v vo: val) :=
   match Mem.loadv chunk m (Val.addl v (Val.shll vo (scale_of_chunk chunk))) with
-  | None => exec_incorrect_load trap chunk
+  | None => exec_incorrect_load trap
   | Some vl => Some (Val vl)
   end.
 
diff --git a/kvx/Asmblockgenproof1.v b/kvx/Asmblockgenproof1.v
index a65bd5bc..259c4f9c 100644
--- a/kvx/Asmblockgenproof1.v
+++ b/kvx/Asmblockgenproof1.v
@@ -1914,7 +1914,7 @@ Lemma transl_load_access2_correct_notrap2:
   Mem.loadv chunk m v = None ->
   exists rs',
      exec_straight ge (basics_to_code c) rs m (basics_to_code k) rs' m
-  /\ rs'#rd = concrete_default_notrap_load_value chunk
+  /\ rs'#rd = Vundef
   /\ forall r, r <> PC -> r <> RTMP -> r <> rd -> rs'#r = rs#r.
 Proof.
   intros until ro; intros ARGS IREGE INSTR TR EV LOAD. 
@@ -1963,7 +1963,7 @@ Lemma transl_load_access2XS_correct_notrap2:
   Mem.loadv chunk m v = None ->
   exists rs',
      exec_straight ge (basics_to_code c) rs m (basics_to_code k) rs' m
-  /\ rs'#rd = concrete_default_notrap_load_value chunk
+  /\ rs'#rd = Vundef
   /\ forall r, r <> PC -> r <> RTMP -> r <> rd -> rs'#r = rs#r.
 Proof.
   intros until ro; intros ARGS IREGE INSTR TR EV LOAD. 
@@ -2008,7 +2008,7 @@ Lemma transl_load_access_correct_notrap2:
   Mem.loadv chunk m v = None ->
   exists rs',
      exec_straight ge (basics_to_code c) rs m (basics_to_code k) rs' m
-  /\ rs'#rd = concrete_default_notrap_load_value chunk
+  /\ rs'#rd = Vundef
   /\ forall r, r <> PC -> r <> RTMP -> r <> rd -> rs'#r = rs#r.
 Proof.
   intros until v; intros INSTR TR EV LOAD. 
@@ -2185,7 +2185,7 @@ Lemma transl_load_correct_notrap2:
   Mem.loadv chunk m a = None ->
   exists rs',
      exec_straight ge (basics_to_code c) rs m (basics_to_code k) rs' m
-  /\ rs'#(preg_of dst) = (concrete_default_notrap_load_value chunk)
+  /\ rs'#(preg_of dst) = Vundef
   /\ forall r, r <> PC -> r <> RTMP -> r <> preg_of dst -> rs'#r = rs#r.
 Proof.
   intros until a; intros TR EV LOAD. destruct addr.
diff --git a/kvx/Asmblockprops.v b/kvx/Asmblockprops.v
index c3929be5..a732d29b 100644
--- a/kvx/Asmblockprops.v
+++ b/kvx/Asmblockprops.v
@@ -96,9 +96,9 @@ Theorem exec_basic_instr_pc:
 Proof.
   intros. destruct b; try destruct i; try destruct i.
   all: try (inv H; Simpl).
-  1-10: unfold parexec_load_offset in H1; destruct (eval_offset ofs); try discriminate; destruct (Mem.loadv _ _ _); unfold parexec_incorrect_load in *; destruct trap; try discriminate; unfold concrete_default_notrap_load_value in *; inv H1; Simpl; fail.
+  1-10: unfold parexec_load_offset in H1; destruct (eval_offset ofs); try discriminate; destruct (Mem.loadv _ _ _); unfold parexec_incorrect_load in *; destruct trap; try discriminate; inv H1; Simpl; fail.
 
-  1-20: unfold parexec_load_reg, parexec_load_regxs in H1; destruct (Mem.loadv _ _ _); unfold parexec_incorrect_load in *; destruct trap; try discriminate; unfold concrete_default_notrap_load_value in *; inv H1; Simpl; fail.
+  1-20: unfold parexec_load_reg, parexec_load_regxs in H1; destruct (Mem.loadv _ _ _); unfold parexec_incorrect_load in *; destruct trap; try discriminate; inv H1; Simpl; fail.
 
   { (* PLoadQRRO *)
     unfold  parexec_load_q_offset in H1.
diff --git a/kvx/Asmvliw.v b/kvx/Asmvliw.v
index aa2e0885..45b230e6 100644
--- a/kvx/Asmvliw.v
+++ b/kvx/Asmvliw.v
@@ -313,7 +313,11 @@ Inductive cf_instruction : Type :=
 .
 
 (** *** Loads *)
-Definition concrete_default_notrap_load_value (chunk : memory_chunk) :=
+
+(* What follows was the original spec, but is subtly incorrect.
+   Our definition of the assembly-level memory model is already an abstraction of the real world.
+   In particular, we consider that a load is incorrect when it points outside of CompCert's visible memory, whereas this memory could be correct at the assembly level.
+   This means that CompCert would believe an incorrect load would yield 0 whereas it would yield another value.
   match chunk with
   | Mint8signed | Mint8unsigned | Mint16signed | Mint16unsigned
   | Mint32 => Vint Int.zero
@@ -321,7 +325,7 @@ Definition concrete_default_notrap_load_value (chunk : memory_chunk) :=
   | Many32 | Many64 => Vundef
   | Mfloat32 => Vsingle Float32.zero
   | Mfloat64 => Vfloat Float.zero
-  end.
+  end. *)
 
 Inductive load_name : Type :=
   | Plb                                             (**r load byte *)
@@ -1169,16 +1173,16 @@ Definition eval_offset (ofs: offset) : res ptrofs := OK ofs.
 
 (** *** load/store instructions *)
 
-Definition parexec_incorrect_load trap chunk d rsw mw :=
+Definition parexec_incorrect_load trap d rsw mw :=
   match trap with
   | TRAP => Stuck
-  | NOTRAP => Next (rsw#d <- (concrete_default_notrap_load_value chunk)) mw
+  | NOTRAP => Next (rsw#d <- Vundef) mw
   end.
 
 Definition parexec_load_offset (trap: trapping_mode) (chunk: memory_chunk) (rsr rsw: regset) (mr mw: mem) (d a: ireg) (ofs: offset) :=
   match (eval_offset ofs) with
   | OK ptr => match Mem.loadv chunk mr (Val.offset_ptr (rsr a) ptr) with
-              | None => parexec_incorrect_load trap chunk d rsw mw
+              | None => parexec_incorrect_load trap d rsw mw
               | Some v => Next (rsw#d <- v) mw
               end
   | _ => Stuck
@@ -1225,13 +1229,13 @@ Definition parexec_load_o_offset (rsr rsw: regset) (mr mw: mem) (d : gpreg_o) (a
 
 Definition parexec_load_reg (trap: trapping_mode) (chunk: memory_chunk) (rsr rsw: regset) (mr mw: mem) (d a ro: ireg) :=
   match Mem.loadv chunk mr (Val.addl (rsr a) (rsr ro)) with
-  | None => parexec_incorrect_load trap chunk d rsw mw
+  | None => parexec_incorrect_load trap d rsw mw
   | Some v => Next (rsw#d <- v) mw
   end.
 
 Definition parexec_load_regxs (trap: trapping_mode) (chunk: memory_chunk) (rsr rsw: regset) (mr mw: mem) (d a ro: ireg) :=
   match Mem.loadv chunk mr (Val.addl (rsr a) (Val.shll (rsr ro) (scale_of_chunk chunk))) with
-  | None => parexec_incorrect_load trap chunk d rsw mw
+  | None => parexec_incorrect_load trap d rsw mw
   | Some v => Next (rsw#d <- v) mw
   end.
 
diff --git a/kvx/Machregsaux.ml b/kvx/Machregsaux.ml
index e3b18181..dbb89727 100644
--- a/kvx/Machregsaux.ml
+++ b/kvx/Machregsaux.ml
@@ -31,3 +31,5 @@ let class_of_type = function
   | AST.Tint | AST.Tlong 
   | AST.Tfloat | AST.Tsingle -> 0
   | AST.Tany32 | AST.Tany64 -> assert false
+
+let nr_regs = [| 59 |]
diff --git a/kvx/Machregsaux.mli b/kvx/Machregsaux.mli
index 01b0f9fd..23ac1c9a 100644
--- a/kvx/Machregsaux.mli
+++ b/kvx/Machregsaux.mli
@@ -15,3 +15,6 @@
 val is_scratch_register: string -> bool
 
 val class_of_type: AST.typ -> int
+
+(* Number of registers in each class *)
+val nr_regs : int array
diff --git a/kvx/PostpassSchedulingOracle.ml b/kvx/PostpassSchedulingOracle.ml
index 2107ce22..5ebad421 100644
--- a/kvx/PostpassSchedulingOracle.ml
+++ b/kvx/PostpassSchedulingOracle.ml
@@ -787,8 +787,14 @@ let latency_constraints bb =
  *)
 
 let build_problem bb = 
-  { max_latency = -1; resource_bounds = resource_bounds;
-    instruction_usages = instruction_usages bb; latency_constraints = latency_constraints bb }
+{ max_latency = -1;
+  resource_bounds = resource_bounds;
+  instruction_usages = instruction_usages bb;
+  latency_constraints = latency_constraints bb;
+  live_regs_entry = Registers.Regset.empty; (* unused here *)
+  typing = (fun x -> AST.Tint);             (* unused here *)
+  reference_counting = None
+}
 
 let rec find_min_opt (l: int option list) =
   match l with
diff --git a/riscV/Machregsaux.ml b/riscV/Machregsaux.ml
index 840943e7..e3e47946 100644
--- a/riscV/Machregsaux.ml
+++ b/riscV/Machregsaux.ml
@@ -18,3 +18,5 @@ let class_of_type = function
   | AST.Tint | AST.Tlong -> 0
   | AST.Tfloat | AST.Tsingle -> 1
   | AST.Tany32 | AST.Tany64 -> assert false
+
+let nr_regs = [| 26; 32|]
diff --git a/riscV/Machregsaux.mli b/riscV/Machregsaux.mli
index 01b0f9fd..bb3777bf 100644
--- a/riscV/Machregsaux.mli
+++ b/riscV/Machregsaux.mli
@@ -15,3 +15,6 @@
 val is_scratch_register: string -> bool
 
 val class_of_type: AST.typ -> int
+  
+(* Number of registers in each class *)
+val nr_regs : int array
diff --git a/riscV/OpWeights.ml b/riscV/OpWeights.ml
index 0a1d9ad4..a5ece6d5 100644
--- a/riscV/OpWeights.ml
+++ b/riscV/OpWeights.ml
@@ -66,7 +66,8 @@ module Rocket = struct
     | OEmayundef _ -> 0
     | _ -> 1
 
-  let resources_of_op (op : operation) (nargs : int) = resource_bounds
+  let resources_of_op (op : operation) (nargs : int) =
+    match op with OEmayundef _ -> [| 0 |] | _ -> resource_bounds
 
   let non_pipelined_resources_of_op (op : operation) (nargs : int) =
     match op with
diff --git a/runtime/Makefile b/runtime/Makefile
index 6f70fa87..ff51b7d7 100644
--- a/runtime/Makefile
+++ b/runtime/Makefile
@@ -1,5 +1,7 @@
 include ../Makefile.config
 
+.PRECIOUS: .s
+
 CFLAGS=-O1 -Wall
 
 ifeq ($(ARCH),x86)
diff --git a/scheduling/BTL.v b/scheduling/BTL.v
index 96377943..f832085c 100644
--- a/scheduling/BTL.v
+++ b/scheduling/BTL.v
@@ -214,7 +214,7 @@ Inductive has_loaded sp rs m chunk addr args v: trapping_mode -> Prop :=
       : has_loaded sp rs m chunk addr args v trap
   | has_loaded_default
       (LOAD: forall a, eval_addressing ge sp addr rs##args = Some a -> Mem.loadv chunk m a = None)
-      (DEFAULT: v = default_notrap_load_value chunk)
+      (DEFAULT: v = Vundef)
       : has_loaded sp rs m chunk addr args v NOTRAP 
   .
 Local Hint Constructors has_loaded: core.
@@ -272,12 +272,10 @@ Fixpoint iblock_istep_run sp ib rs m: option outcome :=
           match Mem.loadv chunk m a with
           | Some v => Some {| _rs := rs#dst <- v; _m:= m; _fin := None |}
           | None =>
-              let v := default_notrap_load_value chunk in
-              Some {| _rs := rs#dst <- v; _m:= m; _fin := None |}
+              Some {| _rs := rs#dst <- Vundef; _m:= m; _fin := None |}
           end
       | None =>
-          let v := default_notrap_load_value chunk in
-          Some {| _rs := rs#dst <- v; _m:= m; _fin := None |}
+          Some {| _rs := rs#dst <- Vundef; _m:= m; _fin := None |}
       end
   | Bstore chunk addr args src _ =>
       SOME a <- eval_addressing ge sp addr rs##args IN
diff --git a/scheduling/BTLRenumber.ml b/scheduling/BTLRenumber.ml
index 6ff42a27..2f4f9203 100644
--- a/scheduling/BTLRenumber.ml
+++ b/scheduling/BTLRenumber.ml
@@ -94,7 +94,7 @@ let regenerate_btl_tree btl entry =
         let ib = renumber_iblock old_ibf.entry in
         let n = get_inumb_or_next ib in
         let n_pos = i2p n in
-        let bi = mk_binfo n in
+        let bi = mk_binfo n old_ibf.binfo.s_output_regs old_ibf.binfo.typing in
         let ibf = { entry = ib; input_regs = old_ibf.input_regs; binfo = bi } in
         if old_n = entry then new_entry := n_pos;
         dm := PTree.set old_n n_pos !dm;
diff --git a/scheduling/BTLScheduleraux.ml b/scheduling/BTLScheduleraux.ml
index 00be2aa7..75672243 100644
--- a/scheduling/BTLScheduleraux.ml
+++ b/scheduling/BTLScheduleraux.ml
@@ -4,7 +4,9 @@ open BTLtypes
 open Machine
 open DebugPrint
 open PrintBTL
+open Registers
 open RTLcommonaux
+open BTLcommonaux
 open ExpansionOracle
 open PrepassSchedulingOracle
 
@@ -88,11 +90,179 @@ let apply_schedule bseq olast positions =
   in
   build_iblock ibl
 
+(** the useful one. Returns a hashtable with bindings of shape
+ ** [(r,(t, n)], where [r] is a pseudo-register (Registers.reg),
+ ** [t] is its class (according to [typing]), and [n] the number of
+ ** times it's referenced as an argument in instructions of [seqa] ;
+ ** and an arrray containg the list of regs referenced by each
+ ** instruction, with a boolean to know whether it's as arg or dest *)
+let reference_counting (seqa : (iblock * Regset.t) array) (out_regs : Regset.t)
+    (typing : RTLtyping.regenv) :
+    (reg, int * int) Hashtbl.t * (reg * bool) list array =
+  let retl = Hashtbl.create 42 in
+  let retr = Array.make (Array.length seqa) [] in
+  (* retr.(i) : (r, b) -> (r', b') -> ...
+   * where b = true if seen as arg, false if seen as dest
+   *)
+  List.iter
+    (fun reg ->
+      Hashtbl.add retl reg (Machregsaux.class_of_type (typing reg), 1))
+    (Regset.elements out_regs);
+  let add_reg reg =
+    match Hashtbl.find_opt retl reg with
+    | Some (t, n) -> Hashtbl.add retl reg (t, n + 1)
+    | None -> Hashtbl.add retl reg (Machregsaux.class_of_type (typing reg), 1)
+  in
+  let map_true = List.map (fun r -> (r, true)) in
+  Array.iteri
+    (fun i (ins, _) ->
+      match ins with
+      | Bop (_, args, dest, _) | Bload (_, _, _, args, dest, _) ->
+          List.iter add_reg args;
+          retr.(i) <- (dest, false) :: map_true args
+      | Bcond (_, args, _, _, _) ->
+          List.iter add_reg args;
+          retr.(i) <- map_true args
+      | Bstore (_, _, args, src, _) ->
+          List.iter add_reg args;
+          add_reg src;
+          retr.(i) <- (src, true) :: map_true args
+      | BF (Bcall (_, fn, args, dest, _), _) ->
+          List.iter add_reg args;
+          retr.(i) <-
+            (match fn with
+            | Datatypes.Coq_inl reg ->
+                add_reg reg;
+                (dest, false) :: (reg, true) :: map_true args
+            | _ -> (dest, false) :: map_true args)
+      | BF (Btailcall (_, fn, args), _) ->
+          List.iter add_reg args;
+          retr.(i) <-
+            (match fn with
+            | Datatypes.Coq_inl reg ->
+                add_reg reg;
+                (reg, true) :: map_true args
+            | _ -> map_true args)
+      | BF (Bbuiltin (_, args, dest, _), _) ->
+          let rec bar = function
+            | AST.BA r ->
+                add_reg r;
+                retr.(i) <- (r, true) :: retr.(i)
+            | AST.BA_splitlong (hi, lo) | AST.BA_addptr (hi, lo) ->
+                bar hi;
+                bar lo
+            | _ -> ()
+          in
+          List.iter bar args;
+          let rec bad = function
+            | AST.BR r -> retr.(i) <- (r, false) :: retr.(i)
+            | AST.BR_splitlong (hi, lo) ->
+                bad hi;
+                bad lo
+            | _ -> ()
+          in
+          bad dest
+      | BF (Bjumptable (reg, _), _) | BF (Breturn (Some reg), _) ->
+          add_reg reg;
+          retr.(i) <- [ (reg, true) ]
+      | _ -> ())
+    seqa;
+  (* print_string "mentions\n";
+   * Array.iteri (fun i l ->
+   *     print_int i;
+   *     print_string ": [";
+   *     List.iter (fun (r, b) ->
+   *         print_int (Camlcoq.P.to_int r);
+   *         print_string ":";
+   *         print_string (if b then "a:" else "d");
+   *         if b then print_int (snd (Hashtbl.find retl r));
+   *         print_string ", "
+   *       ) l;
+   *     print_string "]\n";
+   *     flush stdout;
+   *   ) retr; *)
+  (retl, retr)
+
+let get_live_regs_entry seqa ibf btl =
+  if !Clflags.option_debug_compcert > 6 then debug_flag := true;
+  let ret =
+    Array.fold_right
+      (fun (ins, liveins) regset_i ->
+        let regset = Regset.union liveins regset_i in
+        match ins with
+        | Bnop _ -> regset
+        | Bop (_, args, dest, _) | Bload (_, _, _, args, dest, _) ->
+            List.fold_left
+              (fun set reg -> Regset.add reg set)
+              (Regset.remove dest regset)
+              args
+        | Bstore (_, _, args, src, _) ->
+            List.fold_left
+              (fun set reg -> Regset.add reg set)
+              (Regset.add src regset) args
+        | BF (Bcall (_, fn, args, dest, _), _) ->
+            List.fold_left
+              (fun set reg -> Regset.add reg set)
+              ((match fn with
+               | Datatypes.Coq_inl reg -> Regset.add reg
+               | Datatypes.Coq_inr _ -> fun x -> x)
+                 (Regset.remove dest regset))
+              args
+        | BF (Btailcall (_, fn, args), _) ->
+            List.fold_left
+              (fun set reg -> Regset.add reg set)
+              (match fn with
+              | Datatypes.Coq_inl reg -> Regset.add reg regset
+              | Datatypes.Coq_inr _ -> regset)
+              args
+        | BF (Bbuiltin (_, args, dest, _), _) ->
+            List.fold_left
+              (fun set arg ->
+                let rec add reg set =
+                  match reg with
+                  | AST.BA r -> Regset.add r set
+                  | AST.BA_splitlong (hi, lo) | AST.BA_addptr (hi, lo) ->
+                      add hi (add lo set)
+                  | _ -> set
+                in
+                add arg set)
+              (let rec rem dest set =
+                 match dest with
+                 | AST.BR r -> Regset.remove r set
+                 | AST.BR_splitlong (hi, lo) -> rem hi (rem lo set)
+                 | _ -> set
+               in
+               rem dest regset)
+              args
+        | BF (Bjumptable (reg, _), _) | BF (Breturn (Some reg), _) ->
+            Regset.add reg regset
+        | Bcond (_, args, _, _, _) ->
+            List.fold_left (fun set reg -> Regset.add reg set) regset args
+        | _ -> regset)
+      seqa ibf.binfo.s_output_regs
+  in
+  debug "live in regs: ";
+  print_regset ret;
+  debug "\n";
+  debug_flag := false;
+  ret
+
 let schedule_blk n ibf btl =
   if not !Clflags.option_fprepass then btl
   else
     let bseq, olast = flatten_blk_basics ibf in
-    match schedule_sequence bseq btl with
+    let seqa =
+      Array.mapi
+        (fun i inst -> (inst, get_liveins inst))
+        bseq
+    in
+    let live_regs_entry = get_live_regs_entry seqa ibf btl in
+    let refcount =
+      reference_counting seqa ibf.binfo.s_output_regs ibf.binfo.typing
+    in
+    match
+      schedule_sequence seqa btl live_regs_entry ibf.binfo.typing refcount
+    with
     | Some positions ->
         let new_ib = apply_schedule bseq olast positions in
         let new_ibf =
@@ -116,29 +286,52 @@ let turn_all_loads_nontrap n ibf btl =
     in
     PTree.set n ibf' btl
 
+let compute_liveins n ibf btl =
+  let rec traverse_rec ib =
+    match ib with
+    | Bseq (ib1, ib2) ->
+        traverse_rec ib1;
+        traverse_rec ib2
+    | BF (Bgoto succ, iinfo)
+    | BF (Bcall (_, _, _, _, succ), iinfo)
+    | BF (Bbuiltin (_, _, _, succ), iinfo) ->
+        let lives = (get_some @@ PTree.get succ btl).input_regs in
+        iinfo.liveins <- lives
+    | BF (Bjumptable (_, tbl), iinfo) ->
+        List.iter
+          (fun ex ->
+            let lives = (get_some @@ PTree.get ex btl).input_regs in
+            iinfo.liveins <- Regset.union iinfo.liveins lives)
+          tbl
+    | Bcond (_, _, BF (Bgoto succ, _), Bnop None, iinfo) -> (
+        match iinfo.pcond with
+        | Some predb ->
+            assert (predb = false);
+            let lives = (get_some @@ PTree.get succ btl).input_regs in
+            iinfo.liveins <- lives
+        | None -> ())
+    | _ -> ()
+  in
+  traverse_rec ibf.entry
+
 let rec do_schedule btl = function
   | [] -> btl
   | (n, ibf) :: blks ->
+      compute_liveins n ibf btl;
       let code_exp = expanse n ibf btl in
       let ibf_exp = get_some @@ PTree.get n code_exp in
       let code_nt = turn_all_loads_nontrap n ibf_exp code_exp in
       let ibf_nt = get_some @@ PTree.get n code_nt in
       let btl' = schedule_blk n ibf_nt code_nt in
-      begin
-        (*debug_flag := true;*)
-        print_btl_code stderr code_nt;
-        print_btl_code stderr btl';
-        (*debug_flag := false;*)
-        do_schedule btl' blks
-      end
+      (*debug_flag := true;*)
+      print_btl_code stderr btl;
+      print_btl_code stderr btl';
+      (*debug_flag := false;*)
+      do_schedule btl' blks
 
 let btl_scheduler f =
   let btl = f.fn_code in
-  (*debug_flag := true;*)
   let elts = PTree.elements btl in
   find_last_reg elts;
   let btl' = do_schedule btl elts in
-  debug "Scheduled BTL Code:\n";
-  print_btl_code stderr btl';
-  (*debug_flag := false;*)
   btl'
diff --git a/scheduling/BTL_SEtheory.v b/scheduling/BTL_SEtheory.v
index e2e6be6b..033ed843 100644
--- a/scheduling/BTL_SEtheory.v
+++ b/scheduling/BTL_SEtheory.v
@@ -93,11 +93,11 @@ Fixpoint eval_sval ctx (sv: sval): option val :=
       | NOTRAP =>
           SOME args <- eval_list_sval ctx lsv IN
           match (eval_addressing (cge ctx) (csp ctx) addr args) with
-          | None => Some (default_notrap_load_value chunk)
+          | None => Some Vundef
           | Some a =>
               SOME m <- eval_smem ctx sm IN
               match (Mem.loadv chunk m a) with
-              | None => Some (default_notrap_load_value chunk)
+              | None => Some Vundef
               | Some val => Some val
               end
           end
diff --git a/scheduling/BTLcommonaux.ml b/scheduling/BTLcommonaux.ml
index 4605d613..577e4828 100644
--- a/scheduling/BTLcommonaux.ml
+++ b/scheduling/BTLcommonaux.ml
@@ -1,15 +1,22 @@
 open Maps
 open BTL
+open Registers
 open BTLtypes
 open RTLcommonaux
 
 let undef_node = -1
 
-let mk_iinfo _inumb _pcond = { inumb = _inumb; pcond = _pcond; visited = false }
+let mk_iinfo _inumb _pcond = { inumb = _inumb; pcond = _pcond; visited = false; liveins = Regset.empty }
 
-let def_iinfo () = { inumb = undef_node; pcond = None; visited = false }
+let def_iinfo () = { inumb = undef_node; pcond = None; visited = false; liveins = Regset.empty }
 
-let mk_binfo _bnumb = { bnumb = _bnumb; visited = false }
+let mk_binfo _bnumb _s_output_regs _typing =
+  {
+    bnumb = _bnumb;
+    visited = false;
+    s_output_regs = _s_output_regs;
+    typing = _typing;
+  }
 
 let reset_visited_ibf btl =
   PTree.map
@@ -68,3 +75,13 @@ let rec get_inumb_or_next = function
       iinfo.inumb
   | Bseq (ib1, _) -> get_inumb_or_next ib1
   | _ -> failwith "get_inumb_or_next: Bnop None"
+
+let get_liveins = function
+  | BF (_, iinfo)
+  | Bnop (Some iinfo)
+  | Bop (_, _, _, iinfo)
+  | Bload (_, _, _, _, _, iinfo)
+  | Bstore (_, _, _, _, iinfo)
+  | Bcond (_, _, _, _, iinfo) ->
+      iinfo.liveins
+  | _ -> failwith "get_liveins: invalid iblock"
diff --git a/scheduling/BTLtypes.ml b/scheduling/BTLtypes.ml
index 3972fd6b..12ca30e8 100644
--- a/scheduling/BTLtypes.ml
+++ b/scheduling/BTLtypes.ml
@@ -1,7 +1,15 @@
+open Registers
+
 type inst_info = {
   mutable inumb : int;
   mutable pcond : bool option;
   mutable visited : bool;
+  mutable liveins: Regset.t;
 }
 
-type block_info = { mutable bnumb : int; mutable visited : bool }
+type block_info = {
+  mutable bnumb : int;
+  mutable visited : bool;
+  s_output_regs : Regset.t;
+  typing : RTLtyping.regenv;
+}
diff --git a/scheduling/InstructionScheduler.ml b/scheduling/InstructionScheduler.ml
index eab0b21a..e3a421a5 100644
--- a/scheduling/InstructionScheduler.ml
+++ b/scheduling/InstructionScheduler.ml
@@ -33,6 +33,10 @@ type latency_constraint = {
 type problem = {
   max_latency : int;
   resource_bounds : int array;
+  live_regs_entry : Registers.Regset.t;
+  typing : RTLtyping.regenv;
+  reference_counting : ((Registers.reg, int * int) Hashtbl.t
+                       * ((Registers.reg * bool) list array)) option;
   instruction_usages : int array array;
   latency_constraints : latency_constraint list;
   };;
@@ -118,6 +122,13 @@ let vector_less_equal a b =
     true
   with Exit -> false;;
 
+(* let vector_add a b =
+ *   assert ((Array.length a) = (Array.length b));
+ *   for i=0 to (Array.length a)-1
+ *   do
+ *     b.(i) <- b.(i) + a.(i)
+ *   done;; *)
+
 let vector_subtract a b =
   assert ((Array.length a) = (Array.length b));
   for i=0 to (Array.length a)-1
@@ -257,8 +268,8 @@ let priority_list_scheduler (order : list_scheduler_order)
       assert(!time >= 0);
       !time
     with Exit -> -1
-               
   in
+  
   let advance_time() =
     begin
       (if !current_time < max_time-1
@@ -267,7 +278,8 @@ let priority_list_scheduler (order : list_scheduler_order)
           Array.blit problem.resource_bounds 0 current_resources 0
             (Array.length current_resources);
           ready.(!current_time + 1) <-
-            InstrSet.union (ready.(!current_time)) (ready.(!current_time + 1));
+            InstrSet.union (ready.(!current_time))
+              (ready.(!current_time + 1));
           ready.(!current_time) <- InstrSet.empty;
         end);
       incr current_time
@@ -334,6 +346,470 @@ let list_scheduler = priority_list_scheduler CRITICAL_PATH_ORDER;;
 (* dummy code for placating ocaml's warnings *)
 let _ = fun x -> priority_list_scheduler INSTRUCTION_ORDER x;;
 
+
+(* A scheduler sensitive to register pressure *)
+let reg_pres_scheduler (problem : problem) : solution option =
+  (* DebugPrint.debug_flag := true; *)
+  let nr_instructions = get_nr_instructions problem in
+
+  if !Clflags.option_debug_compcert > 6 then
+    (Printf.eprintf "\nSCHEDULING_SUPERBLOCK %d\n" nr_instructions;
+     flush stderr);
+  
+  let successors = get_successors problem
+  and predecessors = get_predecessors problem
+  and times = Array.make (nr_instructions+1) (-1) in
+  let live_regs_entry = problem.live_regs_entry in
+
+  let available_regs = Array.copy Machregsaux.nr_regs in
+
+  let nr_types_regs = Array.length available_regs in
+
+  let thres = Array.fold_left (min)
+                (max !(Clflags.option_regpres_threshold) 0)
+                Machregsaux.nr_regs
+  in
+  
+
+  let regs_thresholds = Array.make nr_types_regs thres in
+  (* placeholder value *)
+  
+  let class_r r =
+    Machregsaux.class_of_type (problem.typing r) in
+  
+  let live_regs = Hashtbl.create 42 in
+
+  List.iter (fun r -> let classe = Machregsaux.class_of_type
+                                  (problem.typing r) in
+                   available_regs.(classe)
+                   <- available_regs.(classe) - 1;
+                   Hashtbl.add live_regs r classe)
+    (Registers.Regset.elements live_regs_entry);
+
+  let csr_b = ref false in
+
+  let counts, mentions =
+    match problem.reference_counting with
+    | Some (l, r) -> l, r
+    | None -> assert false
+  in
+  
+  let fold_delta i  = (fun a (r, b) ->
+    a +
+      if class_r r <> i then 0 else
+        (if b then
+           if (Hashtbl.find counts r = (i, 1))
+           then 1 else 0
+         else
+           match Hashtbl.find_opt live_regs r with
+           | None -> -1
+           | Some t -> 0
+  )) in
+  
+  let priorities = critical_paths successors in
+  
+  let current_resources = Array.copy problem.resource_bounds in
+
+  let module InstrSet =
+    struct 
+      module MSet = 
+        Set.Make (struct
+            type t=int
+            let compare x y =
+              match priorities.(y) - priorities.(x) with
+              | 0 -> x - y
+              | z -> z
+          end)
+
+      let empty = MSet.empty
+      let is_empty = MSet.is_empty
+      let add = MSet.add
+      let remove = MSet.remove
+      let union = MSet.union
+      let iter = MSet.iter
+
+      let compare_regs i x y =
+        let pyi = List.fold_left (fold_delta i) 0 mentions.(y) in
+        (* print_int y;
+         * print_string " ";
+         * print_int pyi;
+         * print_newline ();
+         * flush stdout; *)
+        let pxi = List.fold_left (fold_delta i) 0 mentions.(x) in
+        match pyi - pxi with
+        | 0 -> (match priorities.(y) - priorities.(x) with
+               | 0 -> x - y
+               | z -> z)
+        | z -> z
+
+      (** t is the register class *)
+      let sched_CSR t ready usages =
+        (* print_string "looking for max delta";
+         * print_newline ();
+         * flush stdout; *)
+        let result = ref (-1) in
+        iter (fun i ->
+            if vector_less_equal usages.(i) current_resources
+            then if !result = -1 || (compare_regs t !result i > 0)
+                 then result := i) ready;
+        !result
+    end
+  in
+
+  let max_time = bound_max_time problem + 5*nr_instructions in
+  let ready = Array.make max_time InstrSet.empty in
+
+  Array.iteri (fun i preds ->
+      if i < nr_instructions && preds = []
+      then ready.(0) <- InstrSet.add i ready.(0)) predecessors;
+
+  let current_time = ref 0
+  and earliest_time i =
+    try
+      let time = ref (-1) in
+      List.iter (fun (j, latency) ->
+          if times.(j) < 0
+          then raise Exit
+          else let t = times.(j) + latency in
+               if t > !time
+               then time := t) predecessors.(i);
+      assert (!time >= 0);
+      !time
+    with Exit -> -1
+  in
+
+  let advance_time () =
+    (if !current_time < max_time-1
+     then (
+       Array.blit problem.resource_bounds 0 current_resources 0
+         (Array.length current_resources);
+       ready.(!current_time + 1) <-
+         InstrSet.union (ready.(!current_time))
+           (ready.(!current_time +1));
+       ready.(!current_time) <- InstrSet.empty));
+    incr current_time
+  in
+
+  (* ALL MENTIONS TO cnt ARE PLACEHOLDERS *)
+  let cnt = ref 0 in
+
+  let attempt_scheduling ready usages =
+    let result = ref (-1) in
+    DebugPrint.debug "\n\nREADY: ";
+    InstrSet.iter (fun i -> DebugPrint.debug "%d " i) ready;
+    DebugPrint.debug "\n\n";
+    try
+      Array.iteri (fun i avlregs ->
+          DebugPrint.debug "avlregs: %d %d\nlive regs: %d\n"
+            i avlregs (Hashtbl.length live_regs); 
+          if !cnt < 5 && avlregs <= regs_thresholds.(i)
+          then (
+            csr_b := true;
+            let maybe = InstrSet.sched_CSR i ready usages in
+            DebugPrint.debug "maybe %d\n" maybe;
+            (if maybe >= 0 &&
+                  let delta = 
+                    List.fold_left (fold_delta i) 0 mentions.(maybe) in
+                  DebugPrint.debug "delta %d\n" delta;
+                  delta > 0
+             then
+               (vector_subtract usages.(maybe) current_resources;
+                result := maybe)
+             else
+               if not !Clflags.option_regpres_wait_window
+               then
+                 (InstrSet.iter (fun ins ->
+                      if vector_less_equal usages.(ins) current_resources &&
+                           List.fold_left (fold_delta i) 0 mentions.(maybe) >= 0
+                      then result := ins
+                    ) ready;
+                  if !result <> -1 then
+                    vector_subtract usages.(!result) current_resources;
+                  incr cnt)
+               else
+                 (incr cnt)
+            );
+            raise Exit)) available_regs;
+      InstrSet.iter (fun i ->
+          if vector_less_equal usages.(i) current_resources
+          then (
+            vector_subtract usages.(i) current_resources;
+            result := i;
+            raise Exit)) ready;
+      -1
+    with Exit ->
+      !result in
+  
+  while !current_time < max_time
+  do
+    if (InstrSet.is_empty ready.(!current_time))
+    then advance_time ()
+    else
+      match attempt_scheduling ready.(!current_time)
+              problem.instruction_usages with
+      | -1 -> advance_time()
+      | i -> (assert(times.(i) < 0);
+             (DebugPrint.debug "INSTR ISSUED: %d\n" i;
+              if !csr_b && !Clflags.option_debug_compcert > 6 then
+                (Printf.eprintf "REGPRES: high pres class %d\n" i;
+                 flush stderr);
+              csr_b := false;
+              (* if !Clflags.option_regpres_wait_window then *)
+              cnt := 0;
+              List.iter (fun (r,b) ->
+                  if b then
+                    (match Hashtbl.find_opt counts r with
+                     | None -> assert false
+                     | Some (t, n) ->
+                        Hashtbl.remove counts r;
+                        if n = 1 then
+                          (Hashtbl.remove live_regs r;
+                           available_regs.(t)
+                           <- available_regs.(t) + 1))
+                  else
+                    let t = class_r r in
+                    match Hashtbl.find_opt live_regs r with
+                    | None -> (Hashtbl.add live_regs r t;
+                              available_regs.(t)
+                              <- available_regs.(t) - 1)
+                    | Some i -> ()
+                ) mentions.(i));
+             times.(i) <- !current_time;
+             ready.(!current_time)
+             <- InstrSet.remove i (ready.(!current_time));
+             List.iter (fun (instr_to, latency) ->
+                 if instr_to < nr_instructions then
+                   match earliest_time instr_to with
+                   | -1 -> ()
+                   | to_time ->
+                      ((* DebugPrint.debug "TO TIME %d : %d\n" to_time
+                        *   (Array.length ready); *)
+                       ready.(to_time)
+                       <- InstrSet.add instr_to ready.(to_time))
+               ) successors.(i);
+             successors.(i) <- []
+            )
+  done;
+
+  try
+    let final_time = ref (-1) in
+    for i = 0 to nr_instructions - 1 do
+      DebugPrint.debug "%d " i;
+      (if times.(i) < 0 then raise Exit);
+      (if !final_time < times.(i) + 1 then final_time := times.(i) + 1)
+    done;
+    List.iter (fun (i, latency) ->
+        let target_time = latency + times.(i) in
+        if target_time > !final_time then
+          final_time := target_time) predecessors.(nr_instructions);
+    times.(nr_instructions) <- !final_time;
+    (* DebugPrint.debug_flag := false; *)
+    Some times
+  with Exit ->
+    DebugPrint.debug "reg_pres_sched failed\n";
+    (* DebugPrint.debug_flag := false; *)
+    None
+              
+;;
+
+
+(********************************************************************)
+
+let reg_pres_scheduler_bis (problem : problem) : solution option =
+  (* Printf.printf "\nNEW\n\n"; *)
+  let nr_instructions = get_nr_instructions problem in
+  let successors = get_successors problem
+  and predecessors = get_predecessors problem
+  and times = Array.make (nr_instructions+1) (-1) in
+  let live_regs_entry = problem.live_regs_entry in
+
+  (* let available_regs = Array.copy Machregsaux.nr_regs in *)
+
+  let class_r r =
+    Machregsaux.class_of_type (problem.typing r) in
+  
+  let live_regs = Hashtbl.create 42 in
+
+  List.iter (fun r -> let classe = Machregsaux.class_of_type
+                                  (problem.typing r) in
+                   (* available_regs.(classe)
+                    * <- available_regs.(classe) - 1; *)
+                   Hashtbl.add live_regs r classe)
+    (Registers.Regset.elements live_regs_entry);
+
+
+  let counts, mentions =
+    match problem.reference_counting with
+    | Some (l, r) -> l, r
+    | None -> assert false
+  in
+  
+  let fold_delta a (r, b) =
+    a + (if b then
+           match Hashtbl.find_opt counts r with
+           | Some (_, 1) -> 1
+           | _ -> 0
+         else
+           match Hashtbl.find_opt live_regs r with
+           | None -> -1
+           | Some t -> 0
+        ) in
+  
+  let priorities = critical_paths successors in
+  
+  let current_resources = Array.copy problem.resource_bounds in
+
+  let compare_pres x y =
+    let pdy = List.fold_left (fold_delta) 0 mentions.(y) in
+    let pdx = List.fold_left (fold_delta) 0 mentions.(x) in
+    match pdy - pdx with
+    | 0 -> x - y
+    | z -> z
+  in
+  
+  let module InstrSet =
+    Set.Make (struct
+        type t = int
+        let compare x y =
+          match priorities.(y) - priorities.(x) with
+          | 0 -> x - y
+          | z -> z
+      end) in
+
+  let max_time = bound_max_time problem (* + 5*nr_instructions *) in
+  let ready = Array.make max_time InstrSet.empty in
+
+  Array.iteri (fun i preds ->
+      if i < nr_instructions && preds = []
+      then ready.(0) <- InstrSet.add i ready.(0)) predecessors;
+
+  let current_time = ref 0
+  and earliest_time i =
+    try
+      let time = ref (-1) in
+      List.iter (fun (j, latency) ->
+          if times.(j) < 0
+          then raise Exit
+          else let t = times.(j) + latency in
+               if t > !time
+               then time := t) predecessors.(i);
+      assert (!time >= 0);
+      !time
+    with Exit -> -1
+  in
+
+  let advance_time () =
+    (* Printf.printf "ADV\n"; 
+     * flush stdout; *)
+    (if !current_time < max_time-1
+     then (
+       Array.blit problem.resource_bounds 0 current_resources 0
+         (Array.length current_resources);
+       ready.(!current_time + 1) <-
+         InstrSet.union (ready.(!current_time))
+           (ready.(!current_time +1));
+       ready.(!current_time) <- InstrSet.empty));
+    incr current_time
+  in
+
+  
+  let attempt_scheduling ready usages =
+    let result = ref [] in
+    try
+      InstrSet.iter (fun i ->
+          if vector_less_equal usages.(i) current_resources
+          then
+            if !result = [] || priorities.(i) = priorities.(List.hd (!result))
+            then
+              result := i::(!result)
+            else raise Exit
+        ) ready;
+      if !result <> [] then raise Exit;
+      -1
+    with
+      Exit ->
+      let mini =  List.fold_left (fun a b ->
+                      if a = -1 || compare_pres a b > 0
+                      then b else a
+                    ) (-1) !result in
+      vector_subtract usages.(mini) current_resources;
+      mini
+  in
+  
+  while !current_time < max_time
+  do
+    if (InstrSet.is_empty ready.(!current_time))
+    then advance_time ()
+    else
+      match attempt_scheduling ready.(!current_time)
+              problem.instruction_usages with
+      | -1 -> advance_time()
+      | i -> (
+        DebugPrint.debug "ISSUED: %d\nREADY: " i;
+        InstrSet.iter (fun i -> DebugPrint.debug "%d " i)
+          ready.(!current_time);
+        DebugPrint.debug "\nSUCC: ";
+        List.iter (fun (i, l) -> DebugPrint.debug "%d " i)
+          successors.(i);
+        DebugPrint.debug "\n\n";
+        assert(times.(i) < 0);
+        times.(i) <- !current_time;
+        ready.(!current_time)
+        <- InstrSet.remove i (ready.(!current_time));
+        (List.iter (fun (r,b) ->
+             if b then
+               (match Hashtbl.find_opt counts r with
+                | None -> assert false
+                | Some (t, n) ->
+                   Hashtbl.remove counts r;
+                   if n = 1 then
+                     (Hashtbl.remove live_regs r;
+               (* available_regs.(t)
+                * <- available_regs.(t) + 1 *)))
+             else
+               let t = class_r r in
+               match Hashtbl.find_opt live_regs r with
+               | None -> (Hashtbl.add live_regs r t;
+                        (* available_regs.(t)
+                         * <- available_regs.(t) - 1 *))
+               | Some i -> ()
+           ) mentions.(i));
+        List.iter (fun (instr_to, latency) ->
+            if instr_to < nr_instructions then
+              match earliest_time instr_to with
+              | -1 -> ()
+              | to_time ->
+                 ((* DebugPrint.debug "TO TIME %d : %d\n" to_time
+                   *   (Array.length ready); *)
+                  ready.(to_time)
+                  <- InstrSet.add instr_to ready.(to_time))
+          ) successors.(i);
+        successors.(i) <- []
+      )
+  done;
+
+  try
+    let final_time = ref (-1) in
+    for i = 0 to nr_instructions - 1 do
+      (* print_int i;
+       * flush stdout; *)
+      (if times.(i) < 0 then raise Exit);
+      (if !final_time < times.(i) + 1 then final_time := times.(i) + 1)
+    done;
+    List.iter (fun (i, latency) ->
+        let target_time = latency + times.(i) in
+        if target_time > !final_time then
+          final_time := target_time) predecessors.(nr_instructions);
+    times.(nr_instructions) <- !final_time;
+    Some times
+  with Exit ->
+    DebugPrint.debug "reg_pres_sched failed\n";
+    None
+              
+;;
+
+(********************************************************************)
+
 type bundle = int list;;
 
 let rec extract_deps_to index = function
@@ -438,6 +914,12 @@ let reverse_problem problem =
   {
     max_latency = problem.max_latency;
     resource_bounds = problem.resource_bounds;
+    live_regs_entry = Registers.Regset.empty; (* PLACEHOLDER *)
+    (* Not needed for the revlist sched, and for now we wont bother
+       with creating a reverse scheduler aware of reg press *)
+    
+    typing = problem.typing;
+    reference_counting = problem.reference_counting;
     instruction_usages = Array.init (nr_instructions + 1)
       (fun i ->
         if i=0
@@ -1259,5 +1741,7 @@ let scheduler_by_name name =
   | "ilp" -> validated_scheduler cascaded_scheduler
   | "list" -> validated_scheduler list_scheduler
   | "revlist" -> validated_scheduler reverse_list_scheduler
+  | "regpres" -> validated_scheduler reg_pres_scheduler
+  | "regpres_bis" -> validated_scheduler reg_pres_scheduler_bis
   | "greedy" -> greedy_scheduler
   | s -> failwith ("unknown scheduler: " ^ s);;
diff --git a/scheduling/InstructionScheduler.mli b/scheduling/InstructionScheduler.mli
index fb7af3f6..48c7bc09 100644
--- a/scheduling/InstructionScheduler.mli
+++ b/scheduling/InstructionScheduler.mli
@@ -23,6 +23,16 @@ type problem = {
     resource_bounds : int array;
     (** An array of number of units available indexed by the kind of resources to be allocated. It can be empty, in which case the problem is scheduling without resource constraints. *)
 
+    live_regs_entry : Registers.Regset.t;
+    (** The set of live pseudo-registers at entry. *)
+    
+    typing : RTLtyping.regenv;
+    (** Register type map. *)
+    
+    reference_counting : ((Registers.reg, int * int) Hashtbl.t
+                       * ((Registers.reg * bool) list array)) option;
+    (** See RTLpathScheduleraux.reference_counting. *)
+    
     instruction_usages: int array array;
     (** At index {i i} the vector of resources used by instruction number {i i}. It must be the same length as [resource_bounds] *)
 
@@ -68,6 +78,12 @@ Once a clock tick is full go to the next.
 @return [Some solution] when a solution is found, [None] if not. *)
 val list_scheduler : problem -> solution option
 
+(** WIP : Same as list_scheduler, but schedules instructions which decrease
+register pressure when it gets too high. *)
+val reg_pres_scheduler : problem -> solution option
+
+val reg_pres_scheduler_bis : problem -> solution option
+  
 (** Schedule the problem using the order of instructions without any reordering *)
 val greedy_scheduler : problem -> solution option
 
diff --git a/scheduling/RTLpath.v b/scheduling/RTLpath.v
index a4fce97e..b29a7759 100644
--- a/scheduling/RTLpath.v
+++ b/scheduling/RTLpath.v
@@ -156,7 +156,7 @@ Definition istep (ge: RTL.genv) (i: instruction) (sp: val) (rs: regset) (m: mem)
       SOME v <- Mem.loadv chunk m a IN
       Some (mk_istate true pc' (rs#dst <- v) m)
   | Iload NOTRAP chunk addr args dst pc' =>
-      let default_state := mk_istate true pc' rs#dst <- (default_notrap_load_value chunk) m in
+      let default_state := mk_istate true pc' rs#dst <- Vundef m in
       match (eval_addressing ge sp addr rs##args) with
       | None => Some default_state
       | Some a => match (Mem.loadv chunk m a) with
diff --git a/scheduling/RTLpathSE_theory.v b/scheduling/RTLpathSE_theory.v
index aa8db342..2a791feb 100644
--- a/scheduling/RTLpathSE_theory.v
+++ b/scheduling/RTLpathSE_theory.v
@@ -87,11 +87,11 @@ Fixpoint seval_sval (ge: RTL.genv) (sp:val) (sv: sval) (rs0: regset) (m0: mem):
       | NOTRAP =>
           SOME args <- seval_list_sval ge sp lsv rs0 m0 IN
           match (eval_addressing ge sp addr args) with
-          | None => Some (default_notrap_load_value chunk)
+          | None => Some Vundef
           | Some a =>
               SOME m <- seval_smem ge sp sm rs0 m0 IN
               match (Mem.loadv chunk m a) with
-              | None => Some (default_notrap_load_value chunk)
+              | None => Some Vundef
               | Some val => Some val
               end
           end
diff --git a/scheduling/RTLpathScheduleraux.ml b/scheduling/RTLpathScheduleraux.ml
index 3db25d82..659a8ba7 100644
--- a/scheduling/RTLpathScheduleraux.ml
+++ b/scheduling/RTLpathScheduleraux.ml
@@ -18,7 +18,7 @@ let print_superblock (sb: superblock) code =
   begin
     debug "{ instructions = "; print_instructions (Array.to_list insts) code; debug "\n";
     debug "  liveins = "; print_ptree_regset li; debug "\n";
-    debug "  output_regs = "; print_regset outs; debug "}"
+    debug "  output_regs = "; print_regset outs; debug "\n}"
   end
 
 let print_superblocks lsb code =
@@ -73,6 +73,168 @@ let get_superblocks code entry pm typing =
     lsb
 end
 
+(** the useful one. Returns a hashtable with bindings of shape
+ ** [(r,(t, n)], where [r] is a pseudo-register (Registers.reg),
+ ** [t] is its class (according to [typing]), and [n] the number of
+ ** times it's referenced as an argument in instructions of [seqa] ;
+ ** and an arrray containg the list of regs referenced by each
+ ** instruction, with a boolean to know whether it's as arg or dest *)
+let reference_counting (seqa : (instruction * Regset.t) array)
+      (out_regs : Registers.Regset.t) (typing : RTLtyping.regenv) :
+      (Registers.reg, int * int) Hashtbl.t *
+        (Registers.reg * bool) list  array =
+  let retl = Hashtbl.create 42 in
+  let retr = Array.make (Array.length seqa) [] in
+  (* retr.(i) : (r, b) -> (r', b') -> ...
+   * where b = true if seen as arg, false if seen as dest
+   *)
+  List.iter (fun reg ->
+      Hashtbl.add retl
+        reg (Machregsaux.class_of_type (typing reg), 1)
+    ) (Registers.Regset.elements out_regs);
+  let add_reg reg = 
+    match Hashtbl.find_opt retl reg with
+    | Some (t, n) -> Hashtbl.add retl reg (t, n+1)
+    | None -> Hashtbl.add retl reg (Machregsaux.class_of_type
+                                    (typing reg), 1)
+  in
+  let map_true = List.map (fun r -> r, true) in
+  Array.iteri (fun i  (ins, _) ->
+      match ins with
+      | Iop(_,args,dest,_) | Iload(_,_,_,args,dest,_) ->
+         List.iter (add_reg) args;
+         retr.(i) <- (dest, false)::(map_true args)
+      | Icond(_,args,_,_,_) ->
+         List.iter (add_reg) args;
+         retr.(i) <- map_true args
+      | Istore(_,_,args,src,_) ->
+         List.iter (add_reg) args;
+         add_reg src;
+         retr.(i) <- (src, true)::(map_true args)
+      | Icall(_,fn,args,dest,_) ->
+         List.iter (add_reg) args;
+         retr.(i) <- (match fn with
+                     | Datatypes.Coq_inl reg ->
+                        add_reg reg;
+                        (dest,false)::(reg, true)::(map_true args)
+                     | _ -> (dest,false)::(map_true args))
+
+      | Itailcall(_,fn,args) ->
+         List.iter (add_reg) args;
+         retr.(i) <- (match fn with
+                     | Datatypes.Coq_inl reg ->
+                        add_reg reg;
+                        (reg, true)::(map_true args)
+                     | _ -> map_true args)
+      | Ibuiltin(_,args,dest,_) ->
+         let rec bar = function
+           | AST.BA r -> add_reg r;
+                        retr.(i) <- (r, true)::retr.(i)
+           | AST.BA_splitlong  (hi, lo) | AST.BA_addptr (hi, lo) ->
+              bar hi; bar lo
+           | _ -> ()
+         in
+         List.iter (bar) args;
+         let rec bad = function
+           | AST.BR r -> retr.(i) <- (r, false)::retr.(i)
+           | AST.BR_splitlong (hi, lo) ->
+              bad hi; bad lo
+           | _ -> ()
+         in
+         bad dest;
+      | Ijumptable (reg,_) | Ireturn (Some reg) ->
+         add_reg reg;
+         retr.(i) <- [reg, true]
+      | _ -> ()
+    ) seqa;
+  (* print_string "mentions\n";
+   * Array.iteri (fun i l ->
+   *     print_int i;
+   *     print_string ": [";
+   *     List.iter (fun (r, b) ->
+   *         print_int (Camlcoq.P.to_int r);
+   *         print_string ":";
+   *         print_string (if b then "a:" else "d");
+   *         if b then print_int (snd (Hashtbl.find retl r));
+   *         print_string ", "
+   *       ) l;
+   *     print_string "]\n";
+   *     flush stdout;
+   *   ) retr; *)
+  retl, retr
+
+  
+let get_live_regs_entry (sb : superblock) code =
+  (if !Clflags.option_debug_compcert > 6
+   then debug_flag := true);
+  debug "getting live regs for superblock:\n";
+  print_superblock sb code;
+  debug "\n";
+  let seqa = Array.map (fun i ->
+                 (match PTree.get i code with
+                  | Some ii -> ii
+                  | None -> failwith "RTLpathScheduleraux.get_live_regs_entry"
+                 ),
+                 (match PTree.get i sb.liveins with
+                  | Some s -> s
+                  | None -> Regset.empty))
+               sb.instructions in
+  let ret = 
+    Array.fold_right (fun (ins, liveins) regset_i ->
+        let regset = Registers.Regset.union liveins regset_i in
+        match ins with
+        | Inop _ -> regset
+        | Iop (_, args, dest, _)
+        | Iload (_, _, _, args, dest, _) ->
+           List.fold_left (fun set reg -> Registers.Regset.add reg set)
+             (Registers.Regset.remove dest regset) args
+        | Istore (_, _, args, src, _) ->
+           List.fold_left (fun set reg -> Registers.Regset.add reg set)
+             (Registers.Regset.add src regset) args
+        | Icall (_, fn, args, dest, _) ->
+           List.fold_left (fun set reg -> Registers.Regset.add reg set)
+             ((match fn with
+               | Datatypes.Coq_inl reg -> (Registers.Regset.add reg)
+               | Datatypes.Coq_inr _ -> (fun x -> x))
+                (Registers.Regset.remove dest regset))
+             args
+        | Itailcall (_, fn, args) ->
+           List.fold_left (fun set reg -> Registers.Regset.add reg set)
+             (match fn with
+              | Datatypes.Coq_inl reg -> (Registers.Regset.add reg regset)
+              | Datatypes.Coq_inr _ -> regset)
+             args
+        | Ibuiltin (_, args, dest, _) ->
+           List.fold_left (fun set arg ->
+               let rec add reg set =
+                 match reg with
+                 | AST.BA r -> Registers.Regset.add r set
+                 | AST.BA_splitlong (hi, lo)
+                 | AST.BA_addptr (hi, lo) -> add hi (add lo set)
+                 | _ -> set
+             in add arg set)
+             (let rec rem dest set =
+                match dest with
+                | AST.BR r -> Registers.Regset.remove r set
+                | AST.BR_splitlong (hi, lo) -> rem hi (rem lo set)
+                | _ -> set
+             in rem dest regset)
+             args
+        | Icond (_, args, _, _, _) ->
+           List.fold_left (fun set reg ->
+               Registers.Regset.add reg set)
+             regset args
+        | Ijumptable (reg, _)
+          | Ireturn (Some reg) ->
+           Registers.Regset.add reg regset
+        | _ -> regset
+      ) seqa sb.s_output_regs
+  in debug "live in regs: ";
+     print_regset ret;
+     debug "\n";
+     debug_flag := false;
+     ret
+  
 (* TODO David *)
 let schedule_superblock sb code =
   if not !Clflags.option_fprepass
@@ -91,15 +253,22 @@ let schedule_superblock sb code =
          match predicted_successor ii with
          | Some _ -> 0
          | None -> 1 in
+    debug "hello\n";
+    let live_regs_entry = get_live_regs_entry sb code in
+    let seqa = 
+      Array.map (fun i ->
+          (match PTree.get i code with
+           | Some ii -> ii
+           | None -> failwith "RTLpathScheduleraux.schedule_superblock"),
+          (match PTree.get i sb.liveins with
+           | Some s -> s
+           | None -> Regset.empty))
+        (Array.sub sb.instructions 0 (nr_instr-trailer_length)) in
     match PrepassSchedulingOracle.schedule_sequence
-            (Array.map (fun i ->
-                 (match PTree.get i code with
-                 | Some ii -> ii
-                 | None -> failwith "RTLpathScheduleraux.schedule_superblock"),
-                 (match PTree.get i sb.liveins with
-                 | Some s -> s
-                 | None -> Regset.empty))
-               (Array.sub sb.instructions 0 (nr_instr-trailer_length))) with
+            seqa
+            live_regs_entry
+            sb.typing
+            (reference_counting seqa sb.s_output_regs sb.typing) with
     | None -> sb.instructions
     | Some order ->
        let ins' =
diff --git a/scheduling/RTLtoBTLaux.ml b/scheduling/RTLtoBTLaux.ml
index d544e87f..3c06f4cb 100644
--- a/scheduling/RTLtoBTLaux.ml
+++ b/scheduling/RTLtoBTLaux.ml
@@ -45,7 +45,7 @@ let translate_inst (iinfo : BTL.inst_info) inst is_final =
   in
   if is_final then encaps_final btli !osucc else btli
 
-let translate_function code entry join_points liveness =
+let translate_function code entry join_points liveness (typing : RTLtyping.regenv) =
   let reached = ref (PTree.map (fun n i -> false) code) in
   let btl_code = ref PTree.empty in
   let rec build_btl_tree e =
@@ -53,6 +53,7 @@ let translate_function code entry join_points liveness =
     else (
       reached := PTree.set e true !reached;
       let next_nodes = ref [] in
+      let last = ref None in
       let rec build_btl_block n =
         let inst = get_some @@ PTree.get n code in
         let psucc = predicted_successor inst in
@@ -78,13 +79,16 @@ let translate_function code entry join_points liveness =
         | None ->
             debug "BLOCK END.\n";
             next_nodes := !next_nodes @ successors_inst inst;
+            last := Some inst;
             translate_inst iinfo inst true
       in
       let ib = build_btl_block e in
       let succs = !next_nodes in
 
       let inputs = get_some @@ PTree.get e liveness in
-      let bi = mk_binfo (p2i e) in
+      let soutput = get_outputs liveness e (get_some @@ !last) in
+
+      let bi = mk_binfo (p2i e) soutput typing in
       let ibf = { entry = ib; input_regs = inputs; binfo = bi } in
       btl_code := PTree.set e ibf !btl_code;
       List.iter build_btl_tree succs)
@@ -97,7 +101,8 @@ let rtl2btl (f : RTL.coq_function) =
   let entry = f.fn_entrypoint in
   let join_points = get_join_points code entry in
   let liveness = analyze f in
-  let btl = translate_function code entry join_points liveness in
+  let typing = get_ok @@ RTLtyping.type_function f in
+  let btl = translate_function code entry join_points liveness typing in
   let dm = PTree.map (fun n i -> n) btl in
   (*debug_flag := true;*)
   debug "Entry %d\n" (p2i entry);
diff --git a/scheduling/postpass_lib/Machblock.v b/scheduling/postpass_lib/Machblock.v
index c8eadbd7..b588cca8 100644
--- a/scheduling/postpass_lib/Machblock.v
+++ b/scheduling/postpass_lib/Machblock.v
@@ -237,13 +237,13 @@ Inductive basic_step (s: list stackframe) (fb: block) (sp: val) (rs: regset) (m:
   | exec_MBload_notrap1:
       forall addr args rs' chunk dst,
       eval_addressing ge sp addr rs##args = None ->
-      rs' = ((undef_regs (destroyed_by_load chunk addr) rs)#dst <- (default_notrap_load_value chunk)) ->
+      rs' = ((undef_regs (destroyed_by_load chunk addr) rs)#dst <- Vundef) ->
       basic_step s fb sp rs m (MBload NOTRAP chunk addr args dst) rs' m
   | exec_MBload_notrap2:
       forall addr args a rs' chunk dst,
       eval_addressing ge sp addr rs##args = Some a ->
       Mem.loadv chunk m a = None ->
-      rs' = ((undef_regs (destroyed_by_load chunk addr) rs)#dst <- (default_notrap_load_value chunk)) ->
+      rs' = ((undef_regs (destroyed_by_load chunk addr) rs)#dst <- Vundef) ->
       basic_step s fb sp rs m (MBload NOTRAP chunk addr args dst) rs' m
   | exec_MBstore:
       forall chunk addr args src m' a rs',
diff --git a/test/nardino/scheduling/entry_regs.c b/test/nardino/scheduling/entry_regs.c
new file mode 100644
index 00000000..9e6adacb
--- /dev/null
+++ b/test/nardino/scheduling/entry_regs.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+
+int f(int n) {
+	if (n > 0) 
+		return 42;
+	else
+		return n;
+}
+
+
+int main(int argc, char *argv[]) {
+    int a=1;
+	float b=2.;
+	int c = f(a);
+	a = 3;
+	int d = f(a);
+	printf("%e, %d, %d, %d", b, a, c, d);
+    return 0;
+}
diff --git a/test/nardino/scheduling/spille_backw.c b/test/nardino/scheduling/spille_backw.c
new file mode 100644
index 00000000..1c36ee86
--- /dev/null
+++ b/test/nardino/scheduling/spille_backw.c
@@ -0,0 +1,114 @@
+int f(int k) {
+	int a1 = k;
+	int b1 = 2*a1;
+	int c = a1;
+	int a2 = k+1;
+	int b2 = 2*a2;
+	c += a2;
+	int a3 = k+2;
+	int b3 = 2*a3;
+	c += a3;
+	int a4 = k+3;
+	int b4 = 2*a4;
+	c += a4;
+	int a5 = k+4;
+	int b5 = 2*a5;
+	c += a5;
+	int a6 = k+5;
+	int b6 = 2*a6;
+	c += a6;
+	int a7 = k+6;
+	int b7 = 2*a7;
+	c += a7;
+	int a8 = k+7;
+	int b8 = 2*a8;
+	c += a8;
+	int a9 = k+8;
+	int b9 = 2*a9;
+	c += a9;
+	int a10 = k+9;
+	int b10 = 2*a10;
+	c += a10;
+	int a11 = k+10;
+	int b11 = 2*a11;
+	c += a11;
+	int a12 = k+11;
+	int b12 = 2*a12;
+	c += a12;
+	int a13 = k+12;
+	int b13 = 2*a13;
+	c += a13;
+	int a14 = k+13;
+	int b14 = 2*a14;
+	c += a14;
+	int a15 = k+14;
+	int b15 = 2*a15;
+	c += a15;
+	int a16 = k+15;
+	int b16 = 2*a16;
+	c += a16;
+	int a17 = k+16;
+	int b17 = 2*a17;
+	c += a17;
+	int a18 = k+17;
+	int b18 = 2*a18;
+	c += a18;
+	int a19 = k+18;
+	int b19 = 2*a19;
+	c += a19;
+	int a20 = k+19;
+	int b20 = 2*a20;
+	c += a20;
+	int a21 = k+20;
+	int b21 = 2*a21;
+	c += a21;
+	int a22 = k+21;
+	int b22 = 2*a22;
+	c += a22;
+	int a23 = k+22;
+	int b23 = 2*a23;
+	c += a23;
+	int a24 = k+23;
+	int b24 = 2*a24;
+	c += a24;
+	int a25 = k+24;
+	int b25 = 2*a25;
+	c += a25;
+	int a26 = k+25;
+	int b26 = 2*a26;
+	c += a26;
+	return
+		b13+	
+		b12+
+		b11+
+		b10+
+		b9+
+		b8+
+		b7+
+		b6+
+		b5+
+		b4+
+		b3+
+		b2+
+		b1+
+		b14+
+		b15+
+		b16+
+		b17+
+		b18+
+		b19+
+		b20+
+		b21+
+		b22+
+		b23+
+		b23+
+		b24+
+		b25+
+		b26+
+		c;
+}
+
+int main(int argc, char *argv[]) {
+	f(3);
+    return 0;
+}
diff --git a/test/nardino/scheduling/spille_forw.c b/test/nardino/scheduling/spille_forw.c
new file mode 100644
index 00000000..db88588b
--- /dev/null
+++ b/test/nardino/scheduling/spille_forw.c
@@ -0,0 +1,166 @@
+#include <stdio.h>
+
+int f(int n, float * arr) {
+	float a1 = (float) n;
+	float b1 = 2.*a1;
+	float c = a1;
+	float a2 = (float) n+1;
+	float b2 = 2.*a2;
+	c += a2;
+	float a3 = (float) n+2;
+	float b3 = 2.*a3;
+	c += a3;
+	float a4 = (float) n+3;
+	float b4 = 2.*a4;
+	c += a4;
+	float a5 = (float) n+4;
+	float b5 = 2.*a5;
+	c += a5;
+	float a6 = (float) n+5;
+	float b6 = 2.*a6;
+	c += a6;
+	float a7 = (float) n+6;
+	float b7 = 2.*a7;
+	c += a7;
+	float a8 = (float) n+7;
+	float b8 = 2.*a8;
+	c += a8;
+	float a9 = (float) n+8;
+	float b9 = 2.*a9;
+	c += a9;
+	float a10 = (float) n+9;
+	float b10 = 2.*a10;
+	c += a10;
+	float a11 = (float) n+10;
+	float b11 = 2.*a11;
+	c += a11;
+	float a12 = (float) n+11;
+	float b12 = 2.*a12;
+	c += a12;
+	float a13 = (float) n+12;
+	float b13 = 2.*a13;
+	c += a13;
+	float a14 = (float) n+13;
+	float b14 = 2.*a14;
+	c += a14;
+	float a15 = (float) n+14;
+	float b15 = 2.*a15;
+	c += a15;
+	float a16 = (float) n+15;
+	float b16 = 2.*a16;
+	c += a16;
+	float a17 = (float) n+16;
+	float b17 = 2.*a17;
+	c += a17;
+	float a18 = (float) n+17;
+	float b18 = 2.*a18;
+	c += a18;
+	float a19 = (float) n+18;
+	float b19 = 2.*a19;
+	c += a19;
+	float a20 = (float) n+19;
+	float b20 = 2.*a20;
+	c += a20;
+	float a21 = (float) n+20;
+	float b21 = 2.*a21;
+	c += a21;
+	float a22 = (float) n+21;
+	float b22 = 2.*a22;
+	c += a22;
+	float a23 = (float) n+22;
+	float b23 = 2.*a23;
+	c += a23;
+	float a24 = (float) n+23;
+	float b24 = 2.*a24;
+	c += a24;
+	float a25 = (float) n+24;
+	float b25 = 2.*a25;
+	c += a25;
+	float a26 = (float) n+25;
+	float b26 = 2.*a26;
+	c += a26;
+	float a27 = (float) n+26;
+	float b27 = 2.*a27;
+	c += a27;
+	float a28 = (float) n+27;
+	float b28 = 2.*a28;
+	c += a28;
+	float a29 = (float) n+28;
+	float b29 = 2.*a29;
+	c += a29;
+	float a30 = (float) n+29;
+	float b30 = 2.*a30;
+	c += a30;
+	/* arr[0] = a1; */
+	/* arr[1] = a2; */
+	/* arr[2] = a3; */
+	/* arr[3] = a4; */
+	/* arr[4] = a5; */
+	/* arr[5] = a6; */
+	/* arr[6] = a7; */
+	/* arr[7] = a8; */
+	/* arr[8] = a9; */
+	/* arr[9] = a10; */
+	/* arr[10] = a11; */
+	/* arr[11] = a12; */
+	/* arr[12] = a13; */
+	/* arr[13] = a14; */
+	/* arr[14] = a15; */
+	/* arr[15] = a16; */
+	/* arr[16] = a17; */
+	/* arr[17] = a18; */
+	/* arr[18] = a19; */
+	/* arr[19] = a20; */
+	/* arr[20] = a21; */
+	/* arr[21] = a22; */
+	/* arr[22] = a23; */
+	/* arr[23] = a24; */
+	/* arr[24] = a25; */
+	/* arr[25] = a26; */
+	/* arr[26] = a27; */
+	/* arr[27] = a28; */
+	/* arr[28] = a29; */
+	/* arr[29] = a30; */
+	return c +
+		b1+
+		b2+
+		b3+
+		b4+
+		b5+
+		b6+
+		b7+
+		b8+
+		b9+
+		b10+
+		b11+
+		b12+
+		b13+
+		b14+
+		b15+
+		b16+
+		b17+
+		b18+
+		b19+
+		b20+
+		b21+
+		b22+
+		b23+
+		b24+
+		b25+
+		b26+
+		b27+
+		b28+
+		b29+
+		b30;
+}
+
+
+
+
+
+
+int main(int argc, char *argv[]) {
+	float arr[30];
+    f(5, arr);
+    return 0;
+}
diff --git a/tools/compiler_expand.ml b/tools/compiler_expand.ml
index 5c90af6c..45ffc828 100644
--- a/tools/compiler_expand.ml
+++ b/tools/compiler_expand.ml
@@ -13,6 +13,8 @@ type print_result = Noprint | Print of string;;
 type when_triggered = Always | Option of string;;
 type needs_require = Require | NoRequire;;
 
+let rtl_tunneling = PARTIAL, Always, Require, (Some "RTL Branch Tunneling"), "RTLTunneling"
+
 (* FIXME - The gestion of NoRequire is a bit ugly right now. *)
 let rtl_passes =
 [|
@@ -38,6 +40,7 @@ PARTIAL, (Option "optim_CSE3"), Require, (Some "CSE3"), "CSE3";
 TOTAL, (Option "optim_CSE3"), Require, (Some "Kill useless moves after CSE3"), "KillUselessMoves";
 TOTAL, (Option "optim_forward_moves"), Require, (Some "Forwarding moves"), "ForwardMoves";
 PARTIAL, (Option "optim_redundancy"), Require, (Some "Redundancy elimination"), "Deadcode";
+rtl_tunneling;
 TOTAL, Always, Require, (Some "Renumbering pre rotate"), "Renumber";
 PARTIAL, Always, NoRequire, (Some "Loop Rotate"), "Looprotate";
 TOTAL, (Option "optim_move_loop_invariants"), NoRequire, (Some "Renumbering for LICM"), "Renumber";
@@ -46,7 +49,7 @@ TOTAL, (Option "optim_move_loop_invariants"), NoRequire, (Some "Renumbering for
 PARTIAL, (Option "optim_move_loop_invariants"), NoRequire, (Some "CSE3 for LICM"), "CSE3";
 PARTIAL, (Option "optim_move_loop_invariants"), NoRequire, (Some "Redundancy elimination for LICM"), "Deadcode";
 TOTAL, (Option "all_loads_nontrap"), Require, None, "Allnontrap";
-PARTIAL, Always, Require, (Some "Unused globals"), "Unusedglob"
+PARTIAL, Always, Require, (Some "Unused globals"), "Unusedglob";
 |];;
 
 let post_rtl_passes =
@@ -55,7 +58,7 @@ let post_rtl_passes =
   PARTIAL, Always, Require, (Some "Prepass scheduling"), "BTL_Scheduler", Noprint;
   PARTIAL, Always, Require,  (Some "Projection to RTL"), "BTLtoRTL", (Print (Printf.sprintf "RTL %d" ((Array.length rtl_passes) + 1)));
   PARTIAL, Always, Require, (Some "Register allocation"), "Allocation", (Print "LTL 1");
-  PARTIAL, Always, Require, (Some "Branch tunneling"), "Tunneling", (Print "LTL 2");
+  PARTIAL, Always, Require, (Some "LTL Branch tunneling"), "LTLTunneling", (Print "LTL 2");
   PARTIAL, Always, Require, (Some "CFG linearization"), "Linearize", Noprint;
   TOTAL, Always, Require, (Some "Label cleanup"), "CleanupLabels", Noprint;
   PARTIAL, (Option "debug"), Require, (Some "Debugging info for local variables"), "Debugvar", Noprint;
diff --git a/x86/TargetPrinter.ml b/x86/TargetPrinter.ml
index 50c871e4..00e70f65 100644
--- a/x86/TargetPrinter.ml
+++ b/x86/TargetPrinter.ml
@@ -95,9 +95,6 @@ let z oc n = output_string oc (Z.to_string n)
 
 let data_pointer = if Archi.ptr64 then ".quad" else ".long"
 
-(* The comment deliminiter *)
-let comment = "##"
-
 (* Base-2 log of a Caml integer *)
 let rec log2 n =
   assert (n > 0);
@@ -106,6 +103,7 @@ let rec log2 n =
 (* System dependent printer functions *)
 module type SYSTEM =
     sig
+      val comment: string
       val raw_symbol: out_channel -> string -> unit
       val symbol: out_channel -> P.t -> unit
       val label: out_channel -> int -> unit
@@ -124,6 +122,9 @@ module type SYSTEM =
 module ELF_System : SYSTEM =
   struct
 
+    (* The comment delimiter *)
+    let comment = "#"
+
     let raw_symbol oc s =
       fprintf oc "%s" s
 
@@ -219,6 +220,10 @@ module ELF_System : SYSTEM =
 module MacOS_System : SYSTEM =
   struct
 
+    (* The comment delimiter.
+        `##` instead of `#` to please the Clang assembler. *)
+    let comment = "##"
+
     let raw_symbol oc s =
      fprintf oc "_%s" s
 
@@ -280,6 +285,9 @@ module MacOS_System : SYSTEM =
 module Cygwin_System : SYSTEM =
   struct
 
+    (* The comment delimiter *)
+    let comment = "#"
+
     let symbol_prefix =
       if Archi.ptr64 then "" else "_"