Merge branch 'kvx-work' into kvx-work-merge3.8

Conflicts: Makefile configure
author: Cyril SIX <cyril.six@kalray.eu> 2020-12-04 17:41:14 +0100
committer: Cyril SIX <cyril.six@kalray.eu> 2020-12-04 17:41:14 +0100
commit: 60ff1e39bac5ab35c46698cbc1ed7a76fc936cab (patch)
tree: 87ff5f3b209e5659e967f862dab1517bb2b32baa /backend
parent: f2fb8540c94ceb9892510f83bd7d6734fe9d422f (diff)
parent: d2197102d6b81e225865cfac5f1d319d168e1e23 (diff)
download: compcert-kvx-60ff1e39bac5ab35c46698cbc1ed7a76fc936cab.tar.gz
compcert-kvx-60ff1e39bac5ab35c46698cbc1ed7a76fc936cab.zip
11 files changed, 1393 insertions, 286 deletions
diff --git a/backend/CSE3analysis.v b/backend/CSE3analysis.v
index 5ed04bc4..8b7f1190 100644
--- a/backend/CSE3analysis.v
+++ b/backend/CSE3analysis.v
@@ -181,12 +181,24 @@ Definition eq_depends_on_mem eq :=
   | SOp op => op_depends_on_memory op
   end.
 
+Definition eq_depends_on_store eq :=
+  match eq_op eq with
+  | SLoad _ _ => true
+  | SOp op => false
+  end.
+
 Definition get_mem_kills (eqs : PTree.t equation) : PSet.t :=
   PTree.fold (fun already (eqno : eq_id) (eq : equation) =>
                 if eq_depends_on_mem eq
                 then PSet.add eqno already
                 else already) eqs PSet.empty.
 
+Definition get_store_kills (eqs : PTree.t equation) : PSet.t :=
+  PTree.fold (fun already (eqno : eq_id) (eq : equation) =>
+                if eq_depends_on_store eq
+                then PSet.add eqno already
+                else already) eqs PSet.empty.
+
 Definition is_move (op : operation) :
   { op = Omove } + { op <> Omove }.
 Proof.
@@ -216,6 +228,7 @@ Record eq_context := mkeqcontext
                          eq_rhs_oracle : node -> sym_op -> list reg -> PSet.t;
                          eq_kill_reg : reg -> PSet.t;
                          eq_kill_mem : unit -> PSet.t;
+                         eq_kill_store : unit -> PSet.t;
                          eq_moves : reg -> PSet.t }.
 
 Section OPERATIONS.
@@ -342,6 +355,9 @@ Section OPERATIONS.
                   (oper1 dst op args' rel)
       end.
   
+  Definition kill_store (rel : RELATION.t) : RELATION.t :=
+    PSet.subtract rel (eq_kill_store ctx tt).
+
   Definition clever_kill_store
              (chunk : memory_chunk) (addr: addressing) (args : list reg)
              (src : reg)
@@ -358,7 +374,7 @@ Section OPERATIONS.
                 may_overlap chunk addr args chunk' addr' (eq_args eq)
               end
             end)
-         (PSet.inter rel (eq_kill_mem ctx tt))).
+         (PSet.inter rel (eq_kill_store ctx tt))).
 
   Definition store2
              (chunk : memory_chunk) (addr: addressing) (args : list reg)
@@ -366,7 +382,7 @@ Section OPERATIONS.
              (rel : RELATION.t) : RELATION.t :=
     if Compopts.optim_CSE3_alias_analysis tt
     then clever_kill_store chunk addr args src rel
-    else kill_mem rel.
+    else kill_store rel.
 
   Definition store1
              (chunk : memory_chunk) (addr: addressing) (args : list reg)
@@ -501,6 +517,7 @@ Definition context_from_hints (hints : analysis_hints) :=
   let eqs := hint_eq_catalog hints in
   let reg_kills := get_reg_kills eqs in 
   let mem_kills := get_mem_kills eqs in
+  let store_kills := get_store_kills eqs in
   let moves := get_moves eqs in
   {|
     eq_catalog := fun eq_id => PTree.get eq_id eqs;
@@ -508,5 +525,6 @@ Definition context_from_hints (hints : analysis_hints) :=
     eq_rhs_oracle  := hint_eq_rhs_oracle hints;
     eq_kill_reg := fun reg => PMap.get reg reg_kills;
     eq_kill_mem := fun _ => mem_kills;
+    eq_kill_store := fun _ => store_kills;
     eq_moves    := fun reg => PMap.get reg moves
   |}.
diff --git a/backend/CSE3analysisaux.ml b/backend/CSE3analysisaux.ml
index e37ef61f..e038331c 100644
--- a/backend/CSE3analysisaux.ml
+++ b/backend/CSE3analysisaux.ml
@@ -174,6 +174,7 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
   and rhs_table = Hashtbl.create 100
   and cur_kill_reg = ref (PMap.init PSet.empty)
   and cur_kill_mem = ref PSet.empty
+  and cur_kill_store = ref PSet.empty
   and cur_moves = ref (PMap.init PSet.empty) in
   let eq_find_oracle node eq =
     assert (not (is_trivial eq));
@@ -216,6 +217,8 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
            (eq.eq_lhs :: eq.eq_args);
          (if eq_depends_on_mem eq
           then cur_kill_mem := PSet.add coq_id !cur_kill_mem);
+         (if eq_depends_on_store eq
+          then cur_kill_store := PSet.add coq_id !cur_kill_store);
          (match eq.eq_op, eq.eq_args with
           | (SOp Op.Omove), [rhs] -> imp_add_i_j cur_moves eq.eq_lhs coq_id
           | _, _ -> ());
@@ -232,6 +235,7 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
               eq_rhs_oracle  = rhs_find_oracle ;
               eq_kill_reg    = (fun reg -> PMap.get reg !cur_kill_reg);
               eq_kill_mem    = (fun () -> !cur_kill_mem);
+              eq_kill_store  = (fun () -> !cur_kill_store);
               eq_moves       = (fun reg -> PMap.get reg !cur_moves)
             } in
   match internal_analysis ctx tenv f
diff --git a/backend/CSE3analysisproof.v b/backend/CSE3analysisproof.v
index 1e5b88c3..b298ea65 100644
--- a/backend/CSE3analysisproof.v
+++ b/backend/CSE3analysisproof.v
@@ -133,13 +133,18 @@ Definition xlget_kills (eqs : list (eq_id * equation)) (m :  Regmap.t PSet.t) :
     add_i_j (eq_lhs (snd item)) (fst item)
             (add_ilist_j (eq_args (snd item)) (fst item) already)) eqs m.
 
-
 Definition xlget_mem_kills (eqs : list (positive * equation)) (m : PSet.t) : PSet.t :=
 (fold_left
        (fun (a : PSet.t) (p : positive * equation) =>
         if eq_depends_on_mem (snd p) then PSet.add (fst p) a else a)
        eqs m).
 
+Definition xlget_store_kills (eqs : list (positive * equation)) (m : PSet.t) : PSet.t :=
+(fold_left
+       (fun (a : PSet.t) (p : positive * equation) =>
+        if eq_depends_on_store (snd p) then PSet.add (fst p) a else a)
+       eqs m).
+
 Lemma xlget_kills_monotone :
   forall eqs m i j,
     PSet.contains (Regmap.get i m) j = true ->
@@ -170,6 +175,24 @@ Qed.
 
 Hint Resolve xlget_mem_kills_monotone : cse3.
 
+Lemma xlget_store_kills_monotone :
+  forall eqs m j,
+    PSet.contains m j = true ->
+    PSet.contains (xlget_store_kills eqs m) j = true.
+Proof.
+  induction eqs; simpl; trivial.
+  intros.
+  destruct eq_depends_on_store.
+  - apply IHeqs.
+    destruct (peq (fst a) j).
+    + subst j. apply PSet.gadds.
+    + rewrite PSet.gaddo by congruence.
+      trivial.
+  - auto.
+Qed.
+
+Hint Resolve xlget_store_kills_monotone : cse3.
+
 Lemma xlget_kills_has_lhs :
   forall eqs m lhs sop args j,
     In (j, {| eq_lhs := lhs;
@@ -333,6 +356,60 @@ Qed.
 
 Hint Resolve context_from_hints_get_kills_has_eq_depends_on_mem : cse3.
 
+Lemma xlget_kills_has_eq_depends_on_store :
+  forall eqs eq j m,
+    In (j, eq) eqs ->
+    eq_depends_on_store eq = true ->
+    PSet.contains (xlget_store_kills eqs m) j = true.
+Proof.
+  induction eqs; simpl.
+  contradiction.
+  intros.
+  destruct H.
+  { subst a.
+    simpl.
+    rewrite H0.
+    apply xlget_store_kills_monotone.
+    apply PSet.gadds.
+  }
+  eauto.
+Qed.
+
+Hint Resolve xlget_kills_has_eq_depends_on_store : cse3.
+
+Lemma get_kills_has_eq_depends_on_store :
+  forall eqs eq j,
+    PTree.get j eqs = Some eq ->
+    eq_depends_on_store eq = true ->
+    PSet.contains (get_store_kills eqs) j = true.
+Proof.
+  intros.
+  unfold get_store_kills.
+  rewrite PTree.fold_spec.
+  change (fold_left
+       (fun (a : PSet.t) (p : positive * equation) =>
+        if eq_depends_on_store (snd p) then PSet.add (fst p) a else a)
+       (PTree.elements eqs) PSet.empty)
+    with (xlget_store_kills (PTree.elements eqs) PSet.empty).
+  eapply xlget_kills_has_eq_depends_on_store.
+  apply PTree.elements_correct.
+  eassumption.
+  trivial.
+Qed.
+  
+Lemma context_from_hints_get_kills_has_eq_depends_on_store :
+  forall hints eq j,
+    PTree.get j (hint_eq_catalog hints) = Some eq ->
+    eq_depends_on_store eq = true ->
+    PSet.contains (eq_kill_store (context_from_hints hints) tt) j = true.
+Proof.
+  intros.
+  simpl.
+  eapply get_kills_has_eq_depends_on_store; eassumption.
+Qed.
+
+Hint Resolve context_from_hints_get_kills_has_eq_depends_on_store : cse3.
+
 Definition eq_involves (eq : equation) (i : reg) :=
   i = (eq_lhs eq) \/ In i (eq_args eq).
 
@@ -418,6 +495,12 @@ Section SOUNDNESS.
       eq_depends_on_mem eq = true ->
       PSet.contains (eq_kill_mem ctx tt) j = true.
 
+  Hypothesis ctx_kill_store_has_depends_on_store :
+    forall eq j,
+      eq_catalog ctx j = Some eq ->
+      eq_depends_on_store eq = true ->
+      PSet.contains (eq_kill_store ctx tt) j = true.
+
   Theorem kill_reg_sound :
     forall rel rs m dst v,
       (sem_rel rel rs m) ->
@@ -574,6 +657,55 @@ Section SOUNDNESS.
 
   Hint Resolve kill_mem_sound : cse3.
 
+  (* TODO: shouldn't this already be proved somewhere else? *)
+  Lemma store_preserves_validity:
+    forall m m' wchunk a v
+      (STORE : Mem.storev wchunk m a v = Some m')
+      (b : block) (z : Z),
+      Mem.valid_pointer m' b z = Mem.valid_pointer m b z.
+  Proof.
+    unfold Mem.storev.
+    intros.
+    destruct a; try discriminate.
+    Local Transparent Mem.store.
+    unfold Mem.store in STORE.
+    destruct Mem.valid_access_dec in STORE.
+    2: discriminate.
+    inv STORE.
+    reflexivity.
+  Qed.
+
+  Hint Resolve store_preserves_validity : cse3.
+  
+  Theorem kill_store_sound :
+    forall rel rs m m' wchunk a v,
+      (sem_rel rel rs m) ->
+      (Mem.storev wchunk m a v = Some m') ->
+      (sem_rel (kill_store (ctx:=ctx) rel) rs m').
+  Proof.
+    unfold sem_rel, sem_eq, sem_rhs, kill_store.
+    intros until v.
+    intros REL STORE i eq.
+    specialize REL with (i := i) (eq0 := eq).
+    intros SUBTRACT CATALOG.
+    rewrite PSet.gsubtract in SUBTRACT.
+    rewrite andb_true_iff in SUBTRACT.
+    intuition.
+    destruct (eq_op eq) as [op | chunk addr] eqn:OP.
+    - rewrite op_valid_pointer_eq with (m2 := m).
+      assumption.
+      eapply store_preserves_validity; eauto.
+    - specialize ctx_kill_store_has_depends_on_store with (eq0 := eq) (j := i).
+      destruct eq as [lhs op args]; simpl in *.
+      rewrite OP in ctx_kill_store_has_depends_on_store.
+      rewrite negb_true_iff in H0.
+      rewrite OP in CATALOG.
+      intuition.
+      congruence.
+  Qed.
+ 
+  Hint Resolve kill_store_sound : cse3.
+ 
   Theorem eq_find_sound:
     forall no eq id,
       eq_find (ctx := ctx) no eq = Some id ->
@@ -895,13 +1027,12 @@ Section SOUNDNESS.
     unfold sem_eq in *.
     simpl in *.
     destruct eq_op as [op' | chunk' addr']; simpl.
-    - destruct (op_depends_on_memory op') eqn:DEPENDS.
-      + erewrite ctx_kill_mem_has_depends_on_mem in CONTAINS by eauto.
-        discriminate.
-      + rewrite op_depends_on_memory_correct with (m2:=m); trivial.
+    - rewrite op_valid_pointer_eq with (m2 := m).
+      + cbn in *.
         apply REL; auto.
+      + eapply store_preserves_validity; eauto.
     - simpl in REL.
-      erewrite ctx_kill_mem_has_depends_on_mem in CONTAINS by eauto.
+      erewrite ctx_kill_store_has_depends_on_store in CONTAINS by eauto.
       simpl in CONTAINS.
       rewrite negb_true_iff in CONTAINS.
       destruct (eval_addressing genv sp addr' rs ## eq_args) as [a'|] eqn:ADDR'.
diff --git a/backend/Duplicateaux.ml b/backend/Duplicateaux.ml
index 76b5616b..c9985dc4 100644
--- a/backend/Duplicateaux.ml
+++ b/backend/Duplicateaux.ml
@@ -115,16 +115,18 @@ let ptree_printbool pt =
 
 (* Looks ahead (until a branch) to see if a node further down verifies
  * the given predicate *)
-let rec look_ahead code node is_loop_header predicate =
+let rec look_ahead_gen (successors: RTL.instruction -> P.t list) code node is_loop_header predicate =
   if (predicate node) then true
-  else match (rtl_successors @@ get_some @@ PTree.get node code) with
+  else match (successors @@ get_some @@ PTree.get node code) with
     | [n] -> if (predicate n) then true
         else (
           if (get_some @@ PTree.get n is_loop_header) then false
-          else look_ahead code n is_loop_header predicate
+          else look_ahead_gen successors code n is_loop_header predicate
         )
     | _ -> false
 
+let look_ahead = look_ahead_gen rtl_successors
+
 (** 
  * Heuristics mostly based on the paper Branch Prediction for Free 
  *)
@@ -545,7 +547,8 @@ let is_a_nop code n =
  * preds: mapping node -> predecessors
  * ptree: the revmap
  * trace: the trace to follow tail duplication on *)
-let tail_duplicate code preds ptree trace =
+let tail_duplicate code preds is_loop_header ptree trace =
+  debug "Tail_duplicate on that trace: %a\n" print_trace trace;
   (* next_int: unused integer that can be used for the next duplication *)
   let next_int = ref (next_free_pc code)
   (* last_node and last_duplicate store resp. the last processed node of the trace, and its duplication *)
@@ -560,7 +563,12 @@ let tail_duplicate code preds ptree trace =
           if is_first then (code, ptree) (* first node is never duplicated regardless of its inputs *)
           else
             let node_preds = ptree_get_some n preds
-            in let node_preds_nolast = List.filter (fun e -> e <> get_some !last_node) node_preds
+            in let node_preds_nolast = 
+              (* We traverse loop headers without initiating tail duplication 
+               * (see case of two imbricated loops) *)
+              if (get_some @@ PTree.get n is_loop_header) then []
+              else List.filter (fun e -> e <> get_some !last_node) node_preds
+            (* in let node_preds_nolast = List.filter (fun e -> not @@ List.mem e t) node_preds_nolast *)
             in let final_node_preds = match !last_duplicate with
               | None -> node_preds_nolast
               | Some n' -> n' :: node_preds_nolast
@@ -581,12 +589,12 @@ let tail_duplicate code preds ptree trace =
   in let new_code, new_ptree = f code ptree true trace
   in (new_code, new_ptree, !nb_duplicated)
 
-let superblockify_traces code preds traces ptree =
+let superblockify_traces code preds is_loop_header traces ptree =
   let max_nb_duplicated = !Clflags.option_ftailduplicate (* FIXME - should be architecture dependent *)
   in let rec f code ptree = function
     | [] -> (code, ptree, 0)
     | trace :: traces ->
-        let new_code, new_ptree, nb_duplicated = tail_duplicate code preds ptree trace
+        let new_code, new_ptree, nb_duplicated = tail_duplicate code preds is_loop_header ptree trace
         in if (nb_duplicated < max_nb_duplicated)
           then (debug "End duplication\n"; f new_code new_ptree traces)
           else (debug "Too many duplicated nodes, aborting tail duplication\n"; (code, ptree, 0))
@@ -615,15 +623,29 @@ type innerLoop = {
   preds: P.t list;
   body: P.t list;
   head: P.t; (* head of the loop *)
-  finals: P.t list (* the final instructions, which loops back to the head *)
+  finals: P.t list; (* the final instructions, which loops back to the head *)
   (* There may be more than one ; for instance if there is an if inside the loop with both
-   * branches leading to a goto backedge *)
+   * branches leading to a goto backedge
+   * Such cases usually happen after a tail-duplication *)
+  sb_final: P.t option; (* if the innerloop wraps a superblock, this is its final instruction *)
+    (* may be None if we predict that we do not loop *)
 }
 
 let print_pset = LICMaux.pp_pset
 
+let print_option_pint oc o =
+  if !debug_flag then
+    match o with
+    | None -> Printf.fprintf oc "None"
+    | Some n -> Printf.fprintf oc "Some %d" (P.to_int n)
+
 let print_inner_loop iloop =
-  debug "{preds: %a, body: %a}" print_intlist iloop.preds print_intlist iloop.body
+  debug "{preds: %a, body: %a, head: %d, finals: %a, sb_final: %a}\n"
+    print_intlist iloop.preds
+    print_intlist iloop.body
+    (P.to_int iloop.head)
+    print_intlist iloop.finals
+    print_option_pint iloop.sb_final
 
 let rec print_inner_loops = function
 | [] -> ()
@@ -692,6 +714,34 @@ let get_inner_loops f code is_loop_header =
   !iloops
   *)
 
+let rtl_successors_pref = function
+| Itailcall _ | Ireturn _ -> []
+| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n)
+| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n]
+| Icond (_,_,n1,n2,p) -> (match p with
+  | Some true -> [n1]
+  | Some false -> [n2]
+  | None -> [n1; n2])
+| Ijumptable (_,ln) -> ln
+
+(* Find the last node of a trace (starting at "node"), until a loop is encountered.
+ * If a non-predicted branch is encountered, returns None *)
+let rec find_last_node_before_loop code node trace is_loop_header =
+  let rtl_succ = rtl_successors @@ get_some @@ PTree.get node code in
+  let headers = List.filter (fun n -> 
+    get_some @@ PTree.get n is_loop_header && HashedSet.PSet.contains trace n) rtl_succ in 
+  match headers with
+  | [] -> (
+      let next_nodes = List.filter (fun n -> HashedSet.PSet.contains trace n)
+        (rtl_successors_pref @@ get_some @@ PTree.get node code) in
+      match next_nodes with
+      | [n] -> find_last_node_before_loop code n trace is_loop_header
+      | _ -> None (* May happen when we predict that a loop is not taken *)
+    )
+  | [h] -> Some node
+  | _ -> failwith "Multiple branches leading to a loop"
+
+(* The computation of sb_final requires to already have branch prediction *)
 let get_inner_loops f code is_loop_header =
   let fake_f = { fn_sig = f.fn_sig; fn_params = f.fn_params; 
     fn_stacksize = f.fn_stacksize; fn_code = code; fn_entrypoint = f.fn_entrypoint } in
@@ -718,7 +768,10 @@ let get_inner_loops f code is_loop_header =
           debug "HEADPREDS: %a\n" print_intlist head_preds;
           filtered
         end in
-      { preds = preds; body = (HashedSet.PSet.elements body); head = head; finals = finals }
+      let sb_final = find_last_node_before_loop code head body is_loop_header in
+      let body = HashedSet.PSet.elements body in
+      { preds = preds; body = body; head = head; finals = finals;
+        sb_final = sb_final; }
     ) 
     (* LICMaux.inner_loops also returns non-inner loops, but with a body of 1 instruction
      * We remove those to get just the inner loops *)
@@ -836,31 +889,36 @@ let unroll_inner_loops_single f code revmap =
     (!code', !revmap')
   end
 
+let is_some o = match o with Some _ -> true | None -> false
+
 (* Unrolls the body of the inner loop once - duplicating the exit condition as well 
  * 1) Clones body into body'
- * 2) Links the last instruction of body into the first of body' 
+ * 2) Links the last instruction of body (sb_final) into the first of body' 
  * 3) Links the last instruction of body' into the first of body
  *)
 let unroll_inner_loop_body code revmap iloop =
+  debug "iloop = "; print_inner_loop iloop;
   let body = iloop.body in
   let limit = !Clflags.option_funrollbody in
   if count_ignore_nops code body > limit then begin
     debug "Too many nodes in the loop body (%d > %d)" (List.length body) limit;
     (code, revmap)
+  end else if not @@ is_some iloop.sb_final then begin
+    debug "The loop body does not form a superblock OR we have predicted that we do not loop";
+    (code, revmap)
   end else
     let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
     let code' = ref code2 in
     let head' = apply_map fwmap (iloop.head) in
     let finals' = apply_map_list fwmap (iloop.finals) in
     begin
-      code' := change_pointers !code' iloop.head head' iloop.finals;
+      code' := change_pointers !code' iloop.head head' [get_some @@ iloop.sb_final];
       code' := change_pointers !code' head' iloop.head finals'; 
       (!code', revmap2)
     end
 
 let unroll_inner_loops_body f code revmap =
   let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
-  (* debug_flag := true; *)
   let inner_loops = get_inner_loops f code is_loop_header in
   debug "Number of loops found: %d\n" (List.length inner_loops);
   let code' = ref code in
@@ -870,7 +928,7 @@ let unroll_inner_loops_body f code revmap =
     List.iter (fun iloop ->
       let (new_code, new_revmap) = unroll_inner_loop_body !code' !revmap' iloop in
       code' := new_code; revmap' := new_revmap
-    ) inner_loops; (* debug_flag := false; *)
+    ) inner_loops;
     (!code', !revmap')
   end
 
@@ -966,6 +1024,7 @@ let tail_duplicate f =
     if !Clflags.option_ftailduplicate > 0 then
       let traces = select_traces code entrypoint in
       let preds = get_predecessors_rtl code in
-      superblockify_traces code preds traces revmap
+      let is_loop_header = get_loop_headers code entrypoint in
+      superblockify_traces code preds is_loop_header traces revmap
     else (code, revmap) in
   ((code, entrypoint), revmap)
diff --git a/backend/LICMaux.ml b/backend/LICMaux.ml
index 6283e129..602d078d 100644
--- a/backend/LICMaux.ml
+++ b/backend/LICMaux.ml
@@ -255,8 +255,8 @@ let rewrite_loop_body (last_alloc : reg ref)
                                (List.map (map_reg mapper) args),
                                new_res));
                 PTree.set res new_res mapper
-             | Iload(_, chunk, addr, args, res, pc')
-             | Istore(chunk, addr, args, res, pc')
+             | Iload(_, chunk, addr, args, v, pc')
+             | Istore(chunk, addr, args, v, pc')
                   when Archi.has_notrap_loads &&
                        !Clflags.option_fnontrap_loads ->
                 let new_res = P.succ !last_alloc in
@@ -264,7 +264,7 @@ let rewrite_loop_body (last_alloc : reg ref)
                 add_inj (INJload(chunk, addr,
                                  (List.map (map_reg mapper) args),
                                  new_res));
-                PTree.set res new_res mapper
+                PTree.set v new_res mapper
              | _ -> mapper in
            List.iter (fun x ->
                if PSet.contains loop_body x
diff --git a/backend/Lineartyping.v b/backend/Lineartyping.v
index 3fe61470..22658fb7 100644
--- a/backend/Lineartyping.v
+++ b/backend/Lineartyping.v
@@ -324,7 +324,6 @@ Local Opaque mreg_type.
  
     apply wt_setreg; auto; try (apply wt_undef_regs; auto).
     eapply Val.has_subtype; eauto.
-    
     change ty_res with (snd (ty_args, ty_res)). rewrite <- TYOP. eapply type_of_operation_sound; eauto.
     red; intros; subst op. simpl in ISMOVE.
     destruct args; try discriminate. destruct args; discriminate.
diff --git a/backend/PrintLTL.ml b/backend/PrintLTL.ml
index 4c172020..87e8a1fc 100644
--- a/backend/PrintLTL.ml
+++ b/backend/PrintLTL.ml
@@ -133,10 +133,10 @@ let print_program pp (prog: LTL.program) =
 
 let destination : string option ref = ref None
 
-let print_if prog =
+let print_if passno prog =
   match !destination with
   | None -> ()
   | Some f ->
-      let oc = open_out f in
+      let oc = open_out (f ^ "." ^ Z.to_string passno) in
       print_program oc prog;
       close_out oc
diff --git a/backend/Tunneling.v b/backend/Tunneling.v
index 78458582..269ebb6f 100644
--- a/backend/Tunneling.v
+++ b/backend/Tunneling.v
@@ -3,6 +3,7 @@
 (*              The Compcert verified compiler                         *)
 (*                                                                     *)
 (*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*          Sylvain Boulmé  Grenoble-INP, VERIMAG                      *)
 (*                                                                     *)
 (*  Copyright Institut National de Recherche en Informatique et en     *)
 (*  Automatique.  All rights reserved.  This file is distributed       *)
@@ -12,7 +13,7 @@
 
 (** Branch tunneling (optimization of branches to branches). *)
 
-Require Import Coqlib Maps UnionFind.
+Require Import Coqlib Maps Errors.
 Require Import AST.
 Require Import LTL.
 
@@ -21,10 +22,10 @@ Require Import LTL.
   so that they jump directly to the end of the branch sequence.
   For example:
 <<
-     L1: nop L2;                          L1: nop L3;
-     L2; nop L3;               becomes    L2: nop L3;
+     L1: if (cond) nop L2;                L1: nop L3;
+     L2: nop L3;               becomes    L2: nop L3;
      L3: instr;                           L3: instr;
-     L4: if (cond) goto L1;               L4: if (cond) goto L3;
+     L4: if (cond) goto L1;               L4: if (cond) nop L1;
 >>
   This optimization can be applied to several of our intermediate
   languages.  We choose to perform it on the [LTL] language,
@@ -37,11 +38,14 @@ Require Import LTL.
   dead code (as the "nop L3" in the example above).
 *)
 
-(** The naive implementation of branch tunneling would replace
-  any branch to a node [pc] by a branch to the node
-  [branch_target f pc], defined as follows:
+(** The implementation consists in two passes: the first pass
+    records the branch t of each "nop"
+    and the second pass replace any "nop" node to [pc]
+    by a branch to a "nop" at [branch_t f pc]
+
+Naively, we may define [branch_t f pc] as follows:
 <<
-  branch_target f pc = branch_target f pc'  if f(pc) = nop pc'
+  branch_t f pc = branch_t f pc'  if f(pc) = nop pc'
                      = pc                   otherwise
 >>
   However, this definition can fail to terminate if
@@ -50,56 +54,114 @@ Require Import LTL.
      L1: nop L1;
 >>
   or
-<<   L1: nop L2;
+<<
+     L1: nop L2;
      L2: nop L1;
 >>
   Coq warns us of this fact by not accepting the definition
-  of [branch_target] above.
+  of [branch_t] above.
+
+  To handle this problem, we use a union-find data structure, adding equalities [pc = pc']
+  for every instruction [pc: nop pc'] in the function.
+
+  Moreover, because the elimination of "useless" [Lcond] depends on the current [uf] datastructure,
+  we need to iterate until we reach a fixpoint.
+
+  Actually, it is simpler and more efficient to perform this in an external oracle, that also returns a measure
+  in order to help the proof.
+
+  A verifier checks that this data-structure is correct.
+*)
+
+Definition UF := PTree.t (node * Z).
 
-  To handle this problem, we proceed in two passes.  The first pass
-  populates a union-find data structure, adding equalities [pc = pc']
-  for every instruction [pc: nop pc'] in the function. *)
+(* The oracle returns a map of "nop" node to their target with a distance (ie the number of the "nop" node on the path) to the target. *)
+Axiom branch_target: LTL.function -> UF. 
+Extract Constant branch_target => "Tunnelingaux.branch_target".
 
-Module U := UnionFind.UF(PTree).
+Local Open Scope error_monad_scope.
 
-Definition record_goto (uf: U.t) (pc: node) (b: bblock) : U.t :=
-  match b with
-  | Lbranch s :: _ => U.union uf pc s
-  | _ => uf
+Definition get (td: UF) pc:node*Z :=
+  match td!pc with
+  | Some (t,d) => (t,Z.abs d)
+  | _ => (pc,0)
   end.
 
-Definition record_gotos (f: LTL.function) : U.t :=
-  PTree.fold record_goto f.(fn_code) U.empty.
+Definition target (td: UF) (pc:node): node := fst (get td pc).
+Coercion target: UF >-> Funclass.
+
+(* we check that the domain of [td] is included in the domain of [c] *)
+Definition check_included (td: UF) (c: code): option bblock
+  := PTree.fold (fun (ok:option bblock) pc _ => if ok then c!pc else None) td (Some nil).
+
+(* we check the validity of targets and their bound:
+   the distance of a "nop" node (w.r.t to the target) must be greater than the one of its parents.
+*)
+Definition check_bblock (td: UF) (pc:node) (bb: bblock): res unit
+ := match td!pc with
+    | None => OK tt
+    | Some (tpc, dpc) =>
+       let dpc := Z.abs dpc in
+       match bb with
+       | Lbranch s ::_ =>
+         let (ts, ds) := get td s in 
+         if peq tpc ts then
+            if zlt ds dpc then OK tt
+            else Error (msg "bad distance in Lbranch")
+         else Error (msg "invalid skip of Lbranch")
+       | Lcond _ _ s1 s2 _ :: _ =>
+          let (ts1, ds1) := get td s1 in
+          let (ts2, ds2) := get td s2 in
+          if peq tpc ts1 then
+            if peq tpc ts2 then
+              if zlt ds1 dpc then
+                if zlt ds2 dpc then OK tt
+                else Error (msg "bad distance on else branch")
+              else Error (msg "bad distance on then branch")
+            else Error (msg "invalid skip of else branch")
+          else Error (msg "invalid skip of then branch")
+      | _ => Error (msg "cannot skip this block")
+      end
+   end.
+
+Definition check_code (td: UF) (c:code): res unit
+  := PTree.fold (fun ok pc bb => do _ <- ok; check_bblock td pc bb) c (OK tt).
 
 (** The second pass rewrites all LTL instructions, replacing every
-  successor [s] of every instruction by the canonical representative
+  successor [s] of every instruction by [t s], the canonical representative
   of its equivalence class in the union-find data structure. *)
 
-Definition tunnel_instr (uf: U.t) (i: instruction) : instruction :=
+Definition tunnel_instr (t: node -> node) (i: instruction) : instruction :=
   match i with
-  | Lbranch s => Lbranch (U.repr uf s)
+  | Lbranch s => Lbranch (t s)
   | Lcond cond args s1 s2 info =>
-      let s1' := U.repr uf s1 in let s2' := U.repr uf s2 in
+      let s1' := t s1 in let s2' := t s2 in
       if peq s1' s2'
       then Lbranch s1'
       else Lcond cond args s1' s2' info
-  | Ljumptable arg tbl => Ljumptable arg (List.map (U.repr uf) tbl)
+  | Ljumptable arg tbl => Ljumptable arg (List.map t tbl)
   | _ => i
   end.
 
-Definition tunnel_block (uf: U.t) (b: bblock) : bblock :=
-  List.map (tunnel_instr uf) b.
+Definition tunnel_block (t: node -> node) (b: bblock) : bblock :=
+  List.map (tunnel_instr t) b.
 
-Definition tunnel_function (f: LTL.function) : LTL.function :=
-  let uf := record_gotos f in
-  mkfunction
-    (fn_sig f)
-    (fn_stacksize f)
-    (PTree.map1 (tunnel_block uf) (fn_code f))
-    (U.repr uf (fn_entrypoint f)).
+Definition tunnel_function (f: LTL.function) : res LTL.function :=
+  let td := branch_target f in
+  let c := (fn_code f) in
+  if check_included td c then 
+    do _ <- check_code td c ; OK
+    (mkfunction
+      (fn_sig f)
+      (fn_stacksize f)
+      (PTree.map1 (tunnel_block td) c)
+      (td (fn_entrypoint f)))
+  else 
+   Error (msg "Some node of the union-find is not in the CFG")
+   .
 
-Definition tunnel_fundef (f: LTL.fundef) : LTL.fundef :=
-  transf_fundef tunnel_function f.
+Definition tunnel_fundef (f: fundef) : res fundef :=
+  transf_partial_fundef tunnel_function f.
 
-Definition transf_program (p: LTL.program) : LTL.program :=
-  transform_program tunnel_fundef p.
+Definition transf_program (p: program) : res program :=
+  transform_partial_program tunnel_fundef p.
diff --git a/backend/Tunnelingaux.ml b/backend/Tunnelingaux.ml
new file mode 100644
index 00000000..87e6d303
--- /dev/null
+++ b/backend/Tunnelingaux.ml
@@ -0,0 +1,283 @@
+(* *************************************************************)
+(*                                                             *)
+(*             The Compcert verified compiler                  *)
+(*                                                             *)
+(*           Sylvain Boulmé  Grenoble-INP, VERIMAG             *)
+(*                                                             *)
+(*  Copyright VERIMAG. All rights reserved.                    *)
+(*  This file is distributed under the terms of the INRIA      *)
+(*  Non-Commercial License Agreement.                          *)
+(*                                                             *)
+(* *************************************************************)
+
+(*
+
+This file implements the [branch_target] oracle that identifies "nop" branches in a LTL function,
+and computes their target node with the distance (ie the number of cummulated nops) toward this target.
+
+See [Tunneling.v]
+
+*)
+
+open Coqlib
+open LTL
+open Maps
+open Camlcoq
+
+let limit_tunneling = None (* for debugging: [Some x] limit the number of iterations *)
+let debug_flag = ref false
+let final_dump = false   (* set to true to have a more verbose debugging *)
+
+let debug fmt =
+  if !debug_flag then Printf.eprintf fmt
+  else Printf.ifprintf stderr fmt
+
+exception BugOnPC of int
+
+(* type of labels in the cfg *)
+type label = int * P.t
+
+(* instructions under analyzis *)
+type simple_inst = (* a simplified view of LTL instructions *)
+  LBRANCH of node
+| LCOND of node * node
+| OTHER
+and node = {
+    lab : label;
+    mutable inst: simple_inst;
+    mutable link: node; (* link in the union-find: itself for non "nop"-nodes, target of the "nop" otherwise *)
+    mutable dist: int;
+    mutable tag: int
+  }
+
+(* type of the (simplified) CFG *)
+type cfg = {
+    nodes: (int, node) Hashtbl.t;
+    mutable rems: node list; (* remaining conditions that may become lbranch or not *)
+    mutable num_rems: int;
+    mutable iter_num: int (* number of iterations in elimination of conditions *)
+  }
+
+let lab_i (n: node): int = fst n.lab
+let lab_p (n: node): P.t = snd n.lab
+
+let rec target c n = (* inspired from the "find" of union-find algorithm *)
+  match n.inst with
+  | LCOND(s1,s2) ->
+     if n.link != n
+     then update c n
+     else if n.tag < c.iter_num then (
+       (* we try to change the condition ... *)
+       n.tag <- c.iter_num; (* ... but at most once by iteration *)
+       let ts1 = target c s1 in
+       let ts2 = target c s2 in
+       if ts1 == ts2 then (n.link <- ts1; ts1) else n
+     ) else n
+  | _ ->
+     if n.link != n
+     then update c n
+     else n
+and update c n =
+  let t = target c n.link in
+  n.link <- t; t
+
+let get_node c p =
+  let li = P.to_int p in
+  try
+    Hashtbl.find c.nodes li
+  with
+    Not_found ->
+      let rec n = { lab = (li, p); inst = OTHER; link = n ; dist = 0; tag = 0 }  in
+      Hashtbl.add c.nodes li n;
+      n
+
+let set_branch c p s =
+  let li = P.to_int p in
+  try
+    let n = Hashtbl.find c.nodes li in
+    n.inst <- LBRANCH s;
+    n.link <- target c s
+  with
+    Not_found ->
+      let n = { lab = (li,p); inst = LBRANCH s; link = target c s; dist = 0; tag = 0 } in
+      Hashtbl.add c.nodes li n
+
+
+(* build [c.nodes] and accumulate in [acc] conditions at beginning of LTL basic-blocks *)
+let build_simplified_cfg c acc pc bb =
+  match bb with
+  | Lbranch s :: _ ->
+     let ns = get_node c s in
+     set_branch c pc ns;
+     acc
+  | Lcond (_, _, s1, s2, _) :: _ ->
+     c.num_rems <- c.num_rems + 1;
+     let ns1 = get_node c s1 in
+     let ns2 = get_node c s2 in
+     let npc = get_node c pc in
+     npc.inst <- LCOND(ns1, ns2);
+     npc::acc
+  | _ -> acc
+
+(* try to change a condition into a branch
+[acc] is the current accumulator of conditions to consider in the next iteration of repeat_change_cond
+*)
+let try_change_cond c acc pc =
+  match pc.inst with
+  | LCOND(s1,s2) ->
+     let ts1 = target c s1 in
+     let ts2 = target c s2 in
+     if ts1 == ts2 then (
+       pc.link <- ts1;
+       c.num_rems <- c.num_rems - 1;
+       acc
+     ) else
+       pc::acc
+  | _  -> raise (BugOnPC (lab_i pc)) (* LCOND expected *)
+
+(* repeat [try_change_cond] until no condition is changed into a branch *)
+let rec repeat_change_cond c =
+  c.iter_num <- c.iter_num + 1;
+  debug "++ Tunneling.branch_target %d: remaining number of conds to consider = %d\n" (c.iter_num) (c.num_rems);
+  let old =  c.num_rems in
+  c.rems <- List.fold_left (try_change_cond c) [] c.rems;
+  let curr = c.num_rems in
+  let continue =
+    match limit_tunneling with
+    | Some n -> curr < old && c.iter_num < n
+    | None -> curr < old
+  in
+  if continue
+  then repeat_change_cond c
+
+
+(* compute the final distance of each nop nodes to its target *)
+let undef_dist = -1
+let self_dist = undef_dist-1
+let rec dist n =
+  if n.dist = undef_dist
+  then (
+    n.dist <- self_dist; (* protection against an unexpected loop in the data-structure *)
+    n.dist <-
+      (match n.inst with
+       | OTHER -> 0
+       | LBRANCH p -> 1 + dist p
+       | LCOND (p1,p2) -> 1 + (max (dist p1) (dist p2)));
+    n.dist
+  ) else if n.dist=self_dist then raise (BugOnPC (lab_i n))
+    else n.dist
+
+let final_export f c =
+  let count = ref 0 in
+  let filter_nops_init_dist _ n acc =
+    let tn = target c n in
+    if tn == n
+    then (
+      n.dist <- 0; (* force [n] to be a base case in the recursion of [dist] *)
+      acc
+    ) else (
+      n.dist <- undef_dist; (* force [dist] to compute the actual [n.dist] *)
+      count := !count+1;
+      n::acc
+    )
+  in
+  let nops = Hashtbl.fold filter_nops_init_dist c.nodes [] in
+  let res = List.fold_left (fun acc n -> PTree.set (lab_p n) (lab_p n.link, Z.of_uint (dist n)) acc) PTree.empty nops in
+  debug "* Tunneling.branch_target: final number of eliminated nops = %d\n" !count;
+  res
+
+(*********************************************)
+(*** START: printing and debugging functions *)
+
+let string_of_labeli nodes ipc =
+  try
+    let pc = Hashtbl.find nodes ipc in
+    if pc.link == pc
+    then Printf.sprintf "(Target@%d)" (dist pc)
+    else Printf.sprintf "(Nop %d @%d)" (lab_i pc.link) (dist pc)
+  with
+    Not_found -> ""
+
+let print_bblock c println (pc, bb) =
+  match bb with
+  | Lbranch s::_ -> (if println then debug "\n"); debug "%d:Lbranch %d %s\n" pc (P.to_int s) (string_of_labeli c.nodes pc); false
+  | Lcond (_, _, s1, s2, _)::_ -> (if println then debug "\n"); debug "%d:Lcond (%d,%d) %s\n" pc (P.to_int s1) (P.to_int s2) (string_of_labeli c.nodes pc); false
+  | _ -> debug "%d " pc; true
+
+
+let print_cfg f c  =
+  let a = Array.of_list (PTree.fold (fun acc pc bb -> (P.to_int pc,bb)::acc) f.fn_code []) in
+  Array.fast_sort (fun (i1,_) (i2,_) -> i2 - i1) a;
+  let ep = P.to_int f.fn_entrypoint in
+  debug "entrypoint: %d %s\n" ep (string_of_labeli c.nodes ep);
+  let println = Array.fold_left (print_bblock c) false a in
+  (if println then debug "\n");debug "remaining cond:";
+  List.iter (fun n -> debug "%d " (lab_i n)) c.rems;
+  debug "\n"
+
+(*************************************************************)
+(* Copy-paste of the extracted code of the verifier          *)
+(* with [raise (BugOnPC (P.to_int pc))] instead of [Error.*] *)
+
+let get td pc =
+  match PTree.get pc td with
+  | Some p -> let (t0, d) = p in (t0, d)
+  | None -> (pc, Z.of_uint 0)
+
+let check_bblock td pc bb =
+  match PTree.get pc td with
+  | Some p ->
+    let (tpc, dpc) = p in
+    let dpc0 = dpc in
+    (match bb with
+     | [] ->
+       raise (BugOnPC (P.to_int pc))
+     | i :: _ ->
+       (match i with
+        | Lbranch s ->
+          let (ts, ds) = get td s in
+          if peq tpc ts
+          then if zlt ds dpc0
+               then ()
+               else raise (BugOnPC (P.to_int pc))
+          else raise (BugOnPC (P.to_int pc))
+        | Lcond (_, _, s1, s2, _) ->
+          let (ts1, ds1) = get td s1 in
+          let (ts2, ds2) = get td s2 in
+          if peq tpc ts1
+          then if peq tpc ts2
+               then if zlt ds1 dpc0
+                    then if zlt ds2 dpc0
+                         then ()
+                         else raise (BugOnPC (P.to_int pc))
+                    else raise (BugOnPC (P.to_int pc))
+               else raise (BugOnPC (P.to_int pc))
+          else raise (BugOnPC (P.to_int pc))
+        | _ ->
+          raise (BugOnPC (P.to_int pc))))
+  | None -> ()
+
+(** val check_code : coq_UF -> code -> unit res **)
+
+let check_code td c =
+  PTree.fold (fun _ pc bb -> check_bblock td pc bb) c (())
+
+(*** END: copy-paste & debugging functions *******)
+
+let branch_target f =
+  debug "* Tunneling.branch_target: starting on a new function\n";
+  if limit_tunneling <> None then debug "* WARNING: limit_tunneling <> None\n";
+  let c = { nodes = Hashtbl.create 100; rems = []; num_rems = 0; iter_num = 0 } in
+  c.rems <- PTree.fold (build_simplified_cfg c) f.fn_code [];
+  repeat_change_cond c;
+  let res = final_export f c in
+  if !debug_flag then (
+    try
+      check_code res f.fn_code;
+      if final_dump then print_cfg f c;
+    with e -> (
+      print_cfg f c;
+      check_code res f.fn_code
+    )
+  );
+  res
diff --git a/backend/Tunnelingproof.v b/backend/Tunnelingproof.v
index cdf6c800..126b7b87 100644
--- a/backend/Tunnelingproof.v
+++ b/backend/Tunnelingproof.v
@@ -3,6 +3,7 @@
 (*              The Compcert verified compiler                         *)
 (*                                                                     *)
 (*          Xavier Leroy, INRIA Paris-Rocquencourt                     *)
+(*          Sylvain Boulmé  Grenoble-INP, VERIMAG                      *)
 (*                                                                     *)
 (*  Copyright Institut National de Recherche en Informatique et en     *)
 (*  Automatique.  All rights reserved.  This file is distributed       *)
@@ -12,131 +13,163 @@
 
 (** Correctness proof for the branch tunneling optimization. *)
 
-Require Import Coqlib Maps UnionFind.
+Require Import Coqlib Maps Errors.
 Require Import AST Linking.
 Require Import Values Memory Events Globalenvs Smallstep.
 Require Import Op Locations LTL.
 Require Import Tunneling.
 
-Definition match_prog (p tp: program) :=
-  match_program (fun ctx f tf => tf = tunnel_fundef f) eq p tp.
+Local Open Scope nat.
 
-Lemma transf_program_match:
-  forall p, match_prog p (transf_program p).
+
+(** * Properties of the branch_target, when the verifier succeeds *)
+
+Definition check_included_spec (c:code) (td:UF) (ok: option bblock) :=
+   ok <> None -> forall pc, c!pc = None -> td!pc = None.
+
+Lemma check_included_correct (td: UF) (c: code): 
+  check_included_spec c td (check_included td c).
+Proof. 
+  apply PTree_Properties.fold_rec with (P := check_included_spec c).
+- (* extensionality *)
+  unfold check_included_spec. intros m m' a EQ IND X pc. rewrite <- EQ; auto.
+- (* base case *)
+  intros _ pc.  rewrite PTree.gempty; try congruence.
+- (* inductive case *)
+  unfold check_included_spec.
+  intros m [|] pc bb NEW ATPC IND; simpl; try congruence.
+  intros H pc0. rewrite PTree.gsspec; destruct (peq _ _); subst; simpl; try congruence.
+  intros; eapply IND; try congruence.
+Qed.
+
+Inductive target_bounds (target: node -> node) (bound: node -> nat) (pc: node): (option bblock) -> Prop :=
+ | TB_default (TB: target pc = pc) ob
+     : target_bounds target bound pc ob
+ | TB_branch s bb
+     (EQ: target pc = target s)
+     (DECREASE: bound s < bound pc)
+     : target_bounds target bound pc (Some (Lbranch s::bb))
+ | TB_cond cond args s1 s2 info bb
+     (EQ1: target pc = target s1)
+     (EQ2: target pc = target s2)
+     (DEC1: bound s1 < bound pc)
+     (DEC2: bound s2 < bound pc)
+     : target_bounds target bound pc (Some (Lcond cond args s1 s2 info::bb))
+ .
+Local Hint Resolve TB_default: core.
+
+Lemma target_None (td:UF) (pc: node): td!pc = None -> td pc = pc.
 Proof.
-  intros. eapply match_transform_program; eauto.
+  unfold target, get. intros H; rewrite H; auto.
 Qed.
+Local Hint Resolve target_None Z.abs_nonneg: core.
 
-(** * Properties of the branch map computed using union-find. *)
+Lemma get_nonneg td pc t d: get td pc = (t, d) -> (0 <= d)%Z.
+Proof.
+  unfold get. destruct (td!_) as [(t0&d0)|]; intros H; inversion H; subst; simpl; omega || auto.
+Qed.
+Local Hint Resolve get_nonneg: core.
 
-(** A variant of [record_goto] that also incrementally computes a measure [f: node -> nat]
-  counting the number of [Lnop] instructions starting at a given [pc] that were eliminated. *)
+Definition bound (td: UF) (pc: node) := Z.to_nat (snd (get td pc)).
 
-Definition measure_edge (u: U.t) (pc s: node) (f: node -> nat) : node -> nat :=
-  fun x => if peq (U.repr u s) pc then f x
-           else if peq (U.repr u x) pc then (f x + f s + 1)%nat
-           else f x.
+Lemma check_bblock_correct (td:UF) (pc:node) (bb: bblock):
+  check_bblock td pc bb = OK tt -> 
+  target_bounds (target td) (bound td) pc (Some bb).
+Proof.
+  unfold check_bblock, bound.
+  destruct (td!pc) as [(tpc&dpc)|] eqn:Hpc; auto.
+  assert (Tpc: td pc = tpc). { unfold target, get; rewrite Hpc; simpl; auto. }
+  assert (Dpc: snd (get td pc) = Z.abs dpc). { unfold get; rewrite Hpc; simpl; auto. }
+  destruct bb as [|[ ] bb]; simpl; try congruence.
+  + destruct (get td s) as (ts, ds) eqn:Hs.
+    repeat (destruct (peq _ _) || destruct (zlt _ _)); simpl; try congruence.
+    intros; apply TB_branch.
+    * rewrite Tpc. unfold target; rewrite Hs; simpl; auto.
+    * rewrite Dpc, Hs; simpl. apply Z2Nat.inj_lt; eauto.
+  + destruct (get td s1) as (ts1, ds1) eqn:Hs1.
+    destruct (get td s2) as (ts2, ds2) eqn:Hs2.
+    repeat (destruct (peq _ _) || destruct (zlt _ _)); simpl; try congruence.
+    intros; apply TB_cond.
+    * rewrite Tpc. unfold target; rewrite Hs1; simpl; auto.
+    * rewrite Tpc. unfold target; rewrite Hs2; simpl; auto.
+    * rewrite Dpc, Hs1; simpl. apply Z2Nat.inj_lt; eauto.
+    * rewrite Dpc, Hs2; simpl. apply Z2Nat.inj_lt; eauto.
+Qed.
 
-Definition record_goto' (uf: U.t * (node -> nat)) (pc: node) (b: bblock) : U.t * (node -> nat) :=
-  match b with
-  | Lbranch s :: b' => let (u, f) := uf in (U.union u pc s, measure_edge u pc s f)
-  | _ => uf
-  end.
+Definition check_code_spec (td:UF) (c:code) (ok: res unit) :=
+   ok = OK tt -> forall pc bb, c!pc = Some bb -> target_bounds (target td) (bound td) pc (Some bb).
 
-Definition branch_map_correct (c: code) (uf: U.t * (node -> nat)): Prop :=
-  forall pc,
-  match c!pc with
-  | Some(Lbranch s :: b) =>
-      U.repr (fst uf) pc = pc \/ (U.repr (fst uf) pc = U.repr (fst uf) s /\ snd uf s < snd uf pc)%nat
-  | _ =>
-      U.repr (fst uf) pc = pc
-  end.
+Lemma check_code_correct (td:UF) c:
+   check_code_spec td c (check_code td c).
+Proof.
+  apply PTree_Properties.fold_rec with (P := check_code_spec td).
+- (* extensionality *)
+  unfold check_code_spec. intros m m' a EQ IND X pc bb; subst. rewrite  <- ! EQ; eauto.
+- (* base case *)
+  intros _ pc.  rewrite PTree.gempty; try congruence.
+- (* inductive case *)
+  unfold check_code_spec.
+  intros m [[]|] pc bb NEW ATPC IND; simpl; try congruence.
+  intros H pc0 bb0. rewrite PTree.gsspec; destruct (peq _ _); subst; simpl; auto.
+  intros X; inversion X; subst.
+  apply check_bblock_correct; auto.
+Qed.
 
-Lemma record_gotos'_correct:
-  forall c,
-  branch_map_correct c (PTree.fold record_goto' c (U.empty, fun (x: node) => O)).
+Theorem branch_target_bounds:
+  forall f tf pc, 
+  tunnel_function f = OK tf ->
+  target_bounds (branch_target f) (bound (branch_target f)) pc (f.(fn_code)!pc).
 Proof.
-  intros.
-  apply PTree_Properties.fold_rec with (P := fun c uf => branch_map_correct c uf).
+  unfold tunnel_function; intros f f' pc.
+  destruct (check_included _ _) eqn:H1; try congruence.
+  destruct (check_code _ _) as [[]|] eqn:H2; simpl; try congruence.
+  intros _.
+  destruct ((fn_code f)!pc) eqn:X.
+  - exploit check_code_correct; eauto.
+  - exploit check_included_correct; eauto.
+    congruence.
+Qed.
 
-- (* extensionality *)
-  intros. red; intros. rewrite <- H. apply H0.
+Lemma tunnel_function_unfold:
+  forall f tf pc,
+  tunnel_function f = OK tf -> 
+  (fn_code tf)!pc = option_map (tunnel_block (branch_target f)) (fn_code f)!pc.
+Proof.
+  unfold tunnel_function; intros f f' pc.
+  destruct (check_included _ _) eqn:H1; try congruence.
+  destruct (check_code _ _) as [[]|] eqn:H2; simpl; try congruence.
+  intros X; inversion X; clear X; subst.
+  simpl. rewrite PTree.gmap1. auto.
+Qed.
 
-- (* base case *)
-  red; intros; simpl. rewrite PTree.gempty. apply U.repr_empty.
+Lemma tunnel_fundef_Internal:
+  forall f tf, tunnel_fundef (Internal f) = OK tf
+  -> exists tf', tunnel_function f = OK tf' /\ tf = Internal tf'.
+Proof.
+  intros f tf; simpl.
+  destruct (tunnel_function f) eqn:X; simpl; try congruence.
+  intros EQ; inversion EQ.
+  eexists; split; eauto.
+Qed.
 
-- (* inductive case *)
-  intros m uf pc bb; intros. destruct uf as [u f].
-  assert (PC: U.repr u pc = pc).
-    generalize (H1 pc). rewrite H. auto.
-  assert (record_goto' (u, f) pc bb = (u, f)
-          \/ exists s, exists bb', bb = Lbranch s :: bb' /\ record_goto' (u, f) pc bb = (U.union u pc s, measure_edge u pc s f)).
-    unfold record_goto'; simpl. destruct bb; auto. destruct i; auto. right. exists s; exists bb; auto.
-  destruct H2 as [B | [s [bb' [EQ B]]]].
-
-+ (* u and f are unchanged *)
-  rewrite B.
-  red. intro pc'. simpl. rewrite PTree.gsspec. destruct (peq pc' pc). subst pc'.
-  destruct bb; auto. destruct i; auto.
-  apply H1.
-
-+ (* b is Lbranch s, u becomes union u pc s, f becomes measure_edge u pc s f *)
-  rewrite B.
-  red. intro pc'. simpl. rewrite PTree.gsspec. destruct (peq pc' pc). subst pc'. rewrite EQ.
-
-* (* The new instruction *)
-  rewrite (U.repr_union_2 u pc s); auto. rewrite U.repr_union_3.
-  unfold measure_edge. destruct (peq (U.repr u s) pc). auto. right. split. auto.
-  rewrite PC. rewrite peq_true. omega.
-
-* (* An old instruction *)
-  assert (U.repr u pc' = pc' -> U.repr (U.union u pc s) pc' = pc').
-  { intro. rewrite <- H2 at 2. apply U.repr_union_1. congruence. }
-  generalize (H1 pc'). simpl. destruct (m!pc'); auto. destruct b; auto. destruct i; auto.
-  intros [P | [P Q]]. left; auto. right.
-  split. apply U.sameclass_union_2. auto.
-  unfold measure_edge. destruct (peq (U.repr u s) pc). auto.
-  rewrite P. destruct (peq (U.repr u s0) pc). omega. auto.
-Qed.
-
-Definition record_gotos' (f: function) :=
-  PTree.fold record_goto' f.(fn_code) (U.empty, fun (x: node) => O).
-
-Lemma record_gotos_gotos':
-  forall f, fst (record_gotos' f) = record_gotos f.
-Proof.
-  intros. unfold record_gotos', record_gotos.
-  repeat rewrite PTree.fold_spec.
-  generalize (PTree.elements (fn_code f)) (U.empty) (fun _ : node => O).
-  induction l; intros; simpl.
-  auto.
-  unfold record_goto' at 2. unfold record_goto at 2.
-  destruct (snd a). apply IHl. destruct i; apply IHl.
-Qed.
-
-Definition branch_target (f: function) (pc: node) : node :=
-  U.repr (record_gotos f) pc.
-
-Definition count_gotos (f: function) (pc: node) : nat :=
-  snd (record_gotos' f) pc.
-
-Theorem record_gotos_correct:
-  forall f pc,
-  match f.(fn_code)!pc with
-  | Some(Lbranch s :: b) =>
-       branch_target f pc = pc \/
-       (branch_target f pc = branch_target f s /\ count_gotos f s < count_gotos f pc)%nat
-  | _ => branch_target f pc = pc
-  end.
+Lemma tunnel_fundef_External:
+  forall tf ef, tunnel_fundef (External ef) = OK tf
+  -> tf = External ef.
 Proof.
-  intros.
-  generalize (record_gotos'_correct f.(fn_code) pc). simpl.
-  fold (record_gotos' f). unfold branch_map_correct, branch_target, count_gotos.
-  rewrite record_gotos_gotos'. auto.
+  intros tf ef; simpl. intros H; inversion H; auto.
 Qed.
 
 (** * Preservation of semantics *)
 
+Definition match_prog (p tp: program) :=
+  match_program (fun _ f tf => tunnel_fundef f = OK tf) eq p tp.
+
+Lemma transf_program_match:
+  forall prog tprog, transf_program prog = OK tprog -> match_prog prog tprog.
+Proof.
+  intros. eapply match_transform_partial_program_contextual; eauto.
+Qed.
+
 Section PRESERVATION.
 
 Variables prog tprog: program.
@@ -145,32 +178,65 @@ Let ge := Genv.globalenv prog.
 Let tge := Genv.globalenv tprog.
 
 Lemma functions_translated:
-  forall v f,
+  forall (v: val) (f: fundef),
   Genv.find_funct ge v = Some f ->
-  Genv.find_funct tge v = Some (tunnel_fundef f).
-Proof (Genv.find_funct_transf TRANSL).
+  exists tf, tunnel_fundef f = OK tf /\ Genv.find_funct tge v = Some tf.
+Proof.
+  intros. exploit (Genv.find_funct_match TRANSL); eauto.
+  intros (cu & tf & A & B & C).
+  repeat eexists; intuition eauto.
+Qed.
 
 Lemma function_ptr_translated:
   forall v f,
   Genv.find_funct_ptr ge v = Some f ->
-  Genv.find_funct_ptr tge v = Some (tunnel_fundef f).
-Proof (Genv.find_funct_ptr_transf TRANSL).
+  exists tf,
+  Genv.find_funct_ptr tge v = Some tf /\ tunnel_fundef f = OK tf.
+Proof.
+  intros.
+  exploit (Genv.find_funct_ptr_transf_partial TRANSL); eauto.
+Qed.
 
-Lemma symbols_preserved:
-  forall id,
-  Genv.find_symbol tge id = Genv.find_symbol ge id.
-Proof (Genv.find_symbol_transf TRANSL).
+Lemma symbols_preserved s: Genv.find_symbol tge s = Genv.find_symbol ge s.
+Proof.
+  rewrite <- (Genv.find_symbol_match TRANSL). reflexivity.
+Qed.
 
 Lemma senv_preserved:
   Senv.equiv ge tge.
-Proof (Genv.senv_transf TRANSL).
+Proof.
+  eapply (Genv.senv_match TRANSL).
+Qed.
 
 Lemma sig_preserved:
-  forall f, funsig (tunnel_fundef f) = funsig f.
+  forall f tf, tunnel_fundef f = OK tf -> funsig tf = funsig f.
 Proof.
-  destruct f; reflexivity.
+  intros. destruct f.
+  - simpl in H. monadInv H. unfold tunnel_function in EQ. 
+    destruct (check_included _ _); try congruence.
+    monadInv EQ. simpl; auto.
+  - simpl in H. monadInv H. reflexivity.
 Qed.
 
+Lemma fn_stacksize_preserved:
+  forall f tf, tunnel_function f = OK tf -> fn_stacksize tf = fn_stacksize f.
+Proof.
+  intros f tf; unfold tunnel_function.
+  destruct (check_included _ _); try congruence.
+  destruct (check_code _ _); simpl; try congruence.
+  intros H; inversion H; simpl; auto.
+Qed.
+
+Lemma fn_entrypoint_preserved:
+  forall f tf, tunnel_function f = OK tf -> fn_entrypoint tf = branch_target f (fn_entrypoint f).
+Proof.
+  intros f tf; unfold tunnel_function.
+  destruct (check_included _ _); try congruence.
+  destruct (check_code _ _); simpl; try congruence.
+  intros H; inversion H; simpl; auto.
+Qed.
+
+
 (** The proof of semantic preservation is a simulation argument
   based on diagrams of the following form:
 <<
@@ -185,7 +251,7 @@ Qed.
   between states [st1] and [st2], as well as the postcondition between
   [st1'] and [st2'].  One transition in the source code (left) can correspond
   to zero or one transition in the transformed code (right).  The
-  "zero transition" case occurs when executing a [Lgoto] instruction
+  "zero transition" case occurs when executing a [Lnop] instruction
   in the source code that has been removed by tunneling.
 
   In the definition of [match_states], what changes between the original and
@@ -194,52 +260,52 @@ Qed.
   and memory states, since some [Vundef] values can become more defined
   as a consequence of eliminating useless [Lcond] instructions. *)
 
-Definition tunneled_block (f: function) (b: bblock) :=
-  tunnel_block (record_gotos f) b.
-
-Definition tunneled_code (f: function) :=
-  PTree.map1 (tunneled_block f) (fn_code f).
-
 Definition locmap_lessdef (ls1 ls2: locset) : Prop :=
   forall l, Val.lessdef (ls1 l) (ls2 l).
 
 Inductive match_stackframes: stackframe -> stackframe -> Prop :=
   | match_stackframes_intro:
-      forall f sp ls0 bb tls0,
+      forall f tf sp ls0 bb tls0,
       locmap_lessdef ls0 tls0 ->
+      tunnel_function f = OK tf ->
       match_stackframes
          (Stackframe f sp ls0 bb)
-         (Stackframe (tunnel_function f) sp tls0 (tunneled_block f bb)).
+         (Stackframe tf sp tls0 (tunnel_block (branch_target f) bb)).
 
 Inductive match_states: state -> state -> Prop :=
   | match_states_intro:
-      forall s f sp pc ls m ts tls tm
+      forall s f tf sp pc ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
         (LS: locmap_lessdef ls tls)
-        (MEM: Mem.extends m tm),
+        (MEM: Mem.extends m tm)
+        (TF: tunnel_function f = OK tf),
       match_states (State s f sp pc ls m)
-                   (State ts (tunnel_function f) sp (branch_target f pc) tls tm)
+                   (State ts tf sp (branch_target f pc) tls tm)
   | match_states_block:
-      forall s f sp bb ls m ts tls tm
+      forall s f tf sp bb ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
         (LS: locmap_lessdef ls tls)
-        (MEM: Mem.extends m tm),
+        (MEM: Mem.extends m tm)
+        (TF: tunnel_function f = OK tf),
       match_states (Block s f sp bb ls m)
-                   (Block ts (tunnel_function f) sp (tunneled_block f bb) tls tm)
+                   (Block ts tf sp (tunnel_block (branch_target f) bb) tls tm)
   | match_states_interm:
-      forall s f sp pc bb ls m ts tls tm
+      forall s f tf sp pc i bb ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
         (LS: locmap_lessdef ls tls)
-        (MEM: Mem.extends m tm),
-      match_states (Block s f sp (Lbranch pc :: bb) ls m)
-                   (State ts (tunnel_function f) sp (branch_target f pc) tls tm)
+        (MEM: Mem.extends m tm)
+        (IBRANCH: tunnel_instr (branch_target f) i = Lbranch pc)
+        (TF: tunnel_function f = OK tf),
+      match_states (Block s f sp (i :: bb) ls m)
+                   (State ts tf sp pc tls tm)
   | match_states_call:
-      forall s f ls m ts tls tm
+      forall s f tf ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
         (LS: locmap_lessdef ls tls)
-        (MEM: Mem.extends m tm),
+        (MEM: Mem.extends m tm)
+        (TF: tunnel_fundef f = OK tf),
       match_states (Callstate s f ls m)
-                   (Callstate ts (tunnel_fundef f) tls tm)
+                   (Callstate ts tf tls tm)
   | match_states_return:
       forall s ls m ts tls tm
         (STK: list_forall2 match_stackframes s ts)
@@ -289,22 +355,6 @@ Proof.
   induction rl as [ | r rl]; intros; simpl. auto. apply locmap_set_undef_lessdef; auto. 
 Qed.
 
-(*
-Lemma locmap_undef_lessdef:
-  forall ll ls1 ls2,
-  locmap_lessdef ls1 ls2 -> locmap_lessdef (Locmap.undef ll ls1) (Locmap.undef ll ls2).
-Proof.
-  induction ll as [ | l ll]; intros; simpl. auto. apply IHll. apply locmap_set_lessdef; auto. 
-Qed.
-
-Lemma locmap_undef_lessdef_1:
-  forall ll ls1 ls2,
-  locmap_lessdef ls1 ls2 -> locmap_lessdef (Locmap.undef ll ls1) ls2.
-Proof.
-  induction ll as [ | l ll]; intros; simpl. auto. apply IHll. apply locmap_set_undef_lessdef; auto. 
-Qed.
-*)
-
 Lemma locmap_getpair_lessdef:
   forall p ls1 ls2,
   locmap_lessdef ls1 ls2 -> Val.lessdef (Locmap.getpair p ls1) (Locmap.getpair p ls2).
@@ -348,15 +398,16 @@ Lemma find_function_translated:
   forall ros ls tls fd,
   locmap_lessdef ls tls ->
   find_function ge ros ls = Some fd ->
-  find_function tge ros tls = Some (tunnel_fundef fd).
+  exists tfd, tunnel_fundef fd = OK tfd /\ find_function tge ros tls = Some tfd.
 Proof.
   intros. destruct ros; simpl in *.
 - assert (E: tls (R m) = ls (R m)).
   { exploit Genv.find_funct_inv; eauto. intros (b & EQ). 
     generalize (H (R m)). rewrite EQ. intros LD; inv LD. auto. }
-  rewrite E. apply functions_translated; auto.
+  rewrite E. exploit functions_translated; eauto.
 - rewrite symbols_preserved. destruct (Genv.find_symbol ge i); inv H0. 
-  apply function_ptr_translated; auto.
+  exploit function_ptr_translated; eauto.
+  intros (tf & X1 & X2). exists tf; intuition.
 Qed.
 
 Lemma call_regs_lessdef:
@@ -383,11 +434,12 @@ Qed.
 
 Definition measure (st: state) : nat :=
   match st with
-  | State s f sp pc ls m => (count_gotos f pc * 2)%nat
-  | Block s f sp (Lbranch pc :: _) ls m => (count_gotos f pc * 2 + 1)%nat
-  | Block s f sp bb ls m => 0%nat
-  | Callstate s f ls m => 0%nat
-  | Returnstate s ls m => 0%nat
+  | State s f sp pc ls m => (bound (branch_target f) pc) * 2
+  | Block s f sp (Lbranch pc :: _) ls m => (bound (branch_target f) pc) * 2 + 1
+  | Block s f sp (Lcond _ _ pc1 pc2 _ :: _) ls m => (max (bound (branch_target f) pc1) (bound (branch_target f) pc2)) * 2 + 1
+  | Block s f sp bb ls m => 0
+  | Callstate s f ls m => 0
+  | Returnstate s ls m => 0
   end.
 
 Lemma match_parent_locset:
@@ -406,24 +458,23 @@ Lemma tunnel_step_correct:
   (exists st2', step tge st1' t st2' /\ match_states st2 st2')
   \/ (measure st2 < measure st1 /\ t = E0 /\ match_states st2 st1')%nat.
 Proof.
-  induction 1; intros; try inv MS.
+  induction 1; intros; try inv MS; try (simpl in IBRANCH; inv IBRANCH).
 
 - (* entering a block *)
-  assert (DEFAULT: branch_target f pc = pc ->
-    (exists st2' : state,
-     step tge (State ts (tunnel_function f) sp (branch_target f pc) tls tm) E0 st2'
-     /\ match_states (Block s f sp bb rs m) st2')).
-  { intros. rewrite H0. econstructor; split.
-    econstructor. simpl. rewrite PTree.gmap1. rewrite H. simpl. eauto.
-    econstructor; eauto. }
-
-  generalize (record_gotos_correct f pc). rewrite H.
-  destruct bb; auto. destruct i; auto.
-  intros [A | [B C]]. auto.
-  right. split. simpl. omega.
-  split. auto.
-  rewrite B. econstructor; eauto.
-
+  exploit (branch_target_bounds f tf pc); eauto.
+  rewrite H. intros X; inversion X.
+  + (* TB_default *) 
+    rewrite TB; left. econstructor; split.
+    * econstructor. simpl. erewrite tunnel_function_unfold, H ; simpl; eauto.
+    * econstructor; eauto.
+  + (* FT_branch *)
+    simpl; right.
+    rewrite EQ; repeat (econstructor; omega || eauto).
+  + (* FT_cond *)
+    simpl; right.
+    repeat (econstructor; omega || eauto); simpl.
+    apply Nat.max_case; omega.
+    destruct (peq _ _); try congruence.
 - (* Lop *)
   exploit eval_operation_lessdef. apply reglist_lessdef; eauto. eauto. eauto. 
   intros (tv & EV & LD).
@@ -485,20 +536,25 @@ Proof.
   eauto. eauto.
   econstructor; eauto using locmap_undef_regs_lessdef.
 - (* Lcall *)
-  left; simpl; econstructor; split.
-  eapply exec_Lcall with (fd := tunnel_fundef fd); eauto.
-  eapply find_function_translated; eauto.
-  rewrite sig_preserved. auto.
-  econstructor; eauto.
-  constructor; auto.
-  constructor; auto.
+  left; simpl.
+  exploit find_function_translated; eauto.
+  intros (tfd & Htfd & FIND).
+  econstructor; split.
+  + eapply exec_Lcall; eauto.
+    erewrite sig_preserved; eauto.
+  + econstructor; eauto.
+    constructor; auto.
+    constructor; auto.
 - (* Ltailcall *)
-  exploit Mem.free_parallel_extends. eauto. eauto. intros (tm' & FREE & MEM'). 
+  exploit find_function_translated. 2: eauto.
+  { eauto using return_regs_lessdef, match_parent_locset. }
+  intros (tfd & Htfd & FIND).
+  exploit Mem.free_parallel_extends. eauto. eauto. intros (tm' & FREE & MEM').
   left; simpl; econstructor; split.
-  eapply exec_Ltailcall with (fd := tunnel_fundef fd); eauto.
-  eapply find_function_translated; eauto using return_regs_lessdef, match_parent_locset.
-  apply sig_preserved.
-  econstructor; eauto using return_regs_lessdef, match_parent_locset.
+  + eapply exec_Ltailcall; eauto.
+    * eapply sig_preserved; eauto.
+    * erewrite fn_stacksize_preserved; eauto.
+  + econstructor; eauto using return_regs_lessdef, match_parent_locset.
 - (* Lbuiltin *)
   exploit eval_builtin_args_lessdef. eexact LS. eauto. eauto. intros (tvargs & EVA & LDA).
   exploit external_call_mem_extends; eauto. intros (tvres & tm' & A & B & C & D).
@@ -513,45 +569,58 @@ Proof.
   fold (branch_target f pc). econstructor; eauto.
 - (* Lbranch (eliminated) *)
   right; split. simpl. omega. split. auto. constructor; auto.
-
-- (* Lcond *)
-  simpl tunneled_block.
-  set (s1 := U.repr (record_gotos f) pc1). set (s2 := U.repr (record_gotos f) pc2).
-  destruct (peq s1 s2).
-+ left; econstructor; split.
-  eapply exec_Lbranch. 
-  destruct b.
-* constructor; eauto using locmap_undef_regs_lessdef_1.
-* rewrite e. constructor; eauto using locmap_undef_regs_lessdef_1.
-+ left; econstructor; split.
-  eapply exec_Lcond; eauto. eapply eval_condition_lessdef; eauto using reglist_lessdef.
-  destruct b; econstructor; eauto using locmap_undef_regs_lessdef.
-
+- (* Lcond (preserved) *)
+  simpl; left; destruct (peq _ _) eqn: EQ.
+  + econstructor; split.
+    eapply exec_Lbranch. 
+    destruct b.
+    * constructor; eauto using locmap_undef_regs_lessdef_1.
+    * rewrite e. constructor; eauto using locmap_undef_regs_lessdef_1.
+  + econstructor; split.
+    eapply exec_Lcond; eauto. eapply eval_condition_lessdef; eauto using reglist_lessdef.
+    destruct b; econstructor; eauto using locmap_undef_regs_lessdef.
+- (* Lcond (eliminated) *)
+  destruct (peq _ _) eqn: EQ; try inv H1.
+  right; split; simpl. 
+  + destruct b.
+    generalize (Nat.le_max_l (bound (branch_target f) pc1) (bound (branch_target f) pc2)); omega.
+    generalize (Nat.le_max_r (bound (branch_target f) pc1) (bound (branch_target f) pc2)); omega.
+  + destruct b.
+    -- repeat (constructor; auto).
+    -- rewrite e; repeat (constructor; auto).
 - (* Ljumptable *)
   assert (tls (R arg) = Vint n).
   { generalize (LS (R arg)); rewrite H; intros LD; inv LD; auto. }
   left; simpl; econstructor; split.
   eapply exec_Ljumptable.
-  eauto. rewrite list_nth_z_map. change U.elt with node. rewrite H0. reflexivity. eauto.
+  eauto. rewrite list_nth_z_map, H0; simpl; eauto. eauto.
   econstructor; eauto using locmap_undef_regs_lessdef.
 - (* Lreturn *)
   exploit Mem.free_parallel_extends. eauto. eauto. intros (tm' & FREE & MEM'). 
   left; simpl; econstructor; split.
-  eapply exec_Lreturn; eauto.
-  constructor; eauto using return_regs_lessdef, match_parent_locset.
+  + eapply exec_Lreturn; eauto.
+    erewrite fn_stacksize_preserved; eauto.
+  + constructor; eauto using return_regs_lessdef, match_parent_locset.
 - (* internal function *)
+  exploit tunnel_fundef_Internal; eauto.
+  intros (tf' & TF' & ITF). subst.
   exploit Mem.alloc_extends. eauto. eauto. apply Z.le_refl. apply Z.le_refl.
-  intros (tm' & ALLOC & MEM'). 
-  left; simpl; econstructor; split.
-  eapply exec_function_internal; eauto.
-  simpl. econstructor; eauto using locmap_undef_regs_lessdef, call_regs_lessdef.
+  intros (tm' & ALLOC & MEM').
+  left; simpl.
+  econstructor; split.
+  + eapply exec_function_internal; eauto.
+    erewrite fn_stacksize_preserved; eauto.
+  + simpl.
+    erewrite (fn_entrypoint_preserved f tf'); auto.
+    econstructor; eauto using locmap_undef_regs_lessdef, call_regs_lessdef.
 - (* external function *)
   exploit external_call_mem_extends; eauto using locmap_getpairs_lessdef.
   intros (tvres & tm' & A & B & C & D).
   left; simpl; econstructor; split.
-  eapply exec_function_external; eauto.
-  eapply external_call_symbols_preserved; eauto. apply senv_preserved.
-  simpl. econstructor; eauto using locmap_setpair_lessdef, locmap_undef_caller_save_regs_lessdef.
+  + erewrite (tunnel_fundef_External tf ef); eauto.
+    eapply exec_function_external; eauto.
+    eapply external_call_symbols_preserved; eauto. apply senv_preserved.
+  + simpl. econstructor; eauto using locmap_setpair_lessdef, locmap_undef_caller_save_regs_lessdef.
 - (* return *)
   inv STK. inv H1.
   left; econstructor; split.
@@ -564,14 +633,15 @@ Lemma transf_initial_states:
   exists st2, initial_state tprog st2 /\ match_states st1 st2.
 Proof.
   intros. inversion H.
-  exists (Callstate nil (tunnel_fundef f) (Locmap.init Vundef) m0); split.
+  exploit function_ptr_translated; eauto.
+  intros (tf & Htf & Hf).
+  exists (Callstate nil tf (Locmap.init Vundef) m0); split.
   econstructor; eauto.
-  apply (Genv.init_mem_transf TRANSL); auto.
+  apply (Genv.init_mem_transf_partial TRANSL); auto.
   rewrite (match_program_main TRANSL).
   rewrite symbols_preserved. eauto.
-  apply function_ptr_translated; auto.
-  rewrite <- H3. apply sig_preserved.
-  constructor. constructor. red; simpl; auto. apply Mem.extends_refl.
+  rewrite <- H3. apply sig_preserved. auto.
+  constructor. constructor. red; simpl; auto. apply Mem.extends_refl. auto.
 Qed.
 
 Lemma transf_final_states:
diff --git a/backend/ValueDomain.v b/backend/ValueDomain.v
index 779e7bb9..f1a46baa 100644
--- a/backend/ValueDomain.v
+++ b/backend/ValueDomain.v
@@ -2069,7 +2069,6 @@ Definition divfs := binop_single Float32.div.
 Lemma divfs_sound:
   forall v x w y, vmatch v x -> vmatch w y -> vmatch (Val.divfs v w) (divfs x y).
 Proof (binop_single_sound Float32.div).
-
 (** Conversions *)
 
 Definition zero_ext (nbits: Z) (v: aval) :=
@@ -2483,6 +2482,468 @@ Proof.
   destruct 1; simpl; auto with va.
 Qed.
 
+
+(* Extensions for KVX and Risc-V *)
+
+Definition intoffloat_total (x: aval) :=
+  match x with
+  | F f =>
+      match Float.to_int f with
+      | Some i => I i
+      | None => ntop
+      end
+  | _ => ntop1 x
+  end.
+
+Definition intuoffloat_total (x: aval) :=
+  match x with
+  | F f =>
+      match Float.to_intu f with
+      | Some i => I i
+      | None => ntop
+      end
+  | _ => ntop1 x
+  end.
+
+Definition intofsingle_total (x: aval) :=
+  match x with
+  | FS f =>
+      match Float32.to_int f with
+      | Some i => I i
+      | None => ntop
+      end
+  | _ => ntop1 x
+  end.
+
+Definition intuofsingle_total (x: aval) :=
+  match x with
+  | FS f =>
+      match Float32.to_intu f with
+      | Some i => I i
+      | None => ntop
+      end
+  | _ => ntop1 x
+  end.
+
+Definition longoffloat_total (x: aval) :=
+  match x with
+  | F f =>
+      match Float.to_long f with
+      | Some i => L i
+      | None => ntop
+      end
+  | _ => ntop1 x
+  end.
+
+Definition longuoffloat_total (x: aval) :=
+  match x with
+  | F f =>
+      match Float.to_longu f with
+      | Some i => L i
+      | None => ntop
+      end
+  | _ => ntop1 x
+  end.
+
+Definition longofsingle_total (x: aval) :=
+  match x with
+  | FS f =>
+      match Float32.to_long f with
+      | Some i => L i
+      | None => ntop
+      end
+  | _ => ntop1 x
+  end.
+
+Definition longuofsingle_total (x: aval) :=
+  match x with
+  | FS f =>
+      match Float32.to_longu f with
+      | Some i => L i
+      | None => ntop
+      end
+  | _ => ntop1 x
+  end.
+
+Lemma intoffloat_total_sound:
+  forall v x
+         (MATCH : vmatch v x),
+    vmatch (Val.maketotal (Val.intoffloat v)) (intoffloat_total x).
+Proof.
+  unfold Val.intoffloat, intoffloat_total. intros.
+  inv MATCH; simpl in *; try constructor.
+  all: destruct (Float.to_int f) as [i|] eqn:E; simpl; [auto with va | constructor].
+Qed.
+
+Lemma intuoffloat_total_sound:
+  forall v x
+         (MATCH : vmatch v x),
+    vmatch (Val.maketotal (Val.intuoffloat v)) (intuoffloat_total x).
+Proof.
+  unfold Val.intoffloat, intoffloat_total. intros.
+  inv MATCH; simpl in *; try constructor.
+  all: destruct (Float.to_intu f) as [i|] eqn:E; simpl; [auto with va | constructor].
+Qed.
+
+Lemma intofsingle_total_sound:
+  forall v x
+         (MATCH : vmatch v x),
+    vmatch (Val.maketotal (Val.intofsingle v)) (intofsingle_total x).
+Proof.
+  unfold Val.intofsingle, intofsingle_total. intros.
+  inv MATCH; simpl in *; try constructor.
+  all: destruct (Float32.to_int f) as [i|] eqn:E; simpl; [auto with va | constructor].
+Qed.
+
+Lemma intuofsingle_total_sound:
+  forall v x
+         (MATCH : vmatch v x),
+    vmatch (Val.maketotal (Val.intuofsingle v)) (intuofsingle_total x).
+Proof.
+  unfold Val.intofsingle, intofsingle_total. intros.
+  inv MATCH; simpl in *; try constructor.
+  all: destruct (Float32.to_intu f) as [i|] eqn:E; simpl; [auto with va | constructor].
+Qed.
+
+Lemma singleofint_total_sound:
+  forall v x, vmatch v x ->
+              vmatch (Val.maketotal (Val.singleofint v)) (singleofint x).
+Proof.
+  unfold Val.singleofint, singleofint; intros.
+  inv H; simpl.
+  all: auto with va.
+  all: unfold ntop1, provenance.
+  all: try constructor.
+Qed.
+
+Lemma singleofintu_total_sound:
+  forall v x, vmatch v x ->
+              vmatch (Val.maketotal (Val.singleofintu v)) (singleofintu x).
+Proof.
+  unfold Val.singleofintu, singleofintu; intros.
+  inv H; simpl.
+  all: auto with va.
+  all: unfold ntop1, provenance.
+  all: try constructor.
+Qed.
+
+Lemma longoffloat_total_sound:
+  forall v x
+         (MATCH : vmatch v x),
+    vmatch (Val.maketotal (Val.longoffloat v)) (longoffloat_total x).
+Proof.
+  unfold Val.longoffloat, longoffloat_total. intros.
+  inv MATCH; simpl in *; try constructor.
+  all: destruct (Float.to_long f) as [i|] eqn:E; simpl; [auto with va | constructor].
+Qed.
+
+Lemma longuoffloat_total_sound:
+  forall v x
+         (MATCH : vmatch v x),
+    vmatch (Val.maketotal (Val.longuoffloat v)) (longuoffloat_total x).
+Proof.
+  unfold Val.longoffloat, longoffloat_total. intros.
+  inv MATCH; simpl in *; try constructor.
+  all: destruct (Float.to_longu f) as [i|] eqn:E; simpl; [auto with va | constructor].
+Qed.
+
+Lemma longofsingle_total_sound:
+  forall v x
+         (MATCH : vmatch v x),
+    vmatch (Val.maketotal (Val.longofsingle v)) (longofsingle_total x).
+Proof.
+  unfold Val.longofsingle, longofsingle_total. intros.
+  inv MATCH; simpl in *; try constructor.
+  all: destruct (Float32.to_long f) as [i|] eqn:E; simpl; [auto with va | constructor].
+Qed.
+
+Lemma longuofsingle_total_sound:
+  forall v x
+         (MATCH : vmatch v x),
+    vmatch (Val.maketotal (Val.longuofsingle v)) (longuofsingle_total x).
+Proof.
+  unfold Val.longofsingle, longofsingle_total. intros.
+  inv MATCH; simpl in *; try constructor.
+  all: destruct (Float32.to_longu f) as [i|] eqn:E; simpl; [auto with va | constructor].
+Qed.
+
+Lemma singleoflong_total_sound:
+  forall v x, vmatch v x ->
+              vmatch (Val.maketotal (Val.singleoflong v)) (singleoflong x).
+Proof.
+  unfold Val.singleoflong, singleoflong; intros.
+  inv H; simpl.
+  all: auto with va.
+  all: unfold ntop1, provenance.
+  all: try constructor.
+Qed.
+
+Lemma singleoflongu_total_sound:
+  forall v x, vmatch v x ->
+              vmatch (Val.maketotal (Val.singleoflongu v)) (singleoflongu x).
+Proof.
+  unfold Val.singleoflongu, singleoflongu; intros.
+  inv H; simpl.
+  all: auto with va.
+  all: unfold ntop1, provenance.
+  all: try constructor.
+Qed.
+
+Lemma floatoflong_total_sound:
+  forall v x, vmatch v x ->
+              vmatch (Val.maketotal (Val.floatoflong v)) (floatoflong x).
+Proof.
+  unfold Val.floatoflong, floatoflong; intros.
+  inv H; simpl.
+  all: auto with va.
+  all: unfold ntop1, provenance.
+  all: try constructor.
+Qed.
+
+Lemma floatoflongu_total_sound:
+  forall v x, vmatch v x ->
+              vmatch (Val.maketotal (Val.floatoflongu v)) (floatoflongu x).
+Proof.
+  unfold Val.floatoflongu, floatoflongu; intros.
+  inv H; simpl.
+  all: auto with va.
+  all: unfold ntop1, provenance.
+  all: try constructor.
+Qed.
+
+Lemma floatofint_total_sound:
+  forall v x, vmatch v x ->
+              vmatch (Val.maketotal (Val.floatofint v)) (floatofint x).
+Proof.
+  unfold Val.floatofint, floatofint; intros.
+  inv H; simpl.
+  all: auto with va.
+  all: unfold ntop1, provenance.
+  all: try constructor.
+Qed.
+
+Lemma floatofintu_total_sound:
+  forall v x, vmatch v x ->
+              vmatch (Val.maketotal (Val.floatofintu v)) (floatofintu x).
+Proof.
+  unfold Val.floatofintu, floatofintu; intros.
+  inv H; simpl.
+  all: auto with va.
+  all: unfold ntop1, provenance.
+  all: try constructor.
+Qed.
+
+
+Definition divs_total (v w: aval) := 
+  match w, v with
+  | I i2, I i1 =>
+      if Int.eq i2 Int.zero
+      || Int.eq i1 (Int.repr Int.min_signed) && Int.eq i2 Int.mone
+      then ntop
+      else I (Int.divs i1 i2)
+  | _, _ => ntop2 v w
+  end.
+
+Lemma divs_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.divs v w)) (divs_total x y).
+Proof.
+  intros until y.
+  intros HX HY.
+  inv HX; inv HY; cbn in *.
+  { destruct (_ || _) eqn:E; cbn; unfold ntop; auto with va.
+  }
+  all: unfold ntop2; auto with va.
+  all: destruct (_ || _) eqn:E; unfold ntop2; cbn; auto with va.
+Qed.
+
+Definition divu_total (v w: aval) :=
+  match w, v with
+  | I i2, I i1 =>
+      if Int.eq i2 Int.zero
+       then ntop
+      else I (Int.divu i1 i2)
+  | _, _ => ntop2 v w
+  end.
+
+Lemma divu_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.divu v w)) (divu_total x y).
+Proof.
+  intros until y.
+  intros HX HY.
+  inv HX; inv HY; cbn in *.
+  { destruct Int.eq eqn:E; cbn; unfold ntop; auto with va.
+  }
+  all: unfold ntop2; auto with va.
+  all: destruct Int.eq eqn:E; unfold ntop2; cbn; auto with va.
+Qed.
+
+Definition mods_total (v w: aval) :=
+  match w, v with
+  | I i2, I i1 =>
+      if Int.eq i2 Int.zero
+      || Int.eq i1 (Int.repr Int.min_signed) && Int.eq i2 Int.mone
+      then ntop
+      else I (Int.mods i1 i2)
+  | _, _ => ntop2 v w
+  end.
+
+Lemma mods_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.mods v w)) (mods_total x y).
+Proof.
+  intros until y.
+  intros HX HY.
+  inv HX; inv HY; cbn in *.
+  { destruct (_ || _) eqn:E; cbn; unfold ntop; auto with va.
+  }
+  all: unfold ntop2; auto with va.
+  all: destruct (_ || _) eqn:E; unfold ntop2; cbn; auto with va.
+Qed.
+
+Definition modu_total (v w: aval) :=
+  match w, v with
+  | I i2, I i1 =>
+      if Int.eq i2 Int.zero
+      then ntop
+      else I (Int.modu i1 i2)
+  | I i2, _ => uns (provenance v) (usize i2)
+  | _, _ => ntop2 v w
+  end.
+
+Lemma modu_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.modu v w)) (modu_total x y).
+Proof.
+  assert (UNS: forall i j, j <> Int.zero -> is_uns (usize j) (Int.modu i j)).
+  {
+    intros. apply is_uns_mon with (usize (Int.modu i j)).
+    { apply is_uns_usize.
+    }
+    unfold usize, Int.size.
+    apply Zsize_monotone.
+    generalize (Int.unsigned_range_2 j); intros RANGE.
+    assert (Int.unsigned j <> 0).
+    { red; intros; elim H. rewrite <- (Int.repr_unsigned j). rewrite H0. auto. }
+    exploit (Z_mod_lt (Int.unsigned i) (Int.unsigned j)). omega. intros MOD.
+    unfold Int.modu. rewrite Int.unsigned_repr. omega. omega.
+  }
+  intros until y.
+  intros HX HY.
+  inv HX; inv HY; cbn in *.
+  { destruct Int.eq eqn:E; unfold ntop; cbn; auto with va.
+  }
+  all: try discriminate.
+  all: unfold ntop2; auto with va.
+  all: try (destruct Int.eq eqn:E; cbn; unfold ntop2; auto with va; fail).
+  all: try apply vmatch_uns_undef.
+  
+  all:
+    generalize (Int.eq_spec i0 Int.zero);
+    destruct (Int.eq i0 Int.zero);
+    cbn;
+    intro.
+  all: try apply vmatch_uns_undef.
+  all: apply vmatch_uns; auto.
+Qed.
+
+
+Lemma shrx_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.shrx v w)) (shrx x y).
+Proof.
+  intros until y. intros HX HY.
+  inv HX; inv HY; cbn.
+  all: unfold ntop1; auto with va.
+  all: destruct Int.ltu eqn:LTU; cbn; unfold ntop; auto with va.
+Qed.
+
+
+Definition divls_total (v w: aval) :=
+  match w, v with
+  | L i2, L i1 =>
+      if Int64.eq i2 Int64.zero
+      || Int64.eq i1 (Int64.repr Int64.min_signed) && Int64.eq i2 Int64.mone
+      then ntop
+      else L (Int64.divs i1 i2)
+  | _, _ => ntop2 v w
+  end.
+
+Lemma divls_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.divls v w)) (divls_total x y).
+Proof.
+  intros until y.
+  intros HX HY.
+  inv HX; inv HY; cbn in *.
+  all: unfold ntop2; auto with va.
+  all: destruct (_ || _) eqn:E; unfold ntop2, ntop; cbn; auto with va.
+Qed.
+
+Definition divlu_total (v w: aval) :=
+  match w, v with
+  | L i2, L i1 =>
+      if Int64.eq i2 Int64.zero
+       then ntop
+      else L (Int64.divu i1 i2)
+  | _, _ => ntop2 v w
+  end.
+
+Lemma divlu_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.divlu v w)) (divlu_total x y).
+Proof.
+  intros until y.
+  intros HX HY.
+  inv HX; inv HY; cbn in *.
+  all: unfold ntop2; auto with va.
+  all: destruct Int64.eq eqn:E; unfold ntop2, ntop; cbn; auto with va.
+Qed.
+
+
+Definition modls_total (v w: aval) :=
+  match w, v with
+  | L i2, L i1 =>
+      if Int64.eq i2 Int64.zero
+      || Int64.eq i1 (Int64.repr Int64.min_signed) && Int64.eq i2 Int64.mone
+      then ntop
+      else L (Int64.mods i1 i2)
+  | _, _ => ntop2 v w
+  end.
+
+Lemma modls_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.modls v w)) (modls_total x y).
+Proof.
+  intros until y.
+  intros HX HY.
+  inv HX; inv HY; cbn in *.
+  all: unfold ntop2; auto with va.
+  all: destruct (_ || _) eqn:E; unfold ntop2, ntop; cbn; auto with va.
+Qed.
+
+
+Definition modlu_total (v w: aval) :=
+  match w, v with
+  | L i2, L i1 =>
+      if Int64.eq i2 Int64.zero
+      then ntop
+      else L (Int64.modu i1 i2)
+  | _, _ => ntop2 v w
+  end.
+
+Lemma modlu_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.modlu v w)) (modlu_total x y).
+Proof.
+  intros until y.
+  intros HX HY.
+  inv HX; inv HY; cbn in *.
+  all: unfold ntop2; auto with va.
+  all: destruct Int64.eq eqn:E; cbn; unfold ntop2, ntop; auto with va.
+Qed.
+
+Lemma shrxl_total_sound:
+  forall v w x y, vmatch v x -> vmatch w y -> vmatch (Val.maketotal (Val.shrxl v w)) (shrxl x y).
+Proof.
+  intros until y. intros HX HY.
+  inv HX; inv HY; cbn.
+  all: unfold ntop1; auto with va.
+  all: destruct Int.ltu eqn:LTU; cbn; unfold ntop; auto with va.
+Qed.
+
 (** Comparisons and variation intervals *)
 
 Definition cmp_intv (c: comparison) (i: Z * Z) (n: Z) : abool :=
@@ -4734,6 +5195,26 @@ Hint Resolve cnot_sound symbol_address_sound
        longoffloat_sound longuoffloat_sound floatoflong_sound floatoflongu_sound
        longofsingle_sound longuofsingle_sound singleoflong_sound singleoflongu_sound
        longofwords_sound loword_sound hiword_sound
+       intoffloat_total_sound
+       intuoffloat_total_sound
+       intofsingle_total_sound
+       intuofsingle_total_sound
+       singleofint_total_sound
+       singleofintu_total_sound
+       longoffloat_total_sound
+       longuoffloat_total_sound
+       longofsingle_total_sound
+       longuofsingle_total_sound
+       singleoflong_total_sound
+       singleoflongu_total_sound
+       floatoflong_total_sound
+       floatoflongu_total_sound
+       floatofint_total_sound
+       floatofintu_total_sound
+       divu_total_sound divs_total_sound
+       modu_total_sound mods_total_sound shrx_total_sound
+       divlu_total_sound divls_total_sound
+       modlu_total_sound modls_total_sound shrxl_total_sound
        cmpu_bool_sound cmp_bool_sound cmplu_bool_sound cmpl_bool_sound
        cmpf_bool_sound cmpfs_bool_sound
        maskzero_sound : va.
author	Cyril SIX <cyril.six@kalray.eu>	2020-12-04 17:41:14 +0100
committer	Cyril SIX <cyril.six@kalray.eu>	2020-12-04 17:41:14 +0100
commit	60ff1e39bac5ab35c46698cbc1ed7a76fc936cab (patch)
tree	87ff5f3b209e5659e967f862dab1517bb2b32baa /backend
parent	f2fb8540c94ceb9892510f83bd7d6734fe9d422f (diff)
parent	d2197102d6b81e225865cfac5f1d319d168e1e23 (diff)
download	compcert-kvx-60ff1e39bac5ab35c46698cbc1ed7a76fc936cab.tar.gz compcert-kvx-60ff1e39bac5ab35c46698cbc1ed7a76fc936cab.zip