Merge remote-tracking branch 'origin/mppa-work' into mppa-thread

author: David Monniaux <david.monniaux@univ-grenoble-alpes.fr> 2020-04-08 21:45:42 +0200
committer: David Monniaux <david.monniaux@univ-grenoble-alpes.fr> 2020-04-08 21:45:42 +0200
commit: 63915fbebe707cc1de7c0ed5a24148cac45a742c (patch)
tree: da503cba224f14281a2ee841930b8843459cb42b
parent: f78d61faf3db94ac1704ce0d11291211b5307629 (diff)
parent: e326ed9f28a2ed6869f0cb356ef9a8e189cb0a47 (diff)
download: compcert-kvx-63915fbebe707cc1de7c0ed5a24148cac45a742c.tar.gz
compcert-kvx-63915fbebe707cc1de7c0ed5a24148cac45a742c.zip
124 files changed, 4766 insertions, 1881 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 00000000..1f854fc3
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,240 @@
+stages:
+  - build
+
+check-admitted:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_x86_64.sh
+    - make check-admitted
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+    
+build_x86_64:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_x86_64.sh
+    - make -j "$NJOBS"
+    - make -C test all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+build_ia32:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-multilib
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_ia32.sh
+    - make -j "$NJOBS"
+    - make -C test all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen BITS=32 TARGET_CC='gcc -m32'
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+build_aarch64:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-aarch64-linux-gnu qemu-user
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_aarch64.sh
+    - make -j "$NJOBS"
+    - make -C test CCOMPOPTS='-static' SIMU='qemu-aarch64' EXECUTE='qemu-aarch64' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='aarch64-linux-gnu-gcc' EXECUTE='qemu-aarch64' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+build_arm:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-arm-linux-gnueabi qemu-user
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_arm.sh
+    - make -j "$NJOBS"
+    - make -C test CCOMPOPTS=-static SIMU='qemu-arm' EXECUTE='qemu-arm' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='arm-linux-gnueabi-gcc' EXECUTE='qemu-arm' CCOMPOPTS='-static' TARGET_CFLAGS='-static' BITS=32
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+
+build_armhf:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-arm-linux-gnueabihf qemu-user
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_armhf.sh
+    - make -j "$NJOBS"
+    - make -C test CCOMPOPTS=-static SIMU='qemu-arm' EXECUTE='qemu-arm' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='arm-linux-gnueabihf-gcc' EXECUTE='qemu-arm' CCOMPOPTS='-static' TARGET_CFLAGS='-static' BITS=32
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+build_ppc:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-powerpc-linux-gnu qemu-user
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_ppc.sh
+    - make -j "$NJOBS"
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+build_ppc64:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-powerpc64-linux-gnu
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_ppc64.sh
+    - make -j "$NJOBS"
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+build_rv64:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-riscv64-linux-gnu qemu-user
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_rv64.sh
+    - make -j "$NJOBS"
+    - make -C test CCOMPOPTS=-static SIMU='qemu-riscv64' EXECUTE='qemu-riscv64' all test
+    - ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='riscv64-linux-gnu-gcc' EXECUTE='qemu-riscv64' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+build_rv32:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - sudo apt-get -o Acquire::Check-Valid-Until=false -o Acquire::Check-Date=false update
+    - sudo apt-get -y install gcc-riscv64-linux-gnu qemu-user
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_rv32.sh -no-runtime-lib
+    - make -j "$NJOBS"
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
+
+build_k1c:
+  stage: build
+  image: "coqorg/coq"
+  before_script:
+    - opam switch 4.07.1+flambda
+    - eval `opam config env`
+    - opam install -y menhir
+  script:
+    - ./config_k1c.sh -no-runtime-lib
+    - make -j "$NJOBS"
+  rules:
+    - if: '$CI_COMMIT_BRANCH == "mppa-work"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "mppa-k1c"'
+      when: always
+    - if: '$CI_COMMIT_BRANCH == "master"'
+      when: always
+    - when: manual
diff --git a/Changelog b/Changelog
index 08586da5..8cf4e548 100644
--- a/Changelog
+++ b/Changelog
@@ -1,7 +1,41 @@
-Coq development:
-- Compatibility with Coq version 8.11.0 (#316)
+Release 3.7, 2020-03-31
+=======================
+
+ISO C conformance:
+- Functions declared `extern` then implemented `inline` remain `extern`
+- The type of a wide char constant is `wchar_t`, not `int`
+- Support vertical tabs and treat them as whitespace
+- Define the semantics of `free(NULL)`
+
+Bug fixing:
+- Take sign into account for conversions from 32-bit integers to 64-bit pointers
+- PowerPC: more precise determination of small data accesses
+- AArch64: when addressing global variables, check for correct alignment
+- PowerPC, ARM: double rounding error in int64->float32 conversions
+
+ABI conformance:
+- x86, AArch64: re-normalize values of small integer types returned by
+  function calls
+- PowerPC: `float` arguments passed on stack are passed in 64-bit format
+- RISC-V: use the new ELF psABI instead of the old ABI from ISA 2.1
+
+Usability and diagnostics:
+- Unknown builtin functions trigger a specific error message
+- Improved error messages
+
+Coq formalization:
+- Revised modeling of the PowerPC/EREF `isel` instruction
+- Weaker `ec_readonly` condition over external calls
+  (permissions can be dropped on read-only locations)
+
+Coq and OCaml development:
+- Compatibility with Coq version 8.10.1, 8.10.2, 8.11.0
+- Compatibility with OCaml 4.10 and up
+- Compatibility with Menhir 20200123 and up
+- Coq versions prior to 8.8.0 are no longer supported
+- OCaml versions prior to 4.05.0 are no longer supported
+
 
-	
 Release 3.6, 2019-09-17
 =======================
 
diff --git a/Makefile b/Makefile
index 5566cf57..2cd40800 100644
--- a/Makefile
+++ b/Makefile
@@ -86,6 +86,7 @@ BACKEND=\
   ValueDomain.v ValueAOp.v ValueAnalysis.v \
   ConstpropOp.v Constprop.v ConstpropOpproof.v Constpropproof.v \
   CSEdomain.v CombineOp.v CSE.v CombineOpproof.v CSEproof.v \
+  CSE2deps.v CSE2depsproof.v \
   CSE2.v CSE2proof.v \
   NeedDomain.v NeedOp.v Deadcode.v Deadcodeproof.v \
   Unusedglob.v Unusedglobproof.v \
diff --git a/VERSION b/VERSION
index 92686b06..b60e8d9b 100644
--- a/VERSION
+++ b/VERSION
@@ -1,3 +1,3 @@
-version=3.6
+version=3.7
 buildnr=
 tag=
diff --git a/aarch64/Asmgen.v b/aarch64/Asmgen.v
index 46dd875d..024c9a17 100644
--- a/aarch64/Asmgen.v
+++ b/aarch64/Asmgen.v
@@ -1061,8 +1061,13 @@ Definition storeptr (src: ireg) (base: iregsp) (ofs: ptrofs) (k: code) :=
 (** Function epilogue *)
 
 Definition make_epilogue (f: Mach.function) (k: code) :=
-  loadptr XSP f.(fn_retaddr_ofs) RA
-    (Pfreeframe f.(fn_stacksize) f.(fn_link_ofs) :: k).
+  (* FIXME
+     Cannot be used because memcpy destroys X30;
+     issue being discussed with X. Leroy *)
+  (* if is_leaf_function f
+  then Pfreeframe f.(fn_stacksize) f.(fn_link_ofs) :: k
+  else*) loadptr XSP f.(fn_retaddr_ofs) RA
+         (Pfreeframe f.(fn_stacksize) f.(fn_link_ofs) :: k).
   
 (** Translation of a Mach instruction. *)
 
diff --git a/aarch64/Asmgenproof.v b/aarch64/Asmgenproof.v
index 88258cd6..6831509f 100644
--- a/aarch64/Asmgenproof.v
+++ b/aarch64/Asmgenproof.v
@@ -337,7 +337,12 @@ Qed.
 Remark make_epilogue_label:
   forall f k, tail_nolabel k (make_epilogue f k).
 Proof.
-  unfold make_epilogue; intros. eapply tail_nolabel_trans. apply loadptr_label. TailNoLabel.
+  unfold make_epilogue; intros.
+  (* FIXME destruct is_leaf_function.
+  { TailNoLabel. } *)
+  eapply tail_nolabel_trans.
+  apply loadptr_label.
+  TailNoLabel.
 Qed.
 
 Lemma transl_instr_label:
@@ -472,7 +477,8 @@ Inductive match_states: Mach.state -> Asm.state -> Prop :=
         (MEXT: Mem.extends m m')
         (AT: transl_code_at_pc ge (rs PC) fb f c ep tf tc)
         (AG: agree ms sp rs)
-        (DXP: ep = true -> rs#X29 = parent_sp s),
+        (DXP: ep = true -> rs#X29 = parent_sp s)
+        (LEAF: is_leaf_function f = true -> rs#RA = parent_ra s),
       match_states (Mach.State s fb sp c ms m)
                    (Asm.State rs m')
   | match_states_call:
@@ -503,16 +509,17 @@ Lemma exec_straight_steps:
    exists rs2,
        exec_straight tge tf c rs1 m1' k rs2 m2'
     /\ agree ms2 sp rs2
-    /\ (it1_is_parent ep i = true -> rs2#X29 = parent_sp s)) ->
+    /\ (it1_is_parent ep i = true -> rs2#X29 = parent_sp s)
+    /\ (is_leaf_function f = true -> rs2#RA = parent_ra s)) ->
   exists st',
   plus step tge (State rs1 m1') E0 st' /\
   match_states (Mach.State s fb sp c ms2 m2) st'.
 Proof.
   intros. inversion H2. subst. monadInv H7.
-  exploit H3; eauto. intros [rs2 [A [B C]]].
+  exploit H3; eauto. intros [rs2 [A [B [C D]]]].
   exists (State rs2 m2'); split.
-  eapply exec_straight_exec; eauto.
-  econstructor; eauto. eapply exec_straight_at; eauto.
+  - eapply exec_straight_exec; eauto.
+  - econstructor; eauto. eapply exec_straight_at; eauto.
 Qed.
 
 Lemma exec_straight_steps_goto:
@@ -527,13 +534,14 @@ Lemma exec_straight_steps_goto:
    exists jmp, exists k', exists rs2,
        exec_straight tge tf c rs1 m1' (jmp :: k') rs2 m2'
     /\ agree ms2 sp rs2
-    /\ exec_instr tge tf jmp rs2 m2' = goto_label tf lbl rs2 m2') ->
+    /\ exec_instr tge tf jmp rs2 m2' = goto_label tf lbl rs2 m2'
+    /\ (is_leaf_function f = true -> rs2#RA = parent_ra s)) ->
   exists st',
   plus step tge (State rs1 m1') E0 st' /\
   match_states (Mach.State s fb sp c' ms2 m2) st'.
 Proof.
   intros. inversion H3. subst. monadInv H9.
-  exploit H5; eauto. intros [jmp [k' [rs2 [A [B C]]]]].
+  exploit H5; eauto. intros [jmp [k' [rs2 [A [B [C D]]]]]].
   generalize (functions_transl _ _ _ H7 H8); intro FN.
   generalize (transf_function_no_overflow _ _ H8); intro NOOV.
   exploit exec_straight_steps_2; eauto.
@@ -550,6 +558,7 @@ Proof.
   econstructor; eauto.
   apply agree_exten with rs2; auto with asmgen.
   congruence.
+  rewrite OTH by congruence; auto.
 Qed.
 
 Lemma exec_straight_opt_steps_goto:
@@ -564,13 +573,14 @@ Lemma exec_straight_opt_steps_goto:
    exists jmp, exists k', exists rs2,
        exec_straight_opt tge tf c rs1 m1' (jmp :: k') rs2 m2'
     /\ agree ms2 sp rs2
-    /\ exec_instr tge tf jmp rs2 m2' = goto_label tf lbl rs2 m2') ->
+    /\ exec_instr tge tf jmp rs2 m2' = goto_label tf lbl rs2 m2'
+    /\ (is_leaf_function f = true -> rs2#RA = parent_ra s)) ->
   exists st',
   plus step tge (State rs1 m1') E0 st' /\
   match_states (Mach.State s fb sp c' ms2 m2) st'.
 Proof.
   intros. inversion H3. subst. monadInv H9.
-  exploit H5; eauto. intros [jmp [k' [rs2 [A [B C]]]]].
+  exploit H5; eauto. intros [jmp [k' [rs2 [A [B [C D]]]]]].
   generalize (functions_transl _ _ _ H7 H8); intro FN.
   generalize (transf_function_no_overflow _ _ H8); intro NOOV.
   inv A.
@@ -583,6 +593,7 @@ Proof.
   econstructor; eauto.
   apply agree_exten with rs2; auto with asmgen.
   congruence.
+  rewrite OTH by congruence; auto.
 - exploit exec_straight_steps_2; eauto.
   intros [ofs' [PC2 CT2]].
   exploit find_label_goto_label; eauto.
@@ -597,6 +608,7 @@ Proof.
   econstructor; eauto.
   apply agree_exten with rs2; auto with asmgen.
   congruence.
+  rewrite OTH by congruence; auto.
 Qed.
 
 (** We need to show that, in the simulation diagram, we cannot
@@ -629,7 +641,7 @@ Qed.
 
 Theorem step_simulation:
   forall S1 t S2, Mach.step return_address_offset ge S1 t S2 ->
-  forall S1' (MS: match_states S1 S1'),
+  forall S1' (MS: match_states S1 S1')  (WF: wf_state ge S1),
   (exists S2', plus step tge S1' t S2' /\ match_states S2 S2')
   \/ (measure S2 < measure S1 /\ t = E0 /\ match_states S2 S1')%nat.
 Proof.
@@ -638,17 +650,20 @@ Proof.
 - (* Mlabel *)
   left; eapply exec_straight_steps; eauto; intros.
   monadInv TR. econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split. apply agree_nextinstr; auto. simpl; congruence.
+  split. { apply agree_nextinstr; auto. }
+  split. { simpl; congruence. }
+  rewrite nextinstr_inv by congruence; assumption.
 
 - (* Mgetstack *)
   unfold load_stack in H.
   exploit Mem.loadv_extends; eauto. intros [v' [A B]].
   rewrite (sp_val _ _ _ AG) in A.
   left; eapply exec_straight_steps; eauto. intros. simpl in TR.
-  exploit loadind_correct; eauto with asmgen. intros [rs' [P [Q R]]].
+  exploit loadind_correct; eauto with asmgen. intros [rs' [P [Q [R S]]]].
   exists rs'; split. eauto.
-  split. eapply agree_set_mreg; eauto with asmgen. congruence.
-  simpl; congruence.
+  split. { eapply agree_set_mreg; eauto with asmgen. congruence. }
+  split. { simpl; congruence. }
+  rewrite S. assumption.
 
 - (* Msetstack *)
   unfold store_stack in H.
@@ -656,10 +671,12 @@ Proof.
   exploit Mem.storev_extends; eauto. intros [m2' [A B]].
   left; eapply exec_straight_steps; eauto.
   rewrite (sp_val _ _ _ AG) in A. intros. simpl in TR.
-  exploit storeind_correct; eauto with asmgen. intros [rs' [P Q]].
+  exploit storeind_correct; eauto with asmgen. intros [rs' [P [Q R]]].
   exists rs'; split. eauto.
   split. eapply agree_undef_regs; eauto with asmgen.
-  simpl; intros. rewrite Q; auto with asmgen.
+  simpl; intros.
+  split. rewrite Q; auto with asmgen.
+  rewrite R. assumption.
 
 - (* Mgetparam *)
   assert (f0 = f) by congruence; subst f0.
@@ -675,39 +692,45 @@ Opaque loadind.
 (* X30 contains parent *)
   exploit loadind_correct. eexact EQ.
   instantiate (2 := rs0). simpl; rewrite DXP; eauto. simpl; congruence.
-  intros [rs1 [P [Q R]]].
+  intros [rs1 [P [Q [R S]]]].
   exists rs1; split. eauto.
   split. eapply agree_set_mreg. eapply agree_set_mreg; eauto. congruence. auto with asmgen.
-  simpl; intros. rewrite R; auto with asmgen.
-  apply preg_of_not_X29; auto.
+  simpl; split; intros.
+  { rewrite R; auto with asmgen.
+    apply preg_of_not_X29; auto.
+  }
+  { rewrite S; auto. }
+    
 (* X30 does not contain parent *)
   exploit loadptr_correct. eexact A. simpl; congruence. intros [rs1 [P [Q R]]].
   exploit loadind_correct. eexact EQ. instantiate (2 := rs1). simpl; rewrite Q. eauto. simpl; congruence.
-  intros [rs2 [S [T U]]].
+  intros [rs2 [S [T [U V]]]].
   exists rs2; split. eapply exec_straight_trans; eauto.
   split. eapply agree_set_mreg. eapply agree_set_mreg. eauto. eauto.
   instantiate (1 := rs1#X29 <- (rs2#X29)). intros.
   rewrite Pregmap.gso; auto with asmgen.
   congruence.
   intros. unfold Pregmap.set. destruct (PregEq.eq r' X29). congruence. auto with asmgen.
-  simpl; intros. rewrite U; auto with asmgen.
+  split; simpl; intros. rewrite U; auto with asmgen.
   apply preg_of_not_X29; auto.
-
+  rewrite V. rewrite R by congruence. auto.
+  
 - (* Mop *)
   assert (eval_operation tge sp op (map rs args) m = Some v).
   { rewrite <- H. apply eval_operation_preserved. exact symbols_preserved. }
   exploit eval_operation_lessdef. eapply preg_vals; eauto. eauto. eexact H0.
   intros [v' [A B]]. rewrite (sp_val _ _ _ AG) in A.
   left; eapply exec_straight_steps; eauto; intros. simpl in TR.
-  exploit transl_op_correct; eauto. intros [rs2 [P [Q R]]].
+  exploit transl_op_correct; eauto. intros [rs2 [P [Q [R S]]]].
   exists rs2; split. eauto. split.
   apply agree_set_undef_mreg with rs0; auto. 
   apply Val.lessdef_trans with v'; auto.
-  simpl; intros. InvBooleans. 
+  split; simpl; intros. InvBooleans. 
   rewrite R; auto. apply preg_of_not_X29; auto.
 Local Transparent destroyed_by_op.
   destruct op; try exact I; simpl; congruence.
-
+  rewrite S.
+  auto.
 - (* Mload *)
   destruct trap.
   {
@@ -717,10 +740,11 @@ Local Transparent destroyed_by_op.
   intros [a' [A B]]. rewrite (sp_val _ _ _ AG) in A.
   exploit Mem.loadv_extends; eauto. intros [v' [C D]].
   left; eapply exec_straight_steps; eauto; intros. simpl in TR.
-  exploit transl_load_correct; eauto. intros [rs2 [P [Q R]]].
+  exploit transl_load_correct; eauto. intros [rs2 [P [Q [R S]]]].
   exists rs2; split. eauto.
   split. eapply agree_set_undef_mreg; eauto. congruence.
-  simpl; congruence.
+  split. simpl; congruence.
+  rewrite S. assumption.
   }
   
   (* Mload notrap1 *)
@@ -740,10 +764,11 @@ Local Transparent destroyed_by_op.
   assert (Val.lessdef (rs src) (rs0 (preg_of src))) by (eapply preg_val; eauto).
   exploit Mem.storev_extends; eauto. intros [m2' [C D]].
   left; eapply exec_straight_steps; eauto.
-  intros. simpl in TR. exploit transl_store_correct; eauto. intros [rs2 [P Q]].
+  intros. simpl in TR. exploit transl_store_correct; eauto. intros [rs2 [P [Q R]]].
   exists rs2; split. eauto.
   split. eapply agree_undef_regs; eauto with asmgen.
-  simpl; congruence.
+  split. simpl; congruence.
+  rewrite R. assumption.
 
 - (* Mcall *)
   assert (f0 = f) by congruence.  subst f0.
@@ -852,6 +877,18 @@ Local Transparent destroyed_by_op.
   eapply agree_undef_regs; eauto. intros. rewrite undef_regs_other_2; auto.
   congruence.
 
+  Simpl.
+  rewrite set_res_other by trivial.
+  rewrite undef_regs_other.
+  assumption.
+  intro.
+  rewrite in_map_iff.
+  intros (x0 & PREG & IN).
+  subst r'.
+  intro.
+  apply (preg_of_not_RA x0).
+  congruence.
+  
 - (* Mgoto *)
   assert (f0 = f) by congruence. subst f0.
   inv AT. monadInv H4.
@@ -865,25 +902,33 @@ Local Transparent destroyed_by_op.
   eapply agree_exten; eauto with asmgen.
   congruence.
 
+  rewrite INV by congruence.
+  assumption.
+  
 - (* Mcond true *)
   assert (f0 = f) by congruence. subst f0.
   exploit eval_condition_lessdef. eapply preg_vals; eauto. eauto. eauto. intros EC.
   left; eapply exec_straight_opt_steps_goto; eauto.
   intros. simpl in TR.
-  exploit transl_cond_branch_correct; eauto. intros (rs' & jmp & A & B & C).
+  exploit transl_cond_branch_correct; eauto. intros (rs' & jmp & A & B & C & D).
   exists jmp; exists k; exists rs'.
   split. eexact A. 
   split. apply agree_exten with rs0; auto with asmgen.
-  exact B. 
+  split.
+  exact B.
+  rewrite D. exact LEAF.
 
 - (* Mcond false *)
   exploit eval_condition_lessdef. eapply preg_vals; eauto. eauto. eauto. intros EC.
   left; eapply exec_straight_steps; eauto. intros. simpl in TR.
-  exploit transl_cond_branch_correct; eauto. intros (rs' & jmp & A & B & C).
+  exploit transl_cond_branch_correct; eauto. intros (rs' & jmp & A & B & C & D).
   econstructor; split.
   eapply exec_straight_opt_right. eexact A. apply exec_straight_one. eexact B. auto.
   split. apply agree_exten with rs0; auto. intros. Simpl.
+  split.
   simpl; congruence.
+  Simpl. rewrite D.
+  exact LEAF.
 
 - (* Mjumptable *)
   assert (f0 = f) by congruence. subst f0.
@@ -905,6 +950,10 @@ Local Transparent destroyed_by_op.
   simpl. intros. rewrite C; auto with asmgen. Simpl.
   congruence.
 
+  rewrite C by congruence.
+  repeat rewrite Pregmap.gso by congruence.
+  assumption.
+  
 - (* Mreturn *)
   assert (f0 = f) by congruence. subst f0.
   inversion AT; subst. simpl in H6; monadInv H6.
@@ -947,7 +996,7 @@ Local Transparent destroyed_by_op.
     simpl preg_of_iregsp. change (rs2 X30) with (rs0 X30). rewrite ATLR. 
     change (rs2 X2) with sp. eexact P. 
     simpl; congruence. congruence.
-  intros (rs3 & U & V).
+  intros (rs3 & U & V & W).
   assert (EXEC_PROLOGUE:
             exec_straight tge tf
               tf.(fn_code) rs0 m'
@@ -974,6 +1023,10 @@ Local Transparent destroyed_at_function_entry. simpl.
   unfold sp; congruence.
   intros. rewrite V by auto with asmgen. reflexivity.
 
+  rewrite W.
+  unfold rs2.
+  Simpl.
+  
 - (* external function *)
   exploit functions_translated; eauto.
   intros [tf [A B]]. simpl in B. inv B.
@@ -993,6 +1046,10 @@ Local Transparent destroyed_at_function_entry. simpl.
   right. split. omega. split. auto.
   rewrite <- ATPC in H5.
   econstructor; eauto. congruence.
+  inv WF.
+  inv STACK.
+  inv H1.
+  congruence.
 Qed.
 
 Lemma transf_initial_states:
@@ -1028,11 +1085,17 @@ Qed.
 Theorem transf_program_correct:
   forward_simulation (Mach.semantics return_address_offset prog) (Asm.semantics tprog).
 Proof.
-  eapply forward_simulation_star with (measure := measure).
-  apply senv_preserved.
-  eexact transf_initial_states.
-  eexact transf_final_states.
-  exact step_simulation.
+  eapply forward_simulation_star with (measure := measure)
+         (match_states := fun S1 S2 => match_states S1 S2 /\ wf_state ge S1).
+  - apply senv_preserved.
+  - simpl; intros. exploit transf_initial_states; eauto.
+    intros (s2 & A & B).
+    exists s2; intuition auto. apply wf_initial; auto.
+  - simpl; intros. destruct H as [MS WF]. eapply transf_final_states; eauto.
+  - simpl; intros. destruct H0 as [MS WF]. 
+    exploit step_simulation; eauto. intros [ (s2' & A & B) | (A & B & C) ].
+    + left; exists s2'; intuition auto. eapply wf_step; eauto. 
+    + right; intuition auto. eapply wf_step; eauto.
 Qed.
 
 End PRESERVATION.
diff --git a/aarch64/Asmgenproof1.v b/aarch64/Asmgenproof1.v
index 6f296f56..0e36bd05 100644
--- a/aarch64/Asmgenproof1.v
+++ b/aarch64/Asmgenproof1.v
@@ -22,6 +22,51 @@ Local Transparent Archi.ptr64.
 
 (** Properties of registers *)
 
+Lemma preg_of_not_RA:
+  forall r, (preg_of r) <> RA.
+Proof.
+  destruct r; discriminate.
+Qed.
+
+Lemma RA_not_written:
+  forall (rs : regset) dst v,
+    rs # (preg_of dst) <- v RA = rs RA.
+Proof.
+  intros.
+  apply Pregmap.gso.
+  intro.
+  symmetry in H.
+  exact (preg_of_not_RA dst H).
+Qed.
+
+Hint Resolve RA_not_written : asmgen.
+
+Lemma RA_not_written2:
+  forall (rs : regset) dst v i,
+    preg_of dst = i ->
+    rs # i <- v RA = rs RA.
+Proof.
+  intros.
+  subst i.
+  apply RA_not_written.
+Qed.
+
+Hint Resolve RA_not_written2 : asmgen.
+
+Lemma RA_not_written3:
+  forall (rs : regset) dst v i,
+    ireg_of dst = OK i ->
+    rs # i <- v RA = rs RA.
+Proof.
+  intros.
+  unfold ireg_of in H.
+  destruct preg_of eqn:PREG; try discriminate.
+  replace i0 with i in * by congruence.
+  eapply RA_not_written2; eassumption.
+Qed.
+
+Hint Resolve RA_not_written3 : asmgen.
+
 Lemma preg_of_iregsp_not_PC: forall r, preg_of_iregsp r <> PC.
 Proof.
   destruct r; simpl; congruence.
@@ -39,6 +84,26 @@ Proof.
   red; intros; subst x. elim (preg_of_not_X16 r); auto.
 Qed.
 
+Lemma ireg_of_not_RA: forall r x, ireg_of r = OK x -> x <> RA.
+Proof.
+  unfold ireg_of; intros. destruct (preg_of r) eqn:E; inv H.
+  red; intros; subst x. elim (preg_of_not_RA r); auto.
+Qed.
+
+Lemma ireg_of_not_RA': forall r x, ireg_of r = OK x -> RA <> x.
+Proof.
+  intros. intro.
+  apply (ireg_of_not_RA r x); auto.
+Qed.
+
+Lemma ireg_of_not_RA'': forall r x, ireg_of r = OK x -> IR RA <> IR x.
+Proof.
+  intros. intro.
+  apply (ireg_of_not_RA' r x); auto. congruence.
+Qed.
+
+Hint Resolve ireg_of_not_RA ireg_of_not_RA' ireg_of_not_RA'' : asmgen.
+
 Lemma ireg_of_not_X16': forall r x, ireg_of r = OK x -> IR x <> IR X16.
 Proof.
   intros. apply ireg_of_not_X16 in H. congruence.
@@ -205,42 +270,49 @@ Qed.
 Lemma exec_loadimm_k_w:
   forall (rd: ireg) k m l,
   wf_decomposition l ->
+  rd <> RA ->
   forall (rs: regset) accu,
   rs#rd = Vint (Int.repr accu) ->
   exists rs',
      exec_straight_opt ge fn (loadimm_k W rd l k) rs m k rs' m
   /\ rs'#rd = Vint (Int.repr (recompose_int accu l))
-  /\ forall r, r <> PC -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, r <> PC -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
-  induction 1; intros rs accu ACCU; simpl.
+  induction 1; intros RD_NOT_RA rs accu ACCU; simpl.                          
 - exists rs; split. apply exec_straight_opt_refl. auto.
-- destruct (IHwf_decomposition
+- destruct (IHwf_decomposition RD_NOT_RA
                 (nextinstr (rs#rd <- (insert_in_int rs#rd n p 16)))
                 (Zinsert accu n p 16))
-  as (rs' & P & Q & R).
+  as (rs' & P & Q & R & S).
   Simpl. rewrite ACCU. simpl. f_equal. apply Int.eqm_samerepr. 
   apply Zinsert_eqmod. auto. omega. apply Int.eqm_sym; apply Int.eqm_unsigned_repr.
   exists rs'; split.
   eapply exec_straight_opt_step_opt. simpl; eauto. auto. exact P.
-  split. exact Q. intros; Simpl. rewrite R by auto. Simpl.
+  split. exact Q.
+  split.
+  { intros; Simpl.
+    rewrite R by auto. Simpl. }
+  { rewrite S. Simpl. }
 Qed.
 
 Lemma exec_loadimm_z_w:
   forall rd l k rs m,
   wf_decomposition l ->
+  rd <> RA ->
   exists rs',
      exec_straight ge fn (loadimm_z W rd l k) rs m k rs' m
   /\ rs'#rd = Vint (Int.repr (recompose_int 0 l))
   /\ forall r, r <> PC -> r <> rd -> rs'#r = rs#r.
 Proof.
-  unfold loadimm_z; destruct 1.
+  unfold loadimm_z; destruct 1; intro RD_NOT_RA.
 - econstructor; split.
   apply exec_straight_one. simpl; eauto. auto.
   split. Simpl.
   intros; Simpl.
 - set (accu0 := Zinsert 0 n p 16).
   set (rs1 := nextinstr (rs#rd <- (Vint (Int.repr accu0)))).
-  destruct (exec_loadimm_k_w rd k m l H1 rs1 accu0) as (rs2 & P & Q & R); auto.
+  destruct (exec_loadimm_k_w rd k m l H1 RD_NOT_RA rs1 accu0) as (rs2 & P & Q & R & S); auto.
   unfold rs1; Simpl.
   exists rs2; split.
   eapply exec_straight_opt_step; eauto.
@@ -253,12 +325,13 @@ Qed.
 Lemma exec_loadimm_n_w:
   forall rd l k rs m,
   wf_decomposition l ->
+  rd <> RA ->
   exists rs',
      exec_straight ge fn (loadimm_n W rd l k) rs m k rs' m
   /\ rs'#rd = Vint (Int.repr (Z.lnot (recompose_int 0 l)))
   /\ forall r, r <> PC -> r <> rd -> rs'#r = rs#r.
 Proof.
-  unfold loadimm_n; destruct 1.
+  unfold loadimm_n; destruct 1; intro RD_NOT_RA.
 - econstructor; split.
   apply exec_straight_one. simpl; eauto. auto.
   split. Simpl. 
@@ -267,7 +340,8 @@ Proof.
   set (rs1 := nextinstr (rs#rd <- (Vint (Int.repr accu0)))).
   destruct (exec_loadimm_k_w rd k m (negate_decomposition l) 
                                     (negate_decomposition_wf l H1)
-                                    rs1 accu0) as (rs2 & P & Q & R).
+                                    RD_NOT_RA rs1 accu0)
+    as (rs2 & P & Q & R & S).
   unfold rs1; Simpl.
   exists rs2; split.
   eapply exec_straight_opt_step; eauto.
@@ -279,7 +353,8 @@ Proof.
 Qed.
 
 Lemma exec_loadimm32:
-  forall rd n k rs m,
+  forall rd n k rs m
+  (RD_NOT_RA : rd <> RA),
   exists rs',
      exec_straight ge fn (loadimm32 rd n k) rs m k rs' m
   /\ rs'#rd = Vint n
@@ -302,13 +377,14 @@ Proof.
     apply Int.eqm_samerepr. apply decompose_notint_eqmod. 
     apply Int.repr_unsigned. }
   destruct Nat.leb.
-+ rewrite <- A. apply exec_loadimm_z_w. apply decompose_int_wf; omega.
-+ rewrite <- B. apply exec_loadimm_n_w. apply decompose_int_wf; omega.
++ rewrite <- A. apply exec_loadimm_z_w. apply decompose_int_wf; omega. trivial.
++ rewrite <- B. apply exec_loadimm_n_w. apply decompose_int_wf; omega. trivial.
 Qed.
 
 Lemma exec_loadimm_k_x:
   forall (rd: ireg) k m l,
-  wf_decomposition l ->
+    wf_decomposition l ->
+    rd <> RA ->
   forall (rs: regset) accu,
   rs#rd = Vlong (Int64.repr accu) ->
   exists rs',
@@ -316,9 +392,9 @@ Lemma exec_loadimm_k_x:
   /\ rs'#rd = Vlong (Int64.repr (recompose_int accu l))
   /\ forall r, r <> PC -> r <> rd -> rs'#r = rs#r.
 Proof.
-  induction 1; intros rs accu ACCU; simpl.
+  induction 1; intros RD_NOT_RA rs accu ACCU; simpl.
 - exists rs; split. apply exec_straight_opt_refl. auto.
-- destruct (IHwf_decomposition
+- destruct (IHwf_decomposition RD_NOT_RA
                 (nextinstr (rs#rd <- (insert_in_long rs#rd n p 16)))
                 (Zinsert accu n p 16))
   as (rs' & P & Q & R).
@@ -332,19 +408,20 @@ Qed.
 Lemma exec_loadimm_z_x:
   forall rd l k rs m,
   wf_decomposition l ->
+    rd <> RA ->
   exists rs',
      exec_straight ge fn (loadimm_z X rd l k) rs m k rs' m
   /\ rs'#rd = Vlong (Int64.repr (recompose_int 0 l))
   /\ forall r, r <> PC -> r <> rd -> rs'#r = rs#r.
 Proof.
-  unfold loadimm_z; destruct 1.
+  unfold loadimm_z; destruct 1; intro RD_NOT_RA.
 - econstructor; split.
   apply exec_straight_one. simpl; eauto. auto.
   split. Simpl.
   intros; Simpl.
 - set (accu0 := Zinsert 0 n p 16).
   set (rs1 := nextinstr (rs#rd <- (Vlong (Int64.repr accu0)))).
-  destruct (exec_loadimm_k_x rd k m l H1 rs1 accu0) as (rs2 & P & Q & R); auto.
+  destruct (exec_loadimm_k_x rd k m l H1 RD_NOT_RA rs1 accu0) as (rs2 & P & Q & R); auto.
   unfold rs1; Simpl.
   exists rs2; split.
   eapply exec_straight_opt_step; eauto.
@@ -357,12 +434,13 @@ Qed.
 Lemma exec_loadimm_n_x:
   forall rd l k rs m,
   wf_decomposition l ->
+  rd <> RA ->
   exists rs',
      exec_straight ge fn (loadimm_n X rd l k) rs m k rs' m
   /\ rs'#rd = Vlong (Int64.repr (Z.lnot (recompose_int 0 l)))
   /\ forall r, r <> PC -> r <> rd -> rs'#r = rs#r.
 Proof.
-  unfold loadimm_n; destruct 1.
+  unfold loadimm_n; destruct 1; intro RD_NOT_RA.
 - econstructor; split.
   apply exec_straight_one. simpl; eauto. auto.
   split. Simpl. 
@@ -371,7 +449,7 @@ Proof.
   set (rs1 := nextinstr (rs#rd <- (Vlong (Int64.repr accu0)))).
   destruct (exec_loadimm_k_x rd k m (negate_decomposition l) 
                                     (negate_decomposition_wf l H1)
-                                    rs1 accu0) as (rs2 & P & Q & R).
+                                    RD_NOT_RA rs1 accu0) as (rs2 & P & Q & R).
   unfold rs1; Simpl.
   exists rs2; split.
   eapply exec_straight_opt_step; eauto.
@@ -384,12 +462,13 @@ Qed.
 
 Lemma exec_loadimm64:
   forall rd n k rs m,
+    rd <> RA ->
   exists rs',
      exec_straight ge fn (loadimm64 rd n k) rs m k rs' m
   /\ rs'#rd = Vlong n
   /\ forall r, r <> PC -> r <> rd -> rs'#r = rs#r.
 Proof.
-  unfold loadimm64, loadimm; intros.
+  unfold loadimm64, loadimm; intros until m; intro RD_NOT_RA.
   destruct (is_logical_imm64 n).
 - econstructor; split.
   apply exec_straight_one. simpl; eauto. auto.
@@ -406,8 +485,8 @@ Proof.
     apply Int64.eqm_samerepr. apply decompose_notint_eqmod. 
     apply Int64.repr_unsigned. }
   destruct Nat.leb.
-+ rewrite <- A. apply exec_loadimm_z_x. apply decompose_int_wf; omega.
-+ rewrite <- B. apply exec_loadimm_n_x. apply decompose_int_wf; omega.
++ rewrite <- A. apply exec_loadimm_z_x. apply decompose_int_wf; omega. trivial.
++ rewrite <- B. apply exec_loadimm_n_x. apply decompose_int_wf; omega. trivial.
 Qed.
 
 (** Add immediate *)
@@ -419,55 +498,59 @@ Lemma exec_addimm_aux_32:
       Next (nextinstr (rs#rd <- (sem rs#r1 (Vint (Int.repr n))))) m) ->
   (forall v n1 n2, sem (sem v (Vint n1)) (Vint n2) = sem v (Vint (Int.add n1 n2))) ->
   forall rd r1 n k rs m,
+    (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (addimm_aux insn rd r1 (Int.unsigned n) k) rs m k rs' m
   /\ rs'#rd = sem rs#r1 (Vint n)
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
-  intros insn sem SEM ASSOC; intros. unfold addimm_aux.
+  intros insn sem SEM ASSOC; intros until m; intro RD_NOT_RA. unfold addimm_aux.
   set (nlo := Zzero_ext 12 (Int.unsigned n)). set (nhi := Int.unsigned n - nlo).
   assert (E: Int.unsigned n = nhi + nlo) by (unfold nhi; omega).
   rewrite <- (Int.repr_unsigned n).
   destruct (Z.eqb_spec nhi 0); [|destruct (Z.eqb_spec nlo 0)].
 - econstructor; split. apply exec_straight_one. apply SEM. Simpl. 
   split. Simpl. do 3 f_equal; omega.
-  intros; Simpl.
+  split; intros; Simpl.
 - econstructor; split. apply exec_straight_one. apply SEM. Simpl. 
   split. Simpl. do 3 f_equal; omega.
-  intros; Simpl.
+  split; intros; Simpl.
 - econstructor; split. eapply exec_straight_two.
   apply SEM. apply SEM. Simpl. Simpl.
   split. Simpl. rewrite ASSOC. do 2 f_equal. apply Int.eqm_samerepr.
   rewrite E. auto with ints.
-  intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 Lemma exec_addimm32:
   forall rd r1 n k rs m,
   r1 <> X16 ->
+  (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (addimm32 rd r1 n k) rs m k rs' m
   /\ rs'#rd = Val.add rs#r1 (Vint n)
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros. unfold addimm32. set (nn := Int.neg n).
   destruct (Int.eq n (Int.zero_ext 24 n)); [| destruct (Int.eq nn (Int.zero_ext 24 nn))].
-- apply exec_addimm_aux_32 with (sem := Val.add). auto. intros; apply Val.add_assoc. 
+- apply exec_addimm_aux_32 with (sem := Val.add); auto. intros; apply Val.add_assoc. 
 - rewrite <- Val.sub_opp_add.
-  apply exec_addimm_aux_32 with (sem := Val.sub). auto.
+  apply exec_addimm_aux_32 with (sem := Val.sub); auto.
   intros. rewrite ! Val.sub_add_opp, Val.add_assoc. rewrite Int.neg_add_distr. auto.
 - destruct (Int.lt n Int.zero).
 + rewrite <- Val.sub_opp_add; fold nn.
-  edestruct (exec_loadimm32 X16 nn) as (rs1 & A & B & C).
+  edestruct (exec_loadimm32 X16 nn) as (rs1 & A & B & C). congruence.
   econstructor; split.
   eapply exec_straight_trans. eexact A. eapply exec_straight_one. simpl; eauto. auto.
   split. Simpl. rewrite B, C; eauto with asmgen.
-  intros; Simpl.
-+ edestruct (exec_loadimm32 X16 n) as (rs1 & A & B & C).
+  split; intros; Simpl.
++ edestruct (exec_loadimm32 X16 n) as (rs1 & A & B & C). congruence.
   econstructor; split.
   eapply exec_straight_trans. eexact A. eapply exec_straight_one. simpl; eauto. auto.
   split. Simpl. rewrite B, C; eauto with asmgen.
-  intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 Lemma exec_addimm_aux_64:
@@ -477,10 +560,12 @@ Lemma exec_addimm_aux_64:
       Next (nextinstr (rs#rd <- (sem rs#r1 (Vlong (Int64.repr n))))) m) ->
   (forall v n1 n2, sem (sem v (Vlong n1)) (Vlong n2) = sem v (Vlong (Int64.add n1 n2))) ->
   forall rd r1 n k rs m,
+  (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (addimm_aux insn rd r1 (Int64.unsigned n) k) rs m k rs' m
   /\ rs'#rd = sem rs#r1 (Vlong n)
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros insn sem SEM ASSOC; intros. unfold addimm_aux.
   set (nlo := Zzero_ext 12 (Int64.unsigned n)). set (nhi := Int64.unsigned n - nlo).
@@ -489,44 +574,46 @@ Proof.
   destruct (Z.eqb_spec nhi 0); [|destruct (Z.eqb_spec nlo 0)].
 - econstructor; split. apply exec_straight_one. apply SEM. Simpl. 
   split. Simpl. do 3 f_equal; omega.
-  intros; Simpl.
+  split; intros; Simpl.
 - econstructor; split. apply exec_straight_one. apply SEM. Simpl. 
   split. Simpl. do 3 f_equal; omega.
-  intros; Simpl.
+  split; intros; Simpl.
 - econstructor; split. eapply exec_straight_two.
   apply SEM. apply SEM. Simpl. Simpl.
   split. Simpl. rewrite ASSOC. do 2 f_equal. apply Int64.eqm_samerepr.
   rewrite E. auto with ints.
-  intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 Lemma exec_addimm64:
   forall rd r1 n k rs m,
   preg_of_iregsp r1 <> X16 ->
+  (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (addimm64 rd r1 n k) rs m k rs' m
   /\ rs'#rd = Val.addl rs#r1 (Vlong n)
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros. 
   unfold addimm64. set (nn := Int64.neg n).
   destruct (Int64.eq n (Int64.zero_ext 24 n)); [| destruct (Int64.eq nn (Int64.zero_ext 24 nn))].
-- apply exec_addimm_aux_64 with (sem := Val.addl). auto. intros; apply Val.addl_assoc. 
+- apply exec_addimm_aux_64 with (sem := Val.addl); auto. intros; apply Val.addl_assoc. 
 - rewrite <- Val.subl_opp_addl.
-  apply exec_addimm_aux_64 with (sem := Val.subl). auto.
+  apply exec_addimm_aux_64 with (sem := Val.subl); auto.
   intros. rewrite ! Val.subl_addl_opp, Val.addl_assoc. rewrite Int64.neg_add_distr. auto.
 - destruct (Int64.lt n Int64.zero).
 + rewrite <- Val.subl_opp_addl; fold nn.
-  edestruct (exec_loadimm64 X16 nn) as (rs1 & A & B & C).
+  edestruct (exec_loadimm64 X16 nn) as (rs1 & A & B & C). congruence.
   econstructor; split.
   eapply exec_straight_trans. eexact A. eapply exec_straight_one. simpl; eauto. Simpl. 
   split. Simpl. rewrite B, C; eauto with asmgen. simpl. rewrite Int64.shl'_zero. auto.
-  intros; Simpl.
-+ edestruct (exec_loadimm64 X16 n) as (rs1 & A & B & C).
+  split; intros; Simpl.
++ edestruct (exec_loadimm64 X16 n) as (rs1 & A & B & C). congruence.
   econstructor; split.
   eapply exec_straight_trans. eexact A. eapply exec_straight_one. simpl; eauto. Simpl. 
   split. Simpl. rewrite B, C; eauto with asmgen. simpl. rewrite Int64.shl'_zero. auto.
-  intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 (** Logical immediate *)
@@ -543,22 +630,25 @@ Lemma exec_logicalimm32:
       Next (nextinstr (rs#rd <- (sem rs##r1 (eval_shift_op_int rs#r2 s)))) m) ->
   forall rd r1 n k rs m,
   r1 <> X16 ->
+  (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (logicalimm32 insn1 insn2 rd r1 n k) rs m k rs' m
   /\ rs'#rd = sem rs#r1 (Vint n)
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros until sem; intros SEM1 SEM2; intros. unfold logicalimm32.
   destruct (is_logical_imm32 n).
 - econstructor; split. 
   apply exec_straight_one. apply SEM1. reflexivity. 
-  split. Simpl. rewrite Int.repr_unsigned; auto. intros; Simpl.
-- edestruct (exec_loadimm32 X16 n) as (rs1 & A & B & C).
+  split. Simpl. rewrite Int.repr_unsigned; auto.
+  split; intros; Simpl.
+- edestruct (exec_loadimm32 X16 n) as (rs1 & A & B & C). congruence.
   econstructor; split.
   eapply exec_straight_trans. eexact A.  
   apply exec_straight_one. apply SEM2. reflexivity.
   split. Simpl. f_equal; auto. apply C; auto with asmgen.
-  intros; Simpl. 
+  split; intros; Simpl. 
 Qed.
 
 Lemma exec_logicalimm64:
@@ -573,50 +663,58 @@ Lemma exec_logicalimm64:
       Next (nextinstr (rs#rd <- (sem rs###r1 (eval_shift_op_long rs#r2 s)))) m) ->
   forall rd r1 n k rs m,
   r1 <> X16 ->
+  (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (logicalimm64 insn1 insn2 rd r1 n k) rs m k rs' m
   /\ rs'#rd = sem rs#r1 (Vlong n)
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros until sem; intros SEM1 SEM2; intros. unfold logicalimm64.
   destruct (is_logical_imm64 n).
 - econstructor; split. 
   apply exec_straight_one. apply SEM1. reflexivity. 
-  split. Simpl. rewrite Int64.repr_unsigned. auto. intros; Simpl.
-- edestruct (exec_loadimm64 X16 n) as (rs1 & A & B & C).
+  split. Simpl. rewrite Int64.repr_unsigned. auto.
+  split; intros; Simpl.
+- edestruct (exec_loadimm64 X16 n) as (rs1 & A & B & C). congruence.
   econstructor; split.
   eapply exec_straight_trans. eexact A.  
   apply exec_straight_one. apply SEM2. reflexivity.
   split. Simpl. f_equal; auto. apply C; auto with asmgen.
-  intros; Simpl. 
+  split; intros; Simpl. 
 Qed.
 
 (** Load address of symbol *)
 
 Lemma exec_loadsymbol: forall rd s ofs k rs m,
-  rd <> X16 \/ Archi.pic_code tt = false ->
+    rd <> X16 \/ Archi.pic_code tt = false ->
+    (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (loadsymbol rd s ofs k) rs m k rs' m
   /\ rs'#rd = Genv.symbol_address ge s ofs
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs'#RA = rs#RA.
 Proof.
   unfold loadsymbol; intros. destruct (Archi.pic_code tt).
 - predSpec Ptrofs.eq Ptrofs.eq_spec ofs Ptrofs.zero.
 + subst ofs. econstructor; split.
   apply exec_straight_one; [simpl; eauto | reflexivity].
-  split. Simpl. intros; Simpl.
+  split. Simpl. split; intros; Simpl.
+  
 + exploit exec_addimm64. instantiate (1 := rd). simpl. destruct H; congruence.
-  intros (rs1 & A & B & C).
+  instantiate (1 := rd). assumption.
+  intros (rs1 & A & B & C & D).
   econstructor; split.
   econstructor. simpl; eauto. auto. eexact A. 
   split. simpl in B; rewrite B. Simpl. 
   rewrite <- Genv.shift_symbol_address_64 by auto.
   rewrite Ptrofs.add_zero_l, Ptrofs.of_int64_to_int64 by auto. auto.
-  intros. rewrite C by auto. Simpl.
+  split; intros. rewrite C by auto; Simpl.
+  rewrite D. Simpl.
 - econstructor; split.
   eapply exec_straight_two. simpl; eauto. simpl; eauto. auto. auto.
   split. Simpl. rewrite symbol_high_low; auto. 
-  intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 (** Shifted operands *)
@@ -725,23 +823,25 @@ Lemma exec_arith_extended:
       Next (nextinstr (rs#rd <- (sem rs###r1 (eval_shift_op_long rs#r2 s)))) m) ->
   forall (rd r1 r2: ireg) (ex: extension) (a: amount64) (k: code) rs m,
   r1 <> X16 ->
+  (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (arith_extended insnX insnS rd r1 r2 ex a k) rs m k rs' m
   /\ rs'#rd = sem rs#r1 (Op.eval_extend ex rs#r2 a)
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros sem insnX insnS EX ES; intros. unfold arith_extended. destruct (Int.ltu a (Int.repr 5)).
 - econstructor; split. 
   apply exec_straight_one. rewrite EX; eauto. auto.
   split. Simpl. f_equal. destruct ex; auto.
-  intros; Simpl.
+  split; intros; Simpl.
 - exploit (exec_move_extended_base X16 r2 ex). intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A. apply exec_straight_one. 
   rewrite ES. eauto. auto.
   split. Simpl. unfold ir0x. rewrite C by eauto with asmgen. f_equal. 
   rewrite B. destruct ex; auto.
-  intros; Simpl.
+  split; intros; Simpl.
 Qed. 
 
 (** Extended right shift *)
@@ -749,15 +849,18 @@ Qed.
 Lemma exec_shrx32: forall (rd r1: ireg) (n: int) k v (rs: regset) m,
   Val.shrx rs#r1 (Vint n) = Some v ->
   r1 <> X16 ->
+  (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (shrx32 rd r1 n k) rs m k rs' m
   /\ rs'#rd = v
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   unfold shrx32; intros. apply Val.shrx_shr_3 in H.
   destruct (Int.eq n Int.zero) eqn:E.
 - econstructor; split. apply exec_straight_one; [simpl;eauto|auto]. 
-  split. Simpl. subst v; auto. intros; Simpl.
+  split. Simpl. subst v; auto.
+  split; intros; Simpl.
 - generalize (Int.eq_spec n Int.one).
   destruct (Int.eq n Int.one); intro ONE.
   * subst n.
@@ -769,27 +872,31 @@ Proof.
        change (Int.ltu Int.one Int.iwordsize) with true; simpl.
        rewrite Int.or_zero_l.
        reflexivity.
-    ** intros; Simpl.
+    ** split; intros; Simpl.
   * econstructor; split. eapply exec_straight_three.
     unfold exec_instr. rewrite or_zero_eval_shift_op_int by congruence. eauto.
     simpl; eauto.
     unfold exec_instr. rewrite or_zero_eval_shift_op_int by congruence. eauto.
     auto. auto. auto.
-    split. subst v; Simpl. intros; Simpl.
+    split. subst v; Simpl.
+    split; intros; Simpl.
 Qed.
  
 Lemma exec_shrx64: forall (rd r1: ireg) (n: int) k v (rs: regset) m,
   Val.shrxl rs#r1 (Vint n) = Some v ->
   r1 <> X16 ->
+  (IR RA) <> (preg_of_iregsp (RR1 rd)) ->
   exists rs',
      exec_straight ge fn (shrx64 rd r1 n k) rs m k rs' m
   /\ rs'#rd = v
-  /\ forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> r <> rd -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   unfold shrx64; intros. apply Val.shrxl_shrl_3 in H.
   destruct (Int.eq n Int.zero) eqn:E.
 - econstructor; split. apply exec_straight_one; [simpl;eauto|auto]. 
-  split. Simpl. subst v; auto. intros; Simpl.
+  split. Simpl. subst v; auto.
+  split; intros; Simpl.
 - generalize (Int.eq_spec n Int.one).
   destruct (Int.eq n Int.one); intro ONE.
   * subst n.
@@ -801,13 +908,14 @@ Proof.
        change (Int.ltu Int.one Int64.iwordsize') with true; simpl.
        rewrite Int64.or_zero_l.
        reflexivity.
-    ** intros; Simpl.
+    ** split; intros; Simpl.
   * econstructor; split. eapply exec_straight_three.
     unfold exec_instr. rewrite or_zero_eval_shift_op_long by congruence. eauto.
     simpl; eauto.
     unfold exec_instr. rewrite or_zero_eval_shift_op_long by congruence. eauto.
     auto. auto. auto.
-    split. subst v; Simpl. intros; Simpl.
+    split. subst v; Simpl.
+    split; intros; Simpl.
 Qed.
 
 (** Condition bits *)
@@ -1063,6 +1171,56 @@ Ltac ArgsInv :=
   | [ H: freg_of _ = OK _ |- _ ] => simpl in *; rewrite (freg_of_eq _ _ H) in *
   end).
 
+Lemma compare_int_RA:
+  forall rs a b m,
+    compare_int rs a b m X30 = rs X30.
+Proof.
+  unfold compare_int.
+  intros.
+  repeat rewrite Pregmap.gso by congruence.
+  trivial.
+Qed.
+
+Hint Resolve compare_int_RA : asmgen.
+
+Lemma compare_long_RA:
+  forall rs a b m,
+    compare_long rs a b m X30 = rs X30.
+Proof.
+  unfold compare_long.
+  intros.
+  repeat rewrite Pregmap.gso by congruence.
+  trivial.
+Qed.
+
+Hint Resolve compare_long_RA : asmgen.
+
+Lemma compare_float_RA:
+  forall rs a b,
+    compare_float rs a b X30 = rs X30.
+Proof.
+  unfold compare_float.
+  intros.
+  destruct a; destruct b.
+  all: repeat rewrite Pregmap.gso by congruence; trivial.
+Qed.
+
+Hint Resolve compare_float_RA : asmgen.
+  
+
+Lemma compare_single_RA:
+  forall rs a b,
+    compare_single rs a b X30 = rs X30.
+Proof.
+  unfold compare_single.
+  intros.
+  destruct a; destruct b.
+  all: repeat rewrite Pregmap.gso by congruence; trivial.
+Qed.
+
+Hint Resolve compare_single_RA : asmgen.
+  
+                                         
 Lemma transl_cond_correct:
   forall cond args k c rs m,
   transl_cond cond args k = OK c ->
@@ -1071,185 +1229,218 @@ Lemma transl_cond_correct:
   /\ (forall b,
       eval_condition cond (map rs (map preg_of args)) m = Some b ->
       eval_testcond (cond_for_cond cond) rs' = Some b)
-  /\ forall r, data_preg r = true -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros until m; intros TR. destruct cond; simpl in TR; ArgsInv.
 - (* Ccomp *)
   econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. apply eval_testcond_compare_sint; auto. 
+  repeat split; intros. apply eval_testcond_compare_sint; auto. 
   destruct r; reflexivity || discriminate.
 - (* Ccompu *)
   econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. apply eval_testcond_compare_uint; auto. 
+  repeat split; intros. apply eval_testcond_compare_uint; auto. 
   destruct r; reflexivity || discriminate.
 - (* Ccompimm *)
   destruct (is_arith_imm32 n); [|destruct (is_arith_imm32 (Int.neg n))].
 + econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite Int.repr_unsigned. apply eval_testcond_compare_sint; auto. 
+  repeat split; intros. rewrite Int.repr_unsigned. apply eval_testcond_compare_sint; auto. 
   destruct r; reflexivity || discriminate.
 + econstructor; split.
   apply exec_straight_one. simpl. rewrite Int.repr_unsigned, Int.neg_involutive. eauto. auto.
-  split; intros. apply eval_testcond_compare_sint; auto. 
+  repeat split; intros. apply eval_testcond_compare_sint; auto. 
   destruct r; reflexivity || discriminate.
-+ exploit (exec_loadimm32 X16 n). intros (rs' & A & B & C).
++ exploit (exec_loadimm32 X16 n). congruence. intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A. apply exec_straight_one.
   simpl. rewrite B, C by eauto with asmgen. eauto. auto.
-  split; intros. apply eval_testcond_compare_sint; auto. 
-  transitivity (rs' r). destruct r; reflexivity || discriminate. auto with asmgen.
+  repeat split; intros. apply eval_testcond_compare_sint; auto. 
+  transitivity (rs' r). destruct r; reflexivity || discriminate.
+  auto with asmgen.
+  Simpl. rewrite compare_int_RA.
+  apply C; congruence.
 - (* Ccompuimm *)
   destruct (is_arith_imm32 n); [|destruct (is_arith_imm32 (Int.neg n))].
 + econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite Int.repr_unsigned. apply eval_testcond_compare_uint; auto. 
+  repeat split; intros. rewrite Int.repr_unsigned. apply eval_testcond_compare_uint; auto. 
   destruct r; reflexivity || discriminate.
 + econstructor; split.
   apply exec_straight_one. simpl. rewrite Int.repr_unsigned, Int.neg_involutive. eauto. auto.
-  split; intros. apply eval_testcond_compare_uint; auto. 
+  repeat split; intros. apply eval_testcond_compare_uint; auto. 
   destruct r; reflexivity || discriminate.
-+ exploit (exec_loadimm32 X16 n). intros (rs' & A & B & C).
++ exploit (exec_loadimm32 X16 n). congruence. intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A. apply exec_straight_one.
   simpl. rewrite B, C by eauto with asmgen. eauto. auto.
-  split; intros. apply eval_testcond_compare_uint; auto. 
+  repeat split; intros. apply eval_testcond_compare_uint; auto. 
   transitivity (rs' r). destruct r; reflexivity || discriminate. auto with asmgen.
+  Simpl. rewrite compare_int_RA.
+  apply C; congruence.  
 - (* Ccompshift *)
   econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite transl_eval_shift. apply eval_testcond_compare_sint; auto. 
+  repeat split; intros. rewrite transl_eval_shift. apply eval_testcond_compare_sint; auto. 
   destruct r; reflexivity || discriminate.
 - (* Ccompushift *)
   econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite transl_eval_shift. apply eval_testcond_compare_uint; auto. 
+  repeat split; intros. rewrite transl_eval_shift. apply eval_testcond_compare_uint; auto. 
   destruct r; reflexivity || discriminate.
 - (* Cmaskzero *)
   destruct (is_logical_imm32 n).
 + econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite Int.repr_unsigned. apply (eval_testcond_compare_sint Ceq); auto.
+  repeat split; intros. rewrite Int.repr_unsigned. apply (eval_testcond_compare_sint Ceq); auto.
   destruct r; reflexivity || discriminate.
-+ exploit (exec_loadimm32 X16 n). intros (rs' & A & B & C).
++ exploit (exec_loadimm32 X16 n). congruence. intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A.
   apply exec_straight_one. simpl. rewrite B, C by eauto with asmgen. eauto. auto.
-  split; intros. apply (eval_testcond_compare_sint Ceq); auto.
+  repeat split; intros. apply (eval_testcond_compare_sint Ceq); auto.
   transitivity (rs' r). destruct r; reflexivity || discriminate. auto with asmgen.
+  Simpl. rewrite compare_int_RA.
+  apply C; congruence.
+
 - (* Cmasknotzero *)
   destruct (is_logical_imm32 n).
 + econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite Int.repr_unsigned. apply (eval_testcond_compare_sint Cne); auto.
+  repeat split; intros. rewrite Int.repr_unsigned. apply (eval_testcond_compare_sint Cne); auto.
   destruct r; reflexivity || discriminate.
-+ exploit (exec_loadimm32 X16 n). intros (rs' & A & B & C).
+
++ exploit (exec_loadimm32 X16 n). congruence. intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A.
   apply exec_straight_one. simpl. rewrite B, C by eauto with asmgen. eauto. auto.
-  split; intros. apply (eval_testcond_compare_sint Cne); auto.
+  repeat split; intros. apply (eval_testcond_compare_sint Cne); auto.
   transitivity (rs' r). destruct r; reflexivity || discriminate. auto with asmgen.
+  Simpl. rewrite compare_int_RA.
+  apply C; congruence.
+  
 - (* Ccompl *)
   econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. apply eval_testcond_compare_slong; auto. 
+  repeat split; intros. apply eval_testcond_compare_slong; auto. 
   destruct r; reflexivity || discriminate.
 - (* Ccomplu *)
   econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. apply eval_testcond_compare_ulong; auto. 
+  repeat split; intros. apply eval_testcond_compare_ulong; auto. 
   destruct r; reflexivity || discriminate.
 - (* Ccomplimm *)
   destruct (is_arith_imm64 n); [|destruct (is_arith_imm64 (Int64.neg n))].
 + econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite Int64.repr_unsigned. apply eval_testcond_compare_slong; auto. 
+  repeat split; intros. rewrite Int64.repr_unsigned. apply eval_testcond_compare_slong; auto. 
   destruct r; reflexivity || discriminate.
 + econstructor; split.
   apply exec_straight_one. simpl. rewrite Int64.repr_unsigned, Int64.neg_involutive. eauto. auto.
-  split; intros. apply eval_testcond_compare_slong; auto. 
+  repeat split; intros. apply eval_testcond_compare_slong; auto. 
   destruct r; reflexivity || discriminate.
-+ exploit (exec_loadimm64 X16 n). intros (rs' & A & B & C).
++ exploit (exec_loadimm64 X16 n). congruence. intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A. apply exec_straight_one.
   simpl. rewrite B, C by eauto with asmgen. eauto. auto.
-  split; intros. apply eval_testcond_compare_slong; auto. 
+  repeat split; intros. apply eval_testcond_compare_slong; auto. 
   transitivity (rs' r). destruct r; reflexivity || discriminate. auto with asmgen.
+  Simpl. rewrite compare_long_RA.
+  apply C; congruence.
+
 - (* Ccompluimm *)
   destruct (is_arith_imm64 n); [|destruct (is_arith_imm64 (Int64.neg n))].
 + econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite Int64.repr_unsigned. apply eval_testcond_compare_ulong; auto. 
+  repeat split; intros. rewrite Int64.repr_unsigned. apply eval_testcond_compare_ulong; auto. 
   destruct r; reflexivity || discriminate.
 + econstructor; split.
   apply exec_straight_one. simpl. rewrite Int64.repr_unsigned, Int64.neg_involutive. eauto. auto.
-  split; intros. apply eval_testcond_compare_ulong; auto. 
+  repeat split; intros. apply eval_testcond_compare_ulong; auto. 
   destruct r; reflexivity || discriminate.
-+ exploit (exec_loadimm64 X16 n). intros (rs' & A & B & C).
++ exploit (exec_loadimm64 X16 n). congruence. intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A. apply exec_straight_one.
   simpl. rewrite B, C by eauto with asmgen. eauto. auto.
-  split; intros. apply eval_testcond_compare_ulong; auto. 
+  repeat split; intros. apply eval_testcond_compare_ulong; auto. 
   transitivity (rs' r). destruct r; reflexivity || discriminate. auto with asmgen.
+  Simpl. rewrite compare_long_RA.
+  apply C; congruence.
+  
 - (* Ccomplshift *)
   econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite transl_eval_shiftl. apply eval_testcond_compare_slong; auto. 
+  repeat split; intros. rewrite transl_eval_shiftl. apply eval_testcond_compare_slong; auto. 
   destruct r; reflexivity || discriminate.
 - (* Ccomplushift *)
   econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite transl_eval_shiftl. apply eval_testcond_compare_ulong; auto. 
+  repeat split; intros. rewrite transl_eval_shiftl. apply eval_testcond_compare_ulong; auto. 
   destruct r; reflexivity || discriminate.
 - (* Cmasklzero *)
   destruct (is_logical_imm64 n).
 + econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite Int64.repr_unsigned. apply (eval_testcond_compare_slong Ceq); auto.
+  repeat split; intros. rewrite Int64.repr_unsigned. apply (eval_testcond_compare_slong Ceq); auto.
   destruct r; reflexivity || discriminate.
-+ exploit (exec_loadimm64 X16 n). intros (rs' & A & B & C).
++ exploit (exec_loadimm64 X16 n). congruence. intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A.
   apply exec_straight_one. simpl. rewrite B, C by eauto with asmgen. eauto. auto.
-  split; intros. apply (eval_testcond_compare_slong Ceq); auto.
+  repeat split; intros. apply (eval_testcond_compare_slong Ceq); auto.
   transitivity (rs' r). destruct r; reflexivity || discriminate. auto with asmgen.
+  Simpl. rewrite compare_long_RA.
+  apply C; congruence.
+
 - (* Cmasknotzero *)
   destruct (is_logical_imm64 n).
 + econstructor; split. apply exec_straight_one. simpl; eauto. auto.
-  split; intros. rewrite Int64.repr_unsigned. apply (eval_testcond_compare_slong Cne); auto.
+  repeat split; intros. rewrite Int64.repr_unsigned. apply (eval_testcond_compare_slong Cne); auto.
   destruct r; reflexivity || discriminate.
-+ exploit (exec_loadimm64 X16 n). intros (rs' & A & B & C).
++ exploit (exec_loadimm64 X16 n). congruence. intros (rs' & A & B & C).
   econstructor; split.
   eapply exec_straight_trans. eexact A.
   apply exec_straight_one. simpl. rewrite B, C by eauto with asmgen. eauto. auto.
-  split; intros. apply (eval_testcond_compare_slong Cne); auto.
+  repeat split; intros. apply (eval_testcond_compare_slong Cne); auto.
   transitivity (rs' r). destruct r; reflexivity || discriminate. auto with asmgen.
+    Simpl. rewrite compare_long_RA.
+  apply C; congruence.
+
 - (* Ccompf *)
   econstructor; split. apply exec_straight_one. simpl; eauto.
   rewrite compare_float_inv; auto.
-  split; intros. apply eval_testcond_compare_float; auto.
+  repeat split; intros. apply eval_testcond_compare_float; auto.
   destruct r; discriminate || rewrite compare_float_inv; auto.
+  Simpl. 
 - (* Cnotcompf *)
   econstructor; split. apply exec_straight_one. simpl; eauto.
   rewrite compare_float_inv; auto.
-  split; intros. apply eval_testcond_compare_not_float; auto.
+  repeat split; intros. apply eval_testcond_compare_not_float; auto.
   destruct r; discriminate || rewrite compare_float_inv; auto.
+  Simpl. 
 - (* Ccompfzero *)
   econstructor; split. apply exec_straight_one. simpl; eauto.
   rewrite compare_float_inv; auto.
-  split; intros. apply eval_testcond_compare_float; auto.
+  repeat split; intros. apply eval_testcond_compare_float; auto.
   destruct r; discriminate || rewrite compare_float_inv; auto.
+  Simpl.
 - (* Cnotcompfzero *)
   econstructor; split. apply exec_straight_one. simpl; eauto.
   rewrite compare_float_inv; auto.
-  split; intros. apply eval_testcond_compare_not_float; auto.
+  repeat split; intros. apply eval_testcond_compare_not_float; auto.
   destruct r; discriminate || rewrite compare_float_inv; auto.
+  Simpl.
 - (* Ccompfs *)
   econstructor; split. apply exec_straight_one. simpl; eauto.
   rewrite compare_single_inv; auto.
-  split; intros. apply eval_testcond_compare_single; auto.
+  repeat split; intros. apply eval_testcond_compare_single; auto.
   destruct r; discriminate || rewrite compare_single_inv; auto.
+  Simpl.
 - (* Cnotcompfs *)
   econstructor; split. apply exec_straight_one. simpl; eauto.
   rewrite compare_single_inv; auto.
-  split; intros. apply eval_testcond_compare_not_single; auto.
+  repeat split; intros. apply eval_testcond_compare_not_single; auto.
   destruct r; discriminate || rewrite compare_single_inv; auto.
+  Simpl.
 - (* Ccompfszero *)
   econstructor; split. apply exec_straight_one. simpl; eauto.
   rewrite compare_single_inv; auto.
-  split; intros. apply eval_testcond_compare_single; auto.
+  repeat split; intros. apply eval_testcond_compare_single; auto.
   destruct r; discriminate || rewrite compare_single_inv; auto.
+  Simpl.
 - (* Cnotcompfszero *)
   econstructor; split. apply exec_straight_one. simpl; eauto.
   rewrite compare_single_inv; auto.
-  split; intros. apply eval_testcond_compare_not_single; auto.
+  repeat split; intros. apply eval_testcond_compare_not_single; auto.
   destruct r; discriminate || rewrite compare_single_inv; auto.
+  Simpl.
 Qed.
 
 (** Translation of conditional branches *)
@@ -1262,7 +1453,8 @@ Lemma transl_cond_branch_correct:
      exec_straight_opt ge fn c rs m (insn :: k) rs' m
   /\ exec_instr ge fn insn rs' m =
          (if b then goto_label fn lbl rs' m else Next (nextinstr rs') m)
-  /\ forall r, data_preg r = true -> rs'#r = rs#r.
+  /\ (forall r, data_preg r = true -> rs'#r = rs#r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros until b; intros TR EV.
   assert (DFL:
@@ -1271,13 +1463,14 @@ Proof.
        exec_straight_opt ge fn c rs m (insn :: k) rs' m
     /\ exec_instr ge fn insn rs' m =
          (if b then goto_label fn lbl rs' m else Next (nextinstr rs') m)
-    /\ forall r, data_preg r = true -> rs'#r = rs#r).
+    /\ (forall r, data_preg r = true -> rs'#r = rs#r)
+     /\ rs' # RA = rs # RA ).
   {
     unfold transl_cond_branch_default; intros.
-    exploit transl_cond_correct; eauto. intros (rs' & A & B & C).
+    exploit transl_cond_correct; eauto. intros (rs' & A & B & C & D).
     exists rs', (Pbc (cond_for_cond cond) lbl); split.
     apply exec_straight_opt_intro. eexact A.
-    split; auto. simpl. rewrite (B b) by auto. auto. 
+    repeat split; auto. simpl. rewrite (B b) by auto. auto.
   }
 Local Opaque transl_cond transl_cond_branch_default.
   destruct args as [ | a1 args]; simpl in TR; auto.
@@ -1371,13 +1564,15 @@ Ltac TranslOpSimpl :=
   [ apply exec_straight_one; [simpl; eauto | reflexivity]
   | split; [ rewrite ? transl_eval_shift, ? transl_eval_shiftl;
              apply Val.lessdef_same; Simpl; fail
-           | intros; Simpl; fail ] ].
+           | split; [ intros; Simpl; fail
+                    | intros; Simpl; eauto with asmgen; fail] ]].
 
 Ltac TranslOpBase :=
   econstructor; split;
   [ apply exec_straight_one; [simpl; eauto | reflexivity]
   | split; [ rewrite ? transl_eval_shift, ? transl_eval_shiftl; Simpl
-           | intros; Simpl; fail ] ].
+           | split; [ intros; Simpl; fail
+                    | intros; Simpl; eapply RA_not_written2; eauto] ]].
 
 Lemma transl_op_correct:
   forall op args res k (rs: regset) m v c,
@@ -1386,21 +1581,29 @@ Lemma transl_op_correct:
   exists rs',
      exec_straight ge fn c rs m k rs' m
   /\ Val.lessdef v rs'#(preg_of res)
-  /\ forall r, data_preg r = true -> r <> preg_of res -> preg_notin r (destroyed_by_op op) -> rs' r = rs r.
+  /\ (forall r, data_preg r = true -> r <> preg_of res -> preg_notin r (destroyed_by_op op) -> rs' r = rs r)
+  /\ rs' RA = rs RA.
 Proof.
 Local Opaque Int.eq Int64.eq Val.add Val.addl Int.zwordsize Int64.zwordsize.
   intros until c; intros TR EV.
   unfold transl_op in TR; destruct op; ArgsInv; simpl in EV; SimplEval EV; try TranslOpSimpl.
 - (* move *)
   destruct (preg_of res) eqn:RR; try discriminate; destruct (preg_of m0) eqn:R1; inv TR.
-+ TranslOpSimpl.
-+ TranslOpSimpl.
+  all: TranslOpSimpl.
 - (* intconst *)
-  exploit exec_loadimm32. intros (rs' & A & B & C).
-  exists rs'; split. eexact A. split. rewrite B; auto. intros; auto with asmgen.
+  exploit exec_loadimm32. apply (ireg_of_not_RA res); eassumption.
+  intros (rs' & A & B & C).
+  exists rs'; split. eexact A. split. rewrite B; auto.
+  split. intros; auto with asmgen.
+  apply C. congruence.
+  eapply ireg_of_not_RA''; eauto.
 - (* longconst *)
-  exploit exec_loadimm64. intros (rs' & A & B & C).
-  exists rs'; split. eexact A. split. rewrite B; auto. intros; auto with asmgen.
+  exploit exec_loadimm64. apply (ireg_of_not_RA res); eassumption.
+  intros (rs' & A & B & C).
+  exists rs'; split. eexact A. split. rewrite B; auto.
+  split. intros; auto with asmgen.
+  apply C. congruence.
+  eapply ireg_of_not_RA''; eauto.
 - (* floatconst *)
   destruct (Float.eq_dec n Float.zero).
 + subst n. TranslOpSimpl. 
@@ -1410,11 +1613,15 @@ Local Opaque Int.eq Int64.eq Val.add Val.addl Int.zwordsize Int64.zwordsize.
 + subst n. TranslOpSimpl. 
 + TranslOpSimpl.
 - (* loadsymbol *)
-  exploit (exec_loadsymbol x id ofs). eauto with asmgen. intros (rs' & A & B & C).
-  exists rs'; split. eexact A. split. rewrite B; auto. auto.
+  exploit (exec_loadsymbol x id ofs). eauto with asmgen.
+  apply (ireg_of_not_RA'' res); eassumption. 
+  intros (rs' & A & B & C & D).
+  exists rs'; split. eexact A. split. rewrite B; auto.
+  split; auto.
 - (* addrstack *)
   exploit (exec_addimm64 x XSP (Ptrofs.to_int64 ofs)). simpl; eauto with asmgen.
-  intros (rs' & A & B & C).
+  apply (ireg_of_not_RA'' res); eassumption. 
+  intros (rs' & A & B & C & D).
   exists rs'; split. eexact A. split. simpl in B; rewrite B.
 Local Transparent Val.addl.
   destruct (rs SP); simpl; auto. rewrite Ptrofs.of_int64_to_int64 by auto. auto.
@@ -1422,7 +1629,8 @@ Local Transparent Val.addl.
 - (* shift *)
   rewrite <- transl_eval_shift'. TranslOpSimpl.
 - (* addimm *)
-  exploit (exec_addimm32 x x0 n). eauto with asmgen. intros (rs' & A & B & C).
+  exploit (exec_addimm32 x x0 n). eauto with asmgen. eapply ireg_of_not_RA''; eassumption.
+  intros (rs' & A & B & C & D).
   exists rs'; split. eexact A. split. rewrite B; auto. auto.
 - (* mul *)
   TranslOpBase.
@@ -1430,18 +1638,20 @@ Local Transparent Val.add.
   destruct (rs x0); auto; destruct (rs x1); auto. simpl. rewrite Int.add_zero_l; auto.
 - (* andimm *)
   exploit (exec_logicalimm32 (Pandimm W) (Pand W)). 
-  intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen.
-  intros (rs' & A & B & C). 
-  exists rs'; split. eexact A. split. rewrite B; auto. auto.
+  intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen. apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D). 
+  exists rs'; split. eexact A. split. rewrite B; auto.
+  split; auto.
 - (* orimm *)
   exploit (exec_logicalimm32 (Porrimm W) (Porr W)). 
-  intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen.
-  intros (rs' & A & B & C). 
-  exists rs'; split. eexact A. split. rewrite B; auto. auto.
+  intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen.  apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D). 
+  exists rs'; split. eexact A. split. rewrite B; auto.
+  split; auto.
 - (* xorimm *)
   exploit (exec_logicalimm32 (Peorimm W) (Peor W)). 
-  intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen.
-  intros (rs' & A & B & C). 
+  intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen. apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D). 
   exists rs'; split. eexact A. split. rewrite B; auto. auto.
 - (* not *)
   TranslOpBase.
@@ -1450,8 +1660,10 @@ Local Transparent Val.add.
   TranslOpBase.
   destruct (eval_shift s (rs x0) a); auto. simpl. rewrite Int.or_zero_l; auto.
 - (* shrx *)
-  exploit (exec_shrx32 x x0 n); eauto with asmgen. intros (rs' & A & B & C).
-  econstructor; split. eexact A. split. rewrite B; auto. auto.
+  exploit (exec_shrx32 x x0 n); eauto with asmgen. apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D).
+  econstructor; split. eexact A. split. rewrite B; auto.
+  split; auto.
 - (* zero-ext *)
   TranslOpBase.
   destruct (rs x0); auto; simpl. rewrite Int.shl_zero. auto.
@@ -1475,36 +1687,47 @@ Local Transparent Val.add.
 - (* extend *)
   exploit (exec_move_extended x0 x1 x a k). intros (rs' & A & B & C).
   econstructor; split. eexact A. 
-  split. rewrite B; auto. eauto with asmgen.
+  split. rewrite B; auto.
+  split; eauto with asmgen.
 - (* addext *)
   exploit (exec_arith_extended Val.addl Paddext (Padd X)).
-  auto. auto. instantiate (1 := x1). eauto with asmgen. intros (rs' & A & B & C).
-  econstructor; split. eexact A. split. rewrite B; auto. auto.
+  auto. auto. instantiate (1 := x1). eauto with asmgen.
+  apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D).
+  econstructor; split. eexact A. split. rewrite B; auto.
+  split; auto.
 - (* addlimm *)
   exploit (exec_addimm64 x x0 n). simpl. generalize (ireg_of_not_X16 _ _ EQ1). congruence.
-  intros (rs' & A & B & C).
+  apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D).
   exists rs'; split. eexact A. split. simpl in B; rewrite B; auto. auto.
 - (* subext *)
   exploit (exec_arith_extended Val.subl Psubext (Psub X)).
-  auto. auto. instantiate (1 := x1). eauto with asmgen. intros (rs' & A & B & C).
-  econstructor; split. eexact A. split. rewrite B; auto. auto.
+  auto. auto. instantiate (1 := x1). eauto with asmgen.
+  apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D).
+  econstructor; split. eexact A. split. rewrite B; auto.
+  split; auto.
 - (* mull *)
   TranslOpBase.
   destruct (rs x0); auto; destruct (rs x1); auto. simpl. rewrite Int64.add_zero_l; auto.
 - (* andlimm *)
   exploit (exec_logicalimm64 (Pandimm X) (Pand X)). 
   intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen.
-  intros (rs' & A & B & C). 
+  apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D). 
   exists rs'; split. eexact A. split. rewrite B; auto. auto.
 - (* orlimm *)
   exploit (exec_logicalimm64 (Porrimm X) (Porr X)). 
   intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen.
-  intros (rs' & A & B & C). 
+  apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D). 
   exists rs'; split. eexact A. split. rewrite B; auto. auto.
 - (* xorlimm *)
   exploit (exec_logicalimm64 (Peorimm X) (Peor X)). 
   intros; reflexivity. intros; reflexivity. instantiate (1 := x0). eauto with asmgen.
-  intros (rs' & A & B & C). 
+  apply (ireg_of_not_RA'' res); eassumption.
+  intros (rs' & A & B & C & D). 
   exists rs'; split. eexact A. split. rewrite B; auto. auto.
 - (* notl *)
   TranslOpBase.
@@ -1513,7 +1736,8 @@ Local Transparent Val.add.
   TranslOpBase.
   destruct (eval_shiftl s (rs x0) a); auto. simpl. rewrite Int64.or_zero_l; auto.
 - (* shrx *)
-  exploit (exec_shrx64 x x0 n); eauto with asmgen. intros (rs' & A & B & C).
+  exploit (exec_shrx64 x x0 n); eauto with asmgen.
+  apply (ireg_of_not_RA'' res); eassumption. intros (rs' & A & B & C & D ).
   econstructor; split. eexact A. split. rewrite B; auto. auto.
 - (* zero-ext-l *)
   TranslOpBase.
@@ -1534,35 +1758,37 @@ Local Transparent Val.add.
   TranslOpBase.
   destruct (rs x0); simpl; auto. rewrite ! a64_range; simpl. rewrite <- Int64.sign_ext_shr'_min; auto using a64_range.
 - (* condition *)
-  exploit (transl_cond_correct cond args); eauto. intros (rs' & A & B & C).
+  exploit (transl_cond_correct cond args); eauto. intros (rs' & A & B & C & D).
   econstructor; split.
   eapply exec_straight_trans. eexact A. apply exec_straight_one. simpl; eauto. auto.
   split. Simpl. destruct (eval_condition cond (map rs (map preg_of args)) m) as [b|]; simpl in *.
   rewrite (B b) by auto. auto. 
   auto.
-  intros; Simpl.
+  split; intros; Simpl.
 - (* select *)
   destruct (preg_of res) eqn:RES; monadInv TR.
   + (* integer *)
     generalize (ireg_of_eq _ _ EQ) (ireg_of_eq _ _ EQ1); intros E1 E2; rewrite E1, E2.
-    exploit (transl_cond_correct cond args); eauto. intros (rs' & A & B & C).
+    exploit (transl_cond_correct cond args); eauto. intros (rs' & A & B & C & D).
     econstructor; split.
     eapply exec_straight_trans. eexact A. apply exec_straight_one. simpl; eauto. auto.
     split. Simpl. destruct (eval_condition cond (map rs (map preg_of args)) m) as [b|]; simpl in *.
     rewrite (B b) by auto. rewrite !C. apply Val.lessdef_normalize.
     rewrite <- E2; auto with asmgen. rewrite <- E1; auto with asmgen.
     auto.
-    intros; Simpl.
+    split; intros; Simpl.
+    rewrite <- D.
+    eapply RA_not_written2; eassumption.
   + (* FP *)
     generalize (freg_of_eq _ _ EQ) (freg_of_eq _ _ EQ1); intros E1 E2; rewrite E1, E2.
-    exploit (transl_cond_correct cond args); eauto. intros (rs' & A & B & C).
+    exploit (transl_cond_correct cond args); eauto. intros (rs' & A & B & C & D).
     econstructor; split.
     eapply exec_straight_trans. eexact A. apply exec_straight_one. simpl; eauto. auto.
     split. Simpl. destruct (eval_condition cond (map rs (map preg_of args)) m) as [b|]; simpl in *.
     rewrite (B b) by auto. rewrite !C. apply Val.lessdef_normalize.
     rewrite <- E2; auto with asmgen. rewrite <- E1; auto with asmgen.
     auto.
-    intros; Simpl.
+    split; intros; Simpl.
 Qed.
 
 (** Translation of addressing modes, loads, stores *)
@@ -1574,7 +1800,8 @@ Lemma transl_addressing_correct:
   exists ad rs',
      exec_straight_opt ge fn c rs m (insn ad :: k) rs' m
   /\ Asm.eval_addressing ge ad rs' = Vptr b o
-  /\ forall r, data_preg r = true -> rs' r = rs r.
+  /\ (forall r, data_preg r = true -> rs' r = rs r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros until o; intros TR EV.
   unfold transl_addressing in TR; destruct addr; ArgsInv; SimplEval EV.
@@ -1582,10 +1809,10 @@ Proof.
   destruct (offset_representable sz ofs); inv EQ0.
 + econstructor; econstructor; split. apply exec_straight_opt_refl.
   auto.
-+ exploit (exec_loadimm64 X16 ofs). intros (rs' & A & B & C).
++ exploit (exec_loadimm64 X16 ofs). congruence. intros (rs' & A & B & C).
   econstructor; exists rs'; split. apply exec_straight_opt_intro; eexact A.
   split. simpl. rewrite B, C by eauto with asmgen. auto.
-  eauto with asmgen.
+  split; eauto with asmgen.
 - (* Aindexed2 *)
   econstructor; econstructor; split. apply exec_straight_opt_refl.
   auto.
@@ -1601,33 +1828,38 @@ Proof.
 + econstructor; econstructor; split.
   apply exec_straight_opt_intro. apply exec_straight_one. simpl; eauto. auto.
   split. simpl. Simpl. rewrite H0. simpl. rewrite Ptrofs.add_zero. auto.
-  intros; Simpl.
+  split; intros; Simpl.
 - (* Aindexed2ext *)
   destruct (Int.eq a Int.zero || Int.eq (Int.shl Int.one a) (Int.repr sz)); inv EQ2.
 + econstructor; econstructor; split. apply exec_straight_opt_refl.
   split; auto. destruct x; auto.
 + exploit (exec_arith_extended Val.addl Paddext (Padd X)); auto.
   instantiate (1 := x0). eauto with asmgen.
-  intros (rs' & A & B & C).
+  instantiate (1 := X16). simpl. congruence.
+  intros (rs' & A & B & C & D).
   econstructor; exists rs'; split.
   apply exec_straight_opt_intro. eexact A. 
   split. simpl. rewrite B. rewrite Val.addl_assoc. f_equal.
   unfold Op.eval_extend; destruct x, (rs x1); simpl; auto; rewrite ! a64_range;
   simpl; rewrite Int64.add_zero; auto.
-  intros. apply C; eauto with asmgen.
+  split; intros.
+  apply C; eauto with asmgen.
+  trivial.
 - (* Aglobal *)
   destruct (Ptrofs.eq (Ptrofs.modu ofs (Ptrofs.repr sz)) Ptrofs.zero && symbol_is_aligned id sz); inv TR.
 + econstructor; econstructor; split.
   apply exec_straight_opt_intro. apply exec_straight_one. simpl; eauto. auto.
   split. simpl. Simpl. rewrite symbol_high_low. simpl in EV. congruence.
-  intros; Simpl.
-+ exploit (exec_loadsymbol X16 id ofs). auto. intros (rs' & A & B & C).
+  split; intros; Simpl.
++ exploit (exec_loadsymbol X16 id ofs). auto.
+  simpl. congruence.
+  intros (rs' & A & B & C & D).
   econstructor; exists rs'; split.
   apply exec_straight_opt_intro. eexact A.
   split. simpl. 
   rewrite B. rewrite <- Genv.shift_symbol_address_64, Ptrofs.add_zero by auto. 
   simpl in EV. congruence. 
-  auto with asmgen.
+  split; auto with asmgen.
 - (* Ainstrack *)
   assert (E: Val.addl (rs SP) (Vlong (Ptrofs.to_int64 ofs)) = Vptr b o).
   { simpl in EV. inv EV. destruct (rs SP); simpl in H1; inv H1. simpl. 
@@ -1635,7 +1867,9 @@ Proof.
   destruct (offset_representable sz (Ptrofs.to_int64 ofs)); inv TR.
 + econstructor; econstructor; split. apply exec_straight_opt_refl.
   auto.
-+ exploit (exec_loadimm64 X16 (Ptrofs.to_int64 ofs)). intros (rs' & A & B & C).
++ exploit (exec_loadimm64 X16 (Ptrofs.to_int64 ofs)).
+  simpl. congruence.
+  intros (rs' & A & B & C).
   econstructor; exists rs'; split.
   apply exec_straight_opt_intro. eexact A.
   split. simpl. rewrite B, C by eauto with asmgen. auto.
@@ -1650,7 +1884,8 @@ Lemma transl_load_correct:
   exists rs',
      exec_straight ge fn c rs m k rs' m
   /\ rs'#(preg_of dst) = v
-  /\ forall r, data_preg r = true -> r <> preg_of dst -> rs' r = rs r.
+  /\ (forall r, data_preg r = true -> r <> preg_of dst -> rs' r = rs r)
+  /\ rs' # RA = rs # RA.
 Proof.
   intros. destruct vaddr; try discriminate. 
   assert (A: exists sz insn,
@@ -1663,14 +1898,17 @@ Proof.
     do 2 econstructor; (split; [eassumption|auto]).
   }
   destruct A as (sz & insn & B & C).
-  exploit transl_addressing_correct. eexact B. eexact H0. intros (ad & rs' & P & Q & R).
+  exploit transl_addressing_correct. eexact B. eexact H0. intros (ad & rs' & P & Q & R & S).
   assert (X: exec_load ge chunk (fun v => v) ad (preg_of dst) rs' m =
              Next (nextinstr (rs'#(preg_of dst) <- v)) m).
   { unfold exec_load. rewrite Q, H1. auto. }
   econstructor; split.
   eapply exec_straight_opt_right. eexact P.
   apply exec_straight_one. rewrite C, X; eauto. Simpl. 
-  split. Simpl. intros; Simpl.
+  split. Simpl.
+  split; intros; Simpl.
+  rewrite <- S.
+  apply RA_not_written.
 Qed.
 
 Lemma transl_store_correct:
@@ -1680,7 +1918,8 @@ Lemma transl_store_correct:
   Mem.storev chunk m vaddr rs#(preg_of src) = Some m' ->
   exists rs',
      exec_straight ge fn c rs m k rs' m'
-  /\ forall r, data_preg r = true -> rs' r = rs r.
+     /\ (forall r, data_preg r = true -> rs' r = rs r)
+     /\ rs' # RA = rs # RA.
 Proof.
   intros. destruct vaddr; try discriminate. 
   set (chunk' := match chunk with Mint8signed => Mint8unsigned
@@ -1696,7 +1935,7 @@ Proof.
     do 2 econstructor; (split; [eassumption|auto]).
   }
   destruct A as (sz & insn & B & C).
-  exploit transl_addressing_correct. eexact B. eexact H0. intros (ad & rs' & P & Q & R).
+  exploit transl_addressing_correct. eexact B. eexact H0. intros (ad & rs' & P & Q & R & S).                                                  
   assert (X: Mem.storev chunk' m (Vptr b i) rs#(preg_of src) = Some m').
   { rewrite <- H1. unfold chunk'. destruct chunk; auto; simpl; symmetry.
     apply Mem.store_signed_unsigned_8.
@@ -1707,7 +1946,7 @@ Proof.
   econstructor; split.
   eapply exec_straight_opt_right. eexact P.
   apply exec_straight_one. rewrite C, Y; eauto. Simpl. 
-  intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 (** Translation of indexed memory accesses *)
@@ -1725,7 +1964,9 @@ Proof.
   { destruct (rs base); try discriminate. simpl in *. rewrite Ptrofs.of_int64_to_int64 by auto. auto. }
   destruct offset_representable.
 - econstructor; econstructor; split. apply exec_straight_opt_refl. auto. 
-- exploit (exec_loadimm64 X16); eauto. intros (rs' & A & B & C).
+- exploit (exec_loadimm64 X16); eauto.
+  simpl. congruence.
+  intros (rs' & A & B & C).
   econstructor; econstructor; split. apply exec_straight_opt_intro; eexact A.
   split. simpl. rewrite B, C by eauto with asmgen. auto. auto.
 Qed.
@@ -1736,7 +1977,7 @@ Lemma loadptr_correct: forall (base: iregsp) ofs dst k m v (rs: regset),
   exists rs',
      exec_straight ge fn (loadptr base ofs dst k) rs m k rs' m
   /\ rs'#dst = v
-  /\ forall r, r <> PC -> r <> X16 -> r <> dst -> rs' r = rs r.
+  /\ (forall r, r <> PC -> r <> X16 -> r <> dst -> rs' r = rs r).
 Proof.
   intros. 
   destruct (Val.offset_ptr rs#base ofs) eqn:V; try discriminate.
@@ -1744,7 +1985,8 @@ Proof.
   econstructor; split.
   eapply exec_straight_opt_right. eexact A.
   apply exec_straight_one. simpl. unfold exec_load. rewrite B, H. eauto. auto.
-  split. Simpl. intros; Simpl.
+  split. Simpl.
+  intros; Simpl.
 Qed.
 
 Lemma storeptr_correct: forall (base: iregsp) ofs (src: ireg) k m m' (rs: regset),
@@ -1753,7 +1995,8 @@ Lemma storeptr_correct: forall (base: iregsp) ofs (src: ireg) k m m' (rs: regset
   src <> X16 ->
   exists rs',
      exec_straight ge fn (storeptr src base ofs k) rs m k rs' m'
-  /\ forall r, r <> PC -> r <> X16 -> rs' r = rs r.
+  /\ (forall r, r <> PC -> r <> X16 -> rs' r = rs r)
+  /\ rs' RA = rs RA.
 Proof.
   intros. 
   destruct (Val.offset_ptr rs#base ofs) eqn:V; try discriminate.
@@ -1761,7 +2004,7 @@ Proof.
   econstructor; split.
   eapply exec_straight_opt_right. eexact A.
   apply exec_straight_one. simpl. unfold exec_store. rewrite B, C, H by eauto with asmgen. eauto. auto.
-  intros; Simpl.
+  split; intros; Simpl.
 Qed.
 
 Lemma loadind_correct: forall (base: iregsp) ofs ty dst k c (rs: regset) m v,
@@ -1771,7 +2014,8 @@ Lemma loadind_correct: forall (base: iregsp) ofs ty dst k c (rs: regset) m v,
   exists rs',
      exec_straight ge fn c rs m k rs' m
   /\ rs'#(preg_of dst) = v
-  /\ forall r, data_preg r = true -> r <> preg_of dst -> rs' r = rs r.
+  /\ (forall r, data_preg r = true -> r <> preg_of dst -> rs' r = rs r)
+  /\ rs' RA = rs RA.
 Proof.
   intros. 
   destruct (Val.offset_ptr rs#base ofs) eqn:V; try discriminate.
@@ -1787,7 +2031,10 @@ Proof.
   econstructor; split.
   eapply exec_straight_opt_right. eexact A.
   apply exec_straight_one. rewrite SEM. unfold exec_load. rewrite B, H0. eauto. Simpl.
-  split. Simpl. intros; Simpl.
+  split. Simpl.
+  split. intros; Simpl.
+  Simpl. rewrite RA_not_written.
+  apply C; congruence.
 Qed.
 
 Lemma storeind_correct: forall (base: iregsp) ofs ty src k c (rs: regset) m m',
@@ -1796,7 +2043,8 @@ Lemma storeind_correct: forall (base: iregsp) ofs ty src k c (rs: regset) m m',
   preg_of_iregsp base <> IR X16 ->
   exists rs',
      exec_straight ge fn c rs m k rs' m'
-  /\ forall r, data_preg r = true -> rs' r = rs r.
+     /\ (forall r, data_preg r = true -> rs' r = rs r)
+     /\ rs' RA = rs RA.
 Proof.
   intros. 
   destruct (Val.offset_ptr rs#base ofs) eqn:V; try discriminate.
@@ -1814,13 +2062,15 @@ Proof.
   apply exec_straight_one. rewrite SEM.
   unfold exec_store. rewrite B, C, H0 by eauto with asmgen. eauto.
   Simpl.
-  intros; Simpl.
+  split. intros; Simpl.
+  Simpl. 
 Qed.
 
 Lemma make_epilogue_correct:
   forall ge0 f m stk soff cs m' ms rs k tm,
+  (is_leaf_function f = true -> rs # (IR RA) = parent_ra cs) ->
   load_stack m (Vptr stk soff) Tptr f.(fn_link_ofs) = Some (parent_sp cs) ->
-  load_stack m (Vptr stk soff) Tptr f.(fn_retaddr_ofs) = Some (parent_ra cs) ->
+  ((* FIXME is_leaf_function f = false -> *) load_stack m (Vptr stk soff) Tptr f.(fn_retaddr_ofs) = Some (parent_ra cs)) ->
   Mem.free m stk 0 f.(fn_stacksize) = Some m' ->
   agree ms (Vptr stk soff) rs ->
   Mem.extends m tm ->
@@ -1831,18 +2081,46 @@ Lemma make_epilogue_correct:
   /\ Mem.extends m' tm'
   /\ rs'#RA = parent_ra cs
   /\ rs'#SP = parent_sp cs
-  /\ (forall r, r <> PC -> r <> SP -> r <> X30 -> r <> X16 -> rs'#r = rs#r).
+  /\ (forall r, r <> PC -> r <> SP -> r <> RA -> r <> X16 -> rs'#r = rs#r).
 Proof.
-  intros until tm; intros LP LRA FREE AG MEXT MCS.
+  intros until tm; intros LEAF_RA LP LRA FREE AG MEXT MCS.
+
+  (* FIXME
+  Cannot be used at this point
+  destruct (is_leaf_function f) eqn:IS_LEAF.
+  {
+  exploit Mem.loadv_extends. eauto. eexact LP. auto. simpl. intros (parent' & LP' & LDP').
+  exploit lessdef_parent_sp; eauto. intros EQ; subst parent'; clear LDP'.
+  exploit Mem.free_parallel_extends; eauto. intros (tm' & FREE' & MEXT').
+  unfold make_epilogue.
+  rewrite IS_LEAF.
+
+  econstructor; econstructor; split.
+  apply exec_straight_one. simpl.
+  rewrite <- (sp_val _ _ _ AG). simpl; rewrite LP'. 
+    rewrite FREE'. eauto. auto. 
+  split. apply agree_nextinstr. apply agree_set_other; auto.
+  apply agree_change_sp with (Vptr stk soff).
+  apply agree_exten with rs; auto.
+  eapply parent_sp_def; eauto.
+  split. auto.
+  split. Simpl.
+  split. Simpl.
+  intros. Simpl.
+  }
+  lapply LRA. 2: reflexivity.
+  clear LRA. intro LRA. *)
   exploit Mem.loadv_extends. eauto. eexact LP. auto. simpl. intros (parent' & LP' & LDP').
   exploit Mem.loadv_extends. eauto. eexact LRA. auto. simpl. intros (ra' & LRA' & LDRA').
   exploit lessdef_parent_sp; eauto. intros EQ; subst parent'; clear LDP'.
   exploit lessdef_parent_ra; eauto. intros EQ; subst ra'; clear LDRA'.
   exploit Mem.free_parallel_extends; eauto. intros (tm' & FREE' & MEXT').
-  unfold make_epilogue. 
+  unfold make_epilogue.
+  (* FIXME rewrite IS_LEAF. *)
   exploit (loadptr_correct XSP (fn_retaddr_ofs f)).
     instantiate (2 := rs). simpl. rewrite <- (sp_val _ _ _ AG). simpl. eexact LRA'. simpl; congruence.
   intros (rs1 & A1 & B1 & C1).
+
   econstructor; econstructor; split.
   eapply exec_straight_trans. eexact A1. apply exec_straight_one. simpl. 
     simpl; rewrite (C1 SP) by auto with asmgen. rewrite <- (sp_val _ _ _ AG). simpl; rewrite LP'. 
@@ -1857,4 +2135,4 @@ Proof.
   intros. Simpl.
 Qed.
 
-End CONSTRUCTORS.
-\ No newline at end of file
+End CONSTRUCTORS.
diff --git a/aarch64/CSE2deps.v b/aarch64/CSE2deps.v
new file mode 100644
index 00000000..90b514a2
--- /dev/null
+++ b/aarch64/CSE2deps.v
@@ -0,0 +1,20 @@
+Require Import BoolEqual Coqlib.
+Require Import AST Integers Floats.
+Require Import Values Memory Globalenvs Events.
+Require Import Op.
+
+
+Definition can_swap_accesses_ofs ofsr chunkr ofsw chunkw :=
+     (0 <=? ofsw) && (ofsw <=? (Ptrofs.modulus - largest_size_chunk))
+  && (0 <=? ofsr) && (ofsr <=? (Ptrofs.modulus - largest_size_chunk))
+  && ((ofsw + size_chunk chunkw <=? ofsr) ||
+      (ofsr + size_chunk chunkr <=? ofsw)).
+
+Definition may_overlap chunk addr args chunk' addr' args' :=
+  match addr, addr', args, args' with
+  | (Aindexed ofs), (Aindexed ofs'),
+    (base :: nil), (base' :: nil) =>
+    if peq base base'
+    then negb (can_swap_accesses_ofs (Int64.unsigned ofs') chunk' (Int64.unsigned ofs) chunk)
+    else true  | _, _, _, _ => true
+  end.
diff --git a/aarch64/CSE2depsproof.v b/aarch64/CSE2depsproof.v
new file mode 100644
index 00000000..4aac23af
--- /dev/null
+++ b/aarch64/CSE2depsproof.v
@@ -0,0 +1,128 @@
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL Maps.
+
+Require Import Globalenvs Values.
+Require Import Linking Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import CSE2 CSE2deps.
+Require Import Lia.
+
+Lemma ptrofs_size :
+  Ptrofs.wordsize = 64%nat.
+Proof.
+  unfold Ptrofs.wordsize.
+  unfold Wordsize_Ptrofs.wordsize.
+  trivial.
+Qed.
+
+Lemma ptrofs_modulus :
+  Ptrofs.modulus = 18446744073709551616.
+Proof.
+  reflexivity.
+Qed.
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Section MEMORY_WRITE.
+  Variable m m2 : mem.
+  Variable chunkw chunkr : memory_chunk.
+  Variable base : val.
+  
+  Variable addrw addrr valw : val.
+  Hypothesis STORE : Mem.storev chunkw m addrw valw = Some m2.
+
+  Section INDEXED_AWAY.
+  Variable ofsw ofsr : int64.
+  Hypothesis ADDRW : eval_addressing genv sp
+                       (Aindexed ofsw) (base :: nil) = Some addrw. 
+  Hypothesis ADDRR : eval_addressing genv sp
+                       (Aindexed ofsr) (base :: nil) = Some addrr.
+
+  Lemma load_store_away1 :
+    forall RANGEW : 0 <= Int64.unsigned ofsw <= Ptrofs.modulus - largest_size_chunk,
+    forall RANGER : 0 <= Int64.unsigned ofsr <= Ptrofs.modulus - largest_size_chunk,
+    forall SWAPPABLE :    Int64.unsigned ofsw + size_chunk chunkw <= Int64.unsigned ofsr
+                       \/ Int64.unsigned ofsr + size_chunk chunkr <= Int64.unsigned ofsw,
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intros.
+    
+    pose proof (max_size_chunk chunkr) as size_chunkr_bounded.
+    pose proof (max_size_chunk chunkw) as size_chunkw_bounded.
+    unfold largest_size_chunk in *.
+
+    rewrite ptrofs_modulus in *.
+    simpl in *.
+    inv ADDRR.
+    inv ADDRW.
+
+    destruct base; try discriminate.
+    eapply Mem.load_store_other with (chunk := chunkw) (v := valw) (b := b).
+    exact STORE.
+    right.
+
+    all: unfold Ptrofs.of_int64.
+
+    all: try (destruct (Ptrofs.unsigned_add_either i (Ptrofs.repr (Int64.unsigned ofsr))) as [OFSR | OFSR];
+              rewrite OFSR).
+    all: try (destruct (Ptrofs.unsigned_add_either i (Ptrofs.repr (Int64.unsigned ofsw))) as [OFSW | OFSW];
+              rewrite OFSW).
+    all: repeat rewrite Ptrofs.unsigned_repr by (change Ptrofs.max_unsigned with 18446744073709551615; lia).
+
+    all: try rewrite ptrofs_modulus in *.
+
+    all: intuition lia.
+  Qed.
+  
+  Theorem load_store_away :
+    can_swap_accesses_ofs (Int64.unsigned ofsr) chunkr (Int64.unsigned ofsw) chunkw = true ->
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intro SWAP.
+    unfold can_swap_accesses_ofs in SWAP.
+    repeat rewrite andb_true_iff in SWAP.
+    repeat rewrite orb_true_iff in SWAP.
+    repeat rewrite Z.leb_le in SWAP.
+    apply load_store_away1.
+    all: tauto.
+  Qed.
+  End INDEXED_AWAY.
+End MEMORY_WRITE.
+End SOUNDNESS.
+
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Lemma may_overlap_sound:
+  forall m m' : mem,
+  forall chunk addr args chunk' addr' args' v a a' rs,
+    (eval_addressing genv sp addr (rs ## args)) = Some a ->
+    (eval_addressing genv sp addr' (rs ## args')) = Some a' ->
+    (may_overlap chunk addr args chunk' addr' args') = false ->
+    (Mem.storev chunk m a v) = Some m' ->
+    (Mem.loadv chunk' m' a') = (Mem.loadv chunk' m  a').
+Proof.
+  intros until rs.
+  intros ADDR ADDR' OVERLAP STORE.
+  destruct addr; destruct addr'; try discriminate.
+  { (* Aindexed / Aindexed *)
+  destruct args as [ | base [ | ]]. 1,3: discriminate.
+  destruct args' as [ | base' [ | ]]. 1,3: discriminate.
+  simpl in OVERLAP.
+  destruct (peq base base'). 2: discriminate.
+  subst base'.
+  destruct (can_swap_accesses_ofs (Int64.unsigned ofs0) chunk' (Int64.unsigned ofs) chunk) eqn:SWAP.
+  2: discriminate.
+  simpl in *.
+  eapply load_store_away with (F:=F) (V:=V) (genv:=genv) (sp:=sp); eassumption.
+  }
+Qed.
+
+End SOUNDNESS.
diff --git a/aarch64/Conventions1.v b/aarch64/Conventions1.v
index 14cb199f..efda835d 100644
--- a/aarch64/Conventions1.v
+++ b/aarch64/Conventions1.v
@@ -190,27 +190,6 @@ Fixpoint loc_arguments_rec
 Definition loc_arguments (s: signature) : list (rpair loc) :=
   loc_arguments_rec s.(sig_args) 0 0 0.
 
-(** [size_arguments s] returns the number of [Outgoing] slots used
-  to call a function with signature [s]. *)
-
-Fixpoint size_arguments_rec (tyl: list typ) (ir fr ofs: Z) {struct tyl} : Z :=
-  match tyl with
-  | nil => ofs
-  | (Tint | Tlong | Tany32 | Tany64) :: tys =>
-      match list_nth_z int_param_regs ir with
-      | None => size_arguments_rec tys ir fr (ofs + 2)
-      | Some ireg => size_arguments_rec tys (ir + 1) fr ofs
-      end
-  | (Tfloat | Tsingle) :: tys =>
-      match list_nth_z float_param_regs fr with
-      | None => size_arguments_rec tys ir fr (ofs + 2)
-      | Some freg => size_arguments_rec tys ir (fr + 1) ofs
-      end
-  end.
-
-Definition size_arguments (s: signature) : Z :=
-  size_arguments_rec s.(sig_args) 0 0 0.
-
 (** Argument locations are either caller-save registers or [Outgoing]
   stack slots at nonnegative offsets. *)
 
@@ -285,92 +264,6 @@ Qed.
 
 Hint Resolve loc_arguments_acceptable: locs.
 
-(** The offsets of [Outgoing] arguments are below [size_arguments s]. *)
-
-Remark size_arguments_rec_above:
-  forall tyl ir fr ofs0,
-  ofs0 <= size_arguments_rec tyl ir fr ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  omega.
-  assert (A: ofs0 <=
-    match list_nth_z int_param_regs ir with
-    | Some _ => size_arguments_rec tyl (ir + 1) fr ofs0
-    | None => size_arguments_rec tyl ir fr (ofs0 + 2)
-    end).
-  { destruct (list_nth_z int_param_regs ir); eauto.
-    apply Z.le_trans with (ofs0 + 2); auto. omega. }
-  assert (B: ofs0 <=
-    match list_nth_z float_param_regs fr with
-    | Some _ => size_arguments_rec tyl ir (fr + 1) ofs0
-    | None => size_arguments_rec tyl ir fr (ofs0 + 2)
-    end).
-  { destruct (list_nth_z float_param_regs fr); eauto.
-    apply Z.le_trans with (ofs0 + 2); auto. omega. }
-  destruct a; auto.
-Qed.
-
-Lemma size_arguments_above:
-  forall s, size_arguments s >= 0.
-Proof.
-  intros; unfold size_arguments. apply Z.le_ge. apply size_arguments_rec_above.
-Qed.
-
-Lemma loc_arguments_rec_bounded:
-  forall ofs ty tyl ir fr ofs0,
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments_rec tyl ir fr ofs0)) ->
-  ofs + typesize ty <= size_arguments_rec tyl ir fr ofs0.
-Proof.
-  induction tyl; simpl; intros.
-- contradiction.
-- assert (T: forall ty0, typesize ty0 <= 2).
-  { destruct ty0; simpl; omega. }
-  assert (A: forall ty0,
-             In (S Outgoing ofs ty) (regs_of_rpairs
-              match list_nth_z int_param_regs ir with
-              | Some ireg =>
-                  One (R ireg) :: loc_arguments_rec tyl (ir + 1) fr ofs0
-              | None => One (S Outgoing ofs0 ty0) :: loc_arguments_rec tyl ir fr (ofs0 + 2)
-              end) ->
-             ofs + typesize ty <=
-             match list_nth_z int_param_regs ir with
-             | Some _ => size_arguments_rec tyl (ir + 1) fr ofs0
-             | None => size_arguments_rec tyl ir fr (ofs0 + 2)
-             end).
-  { intros. destruct (list_nth_z int_param_regs ir); simpl in H0; destruct H0.
-  - discriminate.
-  - eapply IHtyl; eauto.
-  - inv H0. apply Z.le_trans with (ofs + 2). specialize (T ty). omega. apply size_arguments_rec_above.
-  - eapply IHtyl; eauto. }
-  assert (B: forall ty0,
-             In (S Outgoing ofs ty) (regs_of_rpairs
-              match list_nth_z float_param_regs fr with
-              | Some ireg =>
-                  One (R ireg) :: loc_arguments_rec tyl ir (fr + 1) ofs0
-              | None => One (S Outgoing ofs0 ty0) :: loc_arguments_rec tyl ir fr (ofs0 + 2)
-              end) ->
-             ofs + typesize ty <=
-             match list_nth_z float_param_regs fr with
-             | Some _ => size_arguments_rec tyl ir (fr + 1) ofs0
-             | None => size_arguments_rec tyl ir fr (ofs0 + 2)
-             end).
-  { intros. destruct (list_nth_z float_param_regs fr); simpl in H0; destruct H0.
-  - discriminate.
-  - eapply IHtyl; eauto.
-  - inv H0. apply Z.le_trans with (ofs + 2). specialize (T ty). omega. apply size_arguments_rec_above.
-  - eapply IHtyl; eauto. }
-  destruct a; eauto.
-Qed.
-
-Lemma loc_arguments_bounded:
-  forall (s: signature) (ofs: Z) (ty: typ),
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments s)) ->
-  ofs + typesize ty <= size_arguments s.
-Proof.
-  unfold loc_arguments, size_arguments; intros.
-  eauto using loc_arguments_rec_bounded.
-Qed.
-
 Lemma loc_arguments_main:
   loc_arguments signature_main = nil.
 Proof.
diff --git a/aarch64/DuplicateOpcodeHeuristic.ml b/aarch64/DuplicateOpcodeHeuristic.ml
index 85505245..5fc2156c 100644
--- a/aarch64/DuplicateOpcodeHeuristic.ml
+++ b/aarch64/DuplicateOpcodeHeuristic.ml
@@ -1,3 +1,27 @@
-exception HeuristicSucceeded
-        
-let opcode_heuristic code cond ifso ifnot preferred = ()
+open Op
+open Integers
+
+let opcode_heuristic code cond ifso ifnot is_loop_header =
+  match cond with
+  | Ccompimm (c, n) | Ccompuimm (c, n) -> if n == Integers.Int.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccomplimm (c, n) | Ccompluimm (c, n) -> if n == Integers.Int64.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccompf c | Ccompfs c -> (match c with
+      | Ceq -> Some false
+      | Cne -> Some true
+      | _ -> None
+      )
+  | Cnotcompf c | Cnotcompfs c -> (match c with
+      | Ceq -> Some true
+      | Cne -> Some false
+      | _ -> None
+      )
+  | _ -> None
+
diff --git a/arm/CSE2deps.v b/arm/CSE2deps.v
new file mode 100644
index 00000000..9db51bbb
--- /dev/null
+++ b/arm/CSE2deps.v
@@ -0,0 +1,20 @@
+Require Import BoolEqual Coqlib.
+Require Import AST Integers Floats.
+Require Import Values Memory Globalenvs Events.
+Require Import Op.
+
+
+Definition can_swap_accesses_ofs ofsr chunkr ofsw chunkw :=
+     (0 <=? ofsw) && (ofsw <=? (Ptrofs.modulus - largest_size_chunk))
+  && (0 <=? ofsr) && (ofsr <=? (Ptrofs.modulus - largest_size_chunk))
+  && ((ofsw + size_chunk chunkw <=? ofsr) ||
+      (ofsr + size_chunk chunkr <=? ofsw)).
+
+Definition may_overlap chunk addr args chunk' addr' args' :=
+  match addr, addr', args, args' with
+  | (Aindexed ofs), (Aindexed ofs'),
+    (base :: nil), (base' :: nil) =>
+    if peq base base'
+    then negb (can_swap_accesses_ofs (Int.unsigned ofs') chunk' (Int.unsigned ofs) chunk)
+    else true  | _, _, _, _ => true
+  end.
diff --git a/arm/CSE2depsproof.v b/arm/CSE2depsproof.v
new file mode 100644
index 00000000..61fe5980
--- /dev/null
+++ b/arm/CSE2depsproof.v
@@ -0,0 +1,129 @@
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL Maps.
+
+Require Import Globalenvs Values.
+Require Import Linking Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import CSE2 CSE2deps.
+Require Import Lia.
+
+Lemma ptrofs_size :
+  Ptrofs.wordsize = 32%nat.
+Proof.
+  unfold Ptrofs.wordsize.
+  unfold Wordsize_Ptrofs.wordsize.
+  trivial.
+Qed.
+
+Lemma ptrofs_modulus :
+  Ptrofs.modulus = 4294967296.
+Proof.
+  unfold Ptrofs.modulus.
+  rewrite ptrofs_size.
+  destruct Archi.ptr64; reflexivity.
+Qed.
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Section MEMORY_WRITE.
+  Variable m m2 : mem.
+  Variable chunkw chunkr : memory_chunk.
+  Variable base : val.
+  
+  Variable addrw addrr valw : val.
+  Hypothesis STORE : Mem.storev chunkw m addrw valw = Some m2.
+
+  Section INDEXED_AWAY.
+  Variable ofsw ofsr : int.
+  Hypothesis ADDRW : eval_addressing genv sp
+                       (Aindexed ofsw) (base :: nil) = Some addrw. 
+  Hypothesis ADDRR : eval_addressing genv sp
+                       (Aindexed ofsr) (base :: nil) = Some addrr.
+
+  Lemma load_store_away1 :
+    forall RANGEW : 0 <= Int.unsigned ofsw <= Ptrofs.modulus - largest_size_chunk,
+    forall RANGER : 0 <= Int.unsigned ofsr <= Ptrofs.modulus - largest_size_chunk,
+    forall SWAPPABLE :    Int.unsigned ofsw + size_chunk chunkw <= Int.unsigned ofsr
+                       \/ Int.unsigned ofsr + size_chunk chunkr <= Int.unsigned ofsw,
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intros.
+    
+    pose proof (max_size_chunk chunkr) as size_chunkr_bounded.
+    pose proof (max_size_chunk chunkw) as size_chunkw_bounded.
+    unfold largest_size_chunk in *.
+
+    rewrite ptrofs_modulus in *.
+    simpl in *.
+    inv ADDRR.
+    inv ADDRW.
+    destruct base; try discriminate.
+    eapply Mem.load_store_other with (chunk := chunkw) (v := valw) (b := b).
+    exact STORE.
+    right.
+
+    all: try (destruct (Ptrofs.unsigned_add_either i (Ptrofs.of_int ofsr)) as [OFSR | OFSR];
+              rewrite OFSR).
+    all: try (destruct (Ptrofs.unsigned_add_either i (Ptrofs.of_int ofsw)) as [OFSW | OFSW];
+              rewrite OFSW).
+    
+    all: try rewrite ptrofs_modulus in *.
+
+    all: unfold Ptrofs.of_int.
+   
+    all: repeat rewrite Ptrofs.unsigned_repr by (change Ptrofs.max_unsigned with 4294967295; lia).
+    all: intuition lia.
+  Qed.
+  
+  Theorem load_store_away :
+    can_swap_accesses_ofs (Int.unsigned ofsr) chunkr (Int.unsigned ofsw) chunkw = true ->
+    Mem.loadv chunkr m2 addrr =  Mem.loadv chunkr m addrr.
+  Proof.
+    intro SWAP.
+    unfold can_swap_accesses_ofs in SWAP.
+    repeat rewrite andb_true_iff in SWAP.
+    repeat rewrite orb_true_iff in SWAP.
+    repeat rewrite Z.leb_le in SWAP.
+    apply load_store_away1.
+    all: tauto.
+  Qed.
+  End INDEXED_AWAY.
+End MEMORY_WRITE.
+End SOUNDNESS.
+
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Lemma may_overlap_sound:
+  forall m m' : mem,
+  forall chunk addr args chunk' addr' args' v a a' rs,
+    (eval_addressing genv sp addr (rs ## args)) = Some a ->
+    (eval_addressing genv sp addr' (rs ## args')) = Some a' ->
+    (may_overlap chunk addr args chunk' addr' args') = false ->
+    (Mem.storev chunk m a v) = Some m' ->
+    (Mem.loadv chunk' m'  a') = (Mem.loadv chunk' m a').
+Proof.
+  intros until rs.
+  intros ADDR ADDR' OVERLAP STORE.
+  destruct addr; destruct addr'; try discriminate.
+  { (* Aindexed / Aindexed *)
+  destruct args as [ | base [ | ]]. 1,3: discriminate.
+  destruct args' as [ | base' [ | ]]. 1,3: discriminate.
+  simpl in OVERLAP.
+  destruct (peq base base'). 2: discriminate.
+  subst base'.
+  destruct (can_swap_accesses_ofs (Int.unsigned i0) chunk' (Int.unsigned i) chunk) eqn:SWAP.
+  2: discriminate.
+  simpl in *.
+  eapply load_store_away with (F:=F) (V:=V) (genv:=genv) (sp:=sp); eassumption.
+  }
+Qed.
+
+End SOUNDNESS.
diff --git a/arm/Conventions1.v b/arm/Conventions1.v
index 7016c1ee..fe49a781 100644
--- a/arm/Conventions1.v
+++ b/arm/Conventions1.v
@@ -269,48 +269,6 @@ Definition loc_arguments (s: signature) : list (rpair loc) :=
       else loc_arguments_hf s.(sig_args) 0 0 0
   end.
 
-(** [size_arguments s] returns the number of [Outgoing] slots used
-  to call a function with signature [s]. *)
-
-Fixpoint size_arguments_hf (tyl: list typ) (ir fr ofs: Z) {struct tyl} : Z :=
-  match tyl with
-  | nil => ofs
-  | (Tint|Tany32) :: tys =>
-      if zlt ir 4
-      then size_arguments_hf tys (ir + 1) fr ofs
-      else size_arguments_hf tys ir fr (ofs + 1)
-  | (Tfloat|Tany64) :: tys =>
-      if zlt fr 8
-      then size_arguments_hf tys ir (fr + 1) ofs
-      else size_arguments_hf tys ir fr (align ofs 2 + 2)
-  | Tsingle :: tys =>
-      if zlt fr 8
-      then size_arguments_hf tys ir (fr + 1) ofs
-      else size_arguments_hf tys ir fr (ofs + 1)
-  | Tlong :: tys =>
-      let ir := align ir 2 in
-      if zlt ir 4
-      then size_arguments_hf tys (ir + 2) fr ofs
-      else size_arguments_hf tys ir fr (align ofs 2 + 2)
-  end.
-
-Fixpoint size_arguments_sf (tyl: list typ) (ofs: Z) {struct tyl} : Z :=
-  match tyl with
-  | nil => Z.max 0 ofs
-  | (Tint | Tsingle | Tany32) :: tys => size_arguments_sf tys (ofs + 1)
-  | (Tfloat | Tlong | Tany64) :: tys => size_arguments_sf tys (align ofs 2 + 2)
-  end.
-
-Definition size_arguments (s: signature) : Z :=
-  match Archi.abi with
-  | Archi.Softfloat =>
-      size_arguments_sf s.(sig_args) (-4)
-  | Archi.Hardfloat =>
-      if s.(sig_cc).(cc_vararg)
-      then size_arguments_sf s.(sig_args) (-4)
-      else size_arguments_hf s.(sig_args) 0 0 0
-  end.
-
 (** Argument locations are either non-temporary registers or [Outgoing]
   stack slots at nonnegative offsets. *)
 
@@ -471,170 +429,6 @@ Qed.
 
 Hint Resolve loc_arguments_acceptable: locs.
 
-(** The offsets of [Outgoing] arguments are below [size_arguments s]. *)
-
-Remark size_arguments_hf_above:
-  forall tyl ir fr ofs0,
-  ofs0 <= size_arguments_hf tyl ir fr ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  omega.
-  destruct a.
-  destruct (zlt ir 4); eauto. apply Z.le_trans with (ofs0 + 1); auto; omega.
-  destruct (zlt fr 8); eauto.
-  apply Z.le_trans with (align ofs0 2). apply align_le; omega.
-  apply Z.le_trans with (align ofs0 2 + 2); auto; omega.
-  set (ir' := align ir 2).
-  destruct (zlt ir' 4); eauto.
-  apply Z.le_trans with (align ofs0 2). apply align_le; omega.
-  apply Z.le_trans with (align ofs0 2 + 2); auto; omega.
-  destruct (zlt fr 8); eauto.
-  apply Z.le_trans with (ofs0 + 1); eauto. omega.
-  destruct (zlt ir 4); eauto. apply Z.le_trans with (ofs0 + 1); auto; omega.
-  destruct (zlt fr 8); eauto.
-  apply Z.le_trans with (align ofs0 2). apply align_le; omega.
-  apply Z.le_trans with (align ofs0 2 + 2); auto; omega.
-Qed.
-
-Remark size_arguments_sf_above:
-  forall tyl ofs0,
-  Z.max 0 ofs0 <= size_arguments_sf tyl ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  omega.
-  destruct a; (eapply Z.le_trans; [idtac|eauto]).
-  xomega.
-  assert (ofs0 <= align ofs0 2) by (apply align_le; omega). xomega.
-  assert (ofs0 <= align ofs0 2) by (apply align_le; omega). xomega.
-  xomega.
-  xomega.
-  assert (ofs0 <= align ofs0 2) by (apply align_le; omega). xomega.
-Qed.
-
-Lemma size_arguments_above:
-  forall s, size_arguments s >= 0.
-Proof.
-  intros; unfold size_arguments. apply Z.le_ge.
-  assert (0 <= size_arguments_sf (sig_args s) (-4)).
-  { change 0 with (Z.max 0 (-4)). apply size_arguments_sf_above. }
-  assert (0 <= size_arguments_hf (sig_args s) 0 0 0).
-  { apply size_arguments_hf_above. }
-  destruct Archi.abi; [ | destruct (cc_vararg (sig_cc s)) ]; auto.
-Qed.
-
-Lemma loc_arguments_hf_bounded:
-  forall ofs ty tyl ir fr ofs0,
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments_hf tyl ir fr ofs0)) ->
-  ofs + typesize ty <= size_arguments_hf tyl ir fr ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  elim H.
-  destruct a.
-- (* int *)
-  destruct (zlt ir 4); destruct H.
-  discriminate.
-  eauto.
-  inv H. apply size_arguments_hf_above.
-  eauto.
-- (* float *)
-  destruct (zlt fr 8); destruct H.
-  discriminate.
-  eauto.
-  inv H. apply size_arguments_hf_above.
-  eauto.
-- (* long *)
-  destruct (zlt (align ir 2) 4).
-  destruct H. discriminate. destruct H. discriminate. eauto.
-  destruct Archi.big_endian.
-  destruct H. inv H.
-  eapply Z.le_trans. 2: apply size_arguments_hf_above. simpl; omega.
-  destruct H. inv H.
-  rewrite <- Z.add_assoc. simpl. apply size_arguments_hf_above.
-  eauto.
-  destruct H. inv H.
-  rewrite <- Z.add_assoc. simpl. apply size_arguments_hf_above.
-  destruct H. inv H.
-  eapply Z.le_trans. 2: apply size_arguments_hf_above. simpl; omega.
-  eauto.
-- (* float *)
-  destruct (zlt fr 8); destruct H.
-  discriminate.
-  eauto.
-  inv H. apply size_arguments_hf_above.
-  eauto.
-- (* any32 *)
-  destruct (zlt ir 4); destruct H.
-  discriminate.
-  eauto.
-  inv H. apply size_arguments_hf_above.
-  eauto.
-- (* any64 *)
-  destruct (zlt fr 8); destruct H.
-  discriminate.
-  eauto.
-  inv H. apply size_arguments_hf_above.
-  eauto.
-Qed.
-
-Lemma loc_arguments_sf_bounded:
-  forall ofs ty tyl ofs0,
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments_sf tyl ofs0)) ->
-  Z.max 0 (ofs + typesize ty) <= size_arguments_sf tyl ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  elim H.
-  destruct a.
-- (* int *)
-  destruct H.
-  destruct (zlt ofs0 0); inv H. apply size_arguments_sf_above.
-  eauto.
-- (* float *)
-  destruct H.
-  destruct (zlt (align ofs0 2) 0); inv H. apply size_arguments_sf_above.
-  eauto.
-- (* long *)
-  destruct H.
-  destruct Archi.big_endian.
-  destruct (zlt (align ofs0 2) 0); inv H.
-  eapply Z.le_trans. 2: apply size_arguments_sf_above. simpl; xomega.
-  destruct (zlt (align ofs0 2) 0); inv H.
-  rewrite <- Z.add_assoc. simpl. apply size_arguments_sf_above.
-  destruct H.
-  destruct Archi.big_endian.
-  destruct (zlt (align ofs0 2) 0); inv H.
-  rewrite <- Z.add_assoc. simpl. apply size_arguments_sf_above.
-  destruct (zlt (align ofs0 2) 0); inv H.
-  eapply Z.le_trans. 2: apply size_arguments_sf_above. simpl; xomega.
-  eauto.
-- (* float *)
-  destruct H.
-  destruct (zlt ofs0 0); inv H. apply size_arguments_sf_above.
-  eauto.
-- (* any32 *)
-  destruct H.
-  destruct (zlt ofs0 0); inv H. apply size_arguments_sf_above.
-  eauto.
-- (* any64 *)
-  destruct H.
-  destruct (zlt (align ofs0 2) 0); inv H. apply size_arguments_sf_above.
-  eauto.
-Qed.
-
-Lemma loc_arguments_bounded:
-  forall (s: signature) (ofs: Z) (ty: typ),
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments s)) ->
-  ofs + typesize ty <= size_arguments s.
-Proof.
-  unfold loc_arguments, size_arguments; intros.
-  assert (In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments_sf (sig_args s) (-4))) ->
-          ofs + typesize ty <= size_arguments_sf (sig_args s) (-4)).
-  { intros. eapply Z.le_trans. 2: eapply loc_arguments_sf_bounded; eauto. xomega. }
-  assert (In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments_hf (sig_args s) 0 0 0)) ->
-          ofs + typesize ty <= size_arguments_hf (sig_args s) 0 0 0).
-  { intros. eapply loc_arguments_hf_bounded; eauto. }
-  destruct Archi.abi; [ | destruct (cc_vararg (sig_cc s)) ]; eauto.
-Qed.
-
 Lemma loc_arguments_main:
   loc_arguments signature_main = nil.
 Proof.
diff --git a/arm/DuplicateOpcodeHeuristic.ml b/arm/DuplicateOpcodeHeuristic.ml
index 85505245..9b6a6409 100644
--- a/arm/DuplicateOpcodeHeuristic.ml
+++ b/arm/DuplicateOpcodeHeuristic.ml
@@ -1,3 +1,22 @@
-exception HeuristicSucceeded
-        
-let opcode_heuristic code cond ifso ifnot preferred = ()
+open Op
+open Integers
+
+let opcode_heuristic code cond ifso ifnot is_loop_header =
+  match cond with
+  | Ccompimm (c, n) | Ccompuimm (c, n) -> if n == Integers.Int.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccompf c | Ccompfs c -> (match c with
+      | Ceq -> Some false
+      | Cne -> Some true
+      | _ -> None
+      )
+  | Cnotcompf c | Cnotcompfs c -> (match c with
+      | Ceq -> Some true
+      | Cne -> Some false
+      | _ -> None
+      )
+  | _ -> None
+
diff --git a/backend/Allocation.v b/backend/Allocation.v
index d18b07a9..2323c050 100644
--- a/backend/Allocation.v
+++ b/backend/Allocation.v
@@ -314,10 +314,10 @@ Definition pair_instr_block
           Some(BSbuiltin ef args res mv1 args' res' mv2 s)
       | _ => None
       end
-  | Icond cond args s1 s2 =>
+  | Icond cond args s1 s2 i =>
       let (mv1, b1) := extract_moves nil b in
       match b1 with
-      | Lcond cond' args' s1' s2' :: b2 =>
+      | Lcond cond' args' s1' s2' i' :: b2 =>
           assertion (eq_condition cond cond');
           assertion (peq s1 s1');
           assertion (peq s2 s2');
diff --git a/backend/Allocproof.v b/backend/Allocproof.v
index b6880860..3c7df58a 100644
--- a/backend/Allocproof.v
+++ b/backend/Allocproof.v
@@ -169,11 +169,11 @@ Inductive expand_block_shape: block_shape -> RTL.instruction -> LTL.bblock -> Pr
                          (Ibuiltin ef args res s)
                          (expand_moves mv1
                            (Lbuiltin ef args' res' :: expand_moves mv2 (Lbranch s :: k)))
-  | ebs_cond: forall cond args mv args' s1 s2 k,
+  | ebs_cond: forall cond args mv args' s1 s2 k i i',
       wf_moves mv ->
       expand_block_shape (BScond cond args mv args' s1 s2)
-                         (Icond cond args s1 s2)
-                         (expand_moves mv (Lcond cond args' s1 s2 :: k))
+                         (Icond cond args s1 s2 i)
+                         (expand_moves mv (Lcond cond args' s1 s2 i' :: k))
   | ebs_jumptable: forall arg mv arg' tbl k,
       wf_moves mv ->
       expand_block_shape (BSjumptable arg mv arg' tbl)
diff --git a/backend/CSE.v b/backend/CSE.v
index 2827161d..1936d4e4 100644
--- a/backend/CSE.v
+++ b/backend/CSE.v
@@ -496,7 +496,7 @@ Definition transfer (f: function) (approx: PMap.t VA.t) (pc: node) (before: numb
           | EF_vload _ | EF_annot _ _ _ | EF_annot_val _ _ _ | EF_debug _ _ _ =>
               set_res_unknown before res
           end
-      | Icond cond args ifso ifnot =>
+      | Icond cond args ifso ifnot _ =>
           before
       | Ijumptable arg tbl =>
           before
@@ -549,10 +549,10 @@ Definition transf_instr (n: numbering) (instr: instruction) :=
       let (n1, vl) := valnum_regs n args in
       let (addr', args') := reduce _ combine_addr n1 addr args vl in
       Istore chunk addr' args' src s
-  | Icond cond args s1 s2 =>
+  | Icond cond args s1 s2 i =>
       let (n1, vl) := valnum_regs n args in
       let (cond', args') := reduce _ combine_cond n1 cond args vl in
-      Icond cond' args' s1 s2
+      Icond cond' args' s1 s2 i
   | _ =>
       instr
   end.
diff --git a/backend/CSE2.v b/backend/CSE2.v
index 38a46c1b..900a7517 100644
--- a/backend/CSE2.v
+++ b/backend/CSE2.v
@@ -6,7 +6,7 @@ David Monniaux, CNRS, VERIMAG
 
 Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
 Require Import AST Linking.
-Require Import Memory Registers Op RTL Maps.
+Require Import Memory Registers Op RTL Maps CSE2deps.
 
 (* Static analysis *)
 
@@ -265,7 +265,7 @@ Definition kill_sym_val (dst : reg) (sv : sym_val) :=
 Definition kill_reg (dst : reg) (rel : RELATION.t) :=
   PTree.filter1 (fun x => negb (kill_sym_val dst x))
                 (PTree.remove dst rel).
-
+  
 Definition kill_sym_val_mem (sv: sym_val) :=
   match sv with
   | SMove _ => false
@@ -273,16 +273,28 @@ Definition kill_sym_val_mem (sv: sym_val) :=
   | SLoad _ _ _ => true
   end.
 
+Definition kill_sym_val_store chunk addr args (sv: sym_val) :=
+  match sv with
+  | SMove _ => false
+  | SOp op _ => op_depends_on_memory op
+  | SLoad chunk' addr' args' => may_overlap chunk addr args chunk' addr' args'
+  end.
+
 Definition kill_mem (rel : RELATION.t) :=
   PTree.filter1 (fun x => negb (kill_sym_val_mem x)) rel.
 
-
 Definition forward_move (rel : RELATION.t) (x : reg) : reg :=
   match rel ! x with
   | Some (SMove org) => org
   | _ => x
   end.
 
+Definition kill_store1 chunk addr args rel :=
+  PTree.filter1 (fun x => negb (kill_sym_val_store chunk addr args x)) rel.
+
+Definition kill_store chunk addr args rel :=
+  kill_store1 chunk addr (List.map (forward_move rel) args) rel.
+
 Definition move (src dst : reg) (rel : RELATION.t) :=
   PTree.set dst (SMove (forward_move rel src)) (kill_reg dst rel).
 
@@ -393,9 +405,9 @@ Qed.
 Definition apply_instr instr (rel : RELATION.t) : RB.t :=
   match instr with
   | Inop _
-  | Icond _ _ _ _
+  | Icond _ _ _ _ _
   | Ijumptable _ _ => Some rel
-  | Istore _ _ _ _ _ => Some (kill_mem rel)
+  | Istore chunk addr args _ _ => Some (kill_store chunk addr args rel)
   | Iop op args dst _ => Some (gen_oper op dst args rel)
   | Iload trap chunk addr args dst _ => Some (load chunk addr dst args rel)
   | Icall _ _ _ dst _ => Some (kill_reg dst (kill_mem rel))
@@ -457,7 +469,7 @@ Definition transf_instr (fmap : option (PMap.t RB.t))
   match instr with
   | Iop op args dst s =>
     let args' := subst_args fmap pc args in
-    match find_op_in_fmap fmap pc op args' with
+    match (if is_trivial_op op then None else find_op_in_fmap fmap pc op args') with
     | None => Iop op args' dst s
     | Some src => Iop Omove (src::nil) dst s
     end
@@ -473,8 +485,8 @@ Definition transf_instr (fmap : option (PMap.t RB.t))
     Icall sig ros (subst_args fmap pc args) dst s
   | Itailcall sig ros args =>
     Itailcall sig ros (subst_args fmap pc args)
-  | Icond cond args s1 s2 =>
-    Icond cond (subst_args fmap pc args) s1 s2
+  | Icond cond args s1 s2 i =>
+    Icond cond (subst_args fmap pc args) s1 s2 i
   | Ijumptable arg tbl =>
     Ijumptable (subst_arg fmap pc arg) tbl
   | Ireturn (Some arg) =>
diff --git a/backend/CSE2proof.v b/backend/CSE2proof.v
index 254cc4ce..309ccce1 100644
--- a/backend/CSE2proof.v
+++ b/backend/CSE2proof.v
@@ -13,7 +13,8 @@ Require Import Memory Registers Op RTL Maps.
 Require Import Globalenvs Values.
 Require Import Linking Values Memory Globalenvs Events Smallstep.
 Require Import Registers Op RTL.
-Require Import CSE2.
+Require Import CSE2 CSE2deps CSE2depsproof.
+Require Import Lia.
 
 Lemma args_unaffected:
   forall rs : regset,
@@ -54,9 +55,9 @@ Definition sem_sym_val sym rs (v : option val) : Prop :=
     match eval_addressing genv sp addr rs##args with
     | Some a => match Mem.loadv chunk m a with
                 | Some dat => v = Some dat
-                | None => v = None \/ v = Some (default_notrap_load_value chunk)
+                | None => v = None \/ v = Some Vundef
                 end
-    | None   => v = None \/ v = Some (default_notrap_load_value chunk)
+    | None   => v = None \/ v = Some Vundef
     end
   end.
     
@@ -390,6 +391,7 @@ Proof.
   apply REC; auto.
 Qed.
 
+
 Lemma find_load_sound :
   forall rel : RELATION.t,
   forall chunk : memory_chunk,
@@ -402,9 +404,9 @@ Lemma find_load_sound :
     match eval_addressing genv sp addr rs##args with
     | Some a => match Mem.loadv chunk m a with
                 | Some dat => rs#src = dat
-                | None => rs#src = default_notrap_load_value chunk
+                | None => rs#src = Vundef
                 end
-    | None   => rs#src = default_notrap_load_value chunk
+    | None   => rs#src = Vundef
     end.
 Proof.
   intros until rs.
@@ -419,9 +421,9 @@ Proof.
                match eval_addressing genv sp addr rs##args with
                | Some a => match Mem.loadv chunk m a with
                            | Some dat => rs#src = dat
-                           | None => rs#src = default_notrap_load_value chunk
+                           | None => rs#src = Vundef
                            end
-               | None   => rs#src = default_notrap_load_value chunk
+               | None   => rs#src = Vundef
                end
              end ->
     fold_left
@@ -431,9 +433,9 @@ Proof.
     match eval_addressing genv sp addr rs##args with
                | Some a => match Mem.loadv chunk m a with
                            | Some dat => rs#src = dat
-                           | None => rs#src = default_notrap_load_value chunk
+                           | None => rs#src = Vundef
                            end
-               | None   => rs#src = default_notrap_load_value chunk
+               | None   => rs#src = Vundef
                end) as REC.
   
   {
@@ -521,7 +523,7 @@ Lemma find_load_notrap1_sound' :
     sem_rel rel rs ->
     find_load rel chunk addr args = Some src ->
     eval_addressing genv sp addr rs##args = None ->
-    rs # src = (default_notrap_load_value chunk).
+    rs # src = Vundef.
 Proof.
   intros until rs. intros REL FINDLOAD ADDR.
   pose proof (find_load_sound rel chunk addr src args rs REL FINDLOAD) as Z.
@@ -541,7 +543,7 @@ Lemma find_load_notrap2_sound' :
     find_load rel chunk addr args = Some src ->
     eval_addressing genv sp addr rs##args = Some a ->
     Mem.loadv chunk m a = None ->
-    rs # src = (default_notrap_load_value chunk).
+    rs # src = Vundef.
 Proof.
   intros until a. intros REL FINDLOAD ADDR LOAD.
   pose proof (find_load_sound rel chunk addr src args rs REL FINDLOAD) as Z.
@@ -566,6 +568,20 @@ Proof.
   destruct s; congruence.
 Qed.
 
+
+Lemma forward_move_rs:
+  forall rel arg rs,
+    sem_rel rel rs ->
+    rs # (forward_move rel arg) = rs # arg.
+Proof.
+  unfold forward_move, sem_rel, sem_reg, sem_sym_val in *.
+  intros until rs.
+  intro REL.
+  pose proof (REL arg) as RELarg.
+  destruct (rel ! arg); trivial.
+  destruct s; congruence.
+Qed.
+
 Lemma oper_sound :
   forall rel : RELATION.t,
   forall op : operation,
@@ -673,11 +689,11 @@ Lemma load2_notrap1_sound :
     sem_rel rel rs ->
     not (In dst args) ->
     eval_addressing genv sp addr (rs ## args) = None ->
-    sem_rel (load2 chunk addr dst args rel) (rs # dst <- (default_notrap_load_value chunk)).
+    sem_rel (load2 chunk addr dst args rel) (rs # dst <- Vundef).
 Proof.
   intros until rs.
   intros REL NOT_IN ADDR x.
-  pose proof (kill_reg_sound rel dst rs (default_notrap_load_value chunk) REL x) as KILL.
+  pose proof (kill_reg_sound rel dst rs Vundef REL x) as KILL.
   unfold load2.
   destruct (peq x dst).
   {
@@ -710,11 +726,11 @@ Lemma load2_notrap2_sound :
     not (In dst args) ->
     eval_addressing genv sp addr (rs ## args) = Some a ->
     Mem.loadv chunk m a = None ->
-    sem_rel (load2 chunk addr dst args rel) (rs # dst <- (default_notrap_load_value chunk)).
+    sem_rel (load2 chunk addr dst args rel) (rs # dst <- Vundef).
 Proof.
   intros until a.
   intros REL NOT_IN ADDR LOAD x.
-  pose proof (kill_reg_sound rel dst rs  (default_notrap_load_value chunk) REL x) as KILL.
+  pose proof (kill_reg_sound rel dst rs Vundef REL x) as KILL.
   unfold load2.
   destruct (peq x dst).
   {
@@ -768,7 +784,7 @@ Lemma load1_notrap1_sound :
   forall rs : regset,
     sem_rel rel rs ->
     eval_addressing genv sp addr (rs ## args) = None ->
-    sem_rel (load1 chunk addr dst args rel) (rs # dst <- (default_notrap_load_value chunk)).
+    sem_rel (load1 chunk addr dst args rel) (rs # dst <- Vundef).
 Proof.
   intros until rs.
   intros REL ADDR LOAD.
@@ -791,7 +807,7 @@ Lemma load1_notrap2_sound :
     sem_rel rel rs ->
     eval_addressing genv sp addr (rs ## args) = Some a ->
     Mem.loadv chunk m a = None ->
-    sem_rel (load1 chunk addr dst args rel) (rs # dst <- (default_notrap_load_value chunk)).
+    sem_rel (load1 chunk addr dst args rel) (rs # dst <- Vundef).
 Proof.
   intros until a.
   intros REL ADDR LOAD.
@@ -825,9 +841,9 @@ Proof.
     assert (match eval_addressing genv sp addr rs## (map (forward_move rel) args) with
     | Some a => match Mem.loadv chunk m a with
                 | Some dat => rs#src = dat
-                | None => rs#src = default_notrap_load_value chunk
+                | None => rs#src = Vundef
                 end
-    | None   => rs#src = default_notrap_load_value chunk
+    | None   => rs#src = Vundef
     end) as FIND_LOAD.
     {
       apply (find_load_sound rel); trivial.
@@ -853,7 +869,7 @@ Lemma load_notrap1_sound :
   forall rs : regset,
     sem_rel rel rs ->
     eval_addressing genv sp addr (rs ## args) = None ->
-    sem_rel (load chunk addr dst args rel) (rs # dst <- (default_notrap_load_value chunk)).
+    sem_rel (load chunk addr dst args rel) (rs # dst <- Vundef).
 Proof.
   intros until rs.
   intros REL ADDR.
@@ -863,9 +879,9 @@ Proof.
     assert (match eval_addressing genv sp addr rs## (map (forward_move rel) args) with
     | Some a => match Mem.loadv chunk m a with
                 | Some dat => rs#src = dat
-                | None => rs#src = default_notrap_load_value chunk
+                | None => rs#src = Vundef
                 end
-    | None   => rs#src = default_notrap_load_value chunk
+    | None   => rs#src = Vundef
     end) as FIND_LOAD.
     {
       apply (find_load_sound rel); trivial.
@@ -890,7 +906,7 @@ Lemma load_notrap2_sound :
     sem_rel rel rs ->
     eval_addressing genv sp addr (rs ## args) = Some a ->
     Mem.loadv chunk m a = None ->
-    sem_rel (load chunk addr dst args rel) (rs # dst <- (default_notrap_load_value chunk)).
+    sem_rel (load chunk addr dst args rel) (rs # dst <- Vundef).
 Proof.
   intros until a.
   intros REL ADDR.
@@ -900,9 +916,9 @@ Proof.
     assert (match eval_addressing genv sp addr rs## (map (forward_move rel) args) with
     | Some a => match Mem.loadv chunk m a with
                 | Some dat => rs#src = dat
-                | None => rs#src = default_notrap_load_value chunk
+                | None => rs#src = Vundef
                 end
-    | None   => rs#src = default_notrap_load_value chunk
+    | None   => rs#src = Vundef
     end) as FIND_LOAD.
     {
       apply (find_load_sound rel); trivial.
@@ -984,7 +1000,40 @@ Proof.
     apply op_depends_on_memory_correct; auto.
   }
 Qed.
-
+  
+Lemma kill_store_sound :
+  forall m m' : mem,
+  forall rel : RELATION.t,
+  forall chunk addr args a v rs,
+    (eval_addressing genv sp addr (rs ## args)) = Some a ->
+    (Mem.storev chunk m a v) = Some m' ->
+    sem_rel m rel rs -> sem_rel m' (kill_store chunk addr args rel) rs.
+Proof.
+  unfold sem_rel, sem_reg.
+  intros until rs.
+  intros ADDR STORE SEM x.
+  pose proof (SEM x) as SEMx.
+  unfold kill_store, kill_store1.
+  rewrite PTree.gfilter1.
+  destruct (rel ! x) as [ sv | ].
+  2: reflexivity.
+  destruct sv; simpl in *; trivial.
+  {
+    destruct op_depends_on_memory eqn:DEPENDS; simpl; trivial.
+    rewrite SEMx.
+    apply op_depends_on_memory_correct; auto.
+  }
+  destruct may_overlap eqn:OVERLAP; simpl; trivial.
+  destruct (eval_addressing genv sp addr0 rs ## args0) eqn:ADDR0.
+  {
+    erewrite may_overlap_sound with (args := (map (forward_move rel) args)).
+    all: try eassumption.
+    
+    erewrite forward_move_map by eassumption.
+    assumption.
+  }
+  intuition congruence.
+Qed. 
 End SOUNDNESS.
 
 Definition match_prog (p tp: RTL.program) :=
@@ -1072,6 +1121,7 @@ Definition fmap_sem' := fmap_sem fundef unit ge.
 Definition subst_arg_ok' := subst_arg_ok fundef unit ge.
 Definition subst_args_ok' := subst_args_ok fundef unit ge.
 Definition kill_mem_sound' := kill_mem_sound fundef unit ge.
+Definition kill_store_sound' := kill_store_sound fundef unit ge.
 
 Lemma sem_rel_b_ge:
   forall rb1 rb2 : RB.t,
@@ -1150,8 +1200,11 @@ Proof.
   reflexivity.
 - (* op *)
   unfold transf_instr in *.
-  destruct find_op_in_fmap eqn:FIND_OP.
+  destruct (if is_trivial_op op then None else find_op_in_fmap (forward_map f) pc op
+               (subst_args (forward_map f) pc args)) eqn:FIND_OP.
   {
+    destruct (is_trivial_op op).
+    discriminate.
     unfold find_op_in_fmap, fmap_sem', fmap_sem in *.
     destruct (forward_map f) as [map |] eqn:MAP.
     2: discriminate.
@@ -1241,9 +1294,9 @@ Proof.
       {
         f_equal.
         symmetry.
-        apply find_load_sound' with (chunk := chunk) (m := m) (a := a) (genv := ge) (sp := sp) (addr := addr) (args := subst_args (Some map) pc args) (rel := mpc) (src := r) (rs := rs); trivial.
         rewrite MAP in H0.
-        assumption.
+        eapply find_load_sound' with (genv := ge) (sp := sp) (addr := addr) (args := subst_args (Some map) pc args) (rel := mpc) (src := r) (rs := rs).
+        all: eassumption.
       }
       unfold fmap_sem. rewrite MAP. rewrite MPC. assumption.
     }
@@ -1268,36 +1321,37 @@ Proof.
     apply load_sound with (a := a); auto.
   }
   {  
-    econstructor; split.
-    assert (eval_addressing tge sp addr rs ## args = Some a).
-    rewrite <- H0.
-    apply eval_addressing_preserved. exact symbols_preserved.
-    eapply exec_Iload; eauto.
-    rewrite (subst_args_ok' sp m); assumption.
-    constructor; auto.
-    
-    simpl in *.
-    unfold fmap_sem', fmap_sem in *.
-    destruct (forward_map _) as [map |] eqn:MAP in *; trivial.
-    destruct (map # pc) as [mpc |] eqn:MPC in *; try contradiction.
-    apply sem_rel_b_ge with (rb2 := Some (load chunk addr dst args mpc)).
+  econstructor; split.
+  assert (eval_addressing tge sp addr rs ## args = Some a).
+  rewrite <- H0.
+  apply eval_addressing_preserved. exact symbols_preserved.
+  eapply exec_Iload; eauto.
+  rewrite (subst_args_ok' sp m); assumption.
+  constructor; auto.
+
+  simpl in *.
+  unfold fmap_sem', fmap_sem in *.
+  destruct (forward_map _) as [map |] eqn:MAP in *; trivial.
+  destruct (map # pc) as [mpc |] eqn:MPC in *; try contradiction.
+  apply sem_rel_b_ge with (rb2 := Some (load chunk addr dst args mpc)).
+  {
+    replace (Some (load chunk addr dst args mpc)) with (apply_instr' (fn_code f) pc (map # pc)).
     {
-      replace (Some (load chunk addr dst args mpc)) with (apply_instr' (fn_code f) pc (map # pc)).
-      {
-        eapply DS.fixpoint_solution with (code := fn_code f) (successors := successors_instr); try eassumption.
-        2: apply apply_instr'_bot.
-        simpl. tauto.
-      }
-      unfold apply_instr'.
-      rewrite H.
-      rewrite MPC.
-      simpl.
-      reflexivity.
+      eapply DS.fixpoint_solution with (code := fn_code f) (successors := successors_instr); try eassumption.
+      2: apply apply_instr'_bot.
+      simpl. tauto.
     }
-    apply load_sound with (a := a); assumption.
+    unfold apply_instr'.
+    rewrite H.
+    rewrite MPC.
+    simpl.
+    reflexivity.
+  }
+  apply load_sound with (a := a); assumption.
   }
   
--  unfold transf_instr in *.
+- (* load notrap1 *)
+  unfold transf_instr in *.
   destruct find_load_in_fmap eqn:FIND_LOAD.
   {
     unfold find_load_in_fmap, fmap_sem', fmap_sem in *.
@@ -1306,16 +1360,16 @@ Proof.
     change (@PMap.get (option RELATION.t) pc map) with (map # pc) in *. 
     destruct (map # pc) as [mpc | ] eqn:MPC.
     2: discriminate.
-
     econstructor; split.
     {
       eapply exec_Iop with (v := (default_notrap_load_value chunk)); eauto.
+      simpl.
       rewrite <- subst_args_ok with (genv := ge) (f := f) (pc := pc) (sp := sp) (m := m) in H0.
-      { simpl.
+      {
         f_equal.
-        apply find_load_notrap1_sound' with (chunk := chunk) (m := m) (genv := ge) (sp := sp) (addr := addr) (args := subst_args (Some map) pc args) (rel := mpc) (src := r) (rs := rs); trivial.
         rewrite MAP in H0.
-        assumption.
+        eapply find_load_notrap1_sound' with (genv := ge) (sp := sp) (addr := addr) (args := subst_args (Some map) pc args) (rel := mpc) (src := r) (rs := rs).
+        all: eassumption.
       }
       unfold fmap_sem. rewrite MAP. rewrite MPC. assumption.
     }
@@ -1339,37 +1393,38 @@ Proof.
     unfold sem_rel_b', sem_rel_b.
     apply load_notrap1_sound; auto.
   }
-  {
-    econstructor; split.
-    assert (eval_addressing tge sp addr rs ## args = None).
-    rewrite <- H0. apply eval_addressing_preserved. exact symbols_preserved.
-    eapply exec_Iload_notrap1; eauto.
-    rewrite subst_args_ok with (genv := ge) (sp := sp) (m := m) ; assumption.
-    constructor; auto.
+  {  
+  econstructor; split.
+  assert (eval_addressing tge sp addr rs ## args = None).
+  rewrite <- H0.
+  apply eval_addressing_preserved. exact symbols_preserved.
+  eapply exec_Iload_notrap1; eauto.
+  rewrite (subst_args_ok' sp m); assumption.
+  constructor; auto.
 
-    simpl in *.
-    unfold fmap_sem', fmap_sem in *.
-    destruct (forward_map _) as [map |] eqn:MAP in *; trivial.
-    destruct (map # pc) as [mpc |] eqn:MPC in *; try contradiction.
-    apply sem_rel_b_ge with (rb2 := Some (load chunk addr dst args mpc)).
+  simpl in *.
+  unfold fmap_sem', fmap_sem in *.
+  destruct (forward_map _) as [map |] eqn:MAP in *; trivial.
+  destruct (map # pc) as [mpc |] eqn:MPC in *; try contradiction.
+  apply sem_rel_b_ge with (rb2 := Some (load chunk addr dst args mpc)).
+  {
+    replace (Some (load chunk addr dst args mpc)) with (apply_instr' (fn_code f) pc (map # pc)).
     {
-      replace (Some (load chunk addr dst args mpc)) with (apply_instr' (fn_code f) pc (map # pc)).
-      {
-        eapply DS.fixpoint_solution with (code := fn_code f) (successors := successors_instr); try eassumption.
-        2: apply apply_instr'_bot.
-        simpl. tauto.
-      }
-      unfold apply_instr'.
-      rewrite H.
-      rewrite MPC.
-      simpl.
-      reflexivity.
+      eapply DS.fixpoint_solution with (code := fn_code f) (successors := successors_instr); try eassumption.
+      2: apply apply_instr'_bot.
+      simpl. tauto.
     }
-    apply load_notrap1_sound; trivial.
+    unfold apply_instr'.
+    rewrite H.
+    rewrite MPC.
+    simpl.
+    reflexivity.
+  }
+  apply load_notrap1_sound; assumption.
   }
   
-- (* load notrap2 *)
-   unfold transf_instr in *.
+(* load notrap2 *)
+- unfold transf_instr in *.
   destruct find_load_in_fmap eqn:FIND_LOAD.
   {
     unfold find_load_in_fmap, fmap_sem', fmap_sem in *.
@@ -1385,9 +1440,9 @@ Proof.
       rewrite <- subst_args_ok with (genv := ge) (f := f) (pc := pc) (sp := sp) (m := m) in H0.
       {
         f_equal.
-        apply find_load_notrap2_sound' with (chunk := chunk) (m := m) (a := a) (genv := ge) (sp := sp) (addr := addr) (args := subst_args (Some map) pc args) (rel := mpc) (src := r) (rs := rs); trivial.
         rewrite MAP in H0.
-        assumption.
+        eapply find_load_notrap2_sound' with (genv := ge) (sp := sp) (addr := addr) (args := subst_args (Some map) pc args) (rel := mpc) (src := r) (rs := rs).
+        all: try eassumption.
       }
       unfold fmap_sem. rewrite MAP. rewrite MPC. assumption.
     }
@@ -1411,19 +1466,20 @@ Proof.
     unfold sem_rel_b', sem_rel_b.
     apply load_notrap2_sound with (a := a); auto.
   }
-  {
+  {  
   econstructor; split.
   assert (eval_addressing tge sp addr rs ## args = Some a).
-  rewrite <- H0. apply eval_addressing_preserved. exact symbols_preserved.
+  rewrite <- H0.
+  apply eval_addressing_preserved. exact symbols_preserved.
   eapply exec_Iload_notrap2; eauto.
-  rewrite subst_args_ok with (genv := ge) (sp := sp) (m := m); assumption.
+  rewrite (subst_args_ok' sp m); assumption.
   constructor; auto.
 
   simpl in *.
   unfold fmap_sem', fmap_sem in *.
   destruct (forward_map _) as [map |] eqn:MAP in *; trivial.
   destruct (map # pc) as [mpc |] eqn:MPC in *; try contradiction.
-  apply sem_rel_b_ge with (rb2 := Some  (load chunk addr dst args mpc)).
+  apply sem_rel_b_ge with (rb2 := Some (load chunk addr dst args mpc)).
   {
     replace (Some (load chunk addr dst args mpc)) with (apply_instr' (fn_code f) pc (map # pc)).
     {
@@ -1454,9 +1510,9 @@ Proof.
   unfold fmap_sem', fmap_sem in *.
   destruct (forward_map _) as [map |] eqn:MAP in *; trivial.
   destruct (map # pc) as [mpc |] eqn:MPC in *; try contradiction.
-  apply sem_rel_b_ge with (rb2 := Some (kill_mem mpc)); trivial.
+  apply sem_rel_b_ge with (rb2 := Some (kill_store chunk addr args mpc)); trivial.
   {
-  replace (Some (kill_mem mpc)) with (apply_instr' (fn_code f) pc (map # pc)).
+  replace (Some (kill_store chunk addr args mpc)) with (apply_instr' (fn_code f) pc (map # pc)).
   {
     eapply DS.fixpoint_solution with (code := fn_code f) (successors := successors_instr); try eassumption.
     2: apply apply_instr'_bot.
@@ -1468,8 +1524,7 @@ Proof.
   rewrite H.
   reflexivity.
   }
-  apply (kill_mem_sound' sp m).
-  assumption.
+  eapply (kill_store_sound' sp m); eassumption.
   
 (* call *)
 - econstructor; split.
@@ -1625,7 +1680,7 @@ Proof.
   econstructor; split.
   eapply exec_return; eauto.
   constructor; auto.
-Admitted.
+Qed.
 
 
 Lemma transf_initial_states:
@@ -1657,4 +1712,4 @@ Proof.
   exact step_simulation.
 Qed.
 
-End PRESERVATION.
-\ No newline at end of file
+End PRESERVATION.
diff --git a/backend/Constprop.v b/backend/Constprop.v
index eda41b39..0be9438c 100644
--- a/backend/Constprop.v
+++ b/backend/Constprop.v
@@ -69,7 +69,7 @@ Fixpoint successor_rec (n: nat) (f: function) (ae: AE.t) (pc: node) : node :=
       match f.(fn_code)!pc with
       | Some (Inop s) =>
           successor_rec n' f ae s
-      | Some (Icond cond args s1 s2) =>
+      | Some (Icond cond args s1 s2 _) =>
           match resolve_branch (eval_static_condition cond (aregs ae args)) with
           | Some b => successor_rec n' f ae (if b then s1 else s2)
           | None => pc
@@ -217,14 +217,14 @@ Definition transf_instr (f: function) (an: PMap.t VA.t) (rm: romem)
              end
           | _, _ => dfl
           end
-      | Icond cond args s1 s2 =>
+      | Icond cond args s1 s2 i =>
           let aargs := aregs ae args in
           match resolve_branch (eval_static_condition cond aargs) with
           | Some b =>
               if b then Inop s1 else Inop s2
           | None =>
               let (cond', args') := cond_strength_reduction cond args aargs in
-              Icond cond' args' s1 s2
+              Icond cond' args' s1 s2 i
           end
       | Ijumptable arg tbl =>
           match areg ae arg with
diff --git a/backend/Constpropproof.v b/backend/Constpropproof.v
index 63cfee24..60663503 100644
--- a/backend/Constpropproof.v
+++ b/backend/Constpropproof.v
@@ -142,8 +142,8 @@ Inductive match_pc (f: function) (rs: regset) (m: mem): nat -> node -> node -> P
       f.(fn_code)!pc = Some (Inop s) ->
       match_pc f rs m n s pcx ->
       match_pc f rs m (S n) pc pcx
-  | match_pc_cond: forall n pc cond args s1 s2 pcx,
-      f.(fn_code)!pc = Some (Icond cond args s1 s2) ->
+  | match_pc_cond: forall n pc cond args s1 s2 pcx i,
+      f.(fn_code)!pc = Some (Icond cond args s1 s2 i) ->
       (forall b,
         eval_condition cond rs##args m = Some b ->
         match_pc f rs m n (if b then s1 else s2) pcx) ->
diff --git a/backend/Conventions.v b/backend/Conventions.v
index 6025c6b4..14ffb587 100644
--- a/backend/Conventions.v
+++ b/backend/Conventions.v
@@ -34,6 +34,73 @@ Proof.
   apply IHpl; auto.
 Qed.
 
+(** ** Stack size of function arguments *)
+
+(** [size_arguments s] returns the number of [Outgoing] slots used
+  to call a function with signature [s]. *)
+
+Definition max_outgoing_1 (accu: Z) (l: loc) : Z :=
+  match l with
+  | S Outgoing ofs ty => Z.max accu (ofs + typesize ty)
+  | _ => accu
+  end.
+
+Definition max_outgoing_2 (accu: Z) (rl: rpair loc) : Z :=
+  match rl with
+  | One l => max_outgoing_1 accu l
+  | Twolong l1 l2 => max_outgoing_1 (max_outgoing_1 accu l1) l2
+  end.
+
+Definition size_arguments (s: signature) : Z :=
+  List.fold_left max_outgoing_2 (loc_arguments s) 0.
+
+(** The offsets of [Outgoing] arguments are below [size_arguments s]. *)
+
+Remark fold_max_outgoing_above:
+  forall l n, fold_left max_outgoing_2 l n >= n.
+Proof.
+  assert (A: forall n l, max_outgoing_1 n l >= n).
+  { intros; unfold max_outgoing_1. destruct l as [_ | []]; xomega. }
+  induction l; simpl; intros. 
+  - omega.
+  - eapply Zge_trans. eauto.
+    destruct a; simpl. apply A. eapply Zge_trans; eauto.
+Qed.
+
+Lemma size_arguments_above:
+  forall s, size_arguments s >= 0.
+Proof.
+  intros. apply fold_max_outgoing_above.
+Qed.
+
+Lemma loc_arguments_bounded:
+  forall (s: signature) (ofs: Z) (ty: typ),
+  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments s)) ->
+  ofs + typesize ty <= size_arguments s.
+Proof.
+  intros until ty.
+  assert (A: forall n l, n <= max_outgoing_1 n l).
+  { intros; unfold max_outgoing_1. destruct l as [_ | []]; xomega. }
+  assert (B: forall p n,
+             In (S Outgoing ofs ty) (regs_of_rpair p) ->
+             ofs + typesize ty <= max_outgoing_2 n p).
+  { intros. destruct p; simpl in H; intuition; subst; simpl.
+  - xomega.
+  - eapply Z.le_trans. 2: apply A. xomega.
+  - xomega. }
+  assert (C: forall l n,
+             In (S Outgoing ofs ty) (regs_of_rpairs l) ->
+             ofs + typesize ty <= fold_left max_outgoing_2 l n).
+  { induction l; simpl; intros.
+  - contradiction.
+  - rewrite in_app_iff in H. destruct H.
+  + eapply Z.le_trans. eapply B; eauto.
+    apply Z.ge_le. apply fold_max_outgoing_above.
+  + apply IHl; auto.
+  }
+  apply C. 
+Qed.
+
 (** ** Location of function parameters *)
 
 (** A function finds the values of its parameter in the same locations
diff --git a/backend/Deadcode.v b/backend/Deadcode.v
index 1f208a91..3412a6fa 100644
--- a/backend/Deadcode.v
+++ b/backend/Deadcode.v
@@ -142,7 +142,7 @@ Definition transfer (f: function) (approx: PMap.t VA.t)
        nmem_dead_stack f.(fn_stacksize))
   | Some(Ibuiltin ef args res s) =>
       transfer_builtin approx!!pc ef args res ne nm
-  | Some(Icond cond args s1 s2) =>
+  | Some(Icond cond args s1 s2 _) =>
       if peq s1 s2 then after else 
         (add_needs args (needs_of_condition cond) ne, nm)
   | Some(Ijumptable arg tbl) =>
@@ -192,7 +192,7 @@ Definition transf_instr (approx: PMap.t VA.t) (an: PMap.t NA.t)
       if nmem_contains (snd an!!pc) (aaddr_arg approx!!pc dst) sz
       then instr
       else Inop s
-  | Icond cond args s1 s2 =>
+  | Icond cond args s1 s2 _ =>
       if peq s1 s2 then Inop s1 else instr
   | _ =>
       instr
diff --git a/backend/Duplicate.v b/backend/Duplicate.v
index 82c17367..af85efe4 100644
--- a/backend/Duplicate.v
+++ b/backend/Duplicate.v
@@ -134,8 +134,8 @@ Definition verify_match_inst dupmap inst tinst :=
           else Error (msg "Different ef in Ibuiltin")
       | _ => Error (msg "verify_match_inst Ibuiltin") end
 
-  | Icond cond lr n1 n2 => match tinst with
-      | Icond cond' lr' n1' n2' =>
+  | Icond cond lr n1 n2 i => match tinst with
+      | Icond cond' lr' n1' n2' i' =>
           if (list_eq_dec Pos.eq_dec lr lr') then
             if (eq_condition cond cond') then
               do u1 <- verify_is_copy dupmap n1 n1';
diff --git a/backend/Duplicateaux.ml b/backend/Duplicateaux.ml
index d0b7129e..89f187da 100644
--- a/backend/Duplicateaux.ml
+++ b/backend/Duplicateaux.ml
@@ -1,25 +1,32 @@
+(* Oracle for Duplicate pass.
+ * - Add static prediction information to Icond nodes
+ * - Performs tail duplication on interesting traces to form superblocks
+ * - (TODO: perform partial loop unrolling inside innermost loops)
+ *)
+
 open RTL
 open Maps
 open Camlcoq
 
-(* TTL : IR emphasizing the preferred next node *)
-module TTL = struct
-  type instruction =
-  | Tleaf of RTL.instruction
-  | Tnext of node * RTL.instruction
-
-  type code = instruction PTree.t
-end;;
-
-open TTL
+let debug_flag = ref false
 
-(** RTL to TTL *)
+let debug fmt =
+  if !debug_flag then Printf.eprintf fmt
+  else Printf.ifprintf stderr fmt
 
 let get_some = function
 | None -> failwith "Did not get some"
 | Some thing -> thing
 
-let bfs code entrypoint =
+let rtl_successors = function
+| Itailcall _ | Ireturn _ -> []
+| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n)
+| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n]
+| Icond (_,_,n1,n2,_) -> [n1; n2]
+| Ijumptable (_,ln) -> ln
+
+let bfs code entrypoint = begin
+  debug "bfs\n";
   let visited = ref (PTree.map (fun n i -> false) code)
   and bfs_list = ref []
   and to_visit = Queue.create ()
@@ -33,32 +40,24 @@ let bfs code entrypoint =
         match PTree.get !node code with
         | None -> failwith "No such node"
         | Some i ->
-            bfs_list := !bfs_list @ [!node];
-            match i with
-            | Icall(_, _, _, _, n) -> Queue.add n to_visit
-            | Ibuiltin(_, _, _, n) -> Queue.add n to_visit
-            | Ijumptable(_, ln) -> List.iter (fun n -> Queue.add n to_visit) ln
-            | Itailcall _ | Ireturn _ -> ()
-            | Icond (_, _, n1, n2) -> Queue.add n1 to_visit; Queue.add n2 to_visit
-            | Inop n | Iop (_, _, _, n) | Iload (_, _, _, _, _, n) | Istore (_, _, _, _, n) -> Queue.add n to_visit
+            bfs_list := !node :: !bfs_list;
+            let succ = rtl_successors i in
+            List.iter (fun n -> Queue.add n to_visit) succ
       end
     done;
-    !bfs_list
+    List.rev !bfs_list
   end
+end
 
 let optbool o = match o with Some _ -> true | None -> false
 
 let ptree_get_some n ptree = get_some @@ PTree.get n ptree
 
-let get_predecessors_rtl code =
+let get_predecessors_rtl code = begin
+  debug "get_predecessors_rtl\n";
   let preds = ref (PTree.map (fun n i -> []) code) in
   let process_inst (node, i) =
-    let succ = match i with
-      | Inop n | Iop (_,_,_,n) | Iload (_, _,_,_,_,n) | Istore (_,_,_,_,n)
-      | Icall (_,_,_,_,n) | Ibuiltin (_, _, _, n) -> [n]
-      | Icond (_,_,n1,n2) -> [n1;n2]
-      | Ijumptable (_,ln) -> ln
-      | Itailcall _ | Ireturn _ -> []
+    let succ = rtl_successors i
     in List.iter (fun s ->
       let previous_preds = ptree_get_some s !preds in
       if optbool @@ List.find_opt (fun e -> e == node) previous_preds then ()
@@ -67,6 +66,7 @@ let get_predecessors_rtl code =
     List.iter process_inst (PTree.elements code);
     !preds
   end
+end
 
 module PInt = struct
   type t = P.t
@@ -80,65 +80,23 @@ let print_intlist l =
   | [] -> ()
   | n::ln -> (Printf.printf "%d " (P.to_int n); f ln)
   in begin
-    Printf.printf "[";
-    f l;
-    Printf.printf "]"
+    if !debug_flag then begin
+      Printf.printf "[";
+      f l;
+      Printf.printf "]"
+    end
   end
 
 let print_intset s =
   let seq = PSet.to_seq s
   in begin
-    Printf.printf "{";
-    Seq.iter (fun n ->
-      Printf.printf "%d " (P.to_int n)
-    ) seq;
-    Printf.printf "}"
-  end
-
-(* FIXME - dominators not working well because the order of dataflow update isn't right *)
-  (*
-let get_dominators code entrypoint =
-  let bfs_order = bfs code entrypoint
-  and predecessors = get_predecessors_rtl code
-  in let doms = ref (PTree.map (fun n i -> PSet.of_list bfs_order) code)
-  in begin
-    Printf.printf "BFS: ";
-    print_intlist bfs_order;
-    Printf.printf "\n";
-    List.iter (fun n ->
-      let preds = get_some @@ PTree.get n predecessors
-      and single = PSet.singleton n
-      in match preds with
-      | [] -> doms := PTree.set n single !doms
-      | p::lp ->
-          let set_p = get_some @@ PTree.get p !doms
-          and set_lp = List.map (fun p -> get_some @@ PTree.get p !doms) lp
-          in let inter = List.fold_left PSet.inter set_p set_lp
-          in let union = PSet.union inter single
-          in begin
-            Printf.printf "----------------------------------------\n";
-            Printf.printf "n = %d\n" (P.to_int n);
-            Printf.printf "set_p = "; print_intset set_p; Printf.printf "\n";
-            Printf.printf "set_lp = ["; List.iter (fun s -> print_intset s; Printf.printf ", ") set_lp; Printf.printf "]\n";
-            Printf.printf "=> inter = "; print_intset inter; Printf.printf "\n";
-            Printf.printf "=> union = "; print_intset union; Printf.printf "\n";
-            doms := PTree.set n union !doms
-          end
-    ) bfs_order;
-    !doms
-  end
-*)
-
-let print_dominators dominators =
-  let domlist = PTree.elements dominators
-  in begin
-    Printf.printf "{\n";
-    List.iter (fun (n, doms) ->
-      Printf.printf "\t";
-      Printf.printf "%d:" (P.to_int n);
-      print_intset doms;
-      Printf.printf "\n"
-    ) domlist
+    if !debug_flag then begin
+      Printf.printf "{";
+      Seq.iter (fun n ->
+        Printf.printf "%d " (P.to_int n)
+      ) seq;
+      Printf.printf "}"
+    end
   end
 
 type vstate = Unvisited | Processed | Visited
@@ -150,7 +108,8 @@ type vstate = Unvisited | Processed | Visited
   *
   * If we come accross an edge to a Processed node, it's a loop!
   *)
-let get_loop_headers code entrypoint =
+let get_loop_headers code entrypoint = begin
+  debug "get_loop_headers\n";
   let visited = ref (PTree.map (fun n i -> Unvisited) code)
   and is_loop_header = ref (PTree.map (fun n i -> false) code)
   in let rec dfs_visit code = function
@@ -159,6 +118,7 @@ let get_loop_headers code entrypoint =
       match (get_some @@ PTree.get node !visited) with
       | Visited -> ()
       | Processed -> begin
+          debug "Node %d is a loop header\n" (P.to_int node);
           is_loop_header := PTree.set node true !is_loop_header;
           visited := PTree.set node Visited !visited
         end
@@ -166,13 +126,7 @@ let get_loop_headers code entrypoint =
           visited := PTree.set node Processed !visited;
           match PTree.get node code with
           | None -> failwith "No such node"
-          | Some i -> let next_visits = (match i with
-            | Icall (_, _, _, _, n) | Ibuiltin (_, _, _, n) | Inop n | Iop (_, _, _, n)
-            | Iload (_, _, _, _, _, n) | Istore (_, _, _, _, n) -> [n]
-            | Icond (_, _, n1, n2) -> [n1; n2]
-            | Itailcall _ | Ireturn _ -> []
-            | Ijumptable (_, ln) -> ln
-            ) in dfs_visit code next_visits;
+          | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits;
           visited := PTree.set node Visited !visited;
           dfs_visit code ln
         end
@@ -180,140 +134,220 @@ let get_loop_headers code entrypoint =
     dfs_visit code [entrypoint];
     !is_loop_header
   end
+end
 
 let ptree_printbool pt =
   let elements = PTree.elements pt
   in begin
-    Printf.printf "[";
-    List.iter (fun (n, b) ->
-      if b then Printf.printf "%d, " (P.to_int n) else ()
-    ) elements;
-    Printf.printf "]"
+    if !debug_flag then begin
+      Printf.printf "[";
+      List.iter (fun (n, b) ->
+        if b then Printf.printf "%d, " (P.to_int n) else ()
+      ) elements;
+      Printf.printf "]"
+    end
   end
 
 (* Looks ahead (until a branch) to see if a node further down verifies
  * the given predicate *)
 let rec look_ahead code node is_loop_header predicate =
   if (predicate node) then true
-  else match (get_some @@ PTree.get node code) with
-    | Ireturn _ | Itailcall _ | Icond _ | Ijumptable _ -> false
-    | Inop n | Iop (_, _, _, n) | Iload (_, _, _, _, _, n)
-    | Istore (_, _, _, _, n) | Icall (_, _, _, _, n)
-    | Ibuiltin (_, _, _, n) ->
-      if (predicate n) then true
-      else (
-        if (get_some @@ PTree.get n is_loop_header) then false
-        else look_ahead code n is_loop_header predicate
-      )
-
-exception HeuristicSucceeded
-
-let do_call_heuristic code ifso ifnot is_loop_header preferred =
-  let predicate n = (function
-  | Icall _ -> true
-  | _ -> false) @@ get_some @@ PTree.get n code
-  in if (look_ahead code ifso is_loop_header predicate) then
-    (preferred := false; raise HeuristicSucceeded)
-  else if (look_ahead code ifnot is_loop_header predicate) then
-    (preferred := true; raise HeuristicSucceeded)
-  else ()
-
-let do_opcode_heuristic code cond ifso ifnot preferred = DuplicateOpcodeHeuristic.opcode_heuristic code cond ifso ifnot preferred
-
-let do_return_heuristic code ifso ifnot is_loop_header preferred =
-  let predicate n = (function
-  | Ireturn _ -> true
-  | _ -> false) @@ get_some @@ PTree.get n code
-  in if (look_ahead code ifso is_loop_header predicate) then
-    (preferred := false; raise HeuristicSucceeded)
-  else if (look_ahead code ifnot is_loop_header predicate) then
-    (preferred := true; raise HeuristicSucceeded)
-  else ()
-
-let do_store_heuristic code ifso ifnot is_loop_header preferred =
-  let predicate n = (function
-  | Istore _ -> true
-  | _ -> false) @@ get_some @@ PTree.get n code
-  in if (look_ahead code ifso is_loop_header predicate) then
-    (preferred := false; raise HeuristicSucceeded)
-  else if (look_ahead code ifnot is_loop_header predicate) then
-    (preferred := true; raise HeuristicSucceeded)
-  else ()
-
-let do_loop_heuristic code ifso ifnot is_loop_header preferred =
-  let predicate n = get_some @@ PTree.get n is_loop_header
-  in if (look_ahead code ifso is_loop_header predicate) then
-    (preferred := true; raise HeuristicSucceeded)
-  else if (look_ahead code ifnot is_loop_header predicate) then
-    (preferred := false; raise HeuristicSucceeded)
-  else ()
-
-let get_directions code entrypoint =
-  let bfs_order = bfs code entrypoint
-  and is_loop_header = get_loop_headers code entrypoint
-  and directions = ref (PTree.map (fun n i -> false) code) (* false <=> fallthru *)
+  else match (rtl_successors @@ get_some @@ PTree.get node code) with
+    | [n] -> if (predicate n) then true
+        else (
+          if (get_some @@ PTree.get n is_loop_header) then false
+          else look_ahead code n is_loop_header predicate
+        )
+    | _ -> false
+
+let do_call_heuristic code cond ifso ifnot is_loop_header =
+  begin
+    debug "\tCall heuristic..\n";
+    let predicate n = (function
+    | Icall _ -> true
+    | _ -> false) @@ get_some @@ PTree.get n code
+    in let ifso_call = look_ahead code ifso is_loop_header predicate
+    in let ifnot_call = look_ahead code ifnot is_loop_header predicate
+    in if ifso_call && ifnot_call then None
+    else if ifso_call then Some false
+    else if ifnot_call then Some true
+    else None
+  end
+
+let do_opcode_heuristic code cond ifso ifnot is_loop_header =
+  begin
+    debug "\tOpcode heuristic..\n";
+    DuplicateOpcodeHeuristic.opcode_heuristic code cond ifso ifnot is_loop_header
+  end
+
+let do_return_heuristic code cond ifso ifnot is_loop_header =
+  begin
+    debug "\tReturn heuristic..\n";
+    let predicate n = (function
+    | Ireturn _ -> true
+    | _ -> false) @@ get_some @@ PTree.get n code
+    in let ifso_return = look_ahead code ifso is_loop_header predicate
+    in let ifnot_return = look_ahead code ifnot is_loop_header predicate
+    in if ifso_return && ifnot_return then None
+    else if ifso_return then Some false
+    else if ifnot_return then Some true
+    else None
+  end
+
+let do_store_heuristic code cond ifso ifnot is_loop_header =
+  begin
+    debug "\tStore heuristic..\n";
+    let predicate n = (function
+    | Istore _ -> true
+    | _ -> false) @@ get_some @@ PTree.get n code
+    in let ifso_store = look_ahead code ifso is_loop_header predicate
+    in let ifnot_store = look_ahead code ifnot is_loop_header predicate
+    in if ifso_store && ifnot_store then None
+    else if ifso_store then Some false
+    else if ifnot_store then Some true
+    else None
+  end
+
+let do_loop_heuristic code cond ifso ifnot is_loop_header =
+  begin
+    debug "\tLoop heuristic..\n";
+    let predicate n = get_some @@ PTree.get n is_loop_header in
+    let ifso_loop = look_ahead code ifso is_loop_header predicate in
+    let ifnot_loop = look_ahead code ifnot is_loop_header predicate in
+    if ifso_loop && ifnot_loop then None (* TODO - take the innermost loop ? *)
+    else if ifso_loop then Some true
+    else if ifnot_loop then Some false
+    else None
+  end
+
+let do_loop2_heuristic loop_info n code cond ifso ifnot is_loop_header =
+  begin
+    debug "\tLoop2 heuristic..\n";
+    match get_some @@ PTree.get n loop_info with
+    | None -> None
+    | Some b -> Some b
+  end
+
+(* Returns a PTree of either None or Some b where b determines the node following the loop, for a cb instruction *)
+(* It uses the fact that loops in CompCert are done by a branch (backedge) instruction followed by a cb *)
+let get_loop_info is_loop_header bfs_order code =
+  let loop_info = ref (PTree.map (fun n i -> None) code) in
+  let mark_path s n =
+    let visited = ref (PTree.map (fun n i -> false) code) in
+    let rec explore src dest =
+      if (get_some @@ PTree.get src !visited) then false
+      else if src == dest then true
+      else begin
+        visited := PTree.set src true !visited;
+        match rtl_successors @@ get_some @@ PTree.get src code with
+        | [] -> false
+        | [s] -> explore s dest
+        | [s1; s2] -> (explore s1 dest) || (explore s2 dest)
+        | _ -> false
+      end
+    in let rec advance_to_cb src =
+      if (get_some @@ PTree.get src !visited) then None
+      else begin
+        visited := PTree.set src true !visited;
+        match get_some @@ PTree.get src code with
+        | Inop s | Iop (_, _, _, s) | Iload (_,_,_,_,_,s) | Istore (_,_,_,_,s) | Icall (_,_,_,_,s)
+        | Ibuiltin (_,_,_,s) -> advance_to_cb s
+        | Icond _ -> Some src
+        | Ijumptable _ | Itailcall _ | Ireturn _ -> None
+      end
+    in begin
+      debug "Marking path from %d to %d\n" (P.to_int n) (P.to_int s);
+      match advance_to_cb s with
+      | None -> (debug "Nothing found\n")
+      | Some s -> ( debug "Advancing to %d\n" (P.to_int s);
+          match get_some @@ PTree.get s !loop_info with
+          | None | Some _ -> begin
+              match get_some @@ PTree.get s code with
+              | Icond (_, _, n1, n2, _) ->
+                  let b1 = explore n1 n in
+                  let b2 = explore n2 n in
+                  if (b1 && b2) then (debug "both true\n")
+                  else if b1 then (debug "true privileged\n"; loop_info := PTree.set s (Some true) !loop_info)
+                  else if b2 then (debug "false privileged\n"; loop_info := PTree.set s (Some false) !loop_info)
+                  else (debug "none true\n")
+              | _ -> ( debug "not an icond\n" )
+            end
+          (* | Some _ -> ( debug "already loop info there\n" ) FIXME - we don't know yet whether a branch to a loop head is a backedge or not *)
+        )
+    end
   in begin
-    Printf.printf "Loop headers: ";
-    ptree_printbool is_loop_header;
-    Printf.printf "\n";
+    List.iter (fun n ->
+      match get_some @@ PTree.get n code with
+      | Inop s | Iop (_,_,_,s) | Iload (_,_,_,_,_,s) | Istore (_,_,_,_,s) | Icall (_,_,_,_,s)
+      | Ibuiltin (_, _, _, s) ->
+          if get_some @@ PTree.get s is_loop_header then mark_path s n
+      | Icond _ -> () (* loop backedges are never Icond in CompCert RTL.3 *)
+      | Ijumptable _ -> ()
+      | Itailcall _ | Ireturn _ -> ()
+    ) bfs_order;
+    !loop_info
+  end
+
+(* Remark - compared to the original paper, we don't use the store heuristic *)
+let get_directions code entrypoint = begin
+  debug "get_directions\n";
+  let bfs_order = bfs code entrypoint in
+  let is_loop_header = get_loop_headers code entrypoint in
+  let loop_info = get_loop_info is_loop_header bfs_order code in
+  let directions = ref (PTree.map (fun n i -> None) code) in (* None <=> no predicted direction *)
+  begin
+    (* ptree_printbool is_loop_header; *)
+    (* debug "\n"; *)
     List.iter (fun n ->
       match (get_some @@ PTree.get n code) with
-      | Icond (cond, lr, ifso, ifnot) ->
-          Printf.printf "Analyzing %d.." (P.to_int n);
-          let preferred = ref false
-          in (try
-            Printf.printf " call..";
-            do_call_heuristic code ifso ifnot is_loop_header preferred;
-            Printf.printf " opcode..";
-            do_opcode_heuristic code cond ifso ifnot preferred;
-            Printf.printf " return..";
-            do_return_heuristic code ifso ifnot is_loop_header preferred;
-            Printf.printf " store..";
-            do_store_heuristic code ifso ifnot is_loop_header preferred;
-            Printf.printf " loop..";
-            do_loop_heuristic code ifso ifnot is_loop_header preferred;
-            Printf.printf "Random choice for %d\n" (P.to_int n);
-            preferred := Random.bool ()
-            with HeuristicSucceeded | DuplicateOpcodeHeuristic.HeuristicSucceeded
-              -> Printf.printf " %s\n" (match !preferred with true -> "BRANCH"
-                                        | false -> "FALLTHROUGH")
-          ); directions := PTree.set n !preferred !directions
+      | Icond (cond, lr, ifso, ifnot, _) ->
+          (* debug "Analyzing %d.." (P.to_int n); *)
+          let heuristics = [ do_opcode_heuristic;
+            do_return_heuristic; do_loop2_heuristic loop_info n; do_loop_heuristic; do_call_heuristic;
+             (* do_store_heuristic *) ] in
+          let preferred = ref None in
+          begin
+            debug "Deciding condition for RTL node %d\n" (P.to_int n);
+            List.iter (fun do_heur ->
+              match !preferred with
+              | None -> preferred := do_heur code cond ifso ifnot is_loop_header
+              | Some _ -> ()
+            ) heuristics;
+            directions := PTree.set n !preferred !directions;
+            (match !preferred with | Some false -> debug "\tFALLTHROUGH\n"
+                                   | Some true -> debug "\tBRANCH\n"
+                                   | None -> debug "\tUNSURE\n");
+            debug "---------------------------------------\n"
+          end
       | _ -> ()
     ) bfs_order;
     !directions
   end
+end
+
+let update_direction direction = function
+| Icond (cond, lr, n, n', _) -> Icond (cond, lr, n, n', direction)
+| i -> i
 
-let to_ttl_inst direction = function
-| Ireturn o -> Tleaf (Ireturn o)
-| Inop n -> Tnext (n, Inop n)
-| Iop (op, lr, r, n) -> Tnext (n, Iop(op, lr, r, n))
-| Iload (tm, m, a, lr, r, n) -> Tnext (n, Iload(tm, m, a, lr, r, n))
-| Istore (m, a, lr, r, n) -> Tnext (n, Istore(m, a, lr, r, n))
-| Icall (s, ri, lr, r, n) -> Tleaf (Icall(s, ri, lr, r, n))
-| Itailcall (s, ri, lr) -> Tleaf (Itailcall(s, ri, lr))
-| Ibuiltin (ef, lbr, br, n) -> Tleaf (Ibuiltin(ef, lbr, br, n))
-| Icond (cond, lr, n, n') -> (match direction with
-    | false -> Tnext (n', Icond(cond, lr, n, n'))
-    | true -> Tnext (n, Icond(cond, lr, n, n')))
-| Ijumptable (r, ln) -> Tleaf (Ijumptable(r, ln))
-
-let rec to_ttl_code_rec directions = function
+let rec update_direction_rec directions = function
 | [] -> PTree.empty
 | m::lm -> let (n, i) = m
     in let direction = get_some @@ PTree.get n directions
-    in PTree.set n (to_ttl_inst direction i) (to_ttl_code_rec directions lm)
+    in PTree.set n (update_direction direction i) (update_direction_rec directions lm)
 
-let to_ttl_code code entrypoint =
+(* Uses branch prediction to write prediction annotations in Icond *)
+let update_directions code entrypoint = begin
+  debug "Update_directions\n";
   let directions = get_directions code entrypoint
   in begin
-    Printf.printf "Ifso directions: ";
+    (* debug "Ifso directions: ";
     ptree_printbool directions;
-    Printf.printf "\n";
-    Random.init(0); (* using same seed to make it deterministic *)
-    to_ttl_code_rec directions (PTree.elements code)
+    debug "\n"; *)
+    update_direction_rec directions (PTree.elements code)
   end
+end
 
-(** Trace selection on TTL *)
+(** Trace selection *)
 
 let rec exists_false_rec = function
   | [] -> false
@@ -321,50 +355,29 @@ let rec exists_false_rec = function
 
 let exists_false boolmap = exists_false_rec (PTree.elements boolmap)
 
-(* DFS on TTL to guide the exploration *)
-let dfs code entrypoint =
+(* DFS using prediction info to guide the exploration *)
+let dfs code entrypoint = begin
+  debug "dfs\n";
   let visited = ref (PTree.map (fun n i -> false) code) in
   let rec dfs_list code = function
   | [] -> []
   | node :: ln ->
-      let node_dfs =
-        if not (get_some @@ PTree.get node !visited) then begin
-          visited := PTree.set node true !visited;
-          match PTree.get node code with
-          | None -> failwith "No such node"
-          | Some ti -> [node] @ match ti with
-              | Tleaf i -> (match i with
-                  | Icall(_, _, _, _, n) -> dfs_list code [n]
-                  | Ibuiltin(_, _, _, n) -> dfs_list code [n]
-                  | Ijumptable(_, ln) -> dfs_list code ln
-                  | Itailcall _ | Ireturn _ -> [] 
-                  | _ -> failwith "Tleaf case not handled in dfs" )
-              | Tnext (n,i) -> (dfs_list code [n]) @ match i with
-                  | Icond (_, _, n1, n2) -> dfs_list code [n1; n2]
-                  | Inop _ | Iop _ | Iload _ | Istore _ -> []
-                  | _ -> failwith "Tnext case not handled in dfs"
-          end
-        else []
-      in node_dfs @ (dfs_list code ln)
+      if get_some @@ PTree.get node !visited then dfs_list code ln
+      else begin
+        visited := PTree.set node true !visited;
+        let next_nodes = (match get_some @@ PTree.get node code with
+        | Icall(_, _, _, _, n) | Ibuiltin (_, _, _, n) | Iop (_, _, _, n)
+        | Iload (_, _, _, _, _, n) | Istore (_, _, _, _, n) | Inop n -> [n]
+        | Ijumptable (_, ln) -> ln
+        | Itailcall _ | Ireturn _ -> []
+        | Icond (_, _, n1, n2, info) -> (match info with
+          | Some false -> [n2; n1]
+          | _ -> [n1; n2]
+          )
+        ) in node :: dfs_list code (next_nodes @ ln)
+      end
   in dfs_list code [entrypoint]
-
-let get_predecessors_ttl code =
-  let preds = ref (PTree.map (fun n i -> []) code) in
-  let process_inst (node, ti) = match ti with
-  | Tleaf _ -> ()
-  | Tnext (_, i) -> let succ = match i with
-      | Inop n | Iop (_,_,_,n) | Iload (_, _,_,_,_,n) | Istore (_,_,_,_,n)
-      | Icall (_,_,_,_,n) | Ibuiltin (_, _, _, n) -> [n]
-      | Icond (_,_,n1,n2) -> [n1;n2]
-      | Ijumptable (_,ln) -> ln
-      | _ -> []
-      in List.iter (fun s -> preds := PTree.set s (node::(get_some @@ PTree.get s !preds)) !preds) succ
-  in begin
-    List.iter process_inst (PTree.elements code);
-    !preds
-  end
-
-let rtl_proj code = PTree.map (fun n ti -> match ti with Tleaf i | Tnext(_, i) -> i) code
+end
 
 let rec select_unvisited_node is_visited = function
 | [] -> failwith "Empty list"
@@ -373,24 +386,87 @@ let rec select_unvisited_node is_visited = function
 let best_successor_of node code is_visited =
   match (PTree.get node code) with
   | None -> failwith "No such node in the code"
-  | Some ti -> match ti with
-      | Tleaf _ -> None
-      | Tnext (n,_) -> if not (ptree_get_some n is_visited) then Some n
-                       else None
-
-let best_predecessor_of node predecessors order is_visited =
+  | Some i ->
+      let next_node = match i with
+      | Inop n | Iop (_,_,_,n) | Iload (_,_,_,_,_,n) | Istore(_,_,_,_,n)
+      | Icall (_,_,_,_,n) | Ibuiltin (_,_,_,n) -> Some n
+      | Icond (_, _, n1, n2, ob) -> (match ob with None -> None | Some false -> Some n2 | Some true -> Some n1)
+      | _ -> None
+      in match next_node with
+      | None -> None
+      | Some n -> if not (ptree_get_some n is_visited) then Some n else None
+
+(* FIXME - could be improved by selecting in priority the predicted paths *)
+let best_predecessor_of node predecessors code order is_visited =
   match (PTree.get node predecessors) with
   | None -> failwith "No predecessor list found"
-  | Some lp -> try Some (List.find (fun n -> (List.mem n lp) && (not (ptree_get_some n is_visited))) order)
-               with Not_found -> None
+  | Some lp ->
+      try Some (List.find (fun n ->
+          if (List.mem n lp) && (not (ptree_get_some n is_visited)) then
+            match ptree_get_some n code with
+            | Icond (_, _, n1, n2, ob) -> (match ob with
+              | None -> false
+              | Some false -> n == n2
+              | Some true -> n == n1
+              )
+            | _ -> true
+          else false
+        ) order)
+      with Not_found -> None
+
+let print_trace t = print_intlist t
+
+let print_traces traces =
+  let rec f = function
+  | [] -> ()
+  | t::lt -> Printf.printf "\n\t"; print_trace t; Printf.printf ",\n"; f lt
+  in begin
+    if !debug_flag then begin
+      Printf.printf "Traces: {";
+      f traces;
+      Printf.printf "}\n";
+    end
+  end
+
+(* Dumb (but linear) trace selection *)
+let select_traces_linear code entrypoint =
+  let is_visited = ref (PTree.map (fun n i -> false) code) in
+  let bfs_order = bfs code entrypoint in
+  let rec go_through node = begin
+    is_visited := PTree.set node true !is_visited;
+    let next_node = match (get_some @@ PTree.get node code) with
+      | Icall(_, _, _, _, n) | Ibuiltin (_, _, _, n) | Iop (_, _, _, n)
+      | Iload (_, _, _, _, _, n) | Istore (_, _, _, _, n) | Inop n -> Some n
+      | Ijumptable _ | Itailcall _ | Ireturn _ -> None
+      | Icond (_, _, n1, n2, info) -> (match info with
+        | Some false -> Some n2
+        | Some true -> Some n1
+        | None -> None
+        )
+    in match next_node with
+    | None -> [node]
+    | Some n ->
+        if not (get_some @@ PTree.get n !is_visited) then node :: go_through n
+        else [node]
+    end
+  in let traces = ref [] in begin
+    List.iter (fun n ->
+      if not (get_some @@ PTree.get n !is_visited) then
+        traces := (go_through n) :: !traces
+    ) bfs_order;
+    !traces
+  end
+
 
 (* Algorithm mostly inspired from Chang and Hwu 1988
  * "Trace Selection for Compiling Large C Application Programs to Microcode" *)
-let select_traces code entrypoint =
+let select_traces_chang code entrypoint = begin
+  debug "select_traces\n";
   let order = dfs code entrypoint in
-  let predecessors = get_predecessors_ttl code in
+  let predecessors = get_predecessors_rtl code in
   let traces = ref [] in
   let is_visited = ref (PTree.map (fun n i -> false) code) in begin (* mark all nodes visited *)
+    debug "Length: %d\n" (List.length order);
     while exists_false !is_visited do (* while (there are unvisited nodes) *)
       let seed = select_unvisited_node !is_visited order in
       let trace = ref [seed] in
@@ -410,7 +486,7 @@ let select_traces code entrypoint =
           current := seed;
           quit_loop := false;
           while not !quit_loop do
-            let s = best_predecessor_of !current predecessors order !is_visited in
+            let s = best_predecessor_of !current predecessors code order !is_visited in
             match s with
             | None -> quit_loop := true (* if (s==0) exit loop *)
             | Some pred -> begin
@@ -423,21 +499,16 @@ let select_traces code entrypoint =
         end
       end
     done;
-    Printf.printf "DFS: \t"; print_intlist order; Printf.printf "\n";
+    (* debug "DFS: \t"; print_intlist order; debug "\n"; *)
+    debug "Traces: "; print_traces !traces;
     !traces
   end
+end
 
-let print_trace t = print_intlist t
-
-let print_traces traces =
-  let rec f = function
-  | [] -> ()
-  | t::lt -> Printf.printf "\n\t"; print_trace t; Printf.printf ",\n"; f lt
-  in begin
-    Printf.printf "Traces: {";
-    f traces;
-    Printf.printf "}\n";
-  end
+let select_traces code entrypoint =
+  let length = List.length @@ PTree.elements code in
+  if (length < 5000) then select_traces_chang code entrypoint
+  else select_traces_linear code entrypoint
 
 let rec make_identity_ptree_rec = function
 | [] -> PTree.empty
@@ -454,10 +525,10 @@ let rec change_pointers code n n' = function
         | Ibuiltin(a, b, c, n0) -> assert (n0 == n); Ibuiltin(a, b, c, n')
         | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e == n) ln);
                                Ijumptable(a, List.map (fun e -> if (e == n) then n' else e) ln)
-        | Icond(a, b, n1, n2) -> assert (n1 == n || n2 == n);
+        | Icond(a, b, n1, n2, i) -> assert (n1 == n || n2 == n);
                                  let n1' = if (n1 == n) then n' else n1
                                  in let n2' = if (n2 == n) then n' else n2
-                                 in Icond(a, b, n1', n2')
+                                 in Icond(a, b, n1', n2', i)
         | Inop n0 -> assert (n0 == n); Inop n'
         | Iop (a, b, c, n0) -> assert (n0 == n); Iop (a, b, c, n')
         | Iload (a, b, c, d, e, n0) -> assert (n0 == n); Iload (a, b, c, d, e, n')
@@ -471,7 +542,7 @@ let rec change_pointers code n n' = function
  * n': the integer which should contain the duplicate of n
  * returns: new code, new ptree *)
 let duplicate code ptree parent n preds n' =
-  Printf.printf "Duplicating node %d into %d..\n" (P.to_int n) (P.to_int n');
+  debug "Duplicating node %d into %d..\n" (P.to_int n) (P.to_int n');
   match PTree.get n' code with
   | Some _ -> failwith "The PTree already has a node n'"
   | None ->
@@ -529,30 +600,30 @@ let tail_duplicate code preds ptree trace =
   in (new_code, new_ptree, !nb_duplicated)
 
 let superblockify_traces code preds traces =
-  let max_nb_duplicated = 1 (* FIXME - should be architecture dependent *)
+  let max_nb_duplicated = !Clflags.option_fduplicate (* FIXME - should be architecture dependent *)
   in let ptree = make_identity_ptree code
   in let rec f code ptree = function
     | [] -> (code, ptree, 0)
     | trace :: traces ->
         let new_code, new_ptree, nb_duplicated = tail_duplicate code preds ptree trace
-        in if (nb_duplicated < max_nb_duplicated) then f new_code new_ptree traces
-           else (Printf.printf "Too many duplicated nodes, aborting tail duplication\n"; (code, ptree, 0))
+        in if (nb_duplicated < max_nb_duplicated)
+          then (debug "End duplication\n"; f new_code new_ptree traces)
+          else (debug "Too many duplicated nodes, aborting tail duplication\n"; (code, ptree, 0))
   in let new_code, new_ptree, _ = f code ptree traces
   in (new_code, new_ptree)
 
 let rec invert_iconds_trace code = function
   | [] -> code
-  | n::[] -> code
-  | n :: n' :: t ->
+  | n :: ln ->
       let code' = match ptree_get_some n code with
-        | Icond (c, lr, ifso, ifnot) ->
-            assert (n' == ifso || n' == ifnot);
-            if (n' == ifso) then (
-              Printf.printf "Reversing ifso/ifnot for node %d\n" (P.to_int n);
-              PTree.set n (Icond (Op.negate_condition c, lr, ifnot, ifso)) code )
-            else code
+        | Icond (c, lr, ifso, ifnot, info) -> (match info with
+            | Some true -> begin
+                (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *)
+                PTree.set n (Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)) code
+              end
+            | _ -> code)
         | _ -> code
-      in invert_iconds_trace code' (n'::t)
+      in invert_iconds_trace code' ln
 
 let rec invert_iconds code = function
   | [] -> code
@@ -561,12 +632,17 @@ let rec invert_iconds code = function
                   else code
       in invert_iconds code' ts
 
-(* For now, identity function *)
 let duplicate_aux f =
   let entrypoint = f.fn_entrypoint in
-  let code = f.fn_code in
-  let traces = select_traces (to_ttl_code code entrypoint) entrypoint in
-  let icond_code = invert_iconds code traces in
-  let preds = get_predecessors_rtl icond_code in
-  let (new_code, pTreeId) = (print_traces traces; superblockify_traces icond_code preds traces) in
-  ((new_code, f.fn_entrypoint), pTreeId)
+  if !Clflags.option_fduplicate < 0 then
+    ((f.fn_code, entrypoint), make_identity_ptree f.fn_code)
+  else
+    let code = update_directions (f.fn_code) entrypoint in
+    let traces = select_traces code entrypoint in
+    let icond_code = invert_iconds code traces in
+    let preds = get_predecessors_rtl icond_code in
+    if !Clflags.option_fduplicate >= 1 then
+      let (new_code, pTreeId) = ((* print_traces traces; *) superblockify_traces icond_code preds traces) in
+      ((new_code, f.fn_entrypoint), pTreeId)
+    else
+      ((icond_code, entrypoint), make_identity_ptree code)
diff --git a/backend/Duplicateproof.v b/backend/Duplicateproof.v
index a8e9b16b..6b598dc7 100644
--- a/backend/Duplicateproof.v
+++ b/backend/Duplicateproof.v
@@ -23,12 +23,12 @@ Inductive match_inst (dupmap: PTree.t node): instruction -> instruction -> Prop
       match_inst dupmap (Itailcall s ri lr) (Itailcall s ri lr)
   | match_inst_builtin: forall n n' ef la br,
       dupmap!n' = (Some n) -> match_inst dupmap (Ibuiltin ef la br n) (Ibuiltin ef la br n')
-  | match_inst_cond: forall ifso ifso' ifnot ifnot' c lr,
+  | match_inst_cond: forall ifso ifso' ifnot ifnot' c lr i i',
       dupmap!ifso' = (Some ifso) -> dupmap!ifnot' = (Some ifnot) ->
-      match_inst dupmap (Icond c lr ifso ifnot) (Icond c lr ifso' ifnot')
-  | match_inst_revcond: forall ifso ifso' ifnot ifnot' c lr,
+      match_inst dupmap (Icond c lr ifso ifnot i) (Icond c lr ifso' ifnot' i')
+  | match_inst_revcond: forall ifso ifso' ifnot ifnot' c lr i i',
       dupmap!ifso' = (Some ifso) -> dupmap!ifnot' = (Some ifnot) ->
-      match_inst dupmap (Icond c lr ifso ifnot) (Icond (negate_condition c) lr ifnot' ifso')
+      match_inst dupmap (Icond c lr ifso ifnot i) (Icond (negate_condition c) lr ifnot' ifso' i')
   | match_inst_jumptable: forall ln ln' r,
       list_forall2 (fun n n' => (dupmap!n' = (Some n))) ln ln' ->
       match_inst dupmap (Ijumptable r ln) (Ijumptable r ln')
@@ -378,7 +378,7 @@ Theorem step_simulation:
      step tge s2 t s2'
   /\ match_states s1' s2'.
 Proof.
-  Local Hint Resolve transf_fundef_correct.
+  Local Hint Resolve transf_fundef_correct: core.
   induction 1; intros; inv MS.
 (* Inop *)
   - eapply dupmap_correct in DUPLIC; eauto.
diff --git a/backend/ForwardMoves.v b/backend/ForwardMoves.v
index c73b0213..7cfd411f 100644
--- a/backend/ForwardMoves.v
+++ b/backend/ForwardMoves.v
@@ -250,7 +250,7 @@ Fixpoint kill_builtin_res (res : builtin_res reg) (rel : RELATION.t) :=
 Definition apply_instr instr x :=
   match instr with
   | Inop _
-  | Icond _ _ _ _
+  | Icond _ _ _ _ _
   | Ijumptable _ _
   | Istore _ _ _ _ _ => Some x
   | Iop Omove (src :: nil) dst _ => Some (move src dst x)
@@ -309,8 +309,8 @@ Definition transf_instr (fmap : option (PMap.t RB.t))
     Icall sig ros (subst_args fmap pc args) dst s
   | Itailcall sig ros args =>
     Itailcall sig ros (subst_args fmap pc args)
-  | Icond cond args s1 s2 =>
-    Icond cond (subst_args fmap pc args) s1 s2
+  | Icond cond args s1 s2 i =>
+    Icond cond (subst_args fmap pc args) s1 s2 i
   | Ijumptable arg tbl =>
     Ijumptable (subst_arg fmap pc arg) tbl
   | Ireturn (Some arg) =>
diff --git a/backend/IRC.ml b/backend/IRC.ml
index 67da47da..785b0a2d 100644
--- a/backend/IRC.ml
+++ b/backend/IRC.ml
@@ -238,7 +238,6 @@ type graph = {
    according to their types.  A variable can be forced into class 2
    by giving it a negative spill cost. *)
 
-
 let class_of_reg r =
   if Conventions1.is_float_reg r then 1 else 0
 
diff --git a/backend/Inlining.v b/backend/Inlining.v
index 9cf535b9..8c7e1898 100644
--- a/backend/Inlining.v
+++ b/backend/Inlining.v
@@ -397,9 +397,9 @@ Definition expand_instr (ctx: context) (pc: node) (i: instruction): mon unit :=
   | Ibuiltin ef args res s =>
       set_instr (spc ctx pc)
                 (Ibuiltin ef (map (sbuiltinarg ctx) args) (sbuiltinres ctx res) (spc ctx s))
-  | Icond cond args s1 s2 =>
+  | Icond cond args s1 s2 info =>
       set_instr (spc ctx pc)
-                (Icond cond (sregs ctx args) (spc ctx s1) (spc ctx s2))
+                (Icond cond (sregs ctx args) (spc ctx s1) (spc ctx s2) info)
   | Ijumptable r tbl =>
       set_instr (spc ctx pc)
                 (Ijumptable (sreg ctx r) (List.map (spc ctx) tbl))
diff --git a/backend/Inliningaux.ml b/backend/Inliningaux.ml
index d58704ca..cf308962 100644
--- a/backend/Inliningaux.ml
+++ b/backend/Inliningaux.ml
@@ -17,7 +17,8 @@ open Maps
 open Op
 open Ordered
 open! RTL
-
+open Camlcoq
+   
 module PSet = Make(OrderedPositive)
 
 type inlining_info = {
@@ -83,13 +84,15 @@ let static_called_once id io =
   else
     false
 
-(* To be considered: heuristics based on size of function? *)
+(* D. Monniaux: attempt at heuristic based on size *)
+let small_enough (f : coq_function) =
+  P.to_int (RTL.max_pc_function f) <= !Clflags.option_inline_auto_threshold
 
 let should_inline (io: inlining_info) (id: ident) (f: coq_function) =
   if !Clflags.option_finline then begin
     match C2C.atom_inline id with
     | C2C.Inline -> true
     | C2C.Noinline -> false
-    | C2C.No_specifier -> static_called_once id io
+    | C2C.No_specifier -> static_called_once id io || small_enough f
   end else
     false
diff --git a/backend/Inliningspec.v b/backend/Inliningspec.v
index e20fb373..eba026ec 100644
--- a/backend/Inliningspec.v
+++ b/backend/Inliningspec.v
@@ -312,9 +312,9 @@ Inductive tr_instr: context -> node -> instruction -> code -> Prop :=
       match res with BR r => Ple r ctx.(mreg) | _ => True end ->
       c!(spc ctx pc) = Some (Ibuiltin ef (map (sbuiltinarg ctx) args) (sbuiltinres ctx res) (spc ctx s)) ->
       tr_instr ctx pc (Ibuiltin ef args res s) c
-  | tr_cond: forall ctx pc cond args s1 s2 c,
-      c!(spc ctx pc) = Some (Icond cond (sregs ctx args) (spc ctx s1) (spc ctx s2)) ->
-      tr_instr ctx pc (Icond cond args s1 s2) c
+  | tr_cond: forall ctx pc cond args s1 s2 c i,
+      c!(spc ctx pc) = Some (Icond cond (sregs ctx args) (spc ctx s1) (spc ctx s2) i) ->
+      tr_instr ctx pc (Icond cond args s1 s2 i) c
   | tr_jumptable: forall ctx pc r tbl c,
       c!(spc ctx pc) = Some (Ijumptable (sreg ctx r) (List.map (spc ctx) tbl)) ->
       tr_instr ctx pc (Ijumptable r tbl) c
diff --git a/backend/LTL.v b/backend/LTL.v
index ee8b4826..3edd60a2 100644
--- a/backend/LTL.v
+++ b/backend/LTL.v
@@ -37,7 +37,7 @@ Inductive instruction: Type :=
   | Ltailcall (sg: signature) (ros: mreg + ident)
   | Lbuiltin (ef: external_function) (args: list (builtin_arg loc)) (res: builtin_res mreg)
   | Lbranch (s: node)
-  | Lcond (cond: condition) (args: list mreg) (s1 s2: node)
+  | Lcond (cond: condition) (args: list mreg) (s1 s2: node) (info: option bool)
   | Ljumptable (arg: mreg) (tbl: list node)
   | Lreturn.
 
@@ -263,11 +263,11 @@ Inductive step: state -> trace -> state -> Prop :=
   | exec_Lbranch: forall s f sp pc bb rs m,
       step (Block s f sp (Lbranch pc :: bb) rs m)
         E0 (State s f sp pc rs m)
-  | exec_Lcond: forall s f sp cond args pc1 pc2 bb rs b pc rs' m,
+  | exec_Lcond: forall s f sp cond args pc1 pc2 bb rs b pc rs' m i,
       eval_condition cond (reglist rs args) m = Some b ->
       pc = (if b then pc1 else pc2) ->
       rs' = undef_regs (destroyed_by_cond cond) rs ->
-      step (Block s f sp (Lcond cond args pc1 pc2 :: bb) rs m)
+      step (Block s f sp (Lcond cond args pc1 pc2 i :: bb) rs m)
         E0 (State s f sp pc rs' m)
   | exec_Ljumptable: forall s f sp arg tbl bb rs m n pc rs',
       rs (R arg) = Vint n ->
@@ -328,7 +328,7 @@ Fixpoint successors_block (b: bblock) : list node :=
   | nil => nil                          (**r should never happen *)
   | Ltailcall _ _ :: _ => nil
   | Lbranch s :: _ => s :: nil
-  | Lcond _ _ s1 s2 :: _ => s1 :: s2 :: nil
+  | Lcond _ _ s1 s2 _ :: _ => s1 :: s2 :: nil
   | Ljumptable _ tbl :: _ => tbl
   | Lreturn :: _ => nil
   | instr :: b' => successors_block b'
diff --git a/backend/Linearize.v b/backend/Linearize.v
index 4216958c..66b36428 100644
--- a/backend/Linearize.v
+++ b/backend/Linearize.v
@@ -179,7 +179,7 @@ Fixpoint linearize_block (b: LTL.bblock) (k: code) : code :=
       Lbuiltin ef args res :: linearize_block b' k
   | LTL.Lbranch s :: b' =>
       add_branch s k
-  | LTL.Lcond cond args s1 s2 :: b' =>
+  | LTL.Lcond cond args s1 s2 _ :: b' =>
       if starts_with s1 k then
         Lcond (negate_condition cond) args s2 :: add_branch s1 k
       else
diff --git a/backend/Linearizeaux.ml b/backend/Linearizeaux.ml
index a6964233..1381877b 100644
--- a/backend/Linearizeaux.ml
+++ b/backend/Linearizeaux.ml
@@ -13,6 +13,12 @@
 open LTL
 open Maps
 
+let debug_flag = ref false
+
+let debug fmt =
+  if !debug_flag then Printf.eprintf fmt
+  else Printf.ifprintf stderr fmt
+
 (* Trivial enumeration, in decreasing order of PC *)
 
 (***
@@ -81,7 +87,7 @@ let basic_blocks f joins =
        | [] -> assert false
        | Lbranch s :: _ -> next_in_block blk minpc s
        | Ltailcall (sig0, ros) :: _ -> end_block blk minpc
-       | Lcond (cond, args, ifso, ifnot) :: _ ->
+       | Lcond (cond, args, ifso, ifnot, _) :: _ ->
              end_block blk minpc; start_block ifso; start_block ifnot
        | Ljumptable(arg, tbl) :: _ ->
              end_block blk minpc; List.iter start_block tbl
@@ -115,14 +121,11 @@ let enumerate_aux_flat f reach =
   flatten_blocks (basic_blocks f (join_points f))
 
 (**
- * Enumeration based on traces as identified by Duplicate.v 
- *
- * The Duplicate phase heuristically identifies the most frequented paths. Each
- * Icond is modified so that the preferred condition is a fallthrough (ifnot)
- * rather than a branch (ifso).
+ * Alternate enumeration based on traces as identified by Duplicate.v 
  *
- * The enumeration below takes advantage of this - preferring to layout nodes
- * following the fallthroughs of the Lcond branches
+ * This is a slight alteration to the above heuristic, ensuring that any
+ * superblock will be contiguous in memory, while still following the original
+ * heuristic
  *)
 
 let get_some = function
@@ -136,29 +139,385 @@ let rec last_element = function
   | e :: [] -> e
   | e' :: e :: l -> last_element (e::l)
 
-let dfs code entrypoint =
+let print_plist l =
+  let rec f = function
+  | [] -> ()
+  | n :: l -> Printf.printf "%d, " (P.to_int n); f l
+  in begin
+    if !debug_flag then begin
+      Printf.printf "[";
+      f l;
+      Printf.printf "]"
+    end
+  end
+
+(* adapted from the above join_points function, but with PTree *)
+let get_join_points code entry =
+  let reached = ref (PTree.map (fun n i -> false) code) in
+  let reached_twice = ref (PTree.map (fun n i -> false) code) in
+  let rec traverse pc =
+    if get_some @@ PTree.get pc !reached then begin
+      if not (get_some @@ PTree.get pc !reached_twice) then
+        reached_twice := PTree.set pc true !reached_twice
+    end else begin
+      reached := PTree.set pc true !reached;
+      traverse_succs (successors_block @@ get_some @@ PTree.get pc code)
+    end
+  and traverse_succs = function
+    | [] -> ()
+    | [pc] -> traverse pc
+    | pc :: l -> traverse pc; traverse_succs l
+  in traverse entry; !reached_twice
+
+let forward_sequences code entry =
   let visited = ref (PTree.map (fun n i -> false) code) in
-  let rec dfs_list code = function
+  let join_points = get_join_points code entry in
+  (* returns the list of traversed nodes, and a list of nodes to start traversing next *)
+  let rec traverse_fallthrough code node =
+    (* debug "Traversing %d..\n" (P.to_int node); *)
+    if not (get_some @@ PTree.get node !visited) then begin
+      visited := PTree.set node true !visited;
+      match PTree.get node code with
+      | None -> failwith "No such node"
+      | Some bb ->
+          let ln, rem = match (last_element bb) with
+          | Lop _ | Lload _ | Lgetstack _ | Lsetstack _ | Lstore _ | Lcall _
+          | Lbuiltin _ -> assert false
+          | Ltailcall _ | Lreturn -> begin (* debug "STOP tailcall/return\n"; *) ([], []) end
+          | Lbranch n ->
+              if get_some @@ PTree.get n join_points then ([], [n])
+              else let ln, rem = traverse_fallthrough code n in (ln, rem)
+          | Lcond (_, _, ifso, ifnot, info) -> (match info with
+            | None -> begin (* debug "STOP Lcond None\n"; *) ([], [ifso; ifnot]) end
+            | Some false ->
+                if get_some @@ PTree.get ifnot join_points then ([], [ifso; ifnot])
+                else let ln, rem = traverse_fallthrough code ifnot in (ln, [ifso] @ rem)
+            | Some true ->
+                let errstr = Printf.sprintf ("Inconsistency detected in node %d: ifnot is not the preferred branch") (P.to_int node) in
+                  failwith errstr)
+          | Ljumptable(_, ln) -> begin (* debug "STOP Ljumptable\n"; *) ([], ln) end
+          in ([node] @ ln, rem)
+      end
+    else ([], [])
+  in let rec f code = function
   | [] -> []
   | node :: ln ->
-      let node_dfs =
-        if not (get_some @@ PTree.get node !visited) then begin
-          visited := PTree.set node true !visited;
-          match PTree.get node code with
-          | None -> failwith "No such node"
-          | Some bb -> [node] @ match (last_element bb) with
-            | Lop _ | Lload _ | Lgetstack _ | Lsetstack _ | Lstore _ | Lcall _
-            | Lbuiltin _ -> assert false
-            | Ltailcall _ | Lreturn -> []
-            | Lbranch n -> dfs_list code [n]
-            | Lcond (_, _, ifso, ifnot) -> dfs_list code [ifnot; ifso]
-            | Ljumptable(_, ln) -> dfs_list code ln
+      let fs, rem_from_node = traverse_fallthrough code node
+      in [fs] @ ((f code rem_from_node) @ (f code ln))
+  in (f code [entry])
+
+(** Unused code
+module PInt = struct
+  type t = P.t
+  let compare x y = compare (P.to_int x) (P.to_int y)
+end
+
+module PSet = Set.Make(PInt)
+
+module LPInt = struct
+  type t = P.t list
+  let rec compare x y =
+    match x with
+    | [] -> ( match y with
+      | [] -> 0
+      | _ -> 1 )
+    | e :: l -> match y with
+      | [] -> -1
+      | e' :: l' ->
+          let e_cmp = PInt.compare e e' in
+          if e_cmp == 0 then compare l l' else e_cmp
+end
+
+module LPSet = Set.Make(LPInt)
+
+let iter_lpset f s = Seq.iter f (LPSet.to_seq s)
+
+let first_of = function
+  | [] -> None
+  | e :: l -> Some e
+
+let rec last_of = function
+  | [] -> None
+  | e :: l -> (match l with [] -> Some e | e :: l -> last_of l)
+
+let can_be_merged code s s' =
+  let last_s = get_some @@ last_of s in
+  let first_s' = get_some @@ first_of s' in
+  match get_some @@ PTree.get last_s code with
+  | Lop _ | Lload _ | Lgetstack _ | Lsetstack _ | Lstore _ | Lcall _
+  | Lbuiltin _ | Ltailcall _ | Lreturn -> false
+  | Lbranch n -> n == first_s'
+  | Lcond (_, _, ifso, ifnot, info) -> (match info with
+    | None -> false
+    | Some false -> ifnot == first_s'
+    | Some true -> failwith "Inconsistency detected - ifnot is not the preferred branch")
+  | Ljumptable (_, ln) ->
+      match ln with
+      | [] -> false
+      | n :: ln -> n == first_s'
+
+let merge s s' = Some s
+
+let try_merge code (fs: (BinNums.positive list) list) =
+  let seqs = ref (LPSet.of_list fs) in
+  let oldLength = ref (LPSet.cardinal !seqs) in
+  let continue = ref true in
+  let found = ref false in
+  while !continue do
+    begin
+      found := false;
+      iter_lpset (fun s ->
+        if !found then ()
+        else iter_lpset (fun s' ->
+          if (!found || s == s') then ()
+          else if (can_be_merged code s s') then
+            begin
+              seqs := LPSet.remove s !seqs;
+              seqs := LPSet.remove s' !seqs;
+              seqs := LPSet.add (get_some (merge s s')) !seqs;
+              found := true;
+            end
+          else ()
+        ) !seqs
+      ) !seqs;
+      if !oldLength == LPSet.cardinal !seqs then
+        continue := false
+      else
+        oldLength := LPSet.cardinal !seqs
+    end
+  done;
+  !seqs
+*)
+
+(** Code adapted from Duplicateaux.get_loop_headers
+  *
+  * Getting loop branches with a DFS visit :
+  * Each node is either Unvisited, Visited, or Processed
+  * pre-order: node becomes Processed
+  * post-order: node becomes Visited
+  *
+  * If we come accross an edge to a Processed node, it's a loop!
+  *)
+type pos = BinNums.positive
+
+module PP = struct
+  type t = pos * pos
+  let compare a b =
+    let ax, ay = a in
+    let bx, by = b in
+    let dx = compare ax bx in
+    if (dx == 0) then compare ay by
+    else dx
+end
+
+module PPMap = Map.Make(PP)
+
+type vstate = Unvisited | Processed | Visited
+
+let get_loop_edges code entry =
+  let visited = ref (PTree.map (fun n i -> Unvisited) code) in
+  let is_loop_edge = ref PPMap.empty
+  in let rec dfs_visit code from = function
+  | [] -> ()
+  | node :: ln ->
+      match (get_some @@ PTree.get node !visited) with
+      | Visited -> ()
+      | Processed -> begin
+          let from_node = get_some from in
+          is_loop_edge := PPMap.add (from_node, node) true !is_loop_edge;
+          visited := PTree.set node Visited !visited
+        end
+      | Unvisited -> begin
+          visited := PTree.set node Processed !visited;
+          let bb = get_some @@ PTree.get node code in
+          let next_visits = (match (last_element bb) with
+          | Lop _ | Lload _ | Lgetstack _ | Lsetstack _ | Lstore _ | Lcall _
+          | Lbuiltin _ -> assert false
+          | Ltailcall _ | Lreturn -> []
+          | Lbranch n -> [n]
+          | Lcond (_, _, ifso, ifnot, _) -> [ifso; ifnot]
+          | Ljumptable(_, ln) -> ln
+          ) in dfs_visit code (Some node) next_visits;
+          visited := PTree.set node Visited !visited;
+          dfs_visit code from ln
+        end
+  in begin
+    dfs_visit code None [entry];
+    !is_loop_edge
+  end
+
+let ppmap_is_true pp ppmap = PPMap.mem pp ppmap && PPMap.find pp ppmap
+
+module Int = struct
+  type t = int
+  let compare x y = compare x y
+end
+
+module ISet = Set.Make(Int)
+
+let print_iset s = begin
+  if !debug_flag then begin
+    Printf.printf "{";
+    ISet.iter (fun e -> Printf.printf "%d, " e) s;
+    Printf.printf "}"
+  end
+end
+
+let print_depmap dm = begin
+  if !debug_flag then begin
+    Printf.printf "[|";
+    Array.iter (fun s -> print_iset s; Printf.printf ", ") dm;
+    Printf.printf "|]\n"
+  end
+end
+
+let construct_depmap code entry fs =
+  let is_loop_edge = get_loop_edges code entry in
+  let visited = ref (PTree.map (fun n i -> false) code) in
+  let depmap = Array.map (fun e -> ISet.empty) fs in
+  let find_index_of_node n =
+    let index = ref 0 in
+    begin
+      Array.iteri (fun i s ->
+        match List.find_opt (fun e -> e == n) s with
+        | Some _ -> index := i
+        | None -> ()
+      ) fs;
+      !index
+    end
+  in let check_and_update_depmap from target =
+    (* debug "From %d to %d\n" (P.to_int from) (P.to_int target); *)
+    if not (ppmap_is_true (from, target) is_loop_edge) then
+      let in_index_fs = find_index_of_node from in
+      let out_index_fs = find_index_of_node target in
+      if out_index_fs != in_index_fs then
+        depmap.(out_index_fs) <- ISet.add in_index_fs depmap.(out_index_fs)
+      else ()
+    else ()
+  in let rec dfs_visit code = function
+  | [] -> ()
+  | node :: ln ->
+      begin
+        match (get_some @@ PTree.get node !visited) with
+        | true -> ()
+        | false -> begin
+            visited := PTree.set node true !visited;
+            let bb = get_some @@ PTree.get node code in
+            let next_visits =
+              match (last_element bb) with
+              | Ltailcall _ | Lreturn -> []
+              | Lbranch n -> (check_and_update_depmap node n; [n])
+              | Lcond (_, _, ifso, ifnot, _) -> begin
+                  check_and_update_depmap node ifso;
+                  check_and_update_depmap node ifnot;
+                  [ifso; ifnot]
+                end
+              | Ljumptable(_, ln) -> begin
+                  List.iter (fun n -> check_and_update_depmap node n) ln;
+                  ln
+                end
+              (* end of bblocks should not be another value than one of the above *)
+              | _ -> failwith "last_element gave an invalid output"
+            in dfs_visit code next_visits
+          end;
+        dfs_visit code ln
+      end
+  in begin
+    dfs_visit code [entry];
+    depmap
+  end
+
+let print_sequence s =
+  if !debug_flag then begin
+    Printf.printf "[";
+    List.iter (fun n -> Printf.printf "%d, " (P.to_int n)) s;
+    Printf.printf "]\n"
+  end
+
+let print_ssequence ofs =
+  if !debug_flag then begin
+    Printf.printf "[";
+    List.iter (fun s -> print_sequence s) ofs;
+    Printf.printf "]\n"
+  end
+
+let order_sequences code entry fs =
+  let fs_a = Array.of_list fs in
+  let depmap = construct_depmap code entry fs_a in
+  let fs_evaluated = Array.map (fun e -> false) fs_a in
+  let ordered_fs = ref [] in
+  let evaluate s_id =
+    begin
+      assert (not fs_evaluated.(s_id));
+      ordered_fs := fs_a.(s_id) :: !ordered_fs;
+      fs_evaluated.(s_id) <- true;
+      (* debug "++++++\n";
+      debug "Scheduling %d\n" s_id;
+      debug "Initial depmap: "; print_depmap depmap; *)
+      Array.iteri (fun i deps ->
+        depmap.(i) <- ISet.remove s_id deps
+      ) depmap;
+      (* debug "Final depmap: "; print_depmap depmap; *)
+    end
+  in let choose_best_of candidates =
+    let current_best_id = ref None in
+    let current_best_score = ref None in
+    begin
+      List.iter (fun id ->
+        match !current_best_id with
+        | None -> begin
+            current_best_id := Some id;
+            match fs_a.(id) with
+            | [] -> current_best_score := None
+            | n::l -> current_best_score := Some (P.to_int n)
+          end
+        | Some b -> begin
+            match fs_a.(id) with
+            | [] -> ()
+            | n::l -> let nscore = P.to_int n in
+              match !current_best_score with
+              | None -> (current_best_id := Some id; current_best_score := Some nscore)
+              | Some bs -> if nscore > bs then (current_best_id := Some id; current_best_score := Some nscore)
           end
-        else []
-      in node_dfs @ (dfs_list code ln)
-  in dfs_list code [entrypoint]
+      ) candidates;
+      !current_best_id
+    end
+  in let select_next () =
+    let candidates = ref [] in
+    begin
+      Array.iteri (fun i deps ->
+        begin
+          (* debug "Deps of %d: " i; print_iset deps; debug "\n"; *)
+          (* FIXME - if we keep it that way (no dependency check), remove all the unneeded stuff *)
+          if ((* deps == ISet.empty && *) not fs_evaluated.(i)) then
+            candidates := i :: !candidates
+        end
+      ) depmap;
+      if not (List.length !candidates > 0) then begin
+        Array.iteri (fun i deps ->
+          if (not fs_evaluated.(i)) then candidates := i :: !candidates
+        ) depmap;
+      end;
+      get_some (choose_best_of !candidates)
+    end
+  in begin
+    debug "-------------------------------\n";
+    debug "depmap: "; print_depmap depmap;
+    debug "forward sequences identified: "; print_ssequence fs;
+    while List.length !ordered_fs != List.length fs do
+      let next_id = select_next () in
+      evaluate next_id
+    done;
+    debug "forward sequences ordered: "; print_ssequence (List.rev (!ordered_fs));
+    List.rev (!ordered_fs)
+  end
 
-let enumerate_aux_trace f reach = dfs f.fn_code f.fn_entrypoint
+let enumerate_aux_trace f reach =
+  let code = f.fn_code in
+  let entry = f.fn_entrypoint in
+  let fs = forward_sequences code entry in
+  let ofs = order_sequences code entry fs in
+  List.flatten ofs
 
 let enumerate_aux f reach =
   if !Clflags.option_ftracelinearize then enumerate_aux_trace f reach
diff --git a/backend/Liveness.v b/backend/Liveness.v
index afe11ae6..9652b363 100644
--- a/backend/Liveness.v
+++ b/backend/Liveness.v
@@ -94,7 +94,7 @@ Definition transfer
       | Ibuiltin ef args res s =>
           reg_list_live (params_of_builtin_args args)
             (reg_list_dead (params_of_builtin_res res) after)
-      | Icond cond args ifso ifnot =>
+      | Icond cond args ifso ifnot _ =>
           reg_list_live args after
       | Ijumptable arg tbl =>
           reg_live arg after
diff --git a/backend/PrintLTL.ml b/backend/PrintLTL.ml
index b309a9f2..d8f2ac12 100644
--- a/backend/PrintLTL.ml
+++ b/backend/PrintLTL.ml
@@ -83,10 +83,11 @@ let print_instruction pp succ = function
         (print_builtin_args loc) args
   | Lbranch s ->
       print_succ pp s succ
-  | Lcond(cond, args, s1, s2) ->
-      fprintf pp "if (%a) goto %d else goto %d"
+  | Lcond(cond, args, s1, s2, info) ->
+      fprintf pp "if (%a) goto %d else goto %d (prediction: %s)"
         (print_condition mreg) (cond, args)
         (P.to_int s1) (P.to_int s2)
+        (match info with None -> "none" | Some true -> "branch" | Some false -> "fallthrough")
   | Ljumptable(arg, tbl) ->
       let tbl = Array.of_list tbl in
       fprintf pp "jumptable (%a)" mreg arg;
diff --git a/backend/PrintRTL.ml b/backend/PrintRTL.ml
index c25773e5..b2ef05ca 100644
--- a/backend/PrintRTL.ml
+++ b/backend/PrintRTL.ml
@@ -75,10 +75,11 @@ let print_instruction pp (pc, i) =
         (name_of_external ef)
         (print_builtin_args reg) args;
       print_succ pp s (pc - 1)
-  | Icond(cond, args, s1, s2) ->
-      fprintf pp "if (%a) goto %d else goto %d\n"
+  | Icond(cond, args, s1, s2, info) ->
+      fprintf pp "if (%a) goto %d else goto %d (prediction: %s)\n"
         (PrintOp.print_condition reg) (cond, args)
         (P.to_int s1) (P.to_int s2)
+        (match info with None -> "none" | Some true -> "branch" | Some false -> "fallthrough")
   | Ijumptable(arg, tbl) ->
       let tbl = Array.of_list tbl in
       fprintf pp "jumptable (%a)\n" reg arg;
diff --git a/backend/PrintXTL.ml b/backend/PrintXTL.ml
index 1c7655fb..d1b79623 100644
--- a/backend/PrintXTL.ml
+++ b/backend/PrintXTL.ml
@@ -104,7 +104,7 @@ let print_instruction pp succ = function
         (print_builtin_args var) args
   | Xbranch s ->
       print_succ pp s succ
-  | Xcond(cond, args, s1, s2) ->
+  | Xcond(cond, args, s1, s2, _) ->
       fprintf pp "if (%a) goto %d else goto %d"
         (print_condition var) (cond, args)
         (P.to_int s1) (P.to_int s2)
diff --git a/backend/RTL.v b/backend/RTL.v
index 29a49311..dec59ca2 100644
--- a/backend/RTL.v
+++ b/backend/RTL.v
@@ -67,11 +67,12 @@ Inductive instruction: Type :=
       (** [Ibuiltin ef args dest succ] calls the built-in function
           identified by [ef], giving it the values of [args] as arguments.
           It stores the return value in [dest] and branches to [succ]. *)
-  | Icond: condition -> list reg -> node -> node -> instruction
-      (** [Icond cond args ifso ifnot] evaluates the boolean condition
+  | Icond: condition -> list reg -> node -> node -> option bool -> instruction
+      (** [Icond cond args ifso ifnot info] evaluates the boolean condition
           [cond] over the values of registers [args].  If the condition
           is true, it transitions to [ifso].  If the condition is false,
-          it transitions to [ifnot]. *)
+          it transitions to [ifnot]. [info] is a ghost field there to provide
+          information relative to branch prediction. *)
   | Ijumptable: reg -> list node -> instruction
       (** [Ijumptable arg tbl] transitions to the node that is the [n]-th
           element of the list [tbl], where [n] is the unsigned integer
@@ -262,8 +263,8 @@ Inductive step: state -> trace -> state -> Prop :=
       step (State s f sp pc rs m)
          t (State s f sp pc' (regmap_setres res vres rs) m')
   | exec_Icond:
-      forall s f sp pc rs m cond args ifso ifnot b pc',
-      (fn_code f)!pc = Some(Icond cond args ifso ifnot) ->
+      forall s f sp pc rs m cond args ifso ifnot b pc' predb,
+      (fn_code f)!pc = Some(Icond cond args ifso ifnot predb) ->
       eval_condition cond rs##args m = Some b ->
       pc' = (if b then ifso else ifnot) ->
       step (State s f sp pc rs m)
@@ -403,7 +404,7 @@ Definition successors_instr (i: instruction) : list node :=
   | Icall sig ros args res s => s :: nil
   | Itailcall sig ros args => nil
   | Ibuiltin ef args res s => s :: nil
-  | Icond cond args ifso ifnot => ifso :: ifnot :: nil
+  | Icond cond args ifso ifnot _ => ifso :: ifnot :: nil
   | Ijumptable arg tbl => tbl
   | Ireturn optarg => nil
   end.
@@ -424,7 +425,7 @@ Definition instr_uses (i: instruction) : list reg :=
   | Itailcall sig (inl r) args => r :: args
   | Itailcall sig (inr id) args => args
   | Ibuiltin ef args res s => params_of_builtin_args args
-  | Icond cond args ifso ifnot => args
+  | Icond cond args ifso ifnot _ => args
   | Ijumptable arg tbl => arg :: nil
   | Ireturn None => nil
   | Ireturn (Some arg) => arg :: nil
@@ -442,7 +443,7 @@ Definition instr_defs (i: instruction) : option reg :=
   | Itailcall sig ros args => None
   | Ibuiltin ef args res s =>
       match res with BR r => Some r | _ => None end
-  | Icond cond args ifso ifnot => None
+  | Icond cond args ifso ifnot _ => None
   | Ijumptable arg tbl => None
   | Ireturn optarg => None
   end.
@@ -485,7 +486,7 @@ Definition max_reg_instr (m: positive) (pc: node) (i: instruction) :=
   | Ibuiltin ef args res s =>
       fold_left Pos.max (params_of_builtin_args args)
         (fold_left Pos.max (params_of_builtin_res res) m)
-  | Icond cond args ifso ifnot => fold_left Pos.max args m
+  | Icond cond args ifso ifnot _ => fold_left Pos.max args m
   | Ijumptable arg tbl => Pos.max arg m
   | Ireturn None => m
   | Ireturn (Some arg) => Pos.max arg m
diff --git a/backend/RTLgen.v b/backend/RTLgen.v
index 2c27944a..ac98f3a1 100644
--- a/backend/RTLgen.v
+++ b/backend/RTLgen.v
@@ -479,7 +479,7 @@ with transl_condexpr (map: mapping) (a: condexpr) (ntrue nfalse: node)
   match a with
   | CEcond c al =>
       do rl <- alloc_regs map al;
-      do nt <- add_instr (Icond c rl ntrue nfalse);
+      do nt <- add_instr (Icond c rl ntrue nfalse None);
          transl_exprlist map al rl nt
   | CEcondition a b c =>
       do nc <- transl_condexpr map c ntrue nfalse;
diff --git a/backend/RTLgenspec.v b/backend/RTLgenspec.v
index 92b48e2b..30ad7d82 100644
--- a/backend/RTLgenspec.v
+++ b/backend/RTLgenspec.v
@@ -744,9 +744,9 @@ Inductive tr_expr (c: code):
 
 with tr_condition (c: code):
        mapping -> list reg -> condexpr -> node -> node -> node -> Prop :=
-  | tr_CEcond: forall map pr cond bl ns ntrue nfalse n1 rl,
+  | tr_CEcond: forall map pr cond bl ns ntrue nfalse n1 rl i,
       tr_exprlist c map pr bl ns n1 rl ->
-      c!n1 = Some (Icond cond rl ntrue nfalse) ->
+      c!n1 = Some (Icond cond rl ntrue nfalse i) ->
       tr_condition c map pr (CEcond cond bl) ns ntrue nfalse
   | tr_CEcondition: forall map pr a1 a2 a3 ns ntrue nfalse n2 n3,
       tr_condition c map pr a1 ns n2 n3 ->
diff --git a/backend/RTLtyping.v b/backend/RTLtyping.v
index 857f2211..15ed6d8a 100644
--- a/backend/RTLtyping.v
+++ b/backend/RTLtyping.v
@@ -139,11 +139,11 @@ Inductive wt_instr : instruction -> Prop :=
       valid_successor s ->
       wt_instr (Ibuiltin ef args res s)
   | wt_Icond:
-      forall cond args s1 s2,
+      forall cond args s1 s2 i,
       map env args = type_of_condition cond ->
       valid_successor s1 ->
       valid_successor s2 ->
-      wt_instr (Icond cond args s1 s2)
+      wt_instr (Icond cond args s1 s2 i)
   | wt_Ijumptable:
       forall arg tbl,
       env arg = Tint ->
@@ -313,7 +313,7 @@ Definition type_instr (e: S.typenv) (i: instruction) : res S.typenv :=
         | _ => type_builtin_args e args sig.(sig_args)
         end;
       type_builtin_res e1 res (proj_sig_res sig)
- | Icond cond args s1 s2 =>
+ | Icond cond args s1 s2 _ =>
       do x1 <- check_successor s1;
       do x2 <- check_successor s2;
       S.set_list e args (type_of_condition cond)
diff --git a/backend/Regalloc.ml b/backend/Regalloc.ml
index f2658b04..ffe26933 100644
--- a/backend/Regalloc.ml
+++ b/backend/Regalloc.ml
@@ -295,8 +295,8 @@ let block_of_RTL_instr funsig tyenv = function
          (Xbuiltin(ef, args2, res2) ::
             movelist (params_of_builtin_res res2) (params_of_builtin_res res1)
                [Xbranch s])
-  | RTL.Icond(cond, args, s1, s2) ->
-      [Xcond(cond, vregs tyenv args, s1, s2)]
+  | RTL.Icond(cond, args, s1, s2, i) ->
+      [Xcond(cond, vregs tyenv args, s1, s2, i)]
   | RTL.Ijumptable(arg, tbl) ->
       [Xjumptable(vreg tyenv arg, tbl)]
   | RTL.Ireturn None ->
@@ -380,7 +380,7 @@ let live_before instr after =
       vset_addargs args (vset_removeres res after)
   | Xbranch s ->
       after
-  | Xcond(cond, args, s1, s2) ->
+  | Xcond(cond, args, s1, s2, _) ->
       List.fold_right VSet.add args after
   | Xjumptable(arg, tbl) ->
       VSet.add arg after
@@ -575,7 +575,7 @@ let spill_costs f =
             charge_list 10 1 (params_of_builtin_res res)
         end
     | Xbranch _ -> ()
-    | Xcond(cond, args, _, _) ->
+    | Xcond(cond, args, _, _, _) ->
         charge_list 10 1 args
     | Xjumptable(arg, _) ->
         charge 10 1 arg
@@ -718,7 +718,7 @@ let add_interfs_instr g instr live =
       end
   | Xbranch s ->
       ()
-  | Xcond(cond, args, s1, s2) ->
+  | Xcond(cond, args, s1, s2, _) ->
       add_interfs_destroyed g live (destroyed_by_cond cond)
   | Xjumptable(arg, tbl) ->
       add_interfs_destroyed g live destroyed_by_jumptable
@@ -797,7 +797,7 @@ let tospill_instr alloc instr ts =
          (addlist_tospill alloc (params_of_builtin_res res) ts)
   | Xbranch s ->
       ts
-  | Xcond(cond, args, s1, s2) ->
+  | Xcond(cond, args, s1, s2, _) ->
       addlist_tospill alloc args ts
   | Xjumptable(arg, tbl) ->
       add_tospill alloc arg ts
@@ -990,9 +990,9 @@ let spill_instr tospill eqs instr =
       (c1 @ Xbuiltin(ef, args', res') :: c2, eqs2)
   | Xbranch s ->
       ([instr], eqs)
-  | Xcond(cond, args, s1, s2) ->
+  | Xcond(cond, args, s1, s2, i) ->
      let (args', c1, eqs1) = reload_vars tospill eqs args in
-     (c1 @ [Xcond(cond, args', s1, s2)], eqs1)
+     (c1 @ [Xcond(cond, args', s1, s2, i)], eqs1)
   | Xjumptable(arg, tbl) ->
       let (arg', c1, eqs1) = reload_var tospill eqs arg in
       (c1 @ [Xjumptable(arg', tbl)], eqs1)
@@ -1128,8 +1128,8 @@ let transl_instr alloc instr k =
                        AST.map_builtin_res (mreg_of alloc) res) :: k
   | Xbranch s ->
       LTL.Lbranch s :: []
-  | Xcond(cond, args, s1, s2) ->
-      LTL.Lcond(cond, mregs_of alloc args, s1, s2) :: []
+  | Xcond(cond, args, s1, s2, i) ->
+      LTL.Lcond(cond, mregs_of alloc args, s1, s2, i) :: []
   | Xjumptable(arg, tbl) ->
       LTL.Ljumptable(mreg_of alloc arg, tbl) :: []
   | Xreturn optarg ->
diff --git a/backend/Renumber.v b/backend/Renumber.v
index 7ba16658..2727b979 100644
--- a/backend/Renumber.v
+++ b/backend/Renumber.v
@@ -48,7 +48,7 @@ Definition renum_instr (i: instruction) : instruction :=
   | Icall sg ros args res s => Icall sg ros args res (renum_pc s)
   | Itailcall sg ros args => i
   | Ibuiltin ef args res s => Ibuiltin ef args res (renum_pc s)
-  | Icond cond args s1 s2 => Icond cond args (renum_pc s1) (renum_pc s2)
+  | Icond cond args s1 s2 info => Icond cond args (renum_pc s1) (renum_pc s2) info
   | Ijumptable arg tbl => Ijumptable arg (List.map renum_pc tbl)
   | Ireturn or => i
   end.
diff --git a/backend/Splitting.ml b/backend/Splitting.ml
index 78eb66a5..3ca45c3b 100644
--- a/backend/Splitting.ml
+++ b/backend/Splitting.ml
@@ -162,8 +162,8 @@ let ren_instr f maps pc i =
   | Ibuiltin(ef, args, res, s) ->
       Ibuiltin(ef, List.map (AST.map_builtin_arg (ren_reg before)) args,
                    AST.map_builtin_res (ren_reg after) res, s)
-  | Icond(cond, args, s1, s2) ->
-      Icond(cond, ren_regs before args, s1, s2)
+  | Icond(cond, args, s1, s2, i) ->
+      Icond(cond, ren_regs before args, s1, s2, i)
   | Ijumptable(arg, tbl) ->
       Ijumptable(ren_reg before arg, tbl)
   | Ireturn optarg ->
diff --git a/backend/Tunneling.v b/backend/Tunneling.v
index da1ce45a..a4c4a195 100644
--- a/backend/Tunneling.v
+++ b/backend/Tunneling.v
@@ -78,11 +78,11 @@ Definition record_gotos (f: LTL.function) : U.t :=
 Definition tunnel_instr (uf: U.t) (i: instruction) : instruction :=
   match i with
   | Lbranch s => Lbranch (U.repr uf s)
-  | Lcond cond args s1 s2 =>
+  | Lcond cond args s1 s2 info =>
       let s1' := U.repr uf s1 in let s2' := U.repr uf s2 in
       if peq s1' s2'
       then Lbranch s1'
-      else Lcond cond args s1' s2'
+      else Lcond cond args s1' s2' info
   | Ljumptable arg tbl => Ljumptable arg (List.map (U.repr uf) tbl)
   | _ => i
   end.
diff --git a/backend/Unusedglob.v b/backend/Unusedglob.v
index 1b5f2547..93ca7af4 100644
--- a/backend/Unusedglob.v
+++ b/backend/Unusedglob.v
@@ -53,7 +53,7 @@ Definition ref_instruction (i: instruction) : list ident :=
   | Itailcall _ (inl r) _ => nil
   | Itailcall _ (inr id) _ => id :: nil
   | Ibuiltin _ args _ _ => globals_of_builtin_args args
-  | Icond cond _ _ _ => nil
+  | Icond cond _ _ _ _ => nil
   | Ijumptable _ _ => nil
   | Ireturn _ => nil
   end.
diff --git a/backend/ValueAnalysis.v b/backend/ValueAnalysis.v
index 9a33768c..2e79d1a9 100644
--- a/backend/ValueAnalysis.v
+++ b/backend/ValueAnalysis.v
@@ -156,7 +156,7 @@ Definition transfer (f: function) (rm: romem) (pc: node) (ae: aenv) (am: amem) :
       VA.Bot
   | Some(Ibuiltin ef args res s) =>
       transfer_builtin ae am rm ef args res
-  | Some(Icond cond args s1 s2) =>
+  | Some(Icond cond args s1 s2 _) =>
       VA.State ae am
   | Some(Ijumptable arg tbl) =>
       VA.State ae am
@@ -1044,9 +1044,8 @@ Proof.
   red; simpl; intros. destruct (plt b (Mem.nextblock m)).
   exploit RO; eauto. intros (R & P & Q).
   split; auto.
-  split. apply bmatch_incr with bc; auto. apply bmatch_inv with m; auto.
-  intros. eapply Mem.loadbytes_unchanged_on_1. eapply external_call_readonly; eauto.
-  auto. intros; red. apply Q.
+  split. apply bmatch_incr with bc; auto. apply bmatch_ext with m; auto.
+  intros. eapply external_call_readonly with (m2 := m'); eauto.
   intros; red; intros; elim (Q ofs).
   eapply external_call_max_perm with (m2 := m'); eauto.
   destruct (j' b); congruence.
diff --git a/backend/ValueDomain.v b/backend/ValueDomain.v
index c132ce7c..779e7bb9 100644
--- a/backend/ValueDomain.v
+++ b/backend/ValueDomain.v
@@ -3502,11 +3502,6 @@ Proof.
 - omegaContradiction.
 Qed.
 
-Lemma max_size_chunk: forall chunk, size_chunk chunk <= 8.
-Proof.
-  destruct chunk; simpl; omega.
-Qed.
-
 Remark inval_before_contents:
   forall i c chunk' av' j,
   (inval_before i (i - 7) c)##j = Some (ACval chunk' av') ->
diff --git a/backend/XTL.ml b/backend/XTL.ml
index c496fafb..1d8e89c0 100644
--- a/backend/XTL.ml
+++ b/backend/XTL.ml
@@ -36,7 +36,7 @@ type instruction =
   | Xtailcall of signature * (var, ident) sum * var list
   | Xbuiltin of external_function * var builtin_arg list * var builtin_res
   | Xbranch of node
-  | Xcond of condition * var list * node * node
+  | Xcond of condition * var list * node * node * bool option
   | Xjumptable of var * node list
   | Xreturn of var list
 
@@ -105,7 +105,7 @@ let twin_reg r =
 let rec successors_block = function
   | Xbranch s :: _ -> [s]
   | Xtailcall(sg, vos, args) :: _ -> []
-  | Xcond(cond, args, s1, s2) :: _ -> [s1; s2]
+  | Xcond(cond, args, s1, s2, _) :: _ -> [s1; s2]
   | Xjumptable(arg, tbl) :: _ -> tbl
   | Xreturn  _:: _ -> []
   | instr :: blk -> successors_block blk
@@ -179,7 +179,7 @@ let type_instr = function
       type_builtin_res res (proj_sig_res sg)
   | Xbranch s ->
       ()
-  | Xcond(cond, args, s1, s2) ->
+  | Xcond(cond, args, s1, s2, _) ->
       set_vars_type args (type_of_condition cond)
   | Xjumptable(arg, tbl) ->
       set_var_type arg Tint
diff --git a/backend/XTL.mli b/backend/XTL.mli
index b4b77fab..7b7f7186 100644
--- a/backend/XTL.mli
+++ b/backend/XTL.mli
@@ -37,7 +37,7 @@ type instruction =
   | Xtailcall of signature * (var, ident) sum * var list
   | Xbuiltin of external_function * var builtin_arg list * var builtin_res
   | Xbranch of node
-  | Xcond of condition * var list * node * node
+  | Xcond of condition * var list * node * node * bool option
   | Xjumptable of var * node list
   | Xreturn of var list
 
diff --git a/cfrontend/C2C.ml b/cfrontend/C2C.ml
index f2a68156..f637b5e4 100644
--- a/cfrontend/C2C.ml
+++ b/cfrontend/C2C.ml
@@ -1272,7 +1272,7 @@ let convertFundef loc env fd =
     { a_storage = fd.fd_storage;
       a_alignment = None;
       a_size = None;
-      a_sections = Sections.for_function env id' fd.fd_attrib;
+      a_sections = Sections.for_function env loc id' fd.fd_attrib;
       a_access = Sections.Access_default;
       a_inline = inline;
       a_loc = loc };
@@ -1358,7 +1358,7 @@ let convertGlobvar loc env (sto, id, ty, optinit) =
     | Some i ->
         convertInitializer env ty i in
   let (section, access) =
-    Sections.for_variable env id' ty (optinit <> None)
+    Sections.for_variable env loc id' ty (optinit <> None)
       (match sto with
        | Storage_thread_local | Storage_thread_local_extern
        | Storage_thread_local_static -> true
diff --git a/cfrontend/Cexec.v b/cfrontend/Cexec.v
index 2942080b..b08c3ad7 100644
--- a/cfrontend/Cexec.v
+++ b/cfrontend/Cexec.v
@@ -460,6 +460,14 @@ Definition do_ef_free
       check (zlt 0 (Ptrofs.unsigned sz));
       do m' <- Mem.free m b (Ptrofs.unsigned lo - size_chunk Mptr) (Ptrofs.unsigned lo + Ptrofs.unsigned sz);
       Some(w, E0, Vundef, m')
+  | Vint n :: nil =>
+      if Int.eq_dec n Int.zero && negb Archi.ptr64
+      then Some(w, E0, Vundef, m)
+      else None
+  | Vlong n :: nil =>
+      if Int64.eq_dec n Int64.zero && Archi.ptr64
+      then Some(w, E0, Vundef, m)
+      else None
   | _ => None
   end.
 
@@ -544,45 +552,51 @@ Proof with try congruence.
   - eapply do_external_function_sound; eauto.
   }
   destruct ef; simpl.
-(* EF_external *)
+- (* EF_external *)
   eapply do_external_function_sound; eauto.
-(* EF_builtin *)
+- (* EF_builtin *)
   eapply BF_EX; eauto.
-(* EF_runtime *)
+- (* EF_runtime *)
   eapply BF_EX; eauto.
-(* EF_vload *)
+- (* EF_vload *)
   unfold do_ef_volatile_load. destruct vargs... destruct v... destruct vargs...
   mydestr. destruct p as [[w'' t''] v]; mydestr.
   exploit do_volatile_load_sound; eauto. intuition. econstructor; eauto.
-  auto.
-(* EF_vstore *)
+- (* EF_vstore *)
   unfold do_ef_volatile_store. destruct vargs... destruct v... destruct vargs... destruct vargs...
   mydestr. destruct p as [[w'' t''] m'']. mydestr.
   exploit do_volatile_store_sound; eauto. intuition. econstructor; eauto.
-  auto.
-(* EF_malloc *)
+- (* EF_malloc *)
   unfold do_ef_malloc. destruct vargs... destruct vargs... mydestr.
   destruct (Mem.alloc m (- size_chunk Mptr) (Ptrofs.unsigned i)) as [m1 b] eqn:?. mydestr.
   split. apply SIZE in Heqo. subst v. econstructor; eauto. constructor.
-(* EF_free *)
-  unfold do_ef_free. destruct vargs... destruct v... destruct vargs...
-  mydestr. split. apply SIZE in Heqo0. econstructor; eauto. congruence. omega. constructor.
-(* EF_memcpy *)
+- (* EF_free *)
+  unfold do_ef_free. destruct vargs... destruct v... 
++ destruct vargs... mydestr; InvBooleans; subst i.
+  replace (Vint Int.zero) with Vnullptr. split; constructor.
+  apply negb_true_iff in H0. unfold Vnullptr; rewrite H0; auto.
++ destruct vargs... mydestr; InvBooleans; subst i.
+  replace (Vlong Int64.zero) with Vnullptr. split; constructor.
+  unfold Vnullptr; rewrite H0; auto.
++ destruct vargs... mydestr.
+  split. apply SIZE in Heqo0. econstructor; eauto. congruence. omega.
+  constructor.
+- (* EF_memcpy *)
   unfold do_ef_memcpy. destruct vargs... destruct v... destruct vargs...
   destruct v... destruct vargs... mydestr. 
   apply Decidable_sound in Heqb1. red in Heqb1.
   split. econstructor; eauto; tauto. constructor.
-(* EF_annot *)
+- (* EF_annot *)
   unfold do_ef_annot. mydestr.
   split. constructor. apply list_eventval_of_val_sound; auto.
   econstructor. constructor; eauto. constructor.
-(* EF_annot_val *)
+- (* EF_annot_val *)
   unfold do_ef_annot_val. destruct vargs... destruct vargs... mydestr.
   split. constructor. apply eventval_of_val_sound; auto.
   econstructor. constructor; eauto. constructor.
-(* EF_inline_asm *)
+- (* EF_inline_asm *)
   eapply do_inline_assembly_sound; eauto.
-(* EF_debug *)
+- (* EF_debug *)
   unfold do_ef_debug. mydestr. split; constructor.
 Qed.
 
@@ -605,37 +619,38 @@ Proof.
   - eapply do_external_function_complete; eauto.
   }
   destruct ef; simpl in *.
-(* EF_external *)
+- (* EF_external *)
   eapply do_external_function_complete; eauto.
-(* EF_builtin *)
+- (* EF_builtin *)
   eapply BF_EX; eauto.
-(* EF_runtime *)
+- (* EF_runtime *)
   eapply BF_EX; eauto.
-(* EF_vload *)
+- (* EF_vload *)
   inv H; unfold do_ef_volatile_load.
   exploit do_volatile_load_complete; eauto. intros EQ; rewrite EQ; auto.
-(* EF_vstore *)
+- (* EF_vstore *)
   inv H; unfold do_ef_volatile_store.
   exploit do_volatile_store_complete; eauto. intros EQ; rewrite EQ; auto.
-(* EF_malloc *)
+- (* EF_malloc *)
   inv H; unfold do_ef_malloc.
   inv H0. erewrite SIZE by eauto. rewrite H1, H2. auto.
-(* EF_free *)
+- (* EF_free *)
   inv H; unfold do_ef_free.
-  inv H0. rewrite H1. erewrite SIZE by eauto. rewrite zlt_true. rewrite H3. auto. omega.
-(* EF_memcpy *)
++ inv H0. rewrite H1. erewrite SIZE by eauto. rewrite zlt_true. rewrite H3. auto. omega.
++ inv H0. unfold Vnullptr; destruct Archi.ptr64; auto.
+- (* EF_memcpy *)
   inv H; unfold do_ef_memcpy.
   inv H0. rewrite Decidable_complete. rewrite H7; rewrite H8; auto.
   red. tauto.
-(* EF_annot *)
+- (* EF_annot *)
   inv H; unfold do_ef_annot. inv H0. inv H6. inv H4.
   rewrite (list_eventval_of_val_complete _ _ _ H1). auto.
-(* EF_annot_val *)
+- (* EF_annot_val *)
   inv H; unfold do_ef_annot_val. inv H0. inv H6. inv H4.
   rewrite (eventval_of_val_complete _ _ _ H1). auto.
-(* EF_inline_asm *)
+- (* EF_inline_asm *)
   eapply do_inline_assembly_complete; eauto.
-(* EF_debug *)
+- (* EF_debug *)
   inv H. inv H0. reflexivity.
 Qed.
 
diff --git a/common/Events.v b/common/Events.v
index 10e0c232..28bb992a 100644
--- a/common/Events.v
+++ b/common/Events.v
@@ -649,9 +649,12 @@ Record extcall_properties (sem: extcall_sem) (sg: signature) : Prop :=
 (** External call cannot modify memory unless they have [Max, Writable]
    permissions. *)
   ec_readonly:
-    forall ge vargs m1 t vres m2,
+    forall ge vargs m1 t vres m2 b ofs n bytes,
     sem ge vargs m1 t vres m2 ->
-    Mem.unchanged_on (loc_not_writable m1) m1 m2;
+    Mem.valid_block m1 b ->
+    Mem.loadbytes m2 b ofs n = Some bytes ->
+    (forall i, ofs <= i < ofs + n -> ~Mem.perm m1 b i Max Writable) ->
+    Mem.loadbytes m1 b ofs n = Some bytes;
 
 (** External calls must commute with memory extensions, in the
   following sense. *)
@@ -784,7 +787,7 @@ Proof.
 (* max perms *)
 - inv H; auto.
 (* readonly *)
-- inv H. apply Mem.unchanged_on_refl.
+- inv H; auto.
 (* mem extends *)
 - inv H. inv H1. inv H6. inv H4.
   exploit volatile_load_extends; eauto. intros [v' [A B]].
@@ -833,14 +836,27 @@ Proof.
   rewrite C; auto.
 Qed.
 
+Lemma unchanged_on_readonly:
+  forall m1 m2 b ofs n bytes,
+  Mem.unchanged_on (loc_not_writable m1) m1 m2 ->
+  Mem.valid_block m1 b ->
+  Mem.loadbytes m2 b ofs n = Some bytes ->
+  (forall i, ofs <= i < ofs + n -> ~Mem.perm m1 b i Max Writable) ->
+  Mem.loadbytes m1 b ofs n = Some bytes.
+Proof.
+  intros.
+  rewrite <- H1. symmetry.
+  apply Mem.loadbytes_unchanged_on_1 with (P := loc_not_writable m1); auto.
+Qed.
+
 Lemma volatile_store_readonly:
   forall ge chunk1 m1 b1 ofs1 v t m2,
   volatile_store ge chunk1 m1 b1 ofs1 v t m2 ->
   Mem.unchanged_on (loc_not_writable m1) m1 m2.
 Proof.
   intros. inv H.
-  apply Mem.unchanged_on_refl.
-  eapply Mem.store_unchanged_on; eauto.
+- apply Mem.unchanged_on_refl.
+- eapply Mem.store_unchanged_on; eauto.
   exploit Mem.store_valid_access_3; eauto. intros [P Q].
   intros. unfold loc_not_writable. red; intros. elim H2.
   apply Mem.perm_cur_max. apply P. auto.
@@ -934,7 +950,7 @@ Proof.
 (* perms *)
 - inv H. inv H2. auto. eauto with mem.
 (* readonly *)
-- inv H. eapply volatile_store_readonly; eauto.
+- inv H. eapply unchanged_on_readonly; eauto. eapply volatile_store_readonly; eauto.
 (* mem extends*)
 - inv H. inv H1. inv H6. inv H7. inv H4.
   exploit volatile_store_extends; eauto. intros [m2' [A [B C]]].
@@ -994,7 +1010,7 @@ Proof.
   rewrite dec_eq_false. auto.
   apply Mem.valid_not_valid_diff with m1; eauto with mem.
 (* readonly *)
-- inv H. eapply UNCHANGED; eauto.
+- inv H. eapply unchanged_on_readonly; eauto. 
 (* mem extends *)
 - inv H. inv H1. inv H7.
   assert (SZ: v2 = Vptrofs sz).
@@ -1045,11 +1061,13 @@ Qed.
 
 Inductive extcall_free_sem (ge: Senv.t):
               list val -> mem -> trace -> val -> mem -> Prop :=
-  | extcall_free_sem_intro: forall b lo sz m m',
+  | extcall_free_sem_ptr: forall b lo sz m m',
       Mem.load Mptr m b (Ptrofs.unsigned lo - size_chunk Mptr) = Some (Vptrofs sz) ->
       Ptrofs.unsigned sz > 0 ->
       Mem.free m b (Ptrofs.unsigned lo - size_chunk Mptr) (Ptrofs.unsigned lo + Ptrofs.unsigned sz) = Some m' ->
-      extcall_free_sem ge (Vptr b lo :: nil) m E0 Vundef m'.
+      extcall_free_sem ge (Vptr b lo :: nil) m E0 Vundef m'
+  | extcall_free_sem_null: forall m,
+      extcall_free_sem ge (Vnullptr :: nil) m E0 Vundef m.
 
 Lemma extcall_free_ok:
   extcall_properties extcall_free_sem
@@ -1057,26 +1075,29 @@ Lemma extcall_free_ok:
 Proof.
   constructor; intros.
 (* well typed *)
-- inv H. simpl. auto.
+- inv H; simpl; auto.
 (* symbols preserved *)
 - inv H0; econstructor; eauto.
 (* valid block *)
-- inv H. eauto with mem.
+- inv H; eauto with mem.
 (* perms *)
-- inv H. eapply Mem.perm_free_3; eauto.
+- inv H; eauto using Mem.perm_free_3.
 (* readonly *)
-- inv H. eapply Mem.free_unchanged_on; eauto.
-  intros. red; intros. elim H3.
+- eapply unchanged_on_readonly; eauto. inv H.
++ eapply Mem.free_unchanged_on; eauto.
+  intros. red; intros. elim H6.
   apply Mem.perm_cur_max. apply Mem.perm_implies with Freeable; auto with mem.
   eapply Mem.free_range_perm; eauto.
++ apply Mem.unchanged_on_refl.
 (* mem extends *)
-- inv H. inv H1. inv H8. inv H6.
+- inv H.
++ inv H1. inv H8. inv H6.
   exploit Mem.load_extends; eauto. intros [v' [A B]].
   assert (v' = Vptrofs sz).
   { unfold Vptrofs in *; destruct Archi.ptr64; inv B; auto. }
   subst v'.
   exploit Mem.free_parallel_extends; eauto. intros [m2' [C D]].
-  exists Vundef; exists m2'; intuition.
+  exists Vundef; exists m2'; intuition auto.
   econstructor; eauto.
   eapply Mem.free_unchanged_on; eauto.
   unfold loc_out_of_bounds; intros.
@@ -1084,8 +1105,14 @@ Proof.
   { apply Mem.perm_cur_max. apply Mem.perm_implies with Freeable; auto with mem.
     eapply Mem.free_range_perm. eexact H4. eauto. }
   tauto.
++ inv H1. inv H5. replace v2 with Vnullptr.
+  exists Vundef; exists m1'; intuition auto.
+  constructor.
+  apply Mem.unchanged_on_refl.
+  unfold Vnullptr in *; destruct Archi.ptr64; inv H3; auto.
 (* mem inject *)
-- inv H0. inv H2. inv H7. inv H9.
+- inv H0.
++ inv H2. inv H7. inv H9.
   exploit Mem.load_inject; eauto. intros [v' [A B]].
   assert (v' = Vptrofs sz).
   { unfold Vptrofs in *; destruct Archi.ptr64; inv B; auto. }
@@ -1099,7 +1126,7 @@ Proof.
   intro EQ.
   exploit Mem.free_parallel_inject; eauto. intros (m2' & C & D).
   exists f, Vundef, m2'; split.
-  apply extcall_free_sem_intro with (sz := sz) (m' := m2').
+  apply extcall_free_sem_ptr with (sz := sz) (m' := m2').
     rewrite EQ. rewrite <- A. f_equal. omega.
     auto. auto.
     rewrite ! EQ. rewrite <- C. f_equal; omega.
@@ -1112,14 +1139,19 @@ Proof.
     apply P. omega.
   split. auto.
   red; intros. congruence.
++ inv H2. inv H6. replace v' with Vnullptr.
+  exists f, Vundef, m1'; intuition auto using Mem.unchanged_on_refl.
+  constructor.
+  red; intros; congruence.
+  unfold Vnullptr in *; destruct Archi.ptr64; inv H4; auto.
 (* trace length *)
 - inv H; simpl; omega.
 (* receptive *)
-- assert (t1 = t2). inv H; inv H0; auto. subst t2.
+- assert (t1 = t2) by (inv H; inv H0; auto). subst t2.
   exists vres1; exists m1; auto.
 (* determ *)
-- inv H; inv H0.
-  assert (EQ1: Vptrofs sz0 = Vptrofs sz) by congruence.
+- inv H; inv H0; try (unfold Vnullptr in *; destruct Archi.ptr64; discriminate).
++ assert (EQ1: Vptrofs sz0 = Vptrofs sz) by congruence.
   assert (EQ2: sz0 = sz).
   { unfold Vptrofs in EQ1; destruct Archi.ptr64 eqn:SF.
     rewrite <- (Ptrofs.of_int64_to_int64 SF sz0), <- (Ptrofs.of_int64_to_int64 SF sz). congruence.
@@ -1127,6 +1159,7 @@ Proof.
   }
   subst sz0.
   split. constructor. intuition congruence.
++ split. constructor. intuition auto.
 Qed.
 
 (** ** Semantics of [memcpy] operations. *)
@@ -1159,8 +1192,9 @@ Proof.
 - (* perms *)
   intros. inv H. eapply Mem.perm_storebytes_2; eauto.
 - (* readonly *)
-  intros. inv H. eapply Mem.storebytes_unchanged_on; eauto.
-  intros; red; intros. elim H8.
+  intros. inv H. eapply unchanged_on_readonly; eauto. 
+  eapply Mem.storebytes_unchanged_on; eauto.
+  intros; red; intros. elim H11.
   apply Mem.perm_cur_max. eapply Mem.storebytes_range_perm; eauto.
 - (* extensions *)
   intros. inv H.
@@ -1271,7 +1305,7 @@ Proof.
 (* perms *)
 - inv H; auto.
 (* readonly *)
-- inv H. apply Mem.unchanged_on_refl.
+- inv H; auto.
 (* mem extends *)
 - inv H.
   exists Vundef; exists m1'; intuition.
@@ -1316,7 +1350,7 @@ Proof.
 (* perms *)
 - inv H; auto.
 (* readonly *)
-- inv H. apply Mem.unchanged_on_refl.
+- inv H; auto.
 (* mem extends *)
 - inv H. inv H1. inv H6.
   exists v2; exists m1'; intuition.
@@ -1359,7 +1393,7 @@ Proof.
 (* perms *)
 - inv H; auto.
 (* readonly *)
-- inv H. apply Mem.unchanged_on_refl.
+- inv H; auto.
 (* mem extends *)
 - inv H.
   exists Vundef; exists m1'; intuition.
@@ -1406,7 +1440,7 @@ Proof.
 (* perms *)
 - inv H; auto.
 (* readonly *)
-- inv H. apply Mem.unchanged_on_refl.
+- inv H; auto.
 (* mem extends *)
 - inv H. fold bsem in H2. apply val_inject_list_lessdef in H1.
   specialize (bs_inject _ bsem _ _ _ H1).
diff --git a/common/Memdata.v b/common/Memdata.v
index f3016efe..a09b90f5 100644
--- a/common/Memdata.v
+++ b/common/Memdata.v
@@ -44,6 +44,13 @@ Definition size_chunk (chunk: memory_chunk) : Z :=
   | Many64 => 8
   end.
 
+Definition largest_size_chunk := 8.
+
+Lemma max_size_chunk: forall chunk, size_chunk chunk <= 8.
+Proof.
+  destruct chunk; simpl; omega.
+Qed.
+
 Lemma size_chunk_pos:
   forall chunk, size_chunk chunk > 0.
 Proof.
diff --git a/common/Memory.v b/common/Memory.v
index f21d99af..cd8a2001 100644
--- a/common/Memory.v
+++ b/common/Memory.v
@@ -1307,6 +1307,23 @@ Proof.
   }
 Qed.
     
+Section STOREV.
+Variable chunk: memory_chunk.
+Variable m1: mem.
+Variables addr v: val.
+Variable m2: mem.
+Hypothesis STORE: storev chunk m1 addr v = Some m2.
+
+
+Theorem loadv_storev_same:
+  loadv chunk m2 addr = Some (Val.load_result chunk v).
+Proof.
+  destruct addr; simpl in *; try discriminate.
+  eapply load_store_same.
+  eassumption.
+Qed.
+End STOREV.
+
 Lemma load_store_overlap:
   forall chunk m1 b ofs v m2 chunk' ofs' v',
   store chunk m1 b ofs v = Some m2 ->
diff --git a/common/Sections.ml b/common/Sections.ml
index 9555c203..ea0b6dbc 100644
--- a/common/Sections.ml
+++ b/common/Sections.ml
@@ -166,9 +166,22 @@ let gcc_section name readonly exec =
     sec_writable = not readonly; sec_executable = exec;
     sec_access = Access_default }
 
+(* Check and extract whether a section was given as attribute *)
+
+let get_attr_section loc attr =
+  match Cutil.find_custom_attributes ["section"; "__section__"] attr with
+  | [] -> None
+  | [[C.AString name]] -> Some name
+  | [[_]] ->
+    Diagnostics.error loc "'section' attribute requires a string";
+    None
+  | _ ->
+    Diagnostics.error loc "ambiguous 'section' attribute";
+    None
+
 (* Determine section for a variable definition *)
 
-let for_variable env id ty init thrl =
+let for_variable env loc id ty init thrl =
   let attr = Cutil.attributes_of_type env ty in
   let readonly = List.mem C.AConst attr && not(List.mem C.AVolatile attr) in
   let si =
@@ -176,11 +189,11 @@ let for_variable env id ty init thrl =
       (* 1- Section explicitly associated with #use_section *)
       Hashtbl.find use_section_table id
     with Not_found ->
-      match Cutil.find_custom_attributes ["section"; "__section__"] attr with
-      | [[C.AString name]] ->
+      match get_attr_section loc attr with
+      | Some name ->
         (* 2- Section given as an attribute, gcc-style *)
         gcc_section name readonly false
-      | _ ->
+      | None ->
         (* 3- Default section appropriate for size and const-ness *)
         let size =
           match Cutil.sizeof env ty with Some sz -> sz | None -> max_int in
@@ -197,17 +210,17 @@ let for_variable env id ty init thrl =
 
 (* Determine sections for a function definition *)
 
-let for_function env id attr =
+let for_function env loc id attr =
   let si_code =
     try
       (* 1- Section explicitly associated with #use_section *)
       Hashtbl.find use_section_table id
     with Not_found ->
-      match Cutil.find_custom_attributes ["section"; "__section__"] attr with
-      | [[C.AString name]] ->
+      match get_attr_section loc attr with
+      | Some name ->
           (* 2- Section given as an attribute, gcc-style *)
           gcc_section name true true
-      | _ ->
+      | None ->
           (* 3- Default section *)
           try
             Hashtbl.find current_section_table "CODE"
diff --git a/common/Sections.mli b/common/Sections.mli
index e882f042..00c06c20 100644
--- a/common/Sections.mli
+++ b/common/Sections.mli
@@ -47,7 +47,7 @@ val define_section:
          -> ?writable:bool -> ?executable:bool -> ?access:access_mode -> unit -> unit
 val use_section_for: AST.ident -> string -> bool
 
-val for_variable: Env.t -> AST.ident -> C.typ -> bool -> bool ->
+val for_variable: Env.t -> C.location -> AST.ident -> C.typ -> bool -> bool ->
                                           section_name * access_mode
-val for_function: Env.t -> AST.ident -> C.attributes -> section_name list
+val for_function: Env.t -> C.location -> AST.ident -> C.attributes -> section_name list
 val for_stringlit: unit -> section_name
diff --git a/config_arm.sh b/config_arm.sh
index eed55fab..1861e029 100755
--- a/config_arm.sh
+++ b/config_arm.sh
@@ -1 +1 @@
-exec ./config_simple.sh arm-linux --toolprefix arm-linux-gnueabihf- "$@"
+exec ./config_simple.sh arm-linux --toolprefix arm-linux-gnueabi- "$@"
diff --git a/config_armhf.sh b/config_armhf.sh
new file mode 100755
index 00000000..8a1302bd
--- /dev/null
+++ b/config_armhf.sh
@@ -0,0 +1 @@
+exec ./config_simple.sh arm-eabihf --toolprefix arm-linux-gnueabihf- "$@"
diff --git a/config_ppc64.sh b/config_ppc64.sh
new file mode 100755
index 00000000..df31c18f
--- /dev/null
+++ b/config_ppc64.sh
@@ -0,0 +1 @@
+exec ./config_simple.sh ppc64-linux --toolprefix powerpc64-linux-gnu- "$@"
diff --git a/config_rv32.sh b/config_rv32.sh
index 654cacfa..a5a5cf1c 100755
--- a/config_rv32.sh
+++ b/config_rv32.sh
@@ -1 +1 @@
-exec ./config_simple.sh rv32-linux --toolprefix riscv64-unknown-elf- "$@"
+exec ./config_simple.sh rv32-linux --toolprefix riscv64-linux-gnu- "$@"
diff --git a/config_rv64.sh b/config_rv64.sh
index e95f8a70..0698c2ff 100755
--- a/config_rv64.sh
+++ b/config_rv64.sh
@@ -1 +1 @@
-exec ./config_simple.sh rv64-linux --toolprefix riscv64-unknown-elf- "$@"
+exec ./config_simple.sh rv64-linux --toolprefix riscv64-linux-gnu- "$@"
diff --git a/config_simple.sh b/config_simple.sh
index f02680c4..e2d3844c 100755
--- a/config_simple.sh
+++ b/config_simple.sh
@@ -3,4 +3,9 @@ shift
 version=`git rev-parse --short HEAD`
 branch=`git rev-parse --abbrev-ref HEAD`
 date=`date -I`
-./configure --prefix /opt/CompCert/${branch}/${date}_${version}/$arch "$@" $arch
+
+if test "x$CCOMP_INSTALL_PREFIX" = "x" ;
+then CCOMP_INSTALL_PREFIX=/opt/CompCert ;
+fi
+
+./configure --prefix ${CCOMP_INSTALL_PREFIX}/${branch}/${date}_${version}/$arch "$@" $arch
diff --git a/configure b/configure
index f13d1af3..cb2f52ba 100755
--- a/configure
+++ b/configure
@@ -568,7 +568,7 @@ missingtools=false
 echo "Testing Coq... " | tr -d '\n'
 coq_ver=$(${COQBIN}coqc -v 2>/dev/null | sed -n -e 's/The Coq Proof Assistant, version \([^ ]*\).*$/\1/p')
 case "$coq_ver" in
-  8.8.0|8.8.1|8.8.2|8.9.0|8.9.1|8.10.0|8.10.1|8.10.2|8.11.0)
+  8.9.0|8.9.1|8.10.0|8.10.1|8.10.2|8.11.0|8.11.1)
         echo "version $coq_ver -- good!";;
   ?*)
         echo "version $coq_ver -- UNSUPPORTED"
diff --git a/cparser/Elab.ml b/cparser/Elab.ml
index c5295347..0504ad0b 100644
--- a/cparser/Elab.ml
+++ b/cparser/Elab.ml
@@ -469,7 +469,8 @@ let elab_constant loc = function
       let (v, fk) = elab_float_constant f in
       CFloat(v, fk)
   | CONST_CHAR(wide, s) ->
-      CInt(elab_char_constant loc wide s, IInt, "")
+      let ikind = if wide then wchar_ikind () else IInt in
+      CInt(elab_char_constant loc wide s, ikind, "")
   | CONST_STRING(wide, s) ->
       elab_string_literal loc wide s
 
@@ -2457,8 +2458,8 @@ let enter_typedef loc env sto (s, ty, init) =
       env
     end
     else begin
-      error loc "typedef redefinition with different types (%a vs %a)"
-        (print_typ env) ty (print_typ env) ty';
+      error loc "redefinition of typedef '%s' with different type (%a vs %a)"
+        s (print_typ env) ty (print_typ env) ty';
       env
     end
   | _ ->
diff --git a/doc/index.html b/doc/index.html
index 3a4cf6ba..631c5d99 100644
--- a/doc/index.html
+++ b/doc/index.html
@@ -24,7 +24,7 @@ a:active {color : Red; text-decoration : underline; }
 
 <H1 align="center">The CompCert verified compiler</H1>
 <H2 align="center">Commented Coq development</H2>
-<H3 align="center">Version 3.6, 2019-09-17</H3>
+<H3 align="center">Version 3.7, 2020-03-31</H3>
 
 <H2>Introduction</H2>
 
@@ -101,6 +101,8 @@ See also: <A HREF="html/compcert.common.Memdata.html">Memdata</A> (in-memory rep
 <LI> <A HREF="html/compcert.common.Determinism.html">Determinism</A>: determinism properties of small-step semantics.
 <LI> <A HREF="html/compcert.powerpc.Op.html"><I>Op</I></A>: operators, addressing modes and their
 semantics.
+<LI> <A HREF="html/compcert.common.Builtins.html">Builtins</A>: semantics of built-in functions. <BR>
+See also: <A HREF="html/compcert.common.Builtins0.html">Builtins0</A> (target-independent part), <A HREF="html/compcert.powerpc.Builtins1.html"><I>Builtins1</I></A> (target-dependent part).
 <LI> <A HREF="html/compcert.common.Unityping.html">Unityping</A>: a solver for atomic unification constraints.
 </UL>
 
diff --git a/driver/Clflags.ml b/driver/Clflags.ml
index 6d6f1df4..6986fb96 100644
--- a/driver/Clflags.ml
+++ b/driver/Clflags.ml
@@ -28,8 +28,8 @@ let option_fconstprop = ref true
 let option_fcse = ref true
 let option_fcse2 = ref true
 let option_fredundancy = ref true
-let option_fduplicate = ref false
-let option_finvertcond = ref true (* only active if option_fduplicate is also true *)
+let option_fduplicate = ref (-1)
+let option_finvertcond = ref true
 let option_ftracelinearize = ref false
 let option_fpostpass = ref true
 let option_fpostpass_sched = ref "list"
@@ -81,3 +81,4 @@ let option_faddx = ref false
 let option_fcoalesce_mem = ref true
 let option_fforward_moves = ref true
 let option_all_loads_nontrap = ref false
+let option_inline_auto_threshold = ref 0
diff --git a/driver/Compopts.v b/driver/Compopts.v
index b4b9f30d..848657e5 100644
--- a/driver/Compopts.v
+++ b/driver/Compopts.v
@@ -27,8 +27,7 @@ Parameter generate_float_constants: unit -> bool.
 (** For value analysis.  Currently always false. *)
 Parameter va_strict: unit -> bool.
 
-(** Flag -fduplicate. For tail duplication optimization. Necessary to have
-  * bigger superblocks *)
+(** Flag -fduplicate. Branch prediction annotation + tail duplication *)
 Parameter optim_duplicate: unit -> bool.
 
 (** Flag -ftailcalls.  For tail call optimization. *)
diff --git a/driver/Driver.ml b/driver/Driver.ml
index db71aef9..388482a0 100644
--- a/driver/Driver.ml
+++ b/driver/Driver.ml
@@ -190,6 +190,7 @@ Processing options:
   -Os            Optimize for code size in preference to code speed
   -Obranchless   Optimize to generate fewer conditional branches; try to produce
                  branch-free instruction sequences as much as possible
+  -finline-auto-threshold n   Inline functions under size n
   -ftailcalls    Optimize function calls in tail position [on]
   -fconst-prop   Perform global constant propagation  [on]
   -ffloat-const-prop <n>  Control constant propagation of floats
@@ -200,11 +201,15 @@ Processing options:
   -fpostpass     Perform postpass scheduling (only for K1 architecture) [on]
   -fpostpass= <optim> Perform postpass scheduling with the specified optimization [list]
                    (<optim>=list: list scheduling, <optim>=ilp: ILP, <optim>=greedy: just packing bundles)
-  -fduplicate    Perform tail duplication to form superblocks on predicted traces
+  -fduplicate <nb_nodes> Perform tail duplication to form superblocks on predicted traces
+    nb_nodes control the heuristic deciding to duplicate or not
+    A value of -1 desactivates the entire pass (including branch prediction)
+    A value of 0 desactivates the duplication (but activates the branch prediction)
+    FIXME : this is desactivated by default for now
     -finvertcond    Invert conditions based on predicted paths (to prefer fallthrough).
                     Requires -fduplicate to be also activated [on]
     -ftracelinearize Linearizes based on the traces identified by duplicate phase
-                    It is recommended to also activate -fduplicate with this pass [off]
+                    It is heavily recommended to activate -finvertcond with this pass [off]
   -fforward-moves   Forward moves after CSE
   -finline       Perform inlining of functions [on]
   -finline-functions-called-once Integrate functions only required by their
@@ -318,10 +323,11 @@ let cmdline_actions =
  [
   Exact "-O0", Unit (unset_all optimization_options);
   Exact "-O", Unit (set_all optimization_options);
-  _Regexp "-O1", Self (fun _ -> set_all optimization_options (); option_fpostpass := false; option_fduplicate := false);
+  _Regexp "-O1", Self (fun _ -> set_all optimization_options (); option_fpostpass := false);
   _Regexp "-O[123]$", Unit (set_all optimization_options);
   Exact "-Os", Set option_Osize;
   Exact "-Obranchless", Set option_Obranchless;
+  Exact "-finline-auto-threshold", Integer (fun n -> option_inline_auto_threshold := n);
   Exact "-fsmall-data", Integer(fun n -> option_small_data := n);
   Exact "-fsmall-const", Integer(fun n -> option_small_const := n);
   Exact "-ffloat-const-prop", Integer(fun n -> option_ffloatconstprop := n); 
@@ -393,7 +399,7 @@ let cmdline_actions =
   @ f_opt "cse2" option_fcse2
   @ f_opt "redundancy" option_fredundancy
   @ f_opt "postpass" option_fpostpass
-  @ f_opt "duplicate" option_fduplicate
+  @ [ Exact "-fduplicate", Integer (fun n -> option_fduplicate := n) ]
   @ f_opt "invertcond" option_finvertcond
   @ f_opt "tracelinearize" option_ftracelinearize
   @ f_opt_str "postpass" option_fpostpass option_fpostpass_sched
diff --git a/extraction/extraction.v b/extraction/extraction.v
index 929c21e0..9b568951 100644
--- a/extraction/extraction.v
+++ b/extraction/extraction.v
@@ -106,7 +106,7 @@ Extract Constant Compopts.generate_float_constants =>
 Extract Constant Compopts.optim_tailcalls =>
   "fun _ -> !Clflags.option_ftailcalls".
 Extract Constant Compopts.optim_duplicate =>
-  "fun _ -> !Clflags.option_fduplicate".
+  "fun _ -> (if !Clflags.option_fduplicate = -1 then false else true)".
 Extract Constant Compopts.optim_constprop =>
   "fun _ -> !Clflags.option_fconstprop".
 Extract Constant Compopts.optim_CSE =>
diff --git a/lib/Maps.v b/lib/Maps.v
index 1dec59a2..8de3c892 100644
--- a/lib/Maps.v
+++ b/lib/Maps.v
@@ -116,6 +116,19 @@ Module Type TREE.
     forall (m1: t A) (m2: t B) (i: elt),
     get i (combine f m1 m2) = f (get i m1) (get i m2).
 
+  Parameter combine_null :
+    forall (A B C: Type) (f: A -> B -> option C),
+    t A -> t B -> t C.
+
+  Axiom gcombine_null:
+    forall (A B C: Type) (f: A -> B -> option C),
+    forall (m1: t A) (m2: t B) (i: elt),
+      get i (combine_null f m1 m2) =
+      match (get i m1), (get i m2) with
+      | (Some x1), (Some x2) => f x1 x2
+      | _, _ => None
+      end.
+
   (** Enumerating the bindings of a tree. *)
   Parameter elements:
     forall (A: Type), t A -> list (elt * A).
@@ -151,6 +164,12 @@ Module Type TREE.
     forall (A B: Type) (f: B -> A -> B) (v: B) (m: t A),
     fold1 f m v =
     List.fold_left (fun a p => f a (snd p)) (elements m) v.
+
+  Parameter bempty_canon :
+    forall (A : Type), t A -> bool.
+  Axiom bempty_canon_correct:
+    forall (A : Type) (tr : t A) (i : elt),
+      bempty_canon tr = true -> get i tr = None.
 End TREE.
 
 (** * The abstract signatures of maps *)
@@ -261,6 +280,12 @@ Module PTree <: TREE.
     induction i; simpl; auto.
   Qed.
 
+  Definition bempty_canon (A : Type) (tr : t A) : bool :=
+    match tr with
+    | Leaf => true
+    | _ => false
+    end.
+
   Theorem gss:
     forall (A: Type) (i: positive) (x: A) (m: t A), get i (set i x m) = Some x.
   Proof.
@@ -269,7 +294,16 @@ Module PTree <: TREE.
 
     Lemma gleaf : forall (A : Type) (i : positive), get i (Leaf : t A) = None.
     Proof. exact gempty. Qed.
-
+    
+  Lemma bempty_canon_correct:
+    forall (A : Type) (tr : t A) (i : elt),
+      bempty_canon tr = true -> get i tr = None.
+  Proof.
+    destruct tr; intros.
+    - rewrite gleaf; trivial.
+    - discriminate.
+  Qed.
+  
   Theorem gso:
     forall (A: Type) (i j: positive) (x: A) (m: t A),
     i <> j -> get i (set j x m) = get i m.
@@ -625,7 +659,81 @@ Module PTree <: TREE.
      auto.
   Qed.
 
-    Fixpoint xelements (A : Type) (m : t A) (i : positive)
+  Section COMBINE_NULL.
+
+  Variables A B C: Type.
+  Variable f: A -> B -> option C.
+
+  
+  Fixpoint combine_null (m1: t A) (m2: t B) {struct m1} : t C :=
+    match m1, m2 with
+    | (Node l1 o1 r1), (Node l2 o2 r2) =>
+      Node' (combine_null l1 l2)
+            (match o1, o2 with
+             | (Some x1), (Some x2) => f x1 x2
+             | _, _ => None
+             end)
+            (combine_null r1 r2)
+    | _, _ => Leaf
+    end.
+
+  Theorem gcombine_null:
+      forall (m1: t A) (m2: t B) (i: positive),
+        get i (combine_null m1 m2) =
+        match (get i m1), (get i m2) with
+        | (Some x1), (Some x2) => f x1 x2
+        | _, _ => None
+        end.
+  Proof.
+    induction m1; intros; simpl.
+    - rewrite gleaf. rewrite gleaf.
+      reflexivity.
+    - destruct m2; simpl.
+      + rewrite gleaf. rewrite gleaf.
+        destruct get; reflexivity.
+      + rewrite gnode'.
+        destruct i; simpl; try rewrite IHm1_1; try rewrite IHm1; trivial.
+  Qed.
+
+  End COMBINE_NULL.
+
+  Section REMOVE_TREE.
+
+  Variables A B: Type.
+
+  Fixpoint remove_t (m1: t A) (m2: t B) {struct m1} : t A :=
+    match m1, m2 with
+    | Leaf, _ | _, Leaf => m1
+    | (Node l1 o1 r1), (Node l2 o2 r2) =>
+      Node' (remove_t l1 l2)
+            (match o2 with
+             | Some _ => None
+             | None => o1
+             end)
+            (remove_t r1 r2)
+    end.
+
+  Theorem gremove_t:
+    forall m1 : t A,
+    forall m2 : t B,
+    forall i : positive,
+      get i (remove_t m1 m2) = match get i m2 with
+                               | None => get i m1
+                               | Some _ => None
+                               end.
+  Proof.
+    induction m1; intros; simpl.
+    - rewrite gleaf.
+      destruct get; reflexivity.
+    - destruct m2; simpl.
+      + rewrite gleaf.
+        reflexivity.
+      + rewrite gnode'.
+        destruct i; simpl; try rewrite IHm1_1; try rewrite IHm1; trivial.
+  Qed.      
+  End REMOVE_TREE.
+  
+  Fixpoint xelements (A : Type) (m : t A) (i : positive)
                        (k: list (positive * A)) {struct m}
                        : list (positive * A) :=
       match m with
diff --git a/mppa_k1c/Asmblockdeps.v b/mppa_k1c/Asmblockdeps.v
index 02f9141b..01eda623 100644
--- a/mppa_k1c/Asmblockdeps.v
+++ b/mppa_k1c/Asmblockdeps.v
@@ -339,7 +339,7 @@ Proof.
   }
   destruct (Mem.load _ m1 _ _) in *; destruct (Mem.load _ m0 _ _) in *; congruence.
 Qed.
-             
+
 Definition goto_label_deps (f: function) (lbl: label) (vpc: val) :=
   match label_pos lbl 0 (fn_blocks f) with
   | None => None
@@ -1005,7 +1005,7 @@ Theorem bisimu_par_wio_basic ge fn rsr rsw mr mw sr sw bi:
 Proof.
 
 (* a little tactic to automate reasoning on preg_eq *)
-Local Hint Resolve not_eq_sym ppos_pmem_discr ppos_discr.
+Local Hint Resolve not_eq_sym ppos_pmem_discr ppos_discr: core.
 Local Ltac preg_eq_discr r rd :=
   destruct (preg_eq r rd); try (subst r; rewrite assign_eq, Pregmap.gss; auto);
   rewrite (assign_diff _ (#rd) (#r) _); auto;
@@ -1053,7 +1053,7 @@ Local Ltac preg_eq_discr r rd :=
         preg_eq_discr r rd0. }
 
     (* Load Octuple word *)
-    + Local Hint Resolve not_eq_sym ppos_pmem_discr ppos_discr.
+    + Local Hint Resolve not_eq_sym ppos_pmem_discr ppos_discr: core.
       unfold parexec_load_o_offset.
       destruct (gpreg_o_expand rd) as [[[rd0 rd1] rd2] rd3]; destruct Ge; simpl.
       rewrite H0, H.
@@ -1423,7 +1423,7 @@ Section SECT_BBLOCK_EQUIV.
 
 Variable Ge: genv.
 
-Local Hint Resolve trans_state_match.
+Local Hint Resolve trans_state_match: core.
 
 Lemma bblock_simu_reduce:
   forall p1 p2 ge fn,
diff --git a/mppa_k1c/Asmblockgen.v b/mppa_k1c/Asmblockgen.v
index 50637723..36269954 100644
--- a/mppa_k1c/Asmblockgen.v
+++ b/mppa_k1c/Asmblockgen.v
@@ -28,6 +28,8 @@ Require Import Chunks.
 Local Open Scope string_scope.
 Local Open Scope error_monad_scope.
 
+Import PArithCoercions.
+
 (** The code generation functions take advantage of several
   characteristics of the [Mach] code generated by earlier passes of the
   compiler, mostly that argument and result registers are of the correct
diff --git a/mppa_k1c/Asmblockgenproof1.v b/mppa_k1c/Asmblockgenproof1.v
index ecb4629b..00df01e3 100644
--- a/mppa_k1c/Asmblockgenproof1.v
+++ b/mppa_k1c/Asmblockgenproof1.v
@@ -23,6 +23,8 @@ Require Import Op Locations Machblock Conventions.
 Require Import Asmblock Asmblockgen Asmblockgenproof0 Asmblockprops.
 Require Import Chunks.
 
+Import PArithCoercions.
+
 (** Decomposition of integer constants. *)
 
 Lemma make_immed32_sound:
@@ -859,7 +861,7 @@ Proof.
     destruct cmp; discriminate.
 Qed.
 
-Local Hint Resolve Val_cmpu_bool_correct Val_cmplu_bool_correct.
+Local Hint Resolve Val_cmpu_bool_correct Val_cmplu_bool_correct: core.
 
 Lemma transl_cbranch_correct_1:
   forall cond args lbl k c m ms b sp rs m' tbb,
@@ -1163,7 +1165,7 @@ Proof.
   split; intros; Simpl.
 Qed.
 
-Local Hint Resolve Val_cmpu_correct Val_cmplu_correct.
+Local Hint Resolve Val_cmpu_correct Val_cmplu_correct: core.
 
 Lemma transl_condimm_int32u_correct:
   forall cmp rd r1 n k rs m,
@@ -1481,6 +1483,8 @@ Proof.
   destruct (Z_lt_dec _ _); destruct (Z.eq_dec _ _); trivial; omega.
 Qed.
 
+Ltac splitall := repeat match goal with |- _ /\ _ => split end.
+
 Lemma transl_op_correct:
   forall op args res k (rs: regset) m v c,
   transl_op op args res k = OK c ->
@@ -1515,21 +1519,21 @@ Opaque Int.eq.
 - (* Ocast8signed *)
   econstructor; split.
   eapply exec_straight_two. simpl;eauto. simpl;eauto.
-  split; intros; simpl; Simpl.
+  repeat split; intros; simpl; Simpl.
   assert (A: Int.ltu (Int.repr 24) Int.iwordsize = true) by auto.
   destruct (rs x0); auto; simpl. rewrite A; simpl. Simpl. unfold Val.shr. rewrite A.
   apply Val.lessdef_same. f_equal. apply Int.sign_ext_shr_shl. split; reflexivity.
 - (* Ocast16signed *)
   econstructor; split.
   eapply exec_straight_two. simpl;eauto. simpl;eauto.
-  split; intros; Simpl.
+  repeat split; intros; Simpl.
   assert (A: Int.ltu (Int.repr 16) Int.iwordsize = true) by auto.
   destruct (rs x0); auto; simpl. rewrite A; simpl. Simpl. unfold Val.shr. rewrite A. 
   apply Val.lessdef_same. f_equal. apply Int.sign_ext_shr_shl. split; reflexivity.
 - (* Oshrximm *)
   econstructor; split.
   + apply exec_straight_one. simpl. eauto.
-  + split.
+  + repeat split.
     * rewrite Pregmap.gss.
       subst v.
       destruct (rs x0); simpl; trivial.
@@ -1540,7 +1544,7 @@ Opaque Int.eq.
 - (* Oshrxlimm *)
   econstructor; split.
   + apply exec_straight_one. simpl. eauto.
-  + split.
+  + repeat split.
     * rewrite Pregmap.gss.
       subst v.
       destruct (rs x0); simpl; trivial.
@@ -1551,7 +1555,7 @@ Opaque Int.eq.
       
 - (* Ocmp *)
   exploit transl_cond_op_correct; eauto. intros (rs' & A & B & C).
-  exists rs'; split. eexact A. eauto with asmgen.
+  exists rs'; repeat split; eauto with asmgen.
   
 - (* Osel *)
   unfold conditional_move in *.
@@ -1570,72 +1574,73 @@ Opaque Int.eq.
 
   destruct c0; simpl in *.
 
-  all:
-    destruct c; simpl in *; inv EQ2;
-    econstructor; split; try (apply exec_straight_one; constructor);
-    split; try (simpl; intros; rewrite Pregmap.gso; trivial; assumption);
-    unfold Val.select; simpl;
-    unfold cmove, cmoveu;   
-    rewrite Pregmap.gss;
-    destruct (rs x1); simpl; trivial;
-    try rewrite int_ltu_to_neq;
-    try rewrite int64_ltu_to_neq;
-    try change (Int64.eq Int64.zero Int64.zero) with true;
-    try destruct Archi.ptr64;
-    repeat rewrite if_neg;
-    simpl;
-    trivial;
-    try destruct (_ || _);
-    trivial;
-    try apply Val.lessdef_normalize.
+    all: destruct c.
+    all: simpl in *.
+    all: inv EQ2.
+    all: econstructor; splitall.
+    all: try apply exec_straight_one.
+    all: intros; simpl; trivial.
+    all: unfold Val.select, cmove, cmoveu; simpl.
+    all: destruct (rs x1); simpl; trivial.
+    all: try rewrite int_ltu_to_neq.
+    all: try rewrite int64_ltu_to_neq.
+    all: try change (Int64.eq Int64.zero Int64.zero) with true.
+    all: try destruct Archi.ptr64.
+    all: try rewrite Pregmap.gss.
+    all: repeat rewrite if_neg.
+    all: simpl.
+    all: try destruct (_ || _).
+    all: try apply Val.lessdef_normalize.
+    all: trivial. (* no more lessdef *)
+    all: apply Pregmap.gso; congruence.
 
 - (* Oselimm *)
   unfold conditional_move_imm32 in *.
   destruct c0; simpl in *.
 
-  all:
-    destruct c; simpl in *; inv EQ0;
-    econstructor; split; try (apply exec_straight_one; constructor);
-    split; try (simpl; intros; rewrite Pregmap.gso; trivial; assumption);
-    unfold Val.select; simpl;
-    unfold cmove, cmoveu;   
-    rewrite Pregmap.gss;
-    destruct (rs x0); simpl; trivial;
-    try rewrite int_ltu_to_neq;
-    try rewrite int64_ltu_to_neq;
-    try change (Int64.eq Int64.zero Int64.zero) with true;
-    try destruct Archi.ptr64;
-    repeat rewrite if_neg;
-    simpl;
-    trivial;
-    try destruct (_ || _);
-    trivial;
-    try apply Val.lessdef_normalize.
-
+    all: destruct c.
+    all: simpl in *.
+    all: inv EQ0.
+    all: econstructor; splitall.
+    all: try apply exec_straight_one.
+    all: intros; simpl; trivial.
+    all: unfold Val.select, cmove, cmoveu; simpl.
+    all: destruct (rs x0); simpl; trivial.
+    all: try rewrite int_ltu_to_neq.
+    all: try rewrite int64_ltu_to_neq.
+    all: try change (Int64.eq Int64.zero Int64.zero) with true.
+    all: try destruct Archi.ptr64.
+    all: try rewrite Pregmap.gss.
+    all: repeat rewrite if_neg.
+    all: simpl.
+    all: try destruct (_ || _).
+    all: try apply Val.lessdef_normalize.
+    all: trivial. (* no more lessdef *)
+    all: apply Pregmap.gso; congruence.
 
 - (* Osellimm *)
   unfold conditional_move_imm64 in *.
   destruct c0; simpl in *.
 
-  all:
-    destruct c; simpl in *; inv EQ0;
-    econstructor; split; try (apply exec_straight_one; constructor);
-    split; try (simpl; intros; rewrite Pregmap.gso; trivial; assumption);
-    unfold Val.select; simpl;
-    unfold cmove, cmoveu;   
-    rewrite Pregmap.gss;
-    destruct (rs x0); simpl; trivial;
-    try rewrite int_ltu_to_neq;
-    try rewrite int64_ltu_to_neq;
-    try change (Int64.eq Int64.zero Int64.zero) with true;
-    try destruct Archi.ptr64;
-    repeat rewrite if_neg;
-    simpl;
-    trivial;
-    try destruct (_ || _);
-    trivial;
-    try apply Val.lessdef_normalize.
-
+    all: destruct c.
+    all: simpl in *.
+    all: inv EQ0.
+    all: econstructor; splitall.
+    all: try apply exec_straight_one.
+    all: intros; simpl; trivial.
+    all: unfold Val.select, cmove, cmoveu; simpl.
+    all: destruct (rs x0); simpl; trivial.
+    all: try rewrite int_ltu_to_neq.
+    all: try rewrite int64_ltu_to_neq.
+    all: try change (Int64.eq Int64.zero Int64.zero) with true.
+    all: try destruct Archi.ptr64.
+    all: try rewrite Pregmap.gss.
+    all: repeat rewrite if_neg.
+    all: simpl.
+    all: try destruct (_ || _).
+    all: try apply Val.lessdef_normalize.
+    all: trivial. (* no more lessdef *)
+    all: apply Pregmap.gso; congruence.
 Qed.
 
 (** Memory accesses *)
diff --git a/mppa_k1c/Asmvliw.v b/mppa_k1c/Asmvliw.v
index e042d95a..946007c1 100644
--- a/mppa_k1c/Asmvliw.v
+++ b/mppa_k1c/Asmvliw.v
@@ -555,6 +555,8 @@ Inductive ar_instruction : Type :=
   | PArithARRI64 (i: arith_name_arri64) (rd rs: ireg) (imm: int64)
 .
 
+Module PArithCoercions.
+
 Coercion PArithR:       arith_name_r        >-> Funclass.
 Coercion PArithRR:      arith_name_rr       >-> Funclass.
 Coercion PArithRI32:    arith_name_ri32     >-> Funclass.
@@ -569,6 +571,8 @@ Coercion PArithARR:     arith_name_arr      >-> Funclass.
 Coercion PArithARRI32:   arith_name_arri32    >-> Funclass.
 Coercion PArithARRI64:   arith_name_arri64    >-> Funclass.
 
+End PArithCoercions.
+
 Inductive basic : Type :=
   | PArith          (i: ar_instruction)
   | PLoad           (i: ld_instruction)
@@ -1709,7 +1713,7 @@ Proof.
 Qed.
 
 
-Local Hint Resolve parexec_bblock_write_in_order.
+Local Hint Resolve parexec_bblock_write_in_order: core.
 
 Lemma det_parexec_write_in_order f b rs m rs' m':
    det_parexec f b rs m rs' m' -> parexec_wio_bblock f b rs m = Next rs' m'.
diff --git a/mppa_k1c/CSE2deps.v b/mppa_k1c/CSE2deps.v
new file mode 100644
index 00000000..8ab9242a
--- /dev/null
+++ b/mppa_k1c/CSE2deps.v
@@ -0,0 +1,20 @@
+Require Import BoolEqual Coqlib.
+Require Import AST Integers Floats.
+Require Import Values Memory Globalenvs Events.
+Require Import Op.
+
+
+Definition can_swap_accesses_ofs ofsr chunkr ofsw chunkw :=
+     (0 <=? ofsw) && (ofsw <=? (Ptrofs.modulus - largest_size_chunk))
+  && (0 <=? ofsr) && (ofsr <=? (Ptrofs.modulus - largest_size_chunk))
+  && ((ofsw + size_chunk chunkw <=? ofsr) ||
+      (ofsr + size_chunk chunkr <=? ofsw)).
+
+Definition may_overlap chunk addr args chunk' addr' args' :=
+  match addr, addr', args, args' with
+  | (Aindexed ofs), (Aindexed ofs'),
+    (base :: nil), (base' :: nil) =>
+    if peq base base'
+    then negb (can_swap_accesses_ofs (Ptrofs.unsigned ofs') chunk' (Ptrofs.unsigned ofs) chunk)
+    else true  | _, _, _, _ => true
+  end.
diff --git a/mppa_k1c/CSE2depsproof.v b/mppa_k1c/CSE2depsproof.v
new file mode 100644
index 00000000..a3811e78
--- /dev/null
+++ b/mppa_k1c/CSE2depsproof.v
@@ -0,0 +1,127 @@
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL Maps.
+
+Require Import Globalenvs Values.
+Require Import Linking Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import CSE2 CSE2deps.
+Require Import Lia.
+
+Lemma ptrofs_size :
+  Ptrofs.wordsize = (if Archi.ptr64 then 64 else 32)%nat.
+Proof.
+  unfold Ptrofs.wordsize.
+  unfold Wordsize_Ptrofs.wordsize.
+  trivial.
+Qed.
+
+Lemma ptrofs_modulus :
+  Ptrofs.modulus = if Archi.ptr64 then 18446744073709551616 else 4294967296.
+Proof.
+  unfold Ptrofs.modulus.
+  rewrite ptrofs_size.
+  destruct Archi.ptr64; reflexivity.
+Qed.
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Section MEMORY_WRITE.
+  Variable m m2 : mem.
+  Variable chunkw chunkr : memory_chunk.
+  Variable base : val.
+  
+  Variable addrw addrr valw : val.
+  Hypothesis STORE : Mem.storev chunkw m addrw valw = Some m2.
+
+  Section INDEXED_AWAY.
+  Variable ofsw ofsr : ptrofs.
+  Hypothesis ADDRW : eval_addressing genv sp
+                       (Aindexed ofsw) (base :: nil) = Some addrw. 
+  Hypothesis ADDRR : eval_addressing genv sp
+                       (Aindexed ofsr) (base :: nil) = Some addrr.
+
+  Lemma load_store_away1 :
+    forall RANGEW : 0 <= Ptrofs.unsigned ofsw <= Ptrofs.modulus - largest_size_chunk,
+    forall RANGER : 0 <= Ptrofs.unsigned ofsr <= Ptrofs.modulus - largest_size_chunk,
+    forall SWAPPABLE :    Ptrofs.unsigned ofsw + size_chunk chunkw <= Ptrofs.unsigned ofsr
+                          \/ Ptrofs.unsigned ofsr + size_chunk chunkr <= Ptrofs.unsigned ofsw,
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+
+  Proof.
+    intros.
+    
+    pose proof (max_size_chunk chunkr) as size_chunkr_bounded.
+    pose proof (max_size_chunk chunkw) as size_chunkw_bounded.
+    unfold largest_size_chunk in *.
+
+    rewrite ptrofs_modulus in *.
+    simpl in *.
+    inv ADDRR.
+    inv ADDRW.
+    destruct base; try discriminate.
+    eapply Mem.load_store_other with (chunk := chunkw) (v := valw) (b := b).
+    exact STORE.
+    right.
+
+    all: try (destruct (Ptrofs.unsigned_add_either i ofsr) as [OFSR | OFSR];
+              rewrite OFSR).
+    all: try (destruct (Ptrofs.unsigned_add_either i ofsw) as [OFSW | OFSW];
+              rewrite OFSW).
+    all: try rewrite ptrofs_modulus in *.
+    all: destruct Archi.ptr64.
+
+    all: intuition lia.
+  Qed.
+  
+  Theorem load_store_away :
+    can_swap_accesses_ofs (Ptrofs.unsigned ofsr) chunkr (Ptrofs.unsigned ofsw) chunkw = true ->
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intro SWAP.
+    unfold can_swap_accesses_ofs in SWAP.
+    repeat rewrite andb_true_iff in SWAP.
+    repeat rewrite orb_true_iff in SWAP.
+    repeat rewrite Z.leb_le in SWAP.
+    apply load_store_away1.
+    all: tauto.
+  Qed.
+  End INDEXED_AWAY.
+End MEMORY_WRITE.
+End SOUNDNESS.
+
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Lemma may_overlap_sound:
+  forall m m' : mem,
+  forall chunk addr args chunk' addr' args' v a a' rs,
+    (eval_addressing genv sp addr (rs ## args)) = Some a ->
+    (eval_addressing genv sp addr' (rs ## args')) = Some a' ->
+    (may_overlap chunk addr args chunk' addr' args') = false ->
+    (Mem.storev chunk m a v) = Some m' ->
+    (Mem.loadv chunk' m' a') = (Mem.loadv chunk' m  a').
+Proof.
+  intros until rs.
+  intros ADDR ADDR' OVERLAP STORE.
+  destruct addr; destruct addr'; try discriminate.
+  { (* Aindexed / Aindexed *)
+  destruct args as [ | base [ | ]]. 1,3: discriminate.
+  destruct args' as [ | base' [ | ]]. 1,3: discriminate.
+  simpl in OVERLAP.
+  destruct (peq base base'). 2: discriminate.
+  subst base'.
+  destruct (can_swap_accesses_ofs (Ptrofs.unsigned i0) chunk' (Ptrofs.unsigned i) chunk) eqn:SWAP.
+  2: discriminate.
+  simpl in *.
+  eapply load_store_away with (F:=F) (V:=V) (genv:=genv) (sp:=sp); eassumption.
+  }
+Qed.
+
+End SOUNDNESS.
diff --git a/mppa_k1c/DuplicateOpcodeHeuristic.ml b/mppa_k1c/DuplicateOpcodeHeuristic.ml
index 690553ce..2ec314c1 100644
--- a/mppa_k1c/DuplicateOpcodeHeuristic.ml
+++ b/mppa_k1c/DuplicateOpcodeHeuristic.ml
@@ -2,10 +2,8 @@
 open Op
 open Integers
 
-exception HeuristicSucceeded
-
-let opcode_heuristic code cond ifso ifnot preferred =
-  let decision = match cond with
+let opcode_heuristic code cond ifso ifnot is_loop_header =
+  match cond with
   | Ccompimm (c, n) | Ccompuimm (c, n) -> if n == Integers.Int.zero then (match c with
       | Clt | Cle -> Some false
       | Cgt | Cge -> Some true
@@ -27,6 +25,3 @@ let opcode_heuristic code cond ifso ifnot preferred =
       | _ -> None
       )
   | _ -> None
-  in match decision with
-  | Some b -> (preferred := b; raise HeuristicSucceeded)
-  | None -> ()
diff --git a/mppa_k1c/PostpassSchedulingproof.v b/mppa_k1c/PostpassSchedulingproof.v
index fbb06c9b..8cc7f0ab 100644
--- a/mppa_k1c/PostpassSchedulingproof.v
+++ b/mppa_k1c/PostpassSchedulingproof.v
@@ -61,9 +61,9 @@ Proof.
   - subst. repeat (rewrite Pregmap.gss); auto.
     destruct v; simpl; auto.
     rewrite Ptrofs.add_assoc.
-    cutrewrite (Ptrofs.repr (x + y) = Ptrofs.add (Ptrofs.repr x) (Ptrofs.repr y)); auto.
+    enough (Ptrofs.repr (x + y) = Ptrofs.add (Ptrofs.repr x) (Ptrofs.repr y)) as ->; auto.
     unfold Ptrofs.add.
-    cutrewrite (x + y = Ptrofs.unsigned (Ptrofs.repr x) + Ptrofs.unsigned (Ptrofs.repr y)); auto.
+    enough (x + y = Ptrofs.unsigned (Ptrofs.repr x) + Ptrofs.unsigned (Ptrofs.repr y)) as ->; auto.
     repeat (rewrite Ptrofs.unsigned_repr); auto.
   - repeat (rewrite Pregmap.gso; auto).
 Qed.
@@ -220,7 +220,8 @@ Proof.
     destruct (zeq pos 0).
     + inv H. exists lbb. constructor; auto.
     + apply IHlbb in H. destruct H as (c & TAIL). exists c.
-      cutrewrite (pos = pos - size a + size a). apply code_tail_S; auto.
+      enough (pos = pos - size a + size a) as ->.
+      apply code_tail_S; auto.
       omega.
 Qed.
 
@@ -567,7 +568,7 @@ Proof.
   unfold builtin_alone in H0. erewrite H0; eauto.
 Qed.
 
-Local Hint Resolve verified_schedule_nob_checks_alls_bundles.
+Local Hint Resolve verified_schedule_nob_checks_alls_bundles: core.
 
 Lemma verified_schedule_checks_alls_bundles bb lb bundle:
   verified_schedule bb = OK lb ->
diff --git a/mppa_k1c/abstractbb/AbstractBasicBlocksDef.v b/mppa_k1c/abstractbb/AbstractBasicBlocksDef.v
index 5c94d435..cf46072f 100644
--- a/mppa_k1c/abstractbb/AbstractBasicBlocksDef.v
+++ b/mppa_k1c/abstractbb/AbstractBasicBlocksDef.v
@@ -403,7 +403,7 @@ Proof.
     * eapply H2; eauto. intros; eapply H0; eauto. rewrite rev_append_rev, in_app_iff; auto.
     * intros; eapply H0; eauto. rewrite rev_append_rev, in_app_iff, <- in_rev; auto.
 Qed.
-Local Hint Resolve app_fail_allvalid_correct.
+Local Hint Resolve app_fail_allvalid_correct: core.
 
 Lemma app_fail_correct l pt t1 t2: 
   match_pt t1 pt -> 
diff --git a/mppa_k1c/abstractbb/ImpSimuTest.v b/mppa_k1c/abstractbb/ImpSimuTest.v
index ea55b735..7a77ec15 100644
--- a/mppa_k1c/abstractbb/ImpSimuTest.v
+++ b/mppa_k1c/abstractbb/ImpSimuTest.v
@@ -304,12 +304,12 @@ Proof.
       rewrite <- EQT; eauto.
     + exploit smem_valid_set_decompose_1; eauto.
   - clear DM0. unfold hsmem_post_eval, hsmem_post_eval in * |- *; simpl.
-    Local Hint Resolve smem_valid_set_decompose_1.
+    Local Hint Resolve smem_valid_set_decompose_1: core.
     intros; case (R.eq_dec x x0).
     + intros; subst; rewrite !Dict.set_spec_eq; simpl; eauto.
     + intros; rewrite !Dict.set_spec_diff; simpl; eauto.
 Qed.
-Local Hint Resolve naive_set_correct.
+Local Hint Resolve naive_set_correct: core.
 
 Definition equiv_hsmem ge (hd1 hd2: hsmem) := 
       (forall m, allvalid ge hd1.(hpre) m <-> allvalid ge hd2.(hpre) m)
@@ -523,7 +523,7 @@ Lemma hinst_smem_correct i: forall hd hod,
   WHEN hinst_smem i hd hod ~> hd' THEN
     forall ge od d, smem_model ge od hod -> smem_model ge d hd -> (forall m, smem_valid ge d m -> smem_valid ge od m) -> smem_model ge (inst_smem i d od) hd'.
 Proof.
-  Local Hint Resolve smem_valid_set_proof.
+  Local Hint Resolve smem_valid_set_proof: core.
   induction i; simpl; wlp_simplify; eauto 15 with wlp.
 Qed.
 Global Opaque hinst_smem.
@@ -563,7 +563,7 @@ Definition bblock_hsmem: bblock -> ?? hsmem
 Lemma bblock_hsmem_correct p:
   WHEN bblock_hsmem p ~> hd THEN forall ge, smem_model ge (bblock_smem p) hd.
 Proof.
-  Local Hint Resolve hsmem_empty_correct.
+  Local Hint Resolve hsmem_empty_correct: core.
   wlp_simplify.
 Qed.
 Global Opaque bblock_hsmem.
@@ -775,7 +775,7 @@ Proof.
   intro H; erewrite <- list_term_eval_set_hid; rewrite H. apply list_term_eval_set_hid.
 Qed.
 
-Local Hint Resolve term_eval_set_hid_equiv list_term_eval_set_hid_equiv.
+Local Hint Resolve term_eval_set_hid_equiv list_term_eval_set_hid_equiv: core.
 
 Program Definition bblock_simu_test (p1 p2: bblock): ?? bool :=
   DO log <~ count_logger ();;
@@ -802,7 +802,7 @@ Obligation 2.
   wlp_simplify.
 Qed.
 
-Local Hint Resolve g_bblock_simu_test_correct.
+Local Hint Resolve g_bblock_simu_test_correct: core.
 
 Theorem bblock_simu_test_correct p1 p2:
   WHEN bblock_simu_test p1 p2 ~> b THEN b=true -> forall ge, bblock_simu ge p1 p2.
@@ -1123,7 +1123,7 @@ Definition get {A} (d:t A) (x:R.t): option A
 Definition set {A} (d:t A) (x:R.t) (v:A): t A
  := PositiveMap.add x v d.
 
-Local Hint Unfold PositiveMap.E.eq.
+Local Hint Unfold PositiveMap.E.eq: core.
 
 Lemma set_spec_eq A d x (v: A):
   get (set d x v) x = Some v.
diff --git a/mppa_k1c/abstractbb/Impure/ImpHCons.v b/mppa_k1c/abstractbb/Impure/ImpHCons.v
index d8002375..637116cc 100644
--- a/mppa_k1c/abstractbb/Impure/ImpHCons.v
+++ b/mppa_k1c/abstractbb/Impure/ImpHCons.v
@@ -95,7 +95,7 @@ Proof.
   wlp_simplify.
 Qed.
 Global Opaque assert_list_incl.
-Hint Resolve assert_list_incl_correct.
+Hint Resolve assert_list_incl_correct: wlp.
 
 End Sets.
 
@@ -165,7 +165,7 @@ Lemma hConsV_correct A (hasheq: A -> A -> ?? bool):
     (forall x y, WHEN hasheq x y ~> b THEN b=true -> x=y) -> 
     forall x, WHEN hco.(hC) x ~> x' THEN x.(hdata).(data)=x'.(data).
 Proof.
-  Local Hint Resolve f_equal2.
+  Local Hint Resolve f_equal2: core.
   wlp_simplify.
   exploit H; eauto.
   + wlp_simplify.
diff --git a/mppa_k1c/abstractbb/Parallelizability.v b/mppa_k1c/abstractbb/Parallelizability.v
index 22809095..30904b5d 100644
--- a/mppa_k1c/abstractbb/Parallelizability.v
+++ b/mppa_k1c/abstractbb/Parallelizability.v
@@ -332,7 +332,7 @@ Fixpoint bblock_wframe(p:bblock): list R.t :=
   | i::p' => (inst_wframe i)++(bblock_wframe p') 
   end.
 
-Local Hint Resolve Permutation_app_head Permutation_app_tail Permutation_app_comm.
+Local Hint Resolve Permutation_app_head Permutation_app_tail Permutation_app_comm: core.
 
 Lemma bblock_wframe_Permutation p p': 
  Permutation p p' -> Permutation (bblock_wframe p)  (bblock_wframe p').
@@ -620,7 +620,7 @@ Include ParallelizablityChecking L.
 Section PARALLEL2.
 Variable ge: genv.
 
-Local Hint Resolve S.empty_match_frame S.add_match_frame S.union_match_frame S.is_disjoint_match_frame.
+Local Hint Resolve S.empty_match_frame S.add_match_frame S.union_match_frame S.is_disjoint_match_frame: core.
 
 (** Now, refinement of each operation toward parallelizable *)
 
@@ -659,14 +659,14 @@ Fixpoint inst_sframe (i: inst): S.t :=
   | a::i' => S.add (fst a) (S.union (exp_sframe (snd a)) (inst_sframe i'))
   end.
 
-Local Hint Resolve exp_sframe_correct.
+Local Hint Resolve exp_sframe_correct: core.
 
 Lemma inst_sframe_correct i: S.match_frame (inst_sframe i) (inst_frame i).
 Proof.
   induction i as [|[y e] i']; simpl; auto.
 Qed.
 
-Local Hint Resolve inst_wsframe_correct inst_sframe_correct.
+Local Hint Resolve inst_wsframe_correct inst_sframe_correct: core.
 
 Fixpoint is_pararec (p: bblock) (wsframe: S.t): bool :=
   match p with
diff --git a/mppa_k1c/abstractbb/SeqSimuTheory.v b/mppa_k1c/abstractbb/SeqSimuTheory.v
index 649dd083..e234883f 100644
--- a/mppa_k1c/abstractbb/SeqSimuTheory.v
+++ b/mppa_k1c/abstractbb/SeqSimuTheory.v
@@ -102,9 +102,6 @@ Fixpoint bblock_smem_rec (p: bblock) (d: smem): smem :=
      let d':=inst_smem i d d in
      bblock_smem_rec p' d'
   end.
-(*
-Local Hint Resolve smem_eval_empty.
-*)
 
 Definition bblock_smem: bblock -> smem
  := fun p => bblock_smem_rec p smem_empty.
@@ -124,7 +121,7 @@ Proof.
   intros d a H; eapply inst_smem_pre_monotonic; eauto.
 Qed.
 
-Local Hint Resolve inst_smem_pre_monotonic bblock_smem_pre_monotonic.
+Local Hint Resolve inst_smem_pre_monotonic bblock_smem_pre_monotonic: core.
 
 Lemma term_eval_exp e (od:smem) m0 old:
   (forall x, term_eval ge (od x) m0 = Some (old x)) ->
@@ -185,7 +182,7 @@ Lemma bblocks_smem_rec_Some_correct1 p m0: forall (m1 m2: mem) (d: smem),
   (forall x, term_eval ge (d x) m0 = Some (m1 x)) ->
   forall x, term_eval ge (bblock_smem_rec p d x) m0 = Some (m2 x).
 Proof.
-  Local Hint Resolve inst_smem_Some_correct1.
+  Local Hint Resolve inst_smem_Some_correct1: core.
   induction p as [ | i p]; simpl; intros m1 m2 d H.
   - inversion_clear H; eauto.
   - intros H0 x0.
@@ -299,7 +296,7 @@ Lemma block_smem_rec_valid p m0: forall (m1 m2: mem) (d:smem),
   (forall x, term_eval ge (d x) m0 = Some (m1 x)) ->
   pre (bblock_smem_rec p d) ge m0.
 Proof.
-  Local Hint Resolve inst_valid.
+  Local Hint Resolve inst_valid: core.
   induction p as [ | i p]; simpl; intros m1 d H; auto.
   intros H0 H1.
   destruct (inst_run ge i m1 m1) eqn: Heqov; eauto.
@@ -326,7 +323,7 @@ Theorem bblock_smem_simu p1 p2:
    smem_simu (bblock_smem p1) (bblock_smem p2) ->
    bblock_simu ge p1 p2.
 Proof.
-  Local Hint Resolve bblock_smem_valid bblock_smem_Some_correct1.
+  Local Hint Resolve bblock_smem_valid bblock_smem_Some_correct1: core.
   intros (INCL & EQUIV) m DONTFAIL; unfold smem_valid in * |-.
   destruct (run ge p1 m) as [m1|] eqn: RUN1; simpl; try congruence.
   assert (X: forall x, term_eval ge (bblock_smem p1 x) m = Some (m1 x)); eauto.
diff --git a/mppa_k1c/lib/Asmblockgenproof0.v b/mppa_k1c/lib/Asmblockgenproof0.v
index 940c6563..58455ada 100644
--- a/mppa_k1c/lib/Asmblockgenproof0.v
+++ b/mppa_k1c/lib/Asmblockgenproof0.v
@@ -414,7 +414,7 @@ Proof.
 Qed.
 
 
-Local Hint Resolve code_tail_0 code_tail_S.
+Local Hint Resolve code_tail_0 code_tail_S: core.
 
 Lemma code_tail_next:
   forall fn ofs c0,
@@ -458,7 +458,7 @@ Proof.
   omega.
 Qed.
 
-Local Hint Resolve code_tail_next.
+Local Hint Resolve code_tail_next: core.
 
 Lemma code_tail_next_int:
   forall fn ofs bi c,
diff --git a/mppa_k1c/lib/ForwardSimulationBlock.v b/mppa_k1c/lib/ForwardSimulationBlock.v
index 39dd2234..224eda0a 100644
--- a/mppa_k1c/lib/ForwardSimulationBlock.v
+++ b/mppa_k1c/lib/ForwardSimulationBlock.v
@@ -21,7 +21,7 @@ Section starN_lemma.
 
 Variable L: semantics.
 
-Local Hint Resolve starN_refl starN_step Eapp_assoc.
+Local Hint Resolve starN_refl starN_step Eapp_assoc: core.
 
 Lemma starN_split n s t s':
   starN (step L) (globalenv L) n s t s' ->
@@ -93,7 +93,7 @@ Hypothesis simu_end_block:
 
 (** Introduction d'une sémantique par bloc sur L1 appelée "memoL1" *)
 
-Local Hint Resolve starN_refl starN_step.
+Local Hint Resolve starN_refl starN_step: core.
 
 Definition follows_in_block (head current: state L1): Prop :=
   dist_end_block head >= dist_end_block current 
@@ -164,7 +164,7 @@ Inductive is_well_memorized (s s': memostate): Prop :=
     memorized s' = None ->
     is_well_memorized s s'.
 
-Local Hint Resolve StartBloc MidBloc ExitBloc.
+Local Hint Resolve StartBloc MidBloc ExitBloc: core.
 
 Definition memoL1 := {| 
   state := memostate;
diff --git a/mppa_k1c/lib/Machblockgen.v b/mppa_k1c/lib/Machblockgen.v
index a65b218f..2ba42814 100644
--- a/mppa_k1c/lib/Machblockgen.v
+++ b/mppa_k1c/lib/Machblockgen.v
@@ -105,7 +105,7 @@ Inductive is_end_block: Machblock_inst -> code -> Prop :=
   | End_basic bi bh bl: header bh <> nil -> is_end_block (MB_basic bi) (bh::bl)
   | End_cfi cfi bl: bl <> nil -> is_end_block (MB_cfi cfi) bl. 
 
-Local Hint Resolve End_empty End_basic End_cfi.
+Local Hint Resolve End_empty End_basic End_cfi: core.
 
 Inductive is_trans_code: Mach.code -> code -> Prop :=
   | Tr_nil: is_trans_code nil nil
@@ -123,7 +123,7 @@ Inductive is_trans_code: Mach.code -> code -> Prop :=
       header bh = nil ->
       is_trans_code (i::c) (add_basic bi bh::bl).
 
-Local Hint Resolve Tr_nil Tr_end_block.
+Local Hint Resolve Tr_nil Tr_end_block: core.
 
 Lemma add_to_code_is_trans_code i c bl:
   is_trans_code c bl ->
@@ -145,7 +145,7 @@ Proof.
       rewrite <- Heqti. eapply End_cfi. congruence.
 Qed.
 
-Local Hint Resolve add_to_code_is_trans_code.
+Local Hint Resolve add_to_code_is_trans_code: core.
 
 Lemma trans_code_is_trans_code_rev c1: forall c2 mbi, 
   is_trans_code c2 mbi ->
@@ -185,7 +185,7 @@ Proof.
   exists mbi1. split; congruence.
 Qed.
 
-Local Hint Resolve trans_code_is_trans_code.
+Local Hint Resolve trans_code_is_trans_code: core.
 
 Theorem is_trans_code_inv c bl: is_trans_code c bl <-> bl=(trans_code c).
 Proof.
diff --git a/mppa_k1c/lib/Machblockgenproof.v b/mppa_k1c/lib/Machblockgenproof.v
index 91be5e2e..0de2df52 100644
--- a/mppa_k1c/lib/Machblockgenproof.v
+++ b/mppa_k1c/lib/Machblockgenproof.v
@@ -72,7 +72,7 @@ Proof.
   apply match_states_trans_state.
 Qed.
 
-Local Hint Resolve match_states_trans_state.
+Local Hint Resolve match_states_trans_state: core.
 
 Lemma symbols_preserved:
   forall (s: ident), Genv.find_symbol tge s = Genv.find_symbol ge s.
@@ -284,7 +284,7 @@ Proof.
 Qed.
 
 Local Hint Resolve symbols_preserved senv_preserved init_mem_preserved prog_main_preserved functions_translated
-                   parent_sp_preserved.
+                   parent_sp_preserved: core.
 
 
 Definition dist_end_block_code (c: Mach.code) := 
@@ -299,8 +299,8 @@ Definition dist_end_block (s: Mach.state): nat :=
   | _ => 0
   end.
 
-Local Hint Resolve exec_nil_body exec_cons_body.
-Local Hint Resolve exec_MBgetstack exec_MBsetstack exec_MBgetparam exec_MBop exec_MBload exec_MBstore.
+Local Hint Resolve exec_nil_body exec_cons_body: core.
+Local Hint Resolve exec_MBgetstack exec_MBsetstack exec_MBgetparam exec_MBop exec_MBload exec_MBstore: core.
 
 Lemma size_add_label l bh: size (add_label l bh) = size bh + 1.
 Proof.
@@ -336,7 +336,7 @@ Proof.
     omega.
 Qed.
 
-Local Hint Resolve dist_end_block_code_simu_mid_block.
+Local Hint Resolve dist_end_block_code_simu_mid_block: core.
 
 
 Lemma size_nonzero c b bl:
@@ -392,8 +392,8 @@ destruct i; congruence.
 Qed.
 
 
-Local Hint Resolve Mlabel_is_not_cfi.
-Local Hint Resolve MBbasic_is_not_cfi.
+Local Hint Resolve Mlabel_is_not_cfi: core.
+Local Hint Resolve MBbasic_is_not_cfi: core.
 
 Lemma add_to_new_block_is_label i:
   header (add_to_new_bblock (trans_inst i)) <> nil -> exists l, i = Mlabel l.
@@ -408,7 +408,7 @@ Proof.
   + unfold cfi_bblock in H; simpl in H; congruence.
 Qed.
 
-Local Hint Resolve Mlabel_is_not_basic.
+Local Hint Resolve Mlabel_is_not_basic: core.
 
 Lemma trans_code_decompose c: forall b bl,
   is_trans_code c (b::bl) ->
@@ -510,8 +510,8 @@ Proof.
     rewrite Hs2, Hb2; eauto.
     Qed. 
 
-Local Hint Resolve exec_MBcall exec_MBtailcall exec_MBbuiltin exec_MBgoto exec_MBcond_true exec_MBcond_false exec_MBjumptable exec_MBreturn exec_Some_exit exec_None_exit.
-Local Hint Resolve eval_builtin_args_preserved external_call_symbols_preserved find_funct_ptr_same.
+Local Hint Resolve exec_MBcall exec_MBtailcall exec_MBbuiltin exec_MBgoto exec_MBcond_true exec_MBcond_false exec_MBjumptable exec_MBreturn exec_Some_exit exec_None_exit: core.
+Local Hint Resolve eval_builtin_args_preserved external_call_symbols_preserved find_funct_ptr_same: core.
 
 
 Lemma match_states_concat_trans_code st f sp c rs m h: 
diff --git a/powerpc/Archi.v b/powerpc/Archi.v
index ab348c14..10f38391 100644
--- a/powerpc/Archi.v
+++ b/powerpc/Archi.v
@@ -30,6 +30,10 @@ Definition align_float64 := 8%Z.
 (** Can we use the 64-bit extensions to the PowerPC architecture? *)
 Parameter ppc64 : bool.
 
+(** Should single-precision FP arguments passed on stack be passed 
+    as singles or use double FP format. *)
+Parameter single_passed_as_single : bool.
+
 Definition splitlong := negb ppc64.
 
 Lemma splitlong_ptr32: splitlong = true -> ptr64 = false.
diff --git a/powerpc/CSE2deps.v b/powerpc/CSE2deps.v
new file mode 100644
index 00000000..9db51bbb
--- /dev/null
+++ b/powerpc/CSE2deps.v
@@ -0,0 +1,20 @@
+Require Import BoolEqual Coqlib.
+Require Import AST Integers Floats.
+Require Import Values Memory Globalenvs Events.
+Require Import Op.
+
+
+Definition can_swap_accesses_ofs ofsr chunkr ofsw chunkw :=
+     (0 <=? ofsw) && (ofsw <=? (Ptrofs.modulus - largest_size_chunk))
+  && (0 <=? ofsr) && (ofsr <=? (Ptrofs.modulus - largest_size_chunk))
+  && ((ofsw + size_chunk chunkw <=? ofsr) ||
+      (ofsr + size_chunk chunkr <=? ofsw)).
+
+Definition may_overlap chunk addr args chunk' addr' args' :=
+  match addr, addr', args, args' with
+  | (Aindexed ofs), (Aindexed ofs'),
+    (base :: nil), (base' :: nil) =>
+    if peq base base'
+    then negb (can_swap_accesses_ofs (Int.unsigned ofs') chunk' (Int.unsigned ofs) chunk)
+    else true  | _, _, _, _ => true
+  end.
diff --git a/powerpc/CSE2depsproof.v b/powerpc/CSE2depsproof.v
new file mode 100644
index 00000000..fdded9b6
--- /dev/null
+++ b/powerpc/CSE2depsproof.v
@@ -0,0 +1,135 @@
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL Maps.
+
+Require Import Globalenvs Values.
+Require Import Linking Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import CSE2 CSE2deps.
+Require Import Lia.
+
+Lemma ptrofs_size :
+  Ptrofs.wordsize = if Archi.ptr64 then 64%nat else 32%nat.
+Proof.
+  unfold Ptrofs.wordsize.
+  unfold Wordsize_Ptrofs.wordsize.
+  trivial.
+Qed.
+
+Lemma ptrofs_modulus :
+  Ptrofs.modulus = if Archi.ptr64 then 18446744073709551616 else 4294967296.
+Proof.
+  unfold Ptrofs.modulus.
+  rewrite ptrofs_size.
+  destruct Archi.ptr64; reflexivity.
+Qed.
+
+Lemma ptrofs_max_unsigned :
+  Ptrofs.max_unsigned = if Archi.ptr64 then 18446744073709551615 else 4294967295.
+Proof.
+  unfold Ptrofs.max_unsigned.
+  rewrite ptrofs_modulus.
+  destruct Archi.ptr64; reflexivity.
+Qed.
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Section MEMORY_WRITE.
+  Variable m m2 : mem.
+  Variable chunkw chunkr : memory_chunk.
+  Variable base : val.
+  
+  Variable addrw addrr valw : val.
+  Hypothesis STORE : Mem.storev chunkw m addrw valw = Some m2.
+
+  Section INDEXED_AWAY.
+  Variable ofsw ofsr : int.
+  Hypothesis ADDRW : eval_addressing genv sp
+                       (Aindexed ofsw) (base :: nil) = Some addrw. 
+  Hypothesis ADDRR : eval_addressing genv sp
+                       (Aindexed ofsr) (base :: nil) = Some addrr.
+
+  Lemma load_store_away1 :
+    forall RANGEW : 0 <= Int.unsigned ofsw <= Ptrofs.modulus - largest_size_chunk,
+    forall RANGER : 0 <= Int.unsigned ofsr <= Ptrofs.modulus - largest_size_chunk,
+    forall SWAPPABLE :    Int.unsigned ofsw + size_chunk chunkw <= Int.unsigned ofsr
+                       \/ Int.unsigned ofsr + size_chunk chunkr <= Int.unsigned ofsw,
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intros.
+    
+    pose proof (max_size_chunk chunkr) as size_chunkr_bounded.
+    pose proof (max_size_chunk chunkw) as size_chunkw_bounded.
+    unfold largest_size_chunk in *.
+
+    rewrite ptrofs_modulus in *.
+    simpl in *.
+    inv ADDRR.
+    inv ADDRW.
+    destruct base; try discriminate.
+    eapply Mem.load_store_other with (chunk := chunkw) (v := valw) (b := b).
+    exact STORE.
+    right.
+
+    all: try (destruct (Ptrofs.unsigned_add_either i (Ptrofs.of_int ofsr)) as [OFSR | OFSR];
+              rewrite OFSR).
+    all: try (destruct (Ptrofs.unsigned_add_either i (Ptrofs.of_int ofsw)) as [OFSW | OFSW];
+              rewrite OFSW).
+    all: unfold Ptrofs.of_int.
+
+    all: repeat rewrite Ptrofs.unsigned_repr by (unfold Ptrofs.max_unsigned; rewrite ptrofs_modulus; destruct Archi.ptr64; lia).
+    all: repeat rewrite ptrofs_modulus.
+    all: destruct Archi.ptr64; intuition lia.
+  Qed.
+  
+  Theorem load_store_away :
+    can_swap_accesses_ofs (Int.unsigned ofsr) chunkr (Int.unsigned ofsw) chunkw = true ->
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intro SWAP.
+    unfold can_swap_accesses_ofs in SWAP.
+    repeat rewrite andb_true_iff in SWAP.
+    repeat rewrite orb_true_iff in SWAP.
+    repeat rewrite Z.leb_le in SWAP.
+    apply load_store_away1.
+    all: tauto.
+  Qed.
+  End INDEXED_AWAY.
+End MEMORY_WRITE.
+End SOUNDNESS.
+
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Lemma may_overlap_sound:
+  forall m m' : mem,
+  forall chunk addr args chunk' addr' args' v a a' rs,
+    (eval_addressing genv sp addr (rs ## args)) = Some a ->
+    (eval_addressing genv sp addr' (rs ## args')) = Some a' ->
+    (may_overlap chunk addr args chunk' addr' args') = false ->
+    (Mem.storev chunk m a v) = Some m' ->
+    (Mem.loadv chunk' m' a') = (Mem.loadv chunk' m  a').
+Proof.
+  intros until rs.
+  intros ADDR ADDR' OVERLAP STORE.
+  destruct addr; destruct addr'; try discriminate.
+  { (* Aindexed / Aindexed *)
+  destruct args as [ | base [ | ]]. 1,3: discriminate.
+  destruct args' as [ | base' [ | ]]. 1,3: discriminate.
+  simpl in OVERLAP.
+  destruct (peq base base'). 2: discriminate.
+  subst base'.
+  destruct (can_swap_accesses_ofs (Int.unsigned i0) chunk' (Int.unsigned i) chunk) eqn:SWAP.
+  2: discriminate.
+  simpl in *.
+  eapply load_store_away with (F:=F) (V:=V) (genv:=genv) (sp:=sp); eassumption.
+  }
+Qed.
+
+End SOUNDNESS.
diff --git a/powerpc/Conventions1.v b/powerpc/Conventions1.v
index 25d9c081..5c9cbd4f 100644
--- a/powerpc/Conventions1.v
+++ b/powerpc/Conventions1.v
@@ -208,7 +208,16 @@ Fixpoint loc_arguments_rec
       | Some ireg =>
           One (R ireg) :: loc_arguments_rec tys (ir + 1) fr ofs
       end
-  | (Tfloat | Tsingle | Tany64) as ty :: tys =>
+  | Tsingle as ty :: tys =>
+      match list_nth_z float_param_regs fr with
+      | None =>
+          let ty := if Archi.single_passed_as_single then Tsingle else Tany64 in
+          let ofs := align ofs (typesize ty) in
+          One (S Outgoing ofs ty) :: loc_arguments_rec tys ir fr (ofs + (typesize ty))
+      | Some freg =>
+          One (R freg) :: loc_arguments_rec tys ir (fr + 1) ofs
+      end
+  | (Tfloat | Tany64) as ty :: tys =>
       match list_nth_z float_param_regs fr with
       | None =>
           let ofs := align ofs 2 in
@@ -236,33 +245,6 @@ Fixpoint loc_arguments_rec
 Definition loc_arguments (s: signature) : list (rpair loc) :=
   loc_arguments_rec s.(sig_args) 0 0 0.
 
-(** [size_arguments s] returns the number of [Outgoing] slots used
-  to call a function with signature [s]. *)
-
-Fixpoint size_arguments_rec (tyl: list typ) (ir fr ofs: Z) {struct tyl} : Z :=
-  match tyl with
-  | nil => ofs
-  | (Tint | Tany32) :: tys =>
-      match list_nth_z int_param_regs ir with
-      | None => size_arguments_rec tys ir fr (ofs + 1)
-      | Some ireg => size_arguments_rec tys (ir + 1) fr ofs
-      end
-  | (Tfloat | Tsingle | Tany64) :: tys =>
-      match list_nth_z float_param_regs fr with
-      | None => size_arguments_rec tys ir fr (align ofs 2 + 2)
-      | Some freg => size_arguments_rec tys ir (fr + 1) ofs
-      end
-  | Tlong :: tys =>
-      let ir := align ir 2 in
-      match list_nth_z int_param_regs ir, list_nth_z int_param_regs (ir + 1) with
-      | Some r1, Some r2 => size_arguments_rec tys (ir + 2) fr ofs
-      | _, _ => size_arguments_rec tys ir fr (align ofs 2 + 2)
-      end
-  end.
-
-Definition size_arguments (s: signature) : Z :=
-  size_arguments_rec s.(sig_args) 0 0 0.
-
 (** Argument locations are either caller-save registers or [Outgoing]
   stack slots at nonnegative offsets. *)
 
@@ -322,12 +304,14 @@ Opaque list_nth_z.
   apply align_divides; omega. apply Z.divide_1_l. apply Z.divide_1_l.
   eapply Y; eauto. omega.
 - (* single *)
+  assert (ofs <= align ofs 1) by (apply align_le; omega).
   assert (ofs <= align ofs 2) by (apply align_le; omega).
   destruct (list_nth_z float_param_regs fr) as [r|] eqn:E; destruct H.
   subst. right. eapply list_nth_z_in; eauto.
   eapply IHtyl; eauto.
-  subst. split. omega. apply Z.divide_1_l.
-  eapply Y; eauto. omega.
+  subst. split. destruct Archi.single_passed_as_single; simpl; omega.
+  destruct Archi.single_passed_as_single; simpl; apply Z.divide_1_l.
+  eapply Y; eauto. destruct Archi.single_passed_as_single; simpl; omega.
 - (* any32 *)
   destruct (list_nth_z int_param_regs ir) as [r|] eqn:E; destruct H.
   subst. left. eapply list_nth_z_in; eauto.
@@ -359,105 +343,6 @@ Qed.
 
 Hint Resolve loc_arguments_acceptable: locs.
 
-(** The offsets of [Outgoing] arguments are below [size_arguments s]. *)
-
-Remark size_arguments_rec_above:
-  forall tyl ir fr ofs0,
-  ofs0 <= size_arguments_rec tyl ir fr ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  omega.
-  destruct a.
-  destruct (list_nth_z int_param_regs ir); eauto. apply Z.le_trans with (ofs0 + 1); auto; omega.
-  destruct (list_nth_z float_param_regs fr); eauto.
-  apply Z.le_trans with (align ofs0 2). apply align_le; omega.
-  apply Z.le_trans with (align ofs0 2 + 2); auto; omega.
-  set (ir' := align ir 2).
-  destruct (list_nth_z int_param_regs ir'); eauto.
-  destruct (list_nth_z int_param_regs (ir' + 1)); eauto.
-  apply Z.le_trans with (align ofs0 2). apply align_le; omega.
-  apply Z.le_trans with (align ofs0 2 + 2); auto; omega.
-  apply Z.le_trans with (align ofs0 2). apply align_le; omega.
-  apply Z.le_trans with (align ofs0 2 + 2); auto; omega.
-  destruct (list_nth_z float_param_regs fr); eauto.
-  apply Z.le_trans with (align ofs0 2). apply align_le; omega.
-  apply Z.le_trans with (align ofs0 2 + 2); auto; omega.
-  destruct (list_nth_z int_param_regs ir); eauto. apply Z.le_trans with (ofs0 + 1); auto; omega.
-  destruct (list_nth_z float_param_regs fr); eauto.
-  apply Z.le_trans with (align ofs0 2). apply align_le; omega.
-  apply Z.le_trans with (align ofs0 2 + 2); auto; omega.
-Qed.
-
-Lemma size_arguments_above:
-  forall s, size_arguments s >= 0.
-Proof.
-  intros; unfold size_arguments. apply Z.le_ge.
-  apply size_arguments_rec_above.
-Qed.
-
-Lemma loc_arguments_bounded:
-  forall (s: signature) (ofs: Z) (ty: typ),
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments s)) ->
-  ofs + typesize ty <= size_arguments s.
-Proof.
-  intros.
-  assert (forall tyl ir fr ofs0,
-    In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments_rec tyl ir fr ofs0)) ->
-    ofs + typesize ty <= size_arguments_rec tyl ir fr ofs0).
-{
-  induction tyl; simpl; intros.
-  elim H0.
-  destruct a.
-- (* int *)
-  destruct (list_nth_z int_param_regs ir); destruct H0.
-  congruence.
-  eauto.
-  inv H0. apply size_arguments_rec_above.
-  eauto.
-- (* float *)
-  destruct (list_nth_z float_param_regs fr); destruct H0.
-  congruence.
-  eauto.
-  inv H0. apply size_arguments_rec_above. eauto.
-- (* long *)
-  set (ir' := align ir 2) in *.
-  assert (DFL:
-    In (S Outgoing ofs ty) (regs_of_rpairs
-                             ((if Archi.ptr64
-                               then One (S Outgoing (align ofs0 2) Tlong)
-                               else Twolong (S Outgoing (align ofs0 2) Tint)
-                                            (S Outgoing (align ofs0 2 + 1) Tint))
-                          :: loc_arguments_rec tyl ir' fr (align ofs0 2 + 2))) ->
-    ofs + typesize ty <= size_arguments_rec tyl ir' fr (align ofs0 2 + 2)).
-  { destruct Archi.ptr64; intros IN.
-  - destruct IN. inv H1. apply size_arguments_rec_above. auto.
-  - destruct IN. inv H1. transitivity (align ofs0 2 + 2). simpl; omega. apply size_arguments_rec_above.
-    destruct H1. inv H1. transitivity (align ofs0 2 + 2). simpl; omega. apply size_arguments_rec_above.
-    auto. }
-  destruct (list_nth_z int_param_regs ir'); auto.
-  destruct (list_nth_z int_param_regs (ir' + 1)); auto.
-  destruct H0. congruence. destruct H0. congruence. eauto.
-- (* single *)
-  destruct (list_nth_z float_param_regs fr); destruct H0.
-  congruence.
-  eauto.
-  inv H0. transitivity (align ofs0 2 + 2). simpl; omega. apply size_arguments_rec_above.
-  eauto.
-- (* any32 *)
-  destruct (list_nth_z int_param_regs ir); destruct H0.
-  congruence.
-  eauto.
-  inv H0. apply size_arguments_rec_above.
-  eauto.
-- (* any64 *)
-  destruct (list_nth_z float_param_regs fr); destruct H0.
-  congruence.
-  eauto.
-  inv H0. apply size_arguments_rec_above. eauto.
-  }
-  eauto.
-Qed.
-
 Lemma loc_arguments_main:
   loc_arguments signature_main = nil.
 Proof.
diff --git a/powerpc/DuplicateOpcodeHeuristic.ml b/powerpc/DuplicateOpcodeHeuristic.ml
index 85505245..33be79e8 100644
--- a/powerpc/DuplicateOpcodeHeuristic.ml
+++ b/powerpc/DuplicateOpcodeHeuristic.ml
@@ -1,3 +1,27 @@
-exception HeuristicSucceeded
-        
-let opcode_heuristic code cond ifso ifnot preferred = ()
+(* open Camlcoq *)
+open Op
+open Integers
+
+let opcode_heuristic code cond ifso ifnot is_loop_header =
+  match cond with
+  | Ccompimm (c, n) | Ccompuimm (c, n) -> if n == Integers.Int.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccomplimm (c, n) | Ccompluimm (c, n) -> if n == Integers.Int64.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccompf c -> (match c with
+      | Ceq -> Some false
+      | Cne -> Some true
+      | _ -> None
+      )
+  | Cnotcompf c -> (match c with
+      | Ceq -> Some true
+      | Cne -> Some false
+      | _ -> None
+      )
+  | _ -> None
diff --git a/powerpc/extractionMachdep.v b/powerpc/extractionMachdep.v
index 7482435f..a3e945bf 100644
--- a/powerpc/extractionMachdep.v
+++ b/powerpc/extractionMachdep.v
@@ -34,3 +34,6 @@ Extract Constant Archi.ppc64 =>
    | ""e5500"" -> true
    | _ -> false
    end".
+
+(* Choice of passing of single *)
+Extract Constant Archi.single_passed_as_single => "Configuration.gnu_toolchain".
diff --git a/riscV/Asmexpand.ml b/riscV/Asmexpand.ml
index d36b6230..7e36abf8 100644
--- a/riscV/Asmexpand.ml
+++ b/riscV/Asmexpand.ml
@@ -63,44 +63,44 @@ let expand_storeind_ptr src base ofs =
 let int_param_regs   = [| X10; X11; X12; X13; X14; X15; X16; X17 |]
 let float_param_regs = [| F10; F11; F12; F13; F14; F15; F16; F17 |]
 
-let rec fixup_variadic_call pos tyl =
-  if pos < 8 then
+let rec fixup_variadic_call ri rf tyl =
+  if ri < 8 then
     match tyl with
     | [] ->
         ()
     | (Tint | Tany32) :: tyl ->
-        fixup_variadic_call (pos + 1) tyl
+        fixup_variadic_call (ri + 1) rf tyl
     | Tsingle :: tyl ->
-        let rs =float_param_regs.(pos)
-        and rd = int_param_regs.(pos) in
+        let rs = float_param_regs.(rf)
+        and rd = int_param_regs.(ri) in
         emit (Pfmvxs(rd, rs));
-        fixup_variadic_call (pos + 1) tyl
+        fixup_variadic_call (ri + 1) (rf + 1) tyl
     | Tlong :: tyl ->
-        let pos' = if Archi.ptr64 then pos + 1 else align pos 2 + 2 in
-        fixup_variadic_call pos' tyl
+        let ri' = if Archi.ptr64 then ri + 1 else align ri 2 + 2 in
+        fixup_variadic_call ri' rf tyl
     | (Tfloat | Tany64) :: tyl ->
         if Archi.ptr64 then begin
-          let rs = float_param_regs.(pos)
-          and rd = int_param_regs.(pos) in
+          let rs = float_param_regs.(rf)
+          and rd = int_param_regs.(ri) in
           emit (Pfmvxd(rd, rs));
-          fixup_variadic_call (pos + 1) tyl
+          fixup_variadic_call (ri + 1) (rf + 1) tyl
         end else begin
-          let pos = align pos 2 in
-          if pos < 8 then begin
-            let rs = float_param_regs.(pos)
-            and rd1 = int_param_regs.(pos)
-            and rd2 = int_param_regs.(pos + 1) in
+          let ri = align ri 2 in
+          if ri < 8 then begin
+            let rs = float_param_regs.(rf)
+            and rd1 = int_param_regs.(ri)
+            and rd2 = int_param_regs.(ri + 1) in
             emit (Paddiw(X2, X X2, Integers.Int.neg _16));
             emit (Pfsd(rs, X2, Ofsimm _0));
             emit (Plw(rd1, X2, Ofsimm _0));
             emit (Plw(rd2, X2, Ofsimm _4));
             emit (Paddiw(X2, X X2, _16));
-            fixup_variadic_call (pos + 2) tyl
+            fixup_variadic_call (ri + 2) (rf + 1) tyl
           end
         end
         
 let fixup_call sg =
-  if sg.sig_cc.cc_vararg then fixup_variadic_call 0 sg.sig_args
+  if sg.sig_cc.cc_vararg then fixup_variadic_call 0 0 sg.sig_args
 
 (* Handling of annotations *)
 
diff --git a/riscV/CSE2deps.v b/riscV/CSE2deps.v
new file mode 100644
index 00000000..8ab9242a
--- /dev/null
+++ b/riscV/CSE2deps.v
@@ -0,0 +1,20 @@
+Require Import BoolEqual Coqlib.
+Require Import AST Integers Floats.
+Require Import Values Memory Globalenvs Events.
+Require Import Op.
+
+
+Definition can_swap_accesses_ofs ofsr chunkr ofsw chunkw :=
+     (0 <=? ofsw) && (ofsw <=? (Ptrofs.modulus - largest_size_chunk))
+  && (0 <=? ofsr) && (ofsr <=? (Ptrofs.modulus - largest_size_chunk))
+  && ((ofsw + size_chunk chunkw <=? ofsr) ||
+      (ofsr + size_chunk chunkr <=? ofsw)).
+
+Definition may_overlap chunk addr args chunk' addr' args' :=
+  match addr, addr', args, args' with
+  | (Aindexed ofs), (Aindexed ofs'),
+    (base :: nil), (base' :: nil) =>
+    if peq base base'
+    then negb (can_swap_accesses_ofs (Ptrofs.unsigned ofs') chunk' (Ptrofs.unsigned ofs) chunk)
+    else true  | _, _, _, _ => true
+  end.
diff --git a/riscV/CSE2depsproof.v b/riscV/CSE2depsproof.v
new file mode 100644
index 00000000..a3811e78
--- /dev/null
+++ b/riscV/CSE2depsproof.v
@@ -0,0 +1,127 @@
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL Maps.
+
+Require Import Globalenvs Values.
+Require Import Linking Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import CSE2 CSE2deps.
+Require Import Lia.
+
+Lemma ptrofs_size :
+  Ptrofs.wordsize = (if Archi.ptr64 then 64 else 32)%nat.
+Proof.
+  unfold Ptrofs.wordsize.
+  unfold Wordsize_Ptrofs.wordsize.
+  trivial.
+Qed.
+
+Lemma ptrofs_modulus :
+  Ptrofs.modulus = if Archi.ptr64 then 18446744073709551616 else 4294967296.
+Proof.
+  unfold Ptrofs.modulus.
+  rewrite ptrofs_size.
+  destruct Archi.ptr64; reflexivity.
+Qed.
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Section MEMORY_WRITE.
+  Variable m m2 : mem.
+  Variable chunkw chunkr : memory_chunk.
+  Variable base : val.
+  
+  Variable addrw addrr valw : val.
+  Hypothesis STORE : Mem.storev chunkw m addrw valw = Some m2.
+
+  Section INDEXED_AWAY.
+  Variable ofsw ofsr : ptrofs.
+  Hypothesis ADDRW : eval_addressing genv sp
+                       (Aindexed ofsw) (base :: nil) = Some addrw. 
+  Hypothesis ADDRR : eval_addressing genv sp
+                       (Aindexed ofsr) (base :: nil) = Some addrr.
+
+  Lemma load_store_away1 :
+    forall RANGEW : 0 <= Ptrofs.unsigned ofsw <= Ptrofs.modulus - largest_size_chunk,
+    forall RANGER : 0 <= Ptrofs.unsigned ofsr <= Ptrofs.modulus - largest_size_chunk,
+    forall SWAPPABLE :    Ptrofs.unsigned ofsw + size_chunk chunkw <= Ptrofs.unsigned ofsr
+                          \/ Ptrofs.unsigned ofsr + size_chunk chunkr <= Ptrofs.unsigned ofsw,
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+
+  Proof.
+    intros.
+    
+    pose proof (max_size_chunk chunkr) as size_chunkr_bounded.
+    pose proof (max_size_chunk chunkw) as size_chunkw_bounded.
+    unfold largest_size_chunk in *.
+
+    rewrite ptrofs_modulus in *.
+    simpl in *.
+    inv ADDRR.
+    inv ADDRW.
+    destruct base; try discriminate.
+    eapply Mem.load_store_other with (chunk := chunkw) (v := valw) (b := b).
+    exact STORE.
+    right.
+
+    all: try (destruct (Ptrofs.unsigned_add_either i ofsr) as [OFSR | OFSR];
+              rewrite OFSR).
+    all: try (destruct (Ptrofs.unsigned_add_either i ofsw) as [OFSW | OFSW];
+              rewrite OFSW).
+    all: try rewrite ptrofs_modulus in *.
+    all: destruct Archi.ptr64.
+
+    all: intuition lia.
+  Qed.
+  
+  Theorem load_store_away :
+    can_swap_accesses_ofs (Ptrofs.unsigned ofsr) chunkr (Ptrofs.unsigned ofsw) chunkw = true ->
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intro SWAP.
+    unfold can_swap_accesses_ofs in SWAP.
+    repeat rewrite andb_true_iff in SWAP.
+    repeat rewrite orb_true_iff in SWAP.
+    repeat rewrite Z.leb_le in SWAP.
+    apply load_store_away1.
+    all: tauto.
+  Qed.
+  End INDEXED_AWAY.
+End MEMORY_WRITE.
+End SOUNDNESS.
+
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Lemma may_overlap_sound:
+  forall m m' : mem,
+  forall chunk addr args chunk' addr' args' v a a' rs,
+    (eval_addressing genv sp addr (rs ## args)) = Some a ->
+    (eval_addressing genv sp addr' (rs ## args')) = Some a' ->
+    (may_overlap chunk addr args chunk' addr' args') = false ->
+    (Mem.storev chunk m a v) = Some m' ->
+    (Mem.loadv chunk' m' a') = (Mem.loadv chunk' m  a').
+Proof.
+  intros until rs.
+  intros ADDR ADDR' OVERLAP STORE.
+  destruct addr; destruct addr'; try discriminate.
+  { (* Aindexed / Aindexed *)
+  destruct args as [ | base [ | ]]. 1,3: discriminate.
+  destruct args' as [ | base' [ | ]]. 1,3: discriminate.
+  simpl in OVERLAP.
+  destruct (peq base base'). 2: discriminate.
+  subst base'.
+  destruct (can_swap_accesses_ofs (Ptrofs.unsigned i0) chunk' (Ptrofs.unsigned i) chunk) eqn:SWAP.
+  2: discriminate.
+  simpl in *.
+  eapply load_store_away with (F:=F) (V:=V) (genv:=genv) (sp:=sp); eassumption.
+  }
+Qed.
+
+End SOUNDNESS.
diff --git a/riscV/Conventions1.v b/riscV/Conventions1.v
index 27d09d94..17326139 100644
--- a/riscV/Conventions1.v
+++ b/riscV/Conventions1.v
@@ -105,7 +105,9 @@ Definition is_float_reg (r: mreg) :=
   of function arguments), but this leaves much liberty in choosing actual
   locations.  To ensure binary interoperability of code generated by our
   compiler with libraries compiled by another compiler, we
-  implement the standard RISC-V conventions.  *)
+  implement the standard RISC-V conventions as found here:
+  https://github.com/riscv/riscv-elf-psabi-doc/blob/master/riscv-elf.md
+*)
 
 (** ** Location of function result *)
 
@@ -169,38 +171,32 @@ Qed.
 
 (** ** Location of function arguments *)
 
-(** The RISC-V ABI states the following convention for passing arguments
+(** The RISC-V ABI states the following conventions for passing arguments
   to a function:
 
-- Arguments are passed in registers when possible.
-
-- Up to eight integer registers (ai: int_param_regs) and up to eight
-  floating-point registers (fai: float_param_regs) are used for this
-  purpose.
-
-- If the arguments to a function are conceptualized as fields of a C
-  struct, each with pointer alignment, the argument registers are a
-  shadow of the first eight pointer-words of that struct. If argument
-  i < 8 is a floating-point type, it is passed in floating-point
-  register fa_i; otherwise, it is passed in integer register a_i.
-
-- When primitive arguments twice the size of a pointer-word are passed
-  on the stack, they are naturally aligned. When they are passed in the
-  integer registers, they reside in an aligned even-odd register pair,
-  with the even register holding the least-significant bits.
-
-- Floating-point arguments to variadic functions (except those that
-  are explicitly named in the parameter list) are passed in integer
-  registers.
-
-- The portion of the conceptual struct that is not passed in argument
-  registers is passed on the stack. The stack pointer sp points to the
-  first argument not passed in a register.
-
-The bit about variadic functions doesn't quite fit CompCert's model.
-We do our best by passing the FP arguments in registers, as usual,
-and reserving the corresponding integer registers, so that fixup
-code can be introduced in the Asmexpand pass.
+- RV64, not variadic: pass the first 8 integer arguments in
+  integer registers (a1...a8: int_param_regs), the first 8 FP arguments
+  in FP registers (fa1...fa8: float_param_regs), and the remaining
+  arguments on the stack, in 8-byte slots.
+
+- RV32, not variadic: same, but arguments of 64-bit integer type
+  are passed in two consecutive integer registers (a(i), a(i+1))
+  or in a(8) and on a 32-bit word on the stack.  Stack-allocated
+  arguments are aligned to their natural alignment.
+
+- RV64, variadic: pass the first 8 arguments in integer registers
+  (a1...a8), including FP arguments; pass the remaining arguments on
+  the stack, in 8-byte slots.
+
+- RV32, variadic: same, but arguments of 64-bit types (integers as well
+  as floats) are passed in two consecutive aligned integer registers
+  (a(2i), a(2i+1)).  
+
+The passing of FP arguments to variadic functions in integer registers
+doesn't quite fit CompCert's model.  We do our best by passing the FP
+arguments in registers, as usual, and reserving the corresponding
+integer registers, so that fixup code can be introduced in the
+Asmexpand pass.
 *)
 
 Definition int_param_regs :=
@@ -208,80 +204,84 @@ Definition int_param_regs :=
 Definition float_param_regs :=
   F10 :: F11 :: F12 :: F13 :: F14 :: F15 :: F16 :: F17 :: nil.
 
-Definition one_arg (regs: list mreg) (rn: Z) (ofs: Z) (ty: typ)
-                           (rec: Z -> Z -> list (rpair loc)) :=
-  match list_nth_z regs rn with
+Definition int_arg (ri rf ofs: Z) (ty: typ)
+                   (rec: Z -> Z -> Z -> list (rpair loc)) :=
+  match list_nth_z int_param_regs ri with
   | Some r =>
-      One(R r) :: rec (rn + 1) ofs
+      One(R r) :: rec (ri + 1) rf ofs
   | None   =>
-      let ofs := align ofs (typealign ty) in
-      One(S Outgoing ofs ty) :: rec rn (ofs + (if Archi.ptr64 then 2 else typesize ty))
+      let ofs := align ofs (typesize ty) in
+      One(S Outgoing ofs ty)
+      :: rec ri rf (ofs + (if Archi.ptr64 then 2 else typesize ty))
   end.
 
-Definition two_args (regs: list mreg) (rn: Z) (ofs: Z)
-                    (rec: Z -> Z -> list (rpair loc)) :=
-  let rn := align rn 2 in
-  match list_nth_z regs rn, list_nth_z regs (rn + 1) with
-  | Some r1, Some r2 =>
-      Twolong (R r2) (R r1) :: rec (rn + 2) ofs
-  | _, _ =>
-      let ofs := align ofs 2 in
-      Twolong (S Outgoing (ofs + 1) Tint) (S Outgoing ofs Tint) ::
-      rec rn (ofs + 2)
+Definition float_arg (va: bool) (ri rf ofs: Z) (ty: typ)
+                     (rec: Z -> Z -> Z -> list (rpair loc)) :=
+  match list_nth_z float_param_regs rf with
+  | Some r =>
+      if va then
+       (let ri' :=  (* reserve 1 or 2 aligned integer registers *)
+          if Archi.ptr64 || zeq (typesize ty) 1 then ri + 1 else align ri 2 + 2 in
+        if zle ri' 8 then
+            (* we have enough integer registers, put argument in FP reg
+               and fixup code will put it in one or two integer regs *)
+            One (R r) :: rec ri' (rf + 1) ofs
+        else
+            (* we are out of integer registers, pass argument on stack *)
+            let ofs := align ofs (typesize ty) in
+            One(S Outgoing ofs ty)
+            :: rec ri' rf (ofs + (if Archi.ptr64 then 2 else typesize ty)))
+      else
+        One (R r) :: rec ri (rf + 1) ofs
+  | None   =>
+      let ofs := align ofs (typesize ty) in
+      One(S Outgoing ofs ty)
+      :: rec ri rf (ofs + (if Archi.ptr64 then 2 else typesize ty))
   end.
 
-Definition hybrid_arg (regs: list mreg) (rn: Z) (ofs: Z) (ty: typ)
-                      (rec: Z -> Z -> list (rpair loc)) :=
-  let rn := align rn 2 in
-  match list_nth_z regs rn with
-  | Some r =>
-      One (R r) :: rec (rn + 2) ofs
-  | None =>
+Definition split_long_arg (va: bool) (ri rf ofs: Z)
+                          (rec: Z -> Z -> Z -> list (rpair loc)) :=
+  let ri := if va then align ri 2 else ri in
+  match list_nth_z int_param_regs ri, list_nth_z int_param_regs (ri + 1) with
+  | Some r1, Some r2 =>
+      Twolong (R r2) (R r1) :: rec (ri + 2) rf ofs
+  | Some r1, None =>
+      Twolong (S Outgoing ofs Tint) (R r1) :: rec (ri + 1) rf (ofs + 1)
+  | None, _ =>
       let ofs := align ofs 2 in
-      One (S Outgoing ofs ty) :: rec rn (ofs + 2)
+      Twolong (S Outgoing (ofs + 1) Tint) (S Outgoing ofs Tint) ::
+      rec ri rf (ofs + 2)
   end.
 
 Fixpoint loc_arguments_rec (va: bool)
-    (tyl: list typ) (r ofs: Z) {struct tyl} : list (rpair loc) :=
+    (tyl: list typ) (ri rf ofs: Z) {struct tyl} : list (rpair loc) :=
   match tyl with
   | nil => nil
   | (Tint | Tany32) as ty :: tys =>
-      one_arg int_param_regs r ofs ty (loc_arguments_rec va tys)
+      (* pass in one integer register or on stack *)
+      int_arg ri rf ofs ty (loc_arguments_rec va tys)
   | Tsingle as ty :: tys =>
-      one_arg float_param_regs r ofs ty (loc_arguments_rec va tys)
+      (* pass in one FP register or on stack.
+         If vararg, reserve 1 integer register. *)
+      float_arg va ri rf ofs ty (loc_arguments_rec va tys)
   | Tlong as ty :: tys =>
-      if Archi.ptr64
-      then one_arg int_param_regs r ofs ty (loc_arguments_rec va tys)
-      else two_args int_param_regs r ofs  (loc_arguments_rec va tys)
+      if Archi.ptr64 then
+        (* pass in one integer register or on stack *)
+        int_arg ri rf ofs ty (loc_arguments_rec va tys)
+      else
+        (* pass in register pair or on stack; align register pair if vararg *)
+        split_long_arg va ri rf ofs(loc_arguments_rec va tys)
   | (Tfloat | Tany64) as ty :: tys =>
-      if va && negb Archi.ptr64
-      then hybrid_arg float_param_regs r ofs ty (loc_arguments_rec va tys)
-      else one_arg float_param_regs r ofs ty (loc_arguments_rec va tys)
+      (* pass in one FP register or on stack.
+         If vararg, reserve 1 or 2 integer registers. *)
+      float_arg va ri rf ofs ty (loc_arguments_rec va tys)
   end.
 
 (** [loc_arguments s] returns the list of locations where to store arguments
   when calling a function with signature [s].  *)
 
 Definition loc_arguments (s: signature) : list (rpair loc) :=
-  loc_arguments_rec s.(sig_cc).(cc_vararg) s.(sig_args) 0 0.
-
-(** [size_arguments s] returns the number of [Outgoing] slots used
-  to call a function with signature [s]. *)
-
-Definition max_outgoing_1 (accu: Z) (l: loc) : Z :=
-  match l with
-  | S Outgoing ofs ty => Z.max accu (ofs + typesize ty)
-  | _ => accu
-  end.
-
-Definition max_outgoing_2 (accu: Z) (rl: rpair loc) : Z :=
-  match rl with
-  | One l => max_outgoing_1 accu l
-  | Twolong l1 l2 => max_outgoing_1 (max_outgoing_1 accu l1) l2
-  end.
-
-Definition size_arguments (s: signature) : Z :=
-  List.fold_left max_outgoing_2 (loc_arguments s) 0.
+  loc_arguments_rec s.(sig_cc).(cc_vararg) s.(sig_args) 0 0 0.
 
 (** Argument locations are either non-temporary registers or [Outgoing]
   stack slots at nonnegative offsets. *)
@@ -294,90 +294,87 @@ Definition loc_argument_acceptable (l: loc) : Prop :=
   end.
 
 Lemma loc_arguments_rec_charact:
-  forall va tyl rn ofs p,
+  forall va tyl ri rf ofs p,
   ofs >= 0 ->
-  In p (loc_arguments_rec va tyl rn ofs) -> forall_rpair loc_argument_acceptable p.
+  In p (loc_arguments_rec va tyl ri rf ofs) -> forall_rpair loc_argument_acceptable p.
 Proof.
   set (OK := fun (l: list (rpair loc)) =>
              forall p, In p l -> forall_rpair loc_argument_acceptable p).
-  set (OKF := fun (f: Z -> Z -> list (rpair loc)) =>
-              forall rn ofs, ofs >= 0 -> OK (f rn ofs)).
-  set (OKREGS := fun (l: list mreg) => forall r, In r l -> is_callee_save r = false).
-  assert (AL: forall ofs ty, ofs >= 0 -> align ofs (typealign ty) >= 0).
+  set (OKF := fun (f: Z -> Z -> Z -> list (rpair loc)) =>
+              forall ri rf ofs, ofs >= 0 -> OK (f ri rf ofs)).
+  assert (CSI: forall r, In r int_param_regs -> is_callee_save r = false).
+  { decide_goal. }
+  assert (CSF: forall r, In r float_param_regs -> is_callee_save r = false).
+  { decide_goal. }
+  assert (AL: forall ofs ty, ofs >= 0 -> align ofs (typesize ty) >= 0).
   { intros. 
-    assert (ofs <= align ofs (typealign ty)) by (apply align_le; apply typealign_pos).
+    assert (ofs <= align ofs (typesize ty)) by (apply align_le; apply typesize_pos).
     omega. }
+  assert (ALD: forall ofs ty, ofs >= 0 -> (typealign ty | align ofs (typesize ty))).
+  { intros. eapply Z.divide_trans. apply typealign_typesize.
+    apply align_divides. apply typesize_pos. }
   assert (SK: (if Archi.ptr64 then 2 else 1) > 0).
   { destruct Archi.ptr64; omega. }
   assert (SKK: forall ty, (if Archi.ptr64 then 2 else typesize ty) > 0).
   { intros. destruct Archi.ptr64. omega. apply typesize_pos.  }
-  assert (A: forall regs rn ofs ty f,
-             OKREGS regs -> OKF f -> ofs >= 0 -> OK (one_arg regs rn ofs ty f)).
-  { intros until f; intros OR OF OO; red; unfold one_arg; intros.
-    destruct (list_nth_z regs rn) as [r|] eqn:NTH; destruct H.
-  - subst p; simpl. apply OR. eapply list_nth_z_in; eauto. 
+  assert (A: forall ri rf ofs ty f,
+             OKF f -> ofs >= 0 -> OK (int_arg ri rf ofs ty f)).
+  { intros until f; intros OF OO; red; unfold int_arg; intros.
+    destruct (list_nth_z int_param_regs ri) as [r|] eqn:NTH; destruct H.
+  - subst p; simpl. apply CSI. eapply list_nth_z_in; eauto. 
   - eapply OF; eauto. 
   - subst p; simpl. auto using align_divides, typealign_pos.
   - eapply OF; [idtac|eauto].
     generalize (AL ofs ty OO) (SKK ty); omega.
   }
-  assert (B: forall regs rn ofs f,
-             OKREGS regs -> OKF f -> ofs >= 0 -> OK (two_args regs rn ofs f)).
-  { intros until f; intros OR OF OO; unfold two_args.
-    set (rn' := align rn 2).
+  assert (B: forall va ri rf ofs ty f,
+             OKF f -> ofs >= 0 -> OK (float_arg va ri rf ofs ty f)).
+  { intros until f; intros OF OO; red; unfold float_arg; intros.
+    destruct (list_nth_z float_param_regs rf) as [r|] eqn:NTH.
+  - set (ri' := if Archi.ptr64 || zeq (typesize ty) 1 then ri + 1 else align ri 2 + 2) in *.
+    destruct va; [destruct (zle ri' 8)|idtac]; destruct H.
+    + subst p; simpl. apply CSF. eapply list_nth_z_in; eauto.
+    + eapply OF; eauto.
+    + subst p; repeat split; auto.
+    + eapply OF; [idtac|eauto]. generalize (AL ofs ty OO) (SKK ty); omega.
+    + subst p; simpl. apply CSF. eapply list_nth_z_in; eauto.
+    + eapply OF; eauto.
+  - destruct H.
+    + subst p; repeat split; auto.
+    + eapply OF; [idtac|eauto]. generalize (AL ofs ty OO) (SKK ty); omega.
+  }
+  assert (C: forall va ri rf ofs f,
+             OKF f -> ofs >= 0 -> OK (split_long_arg va ri rf ofs f)).
+  { intros until f; intros OF OO; unfold split_long_arg.
+    set (ri' := if va then align ri 2 else ri).
     set (ofs' := align ofs 2).
     assert (OO': ofs' >= 0) by (apply (AL ofs Tlong); auto).
-    assert (DFL: OK (Twolong (S Outgoing (ofs' + 1) Tint) (S Outgoing ofs' Tint)
-                     :: f rn' (ofs' + 2))).
-    { red; simpl; intros. destruct H.
-    - subst p; simpl. 
-      repeat split; auto using Z.divide_1_l. omega.
-    - eapply OF; [idtac|eauto]. omega.
-    }
-    destruct (list_nth_z regs rn') as [r1|] eqn:NTH1;
-    destruct (list_nth_z regs (rn' + 1)) as [r2|] eqn:NTH2;
-    try apply DFL.
-    red; simpl; intros; destruct H.
-  - subst p; simpl. split; apply OR; eauto using list_nth_z_in.  
-  - eapply OF; [idtac|eauto]. auto.
-  }
-  assert (C: forall regs rn ofs ty f,
-             OKREGS regs -> OKF f -> ofs >= 0 -> typealign ty = 1 -> OK (hybrid_arg regs rn ofs ty f)).
-  { intros until f; intros OR OF OO OTY; unfold hybrid_arg; red; intros.
-    set (rn' := align rn 2) in *.
-    destruct (list_nth_z regs rn') as [r|] eqn:NTH; destruct H.
-  - subst p; simpl. apply OR. eapply list_nth_z_in; eauto. 
-  - eapply OF; eauto. 
-  - subst p; simpl. rewrite OTY. split. apply (AL ofs Tlong OO). apply Z.divide_1_l. 
-  - eapply OF; [idtac|eauto]. generalize (AL ofs Tlong OO); simpl; omega.
+    destruct (list_nth_z int_param_regs ri') as [r1|] eqn:NTH1;
+    [destruct (list_nth_z int_param_regs (ri'+1)) as [r2|] eqn:NTH2 | idtac].
+  - red; simpl; intros; destruct H.
+    + subst p; split; apply CSI; eauto using list_nth_z_in.
+    + eapply OF; [idtac|eauto]. omega.
+  - red; simpl; intros; destruct H.
+    + subst p; split. split; auto using Z.divide_1_l. apply CSI; eauto using list_nth_z_in.
+    + eapply OF; [idtac|eauto]. omega.
+  - red; simpl; intros; destruct H.
+    + subst p; repeat split; auto using Z.divide_1_l. omega. 
+    + eapply OF; [idtac|eauto]. omega.
   }
-  assert (D: OKREGS int_param_regs).
-  { red. decide_goal. }
-  assert (E: OKREGS float_param_regs).
-  { red. decide_goal. }
-
-  cut (forall va tyl rn ofs, ofs >= 0 -> OK (loc_arguments_rec va tyl rn ofs)).
+  cut (forall va tyl ri rf ofs, ofs >= 0 -> OK (loc_arguments_rec va tyl ri rf ofs)).
   unfold OK. eauto.
   induction tyl as [ | ty1 tyl]; intros until ofs; intros OO; simpl.
 - red; simpl; tauto.
 - destruct ty1.
 + (* int *) apply A; auto.
-+ (* float *) 
-  destruct (va && negb Archi.ptr64).
-  apply C; auto.
-  apply A; auto.
++ (* float *) apply B; auto.
 + (* long *)
   destruct Archi.ptr64.
   apply A; auto.
-  apply B; auto.
-+ (* single *)
-  apply A; auto.
-+ (* any32 *)
-  apply A; auto.
-+ (* any64 *)
-  destruct (va && negb Archi.ptr64).
   apply C; auto.
-  apply A; auto.
++ (* single *) apply B; auto.
++ (* any32 *) apply A; auto.
++ (* any64 *) apply B; auto.
 Qed.
 
 Lemma loc_arguments_acceptable:
@@ -387,52 +384,6 @@ Proof.
   unfold loc_arguments; intros. eapply loc_arguments_rec_charact; eauto. omega.
 Qed.
 
-(** The offsets of [Outgoing] arguments are below [size_arguments s]. *)
-
-Remark fold_max_outgoing_above:
-  forall l n, fold_left max_outgoing_2 l n >= n.
-Proof.
-  assert (A: forall n l, max_outgoing_1 n l >= n).
-  { intros; unfold max_outgoing_1. destruct l as [_ | []]; xomega. }
-  induction l; simpl; intros. 
-  - omega.
-  - eapply Zge_trans. eauto.
-    destruct a; simpl. apply A. eapply Zge_trans; eauto.
-Qed.
-
-Lemma size_arguments_above:
-  forall s, size_arguments s >= 0.
-Proof.
-  intros. apply fold_max_outgoing_above.
-Qed.
-
-Lemma loc_arguments_bounded:
-  forall (s: signature) (ofs: Z) (ty: typ),
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments s)) ->
-  ofs + typesize ty <= size_arguments s.
-Proof.
-  intros until ty.
-  assert (A: forall n l, n <= max_outgoing_1 n l).
-  { intros; unfold max_outgoing_1. destruct l as [_ | []]; xomega. }
-  assert (B: forall p n,
-             In (S Outgoing ofs ty) (regs_of_rpair p) ->
-             ofs + typesize ty <= max_outgoing_2 n p).
-  { intros. destruct p; simpl in H; intuition; subst; simpl.
-  - xomega.
-  - eapply Z.le_trans. 2: apply A. xomega.
-  - xomega. }
-  assert (C: forall l n,
-             In (S Outgoing ofs ty) (regs_of_rpairs l) ->
-             ofs + typesize ty <= fold_left max_outgoing_2 l n).
-  { induction l; simpl; intros.
-  - contradiction.
-  - rewrite in_app_iff in H. destruct H.
-  + eapply Z.le_trans. eapply B; eauto. apply Z.ge_le. apply fold_max_outgoing_above.
-  + apply IHl; auto.
-  }
-  apply C. 
-Qed.
-
 Lemma loc_arguments_main:
   loc_arguments signature_main = nil.
 Proof.
diff --git a/riscV/DuplicateOpcodeHeuristic.ml b/riscV/DuplicateOpcodeHeuristic.ml
index 85505245..2ec314c1 100644
--- a/riscV/DuplicateOpcodeHeuristic.ml
+++ b/riscV/DuplicateOpcodeHeuristic.ml
@@ -1,3 +1,27 @@
-exception HeuristicSucceeded
-        
-let opcode_heuristic code cond ifso ifnot preferred = ()
+(* open Camlcoq *)
+open Op
+open Integers
+
+let opcode_heuristic code cond ifso ifnot is_loop_header =
+  match cond with
+  | Ccompimm (c, n) | Ccompuimm (c, n) -> if n == Integers.Int.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccomplimm (c, n) | Ccompluimm (c, n) -> if n == Integers.Int64.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccompf c | Ccompfs c -> (match c with
+      | Ceq -> Some false
+      | Cne -> Some true
+      | _ -> None
+      )
+  | Cnotcompf c | Cnotcompfs c -> (match c with
+      | Ceq -> Some true
+      | Cne -> Some false
+      | _ -> None
+      )
+  | _ -> None
diff --git a/runtime/arm/i64_stof.S b/runtime/arm/i64_stof.S
index bcfa471c..11e00a2a 100644
--- a/runtime/arm/i64_stof.S
+++ b/runtime/arm/i64_stof.S
@@ -39,12 +39,11 @@
 @@@ Conversion from signed 64-bit integer to single float
 
 FUNCTION(__compcert_i64_stof)
-    @ Check whether -2^53 <= X < 2^53
-        ASR r2, Reg0HI, #21
-        ASR r3, Reg0HI, #31     @ (r2,r3) = X >> 53
+   @ Check whether -2^53 <= X < 2^53
+        ASR r2, Reg0HI, #21     @ r2 = high 32 bits of X >> 53
+   @ -2^53 <= X < 2^53 iff r2 is -1 or 0, that is, iff r2 + 1 is 0 or 1
         adds r2, r2, #1
-        adc r3, r3, #0          @ (r2,r3) = X >> 53 + 1
-        cmp r3, #2
+        cmp r2, #2
         blo 1f
    @ X is large enough that double rounding can occur.
    @ Avoid it by nudging X away from the points where double rounding
diff --git a/runtime/include/math.h b/runtime/include/math.h
index d6475df1..01b8d8d8 100644
--- a/runtime/include/math.h
+++ b/runtime/include/math.h
@@ -1,6 +1,8 @@
 #ifndef _COMPCERT_MATH_H
 #define _COMPCERT_MATH_H
 
+#ifdef __K1C__
+
 #define isfinite(__y) (fpclassify((__y)) >= FP_ZERO)
 
 #include_next <math.h>
@@ -16,4 +18,9 @@
 #define fmaf(x, y, z) __builtin_fmaf((x),(y),(z))
 #endif
 
+#else
+
+#include_next <math.h>
+
+#endif
 #endif
diff --git a/runtime/powerpc/i64_stof.s b/runtime/powerpc/i64_stof.s
index 97fa6bb8..ea23a1c8 100644
--- a/runtime/powerpc/i64_stof.s
+++ b/runtime/powerpc/i64_stof.s
@@ -43,20 +43,19 @@
 __compcert_i64_stof:
 	mflr r9
    # Check whether -2^53 <= X < 2^53	
-        srawi r5, r3, 31
-        srawi r6, r3, 21        # (r5,r6) = X >> 53
-        addic r6, r6, 1
-        addze r5, r5            # (r5,r6) = (X >> 53) + 1
+        srawi r5, r3, 21        # r5 = high 32 bits of X >> 53
+   # -2^53 <= X < 2^53 iff r5 is -1 or 0, that is, iff r5 + 1 is 0 or 1
+        addi r5, r5, 1
         cmplwi r5, 2
         blt 1f
    # X is large enough that double rounding can occur.
    # Avoid it by nudging X away from the points where double rounding
    # occurs (the "round to odd" technique)
-        rlwinm r0, r4, 0, 21, 31 # extract bits 0 to 11 of X
-        addi r0, r0, 0x7FF      # r0 = (X & 0x7FF) + 0x7FF
-   # bit 12 of r0 is 0 if all low 12 bits of X are 0, 1 otherwise
-   # bits 13-31 of r0 are 0
-        or r4, r4, r0           # correct bit number 12 of X
+        rlwinm r5, r4, 0, 21, 31 # extract bits 0 to 11 of X
+        addi r5, r5, 0x7FF      # r5 = (X & 0x7FF) + 0x7FF
+   # bit 12 of r5 is 0 if all low 12 bits of X are 0, 1 otherwise
+   # bits 13-31 of r5 are 0
+        or r4, r4, r5           # correct bit number 12 of X
         rlwinm r4, r4, 0, 0, 20 # set to 0 bits 0 to 11 of X
    # Convert to double, then round to single	
 1:      bl __compcert_i64_stod
diff --git a/runtime/powerpc/i64_utof.s b/runtime/powerpc/i64_utof.s
index cdb2f867..4a2a172b 100644
--- a/runtime/powerpc/i64_utof.s
+++ b/runtime/powerpc/i64_utof.s
@@ -48,11 +48,11 @@ __compcert_i64_utof:
    # X is large enough that double rounding can occur.
    # Avoid it by nudging X away from the points where double rounding
    # occurs (the "round to odd" technique)
-        rlwinm r0, r4, 0, 21, 31 # extract bits 0 to 11 of X
-        addi r0, r0, 0x7FF      # r0 = (X & 0x7FF) + 0x7FF
-   # bit 12 of r0 is 0 if all low 12 bits of X are 0, 1 otherwise
-   # bits 13-31 of r0 are 0
-        or r4, r4, r0           # correct bit number 12 of X
+        rlwinm r5, r4, 0, 21, 31 # extract bits 0 to 11 of X
+        addi r5, r5, 0x7FF      # r5 = (X & 0x7FF) + 0x7FF
+   # bit 12 of r5 is 0 if all low 12 bits of X are 0, 1 otherwise
+   # bits 13-31 of r5 are 0
+        or r4, r4, r5           # correct bit number 12 of X
         rlwinm r4, r4, 0, 0, 20 # set to 0 bits 0 to 11 of X
    # Convert to double, then round to single	
 1:      bl __compcert_i64_utod
diff --git a/runtime/powerpc64/i64_utof.s b/runtime/powerpc64/i64_utof.s
index cdb2f867..4a2a172b 100644
--- a/runtime/powerpc64/i64_utof.s
+++ b/runtime/powerpc64/i64_utof.s
@@ -48,11 +48,11 @@ __compcert_i64_utof:
    # X is large enough that double rounding can occur.
    # Avoid it by nudging X away from the points where double rounding
    # occurs (the "round to odd" technique)
-        rlwinm r0, r4, 0, 21, 31 # extract bits 0 to 11 of X
-        addi r0, r0, 0x7FF      # r0 = (X & 0x7FF) + 0x7FF
-   # bit 12 of r0 is 0 if all low 12 bits of X are 0, 1 otherwise
-   # bits 13-31 of r0 are 0
-        or r4, r4, r0           # correct bit number 12 of X
+        rlwinm r5, r4, 0, 21, 31 # extract bits 0 to 11 of X
+        addi r5, r5, 0x7FF      # r5 = (X & 0x7FF) + 0x7FF
+   # bit 12 of r5 is 0 if all low 12 bits of X are 0, 1 otherwise
+   # bits 13-31 of r5 are 0
+        or r4, r4, r5           # correct bit number 12 of X
         rlwinm r4, r4, 0, 0, 20 # set to 0 bits 0 to 11 of X
    # Convert to double, then round to single	
 1:      bl __compcert_i64_utod
diff --git a/test/Makefile b/test/Makefile
index 7efcd8f1..e9c5d6a1 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -4,7 +4,9 @@ include ../Makefile.config
 
 # Kalray note - removing compression, raytracer and spass that cannot be executed by the simulator in reasonable time
 ifeq ($(ARCH),mppa_k1c)
-	DIRS:=c regression
+	DIRS=c regression
+else
+	DIRS=c compression raytracer spass regression
 endif
 
 ifeq ($(CLIGHTGEN),true)
diff --git a/test/c/mandelbrot.c b/test/c/mandelbrot.c
index fb8b929c..548c3ffa 100644
--- a/test/c/mandelbrot.c
+++ b/test/c/mandelbrot.c
@@ -59,7 +59,6 @@ int main (int argc, char **argv)
 
             if(bit_num == 8)
             {
-                printf("%c", byte_acc);
                 putc(byte_acc,stdout);
 #ifdef __K1C__ // stdout isn't flushed enough when --syscall=libstd_scalls.so is passed to the simulator k1-cluster
                 fflush(stdout);
@@ -70,7 +69,6 @@ int main (int argc, char **argv)
             else if(x == w-1)
             {
                 byte_acc <<= (8-w%8);
-                printf("%c", byte_acc);
                 putc(byte_acc,stdout);
 #ifdef __K1C__ // stdout isn't flushed enough when --syscall=libstd_scalls.so is passed to the simulator k1-cluster
                 fflush(stdout);
diff --git a/test/cse2/globals.c b/test/cse2/globals.c
new file mode 100644
index 00000000..c6dd59cd
--- /dev/null
+++ b/test/cse2/globals.c
@@ -0,0 +1,8 @@
+int glob1, glob2;
+
+void toto() {
+  if (glob1 > 4) {
+    glob2 ++;
+    glob1 --;
+  }
+}
diff --git a/test/cse2/indexed_addr.c b/test/cse2/indexed_addr.c
new file mode 100644
index 00000000..30a7c571
--- /dev/null
+++ b/test/cse2/indexed_addr.c
@@ -0,0 +1,6 @@
+void foo(int *t) {
+  if (t[0] > 4) {
+    t[1] ++;
+    t[0] --;
+  }
+}
diff --git a/test/monniaux/clock.c b/test/monniaux/clock.c
index fb636667..4ec679f6 100644
--- a/test/monniaux/clock.c
+++ b/test/monniaux/clock.c
@@ -24,9 +24,9 @@ cycle_t get_current_cycle(void) {
 }
 
 void print_total_clock(void) {
-  printf("time cycles: %lu\n", total_clock);
+  printf("time cycles: %" PRcycle "\n", total_clock);
 }
 
 void printerr_total_clock(void) {
-  fprintf(stderr, "time cycles: %lu\n", total_clock);
+  fprintf(stderr, "time cycles: %" PRcycle "\n", total_clock);
 }
diff --git a/test/monniaux/cycles.h b/test/monniaux/cycles.h
index 21541145..c7dc582b 100644
--- a/test/monniaux/cycles.h
+++ b/test/monniaux/cycles.h
@@ -1,13 +1,11 @@
+#include <stdint.h>
 #include <inttypes.h>
 #include <stdio.h>
-typedef unsigned long cycle_t;
-
-#ifdef MAX_MEASURES
-  static cycle_t _last_stop[MAX_MEASURES] = {0};
-  static cycle_t _total_cycles[MAX_MEASURES] = {0};
-#endif
 
 #ifdef __K1C__
+typedef uint64_t cycle_t;
+#define PRcycle PRId64
+
 #include <../../k1-cos/include/hal/cos_registers.h>
 
 static inline void cycle_count_config(void)
@@ -27,18 +25,57 @@ static inline cycle_t get_cycle(void)
 #else // not K1C
 static inline void cycle_count_config(void) { }
 
-#ifdef  __x86_64__
+#if defined(__i386__) || defined( __x86_64__)
+#define PRcycle PRId64
+typedef uint64_t cycle_t;
 #include <x86intrin.h>
 static inline cycle_t get_cycle(void) { return __rdtsc(); }
 
 #elif __riscv
+#ifdef __riscv32
+#define PRcycle PRId32
+typedef uint32_t cycle_t;
+#else
+#define PRcycle PRId64
+typedef uint64_t cycle_t;
+#endif
 static inline cycle_t get_cycle(void) {
   cycle_t cycles;
   asm volatile ("rdcycle %0" : "=r" (cycles));
   return cycles;
 }
 
+#elif defined (__ARM_ARCH) && (__ARM_ARCH >= 6)
+#if (__ARM_ARCH < 8)
+typedef uint32_t cycle_t;
+#define PRcycle PRId32
+
+/* need this kernel module
+https://github.com/zertyz/MTL/tree/master/cpp/time/kernel/arm */
+static inline cycle_t get_cycle(void) {
+  cycle_t cycles;
+  __asm__ volatile ("mrc p15, 0, %0, c9, c13, 0":"=r" (cycles));
+  return cycles;
+}
 #else
+#define PRcycle PRId64
+typedef uint64_t cycle_t;
+/* need this kernel module:
+https://github.com/jerinjacobk/armv8_pmu_cycle_counter_el0
+
+on 5+ kernels, remove first argument of access_ok macro */
+
+static inline cycle_t get_cycle(void)
+{
+  uint64_t val;
+  __asm__ volatile("mrs %0, pmccntr_el0" : "=r"(val));
+  return val;
+}
+#endif
+
+#else
+#define PRcycle PRId32
+typedef uint32_t cycle_t;
 static inline cycle_t get_cycle(void) { return 0; }
 #endif
 #endif
@@ -48,3 +85,9 @@ static inline cycle_t get_cycle(void) { return 0; }
   #define TIMESTOP(i) {cycle_t cur = get_cycle(); _total_cycles[i] += cur - _last_stop[i]; _last_stop[i] = cur;}
   #define TIMEPRINT(n) { for (int i = 0; i <= n; i++) printf("%d cycles: %" PRIu64 "\n", i, _total_cycles[i]); }
 #endif
+
+
+#ifdef MAX_MEASURES
+  static cycle_t _last_stop[MAX_MEASURES] = {0};
+  static cycle_t _total_cycles[MAX_MEASURES] = {0};
+#endif
diff --git a/test/monniaux/quicksort/quicksort_run.c b/test/monniaux/quicksort/quicksort_run.c
index c35d0752..3c640b24 100644
--- a/test/monniaux/quicksort/quicksort_run.c
+++ b/test/monniaux/quicksort/quicksort_run.c
@@ -13,7 +13,7 @@ int main (void) {
   quicksort(vec, len);
   quicksort_time = get_cycle() - quicksort_time;
   printf("sorted=%s\n"
-	 "time cycles:%" PRIu64 "\n",
+	 "time cycles:%" PRcycle "\n",
 	 data_vec_is_sorted(vec, len)?"true":"false",
 	 quicksort_time);
   free(vec);
diff --git a/test/monniaux/yarpgen/Makefile b/test/monniaux/yarpgen/Makefile
index 9da82deb..28bd5ae0 100644
--- a/test/monniaux/yarpgen/Makefile
+++ b/test/monniaux/yarpgen/Makefile
@@ -1,52 +1,98 @@
-YARPGEN=yarpgen
-MAX=300
+TARGET_CCOMP=../../../ccomp
+TARGET_CC=gcc
+
+all:
+
+.SECONDARY:
+
+ifndef YARPGEN
+YARPGEN=./yarpgen
+GENERATOR=yarpgen
+endif
+
+ifdef BITS
+YARPGEN+=-m $(BITS)
+CFLAGS+=-m$(BITS)
+endif
+
+MAX=129
 PREFIX=ran%06.f
-include ../rules.mk
-
-K1C_CCOMPFLAGS += -funprototyped -fbitfields
-CCOMPFLAGS += -funprototyped -fbitfields
-
-TARGETS_C=$(shell seq --format $(PREFIX)/func.c 0 $(MAX)) \
-	$(shell seq --format $(PREFIX)/driver.c 0 $(MAX)) \
-	$(shell seq --format $(PREFIX)/init.h 0 $(MAX))
-TARGETS_CCOMP_K1C_S=$(shell seq --format $(PREFIX)/func.ccomp.k1c.s 0 $(MAX)) \
-	$(shell seq --format $(PREFIX)/driver.ccomp.k1c.s 0 $(MAX))
-TARGETS_GCC_K1C_S=$(shell seq --format $(PREFIX)/func.gcc.k1c.s 0 $(MAX)) \
-	$(shell seq --format $(PREFIX)/driver.gcc.k1c.s 0 $(MAX))
-TARGETS_CCOMP_HOST_S=$(shell seq --format $(PREFIX)/func.ccomp.host.s 0 $(MAX)) \
-	$(shell seq --format $(PREFIX)/driver.ccomp.host.s 0 $(MAX))
-TARGETS_GCC_HOST_S=$(shell seq --format $(PREFIX)/func.gcc.host.s 0 $(MAX)) \
-	$(shell seq --format $(PREFIX)/driver.gcc.host.s 0 $(MAX))
-TARGETS_CCOMP_K1C_OUT=$(shell seq --format $(PREFIX)/example.ccomp.k1c.out 0 $(MAX))
-TARGETS_GCC_K1C_OUT=$(shell seq --format $(PREFIX)/example.gcc.k1c.out 0 $(MAX))
-TARGETS_GCC_HOST_OUT=$(shell seq --format $(PREFIX)/example.gcc.host.out 0 $(MAX))
-TARGETS_CCOMP_HOST_OUT=$(shell seq --format $(PREFIX)/example.ccomp.host.out 0 $(MAX))
-TARGETS_CMP=$(shell seq --format $(PREFIX)/example.k1c.cmp 0 $(MAX))
-
-all:	$(TARGETS_CCOMP_K1C_OUT) $(TARGETS_GCC_K1C_OUT) $(TARGETS_GCC_HOST_OUT) $(TARGETS_CCOMP_HOST_OUT) $(TARGETS_CCOMP_K1C_S) $(TARGETS_GCC_K1C_S) $(TARGETS_GCC_HOST_S) $(TARGETS_CCOMP_HOST_S) $(TARGETS_CMP) $(TARGETS_C)
-
-ran%/func.ccomp.k1c.s ran%/func.gcc.k1c.s ran%/func.ccomp.host.s ran%/func.gcc.host.s : ran%/init.h
-
-ran%/example.ccomp.k1c: ran%/func.ccomp.k1c.o ran%/driver.ccomp.k1c.o 
-	$(K1C_CCOMP) $(K1C_CCOMPFLAGS) $+ -o $@
-
-ran%/example.gcc.k1c: ran%/func.gcc.k1c.o ran%/driver.gcc.k1c.o 
-	$(K1C_CC) $(K1C_CFLAGS) $+ -o $@
-
-ran%/example.gcc.host: ran%/func.gcc.host.o ran%/driver.gcc.host.o 
-	$(CC) $(CFLAGS) $+ -o $@
 
-ran%/example.ccomp.host: ran%/func.ccomp.host.o ran%/driver.ccomp.host.o 
-	$(CCOMP) $(CCOMPFLAGS) $+ -o $@
+CCOMPOPTS=-static
+CCOMPFLAGS+=-funprototyped -fbitfields -fno-cse2 -stdlib ../../../runtime # FIXME
+
+TESTS_C=$(shell seq --format $(PREFIX)/func.c 1 $(MAX)) \
+	$(shell seq --format $(PREFIX)/driver.c 1 $(MAX)) \
+	$(shell seq --format $(PREFIX)/init.c 1 $(MAX)) \
+	$(shell seq --format $(PREFIX)/hash.c 1 $(MAX)) \
+	$(shell seq --format $(PREFIX)/check.c 1 $(MAX)) \
+	$(shell seq --format $(PREFIX)/init.h 1 $(MAX))
+
+$(TESTS_C): $(GENERATOR)
+
+TESTS_CCOMP_TARGET_S=$(TEST_C:.c=.ccomp.target.s)
+TESTS_GCC_TARGET_S=$(TEST_C:.c=.gcc.target.s)
+TESTS_GCC_HOST_S=$(TEST_C:.c=.gcc.host.s)
+TESTS_CCOMP_TARGET_OUT=$(shell seq --format $(PREFIX)/example.ccomp.target.out 1 $(MAX))
+TESTS_GCC_TARGET_OUT=$(shell seq --format $(PREFIX)/example.gcc.target.out 1 $(MAX))
+TESTS_GCC_HOST_OUT=$(shell seq --format $(PREFIX)/example.gcc.host.out 1 $(MAX))
+TESTS_CMP=$(shell seq --format $(PREFIX)/example.target.cmp 1 $(MAX)) # $(shell seq --format $(PREFIX)/example.host_target.cmp 1 $(MAX))
+
+all:	$(TESTS_CCOMP_TARGET_OUT) $(TESTS_GCC_TARGET_OUT) $(TESTS_CCOMP_TARGET_S) $(TESTS_GCC_TARGET_S) $(TESTS_CMP) $(TESTS_C)
+
+tests_c: $(TESTS_C)
+
+tests_s: $(TESTS_CCOMP_TARGET_S)
+
+%.ccomp.target.s : %.c
+	$(TARGET_CCOMP) $(CCOMPOPTS) $(CCOMPFLAGS) -S -o $@ $<
+
+%.gcc.target.s : %.c
+	$(TARGET_CC) $(CCOMPOPTS) -S -o $@ $<
 
-ran%/driver.c ran%/func.c ran%/init.h:
-	-mkdir ran$*
+%.gcc.host.s : %.c
+	$(CC) $(CFLAGS) -S -o $@ $<
+
+%.target.o : %.target.s
+	$(TARGET_CC) -c -o $@ $<
+
+%.host.o : %.host.s
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+%.target.out : %.target
+	$(EXECUTE) $< | tee $@
+
+%.host.out : %.host
+	./$< | tee $@
+
+ran%/func.ccomp.target.s ran%/func.gcc.target.s ran%/func.ccomp.host.s ran%/func.gcc.host.s ran%/init.gcc.host.s : ran%/init.h
+
+ran%/example.ccomp.target: ran%/func.ccomp.target.o ran%/driver.ccomp.target.o  ran%/init.ccomp.target.o ran%/check.ccomp.target.o ran%/hash.ccomp.target.o
+	$(TARGET_CCOMP) $(CCOMPOPTS) $(CCOMPFLAGS) $+ -o $@
+
+ran%/example.gcc.target: ran%/func.gcc.target.o ran%/driver.gcc.target.o ran%/init.gcc.target.o ran%/check.gcc.target.o ran%/hash.gcc.target.o
+	$(TARGET_CC) $(TARGET_CFLAGS) $+ -o $@
+
+ran%/example.gcc.host: ran%/func.gcc.host.o ran%/driver.gcc.host.o ran%/init.gcc.host.o ran%/check.gcc.host.o ran%/hash.gcc.host.o
+	$(CC) $(CFLAGS) $+ -o $@
+
+ran%/driver.c ran%/func.c ran%/init.c ran%/check.c ran%/hash.c ran%/init.h:
+	mkdir -p ran$*
 	$(YARPGEN) --seed=$* --out-dir=ran$*/ --std=c99
 
-ran%/example.k1c.cmp : ran%/example.gcc.k1c.out ran%/example.ccomp.k1c.out
+ran%/example.target.cmp : ran%/example.gcc.target.out ran%/example.ccomp.target.out
 	cmp $+ > $@
 
-.PHONY: all clean
+ran%/example.host_target.cmp : ran%/example.gcc.host.out ran%/example.ccomp.target.out
+	cmp $+ > $@
+
+yarpgen:
+	curl -L -o yarpgen_v1.1.tar.gz https://github.com/intel/yarpgen/archive/v1.1.tar.gz
+	tar xfz yarpgen_v1.1.tar.gz
+	$(MAKE) CXX=g++ -C yarpgen-1.1
+	cp yarpgen-1.1/yarpgen $@
+
+.PHONY: all clean tests_c tests_c
 
 clean:
 	-rm -rf ran*
diff --git a/test/monniaux/yarpgen/Makefile.old b/test/monniaux/yarpgen/Makefile.old
new file mode 100644
index 00000000..9da82deb
--- /dev/null
+++ b/test/monniaux/yarpgen/Makefile.old
@@ -0,0 +1,52 @@
+YARPGEN=yarpgen
+MAX=300
+PREFIX=ran%06.f
+include ../rules.mk
+
+K1C_CCOMPFLAGS += -funprototyped -fbitfields
+CCOMPFLAGS += -funprototyped -fbitfields
+
+TARGETS_C=$(shell seq --format $(PREFIX)/func.c 0 $(MAX)) \
+	$(shell seq --format $(PREFIX)/driver.c 0 $(MAX)) \
+	$(shell seq --format $(PREFIX)/init.h 0 $(MAX))
+TARGETS_CCOMP_K1C_S=$(shell seq --format $(PREFIX)/func.ccomp.k1c.s 0 $(MAX)) \
+	$(shell seq --format $(PREFIX)/driver.ccomp.k1c.s 0 $(MAX))
+TARGETS_GCC_K1C_S=$(shell seq --format $(PREFIX)/func.gcc.k1c.s 0 $(MAX)) \
+	$(shell seq --format $(PREFIX)/driver.gcc.k1c.s 0 $(MAX))
+TARGETS_CCOMP_HOST_S=$(shell seq --format $(PREFIX)/func.ccomp.host.s 0 $(MAX)) \
+	$(shell seq --format $(PREFIX)/driver.ccomp.host.s 0 $(MAX))
+TARGETS_GCC_HOST_S=$(shell seq --format $(PREFIX)/func.gcc.host.s 0 $(MAX)) \
+	$(shell seq --format $(PREFIX)/driver.gcc.host.s 0 $(MAX))
+TARGETS_CCOMP_K1C_OUT=$(shell seq --format $(PREFIX)/example.ccomp.k1c.out 0 $(MAX))
+TARGETS_GCC_K1C_OUT=$(shell seq --format $(PREFIX)/example.gcc.k1c.out 0 $(MAX))
+TARGETS_GCC_HOST_OUT=$(shell seq --format $(PREFIX)/example.gcc.host.out 0 $(MAX))
+TARGETS_CCOMP_HOST_OUT=$(shell seq --format $(PREFIX)/example.ccomp.host.out 0 $(MAX))
+TARGETS_CMP=$(shell seq --format $(PREFIX)/example.k1c.cmp 0 $(MAX))
+
+all:	$(TARGETS_CCOMP_K1C_OUT) $(TARGETS_GCC_K1C_OUT) $(TARGETS_GCC_HOST_OUT) $(TARGETS_CCOMP_HOST_OUT) $(TARGETS_CCOMP_K1C_S) $(TARGETS_GCC_K1C_S) $(TARGETS_GCC_HOST_S) $(TARGETS_CCOMP_HOST_S) $(TARGETS_CMP) $(TARGETS_C)
+
+ran%/func.ccomp.k1c.s ran%/func.gcc.k1c.s ran%/func.ccomp.host.s ran%/func.gcc.host.s : ran%/init.h
+
+ran%/example.ccomp.k1c: ran%/func.ccomp.k1c.o ran%/driver.ccomp.k1c.o 
+	$(K1C_CCOMP) $(K1C_CCOMPFLAGS) $+ -o $@
+
+ran%/example.gcc.k1c: ran%/func.gcc.k1c.o ran%/driver.gcc.k1c.o 
+	$(K1C_CC) $(K1C_CFLAGS) $+ -o $@
+
+ran%/example.gcc.host: ran%/func.gcc.host.o ran%/driver.gcc.host.o 
+	$(CC) $(CFLAGS) $+ -o $@
+
+ran%/example.ccomp.host: ran%/func.ccomp.host.o ran%/driver.ccomp.host.o 
+	$(CCOMP) $(CCOMPFLAGS) $+ -o $@
+
+ran%/driver.c ran%/func.c ran%/init.h:
+	-mkdir ran$*
+	$(YARPGEN) --seed=$* --out-dir=ran$*/ --std=c99
+
+ran%/example.k1c.cmp : ran%/example.gcc.k1c.out ran%/example.ccomp.k1c.out
+	cmp $+ > $@
+
+.PHONY: all clean
+
+clean:
+	-rm -rf ran*
diff --git a/test/regression/Makefile b/test/regression/Makefile
index 3447d6a5..97c25f6c 100644
--- a/test/regression/Makefile
+++ b/test/regression/Makefile
@@ -35,9 +35,9 @@ endif
 # but produce processor-dependent results, so no reference output in Results
 
 TESTS_DIFF=NaNs
-ifeq ($(ARCH),mppa_k1c)
+# FIXME ifeq ($(ARCH),mppa_k1c)
 	TESTS_DIFF:=$(filter-out NaNs,$(TESTS_DIFF))
-endif
+# endif
 
 # Other tests: should compile to .s without errors (but expect warnings)
 
diff --git a/test/regression/Results/int64 b/test/regression/Results/int64
index af444cf6..ae8a3cc1 100644
--- a/test/regression/Results/int64
+++ b/test/regression/Results/int64
@@ -335,6 +335,48 @@ utof x = 0
 stof x = 0
 
 x = 0
+y = 52ce6b4000000063
+-x = 0
+x + y = 52ce6b4000000063
+x - y = ad3194bfffffff9d
+x * y = 0
+x /u y = 0
+x %u y = 0
+x /s y = 0
+x %s y = 0
+x /u y2 = 0
+x %u y2 = 0
+x /s y3 = 0
+x %s y3 = 0
+x /u 3 = 0
+x %u 3 = 0
+x /s 3 = 0
+x %s 3 = 0
+x /u 5 = 0
+x %u 5 = 0
+x /s 5 = 0
+x %s 5 = 0
+x /u 11 = 0
+x %u 11 = 0
+x /s 11 = 0
+x %s 11 = 0
+~x = ffffffffffffffff
+x & y = 0
+x | y = 52ce6b4000000063
+x ^ y = 52ce6b4000000063
+x << i = 0
+x >>u i = 0
+x >>s i = 0
+x cmpu y = lt
+x cmps y = lt
+utod x = 0
+dtou f = 0
+stod x = 0
+dtos f = 0
+utof x = 0
+stof x = 0
+
+x = 0
 y = 14057b7ef767814f
 -x = 0
 x + y = 14057b7ef767814f
@@ -755,6 +797,48 @@ utof x = 3f800000
 stof x = 3f800000
 
 x = 1
+y = 52ce6b4000000063
+-x = ffffffffffffffff
+x + y = 52ce6b4000000064
+x - y = ad3194bfffffff9e
+x * y = 52ce6b4000000063
+x /u y = 0
+x %u y = 1
+x /s y = 0
+x %s y = 1
+x /u y2 = 0
+x %u y2 = 1
+x /s y3 = 0
+x %s y3 = 1
+x /u 3 = 0
+x %u 3 = 1
+x /s 3 = 0
+x %s 3 = 1
+x /u 5 = 0
+x %u 5 = 1
+x /s 5 = 0
+x %s 5 = 1
+x /u 11 = 0
+x %u 11 = 1
+x /s 11 = 0
+x %s 11 = 1
+~x = fffffffffffffffe
+x & y = 1
+x | y = 52ce6b4000000063
+x ^ y = 52ce6b4000000062
+x << i = 800000000
+x >>u i = 0
+x >>s i = 0
+x cmpu y = lt
+x cmps y = lt
+utod x = 3ff0000000000000
+dtou f = 0
+stod x = 3ff0000000000000
+dtos f = 0
+utof x = 3f800000
+stof x = 3f800000
+
+x = 1
 y = 9af678222e728119
 -x = ffffffffffffffff
 x + y = 9af678222e72811a
@@ -1175,6 +1259,48 @@ utof x = 5f800000
 stof x = bf800000
 
 x = ffffffffffffffff
+y = 52ce6b4000000063
+-x = 1
+x + y = 52ce6b4000000062
+x - y = ad3194bfffffff9c
+x * y = ad3194bfffffff9d
+x /u y = 3
+x %u y = 794be3ffffffed6
+x /s y = 0
+x %s y = ffffffffffffffff
+x /u y2 = 3176fe836
+x %u y2 = 3683607f
+x /s y3 = 0
+x %s y3 = ffffffffffffffff
+x /u 3 = 5555555555555555
+x %u 3 = 0
+x /s 3 = 0
+x %s 3 = ffffffffffffffff
+x /u 5 = 3333333333333333
+x %u 5 = 0
+x /s 5 = 0
+x %s 5 = ffffffffffffffff
+x /u 11 = 1745d1745d1745d1
+x %u 11 = 4
+x /s 11 = 0
+x %s 11 = ffffffffffffffff
+~x = 0
+x & y = 52ce6b4000000063
+x | y = ffffffffffffffff
+x ^ y = ad3194bfffffff9c
+x << i = fffffff800000000
+x >>u i = 1fffffff
+x >>s i = ffffffffffffffff
+x cmpu y = gt
+x cmps y = lt
+utod x = 43f0000000000000
+dtou f = 68db8bac710cb
+stod x = bff0000000000000
+dtos f = 0
+utof x = 5f800000
+stof x = bf800000
+
+x = ffffffffffffffff
 y = 62354cda6226d1f3
 -x = 1
 x + y = 62354cda6226d1f2
@@ -1595,6 +1721,48 @@ utof x = 4f000000
 stof x = 4f000000
 
 x = 7fffffff
+y = 52ce6b4000000063
+-x = ffffffff80000001
+x + y = 52ce6b4080000062
+x - y = ad3194c07fffff9c
+x * y = ad3194f17fffff9d
+x /u y = 0
+x %u y = 7fffffff
+x /s y = 0
+x %s y = 7fffffff
+x /u y2 = 1
+x %u y2 = 2d3194bf
+x /s y3 = 1
+x %s y3 = 2d3194bf
+x /u 3 = 2aaaaaaa
+x %u 3 = 1
+x /s 3 = 2aaaaaaa
+x %s 3 = 1
+x /u 5 = 19999999
+x %u 5 = 2
+x /s 5 = 19999999
+x %s 5 = 2
+x /u 11 = ba2e8ba
+x %u 11 = 1
+x /s 11 = ba2e8ba
+x %s 11 = 1
+~x = ffffffff80000000
+x & y = 63
+x | y = 52ce6b407fffffff
+x ^ y = 52ce6b407fffff9c
+x << i = fffffff800000000
+x >>u i = 0
+x >>s i = 0
+x cmpu y = lt
+x cmps y = lt
+utod x = 41dfffffffc00000
+dtou f = 346dc
+stod x = 41dfffffffc00000
+dtos f = 346dc
+utof x = 4f000000
+stof x = 4f000000
+
+x = 7fffffff
 y = 144093704fadba5d
 -x = ffffffff80000001
 x + y = 14409370cfadba5c
@@ -2015,6 +2183,48 @@ utof x = 4f000000
 stof x = 4f000000
 
 x = 80000000
+y = 52ce6b4000000063
+-x = ffffffff80000000
+x + y = 52ce6b4080000063
+x - y = ad3194c07fffff9d
+x * y = 3180000000
+x /u y = 0
+x %u y = 80000000
+x /s y = 0
+x %s y = 80000000
+x /u y2 = 1
+x %u y2 = 2d3194c0
+x /s y3 = 1
+x %s y3 = 2d3194c0
+x /u 3 = 2aaaaaaa
+x %u 3 = 2
+x /s 3 = 2aaaaaaa
+x %s 3 = 2
+x /u 5 = 19999999
+x %u 5 = 3
+x /s 5 = 19999999
+x %s 5 = 3
+x /u 11 = ba2e8ba
+x %u 11 = 2
+x /s 11 = ba2e8ba
+x %s 11 = 2
+~x = ffffffff7fffffff
+x & y = 0
+x | y = 52ce6b4080000063
+x ^ y = 52ce6b4080000063
+x << i = 0
+x >>u i = 0
+x >>s i = 0
+x cmpu y = lt
+x cmps y = lt
+utod x = 41e0000000000000
+dtou f = 346dc
+stod x = 41e0000000000000
+dtos f = 346dc
+utof x = 4f000000
+stof x = 4f000000
+
+x = 80000000
 y = 7b985bc1e7bce4d7
 -x = ffffffff80000000
 x + y = 7b985bc267bce4d7
@@ -2435,6 +2645,48 @@ utof x = 5f000000
 stof x = 5f000000
 
 x = 7fffffffffffffff
+y = 52ce6b4000000063
+-x = 8000000000000001
+x + y = d2ce6b4000000062
+x - y = 2d3194bfffffff9c
+x * y = 2d3194bfffffff9d
+x /u y = 1
+x %u y = 2d3194bfffffff9c
+x /s y = 1
+x %s y = 2d3194bfffffff9c
+x /u y2 = 18bb7f41b
+x %u y2 = 1b41b03f
+x /s y3 = 18bb7f41b
+x %s y3 = 1b41b03f
+x /u 3 = 2aaaaaaaaaaaaaaa
+x %u 3 = 1
+x /s 3 = 2aaaaaaaaaaaaaaa
+x %s 3 = 1
+x /u 5 = 1999999999999999
+x %u 5 = 2
+x /s 5 = 1999999999999999
+x %s 5 = 2
+x /u 11 = ba2e8ba2e8ba2e8
+x %u 11 = 7
+x /s 11 = ba2e8ba2e8ba2e8
+x %s 11 = 7
+~x = 8000000000000000
+x & y = 52ce6b4000000063
+x | y = 7fffffffffffffff
+x ^ y = 2d3194bfffffff9c
+x << i = fffffff800000000
+x >>u i = fffffff
+x >>s i = fffffff
+x cmpu y = gt
+x cmps y = gt
+utod x = 43e0000000000000
+dtou f = 346dc5d638865
+stod x = 43e0000000000000
+dtos f = 346dc5d638865
+utof x = 5f000000
+stof x = 5f000000
+
+x = 7fffffffffffffff
 y = a220229ec164ffe1
 -x = 8000000000000001
 x + y = 2220229ec164ffe0
@@ -2855,6 +3107,48 @@ utof x = 5f000000
 stof x = df000000
 
 x = 8000000000000000
+y = 52ce6b4000000063
+-x = 8000000000000000
+x + y = d2ce6b4000000063
+x - y = 2d3194bfffffff9d
+x * y = 8000000000000000
+x /u y = 1
+x %u y = 2d3194bfffffff9d
+x /s y = ffffffffffffffff
+x %s y = d2ce6b4000000063
+x /u y2 = 18bb7f41b
+x %u y2 = 1b41b040
+x /s y3 = fffffffe74480be5
+x %s y3 = ffffffffe4be4fc0
+x /u 3 = 2aaaaaaaaaaaaaaa
+x %u 3 = 2
+x /s 3 = d555555555555556
+x %s 3 = fffffffffffffffe
+x /u 5 = 1999999999999999
+x %u 5 = 3
+x /s 5 = e666666666666667
+x %s 5 = fffffffffffffffd
+x /u 11 = ba2e8ba2e8ba2e8
+x %u 11 = 8
+x /s 11 = f45d1745d1745d18
+x %s 11 = fffffffffffffff8
+~x = 7fffffffffffffff
+x & y = 0
+x | y = d2ce6b4000000063
+x ^ y = d2ce6b4000000063
+x << i = 0
+x >>u i = 10000000
+x >>s i = fffffffff0000000
+x cmpu y = gt
+x cmps y = lt
+utod x = 43e0000000000000
+dtou f = 346dc5d638865
+stod x = c3e0000000000000
+dtos f = fffcb923a29c779b
+utof x = 5f000000
+stof x = df000000
+
+x = 8000000000000000
 y = c73aa0d9a415dfb
 -x = 8000000000000000
 x + y = 8c73aa0d9a415dfb
@@ -3275,6 +3569,48 @@ utof x = 4f800000
 stof x = 4f800000
 
 x = 100000003
+y = 52ce6b4000000063
+-x = fffffffefffffffd
+x + y = 52ce6b4100000066
+x - y = ad3194c0ffffffa0
+x * y = f86b422300000129
+x /u y = 0
+x %u y = 100000003
+x /s y = 0
+x %s y = 100000003
+x /u y2 = 3
+x %u y2 = 794be43
+x /s y3 = 3
+x %s y3 = 794be43
+x /u 3 = 55555556
+x %u 3 = 1
+x /s 3 = 55555556
+x %s 3 = 1
+x /u 5 = 33333333
+x %u 5 = 4
+x /s 5 = 33333333
+x %s 5 = 4
+x /u 11 = 1745d174
+x %u 11 = 7
+x /s 11 = 1745d174
+x %s 11 = 7
+~x = fffffffefffffffc
+x & y = 3
+x | y = 52ce6b4100000063
+x ^ y = 52ce6b4100000060
+x << i = 1800000000
+x >>u i = 0
+x >>s i = 0
+x cmpu y = lt
+x cmps y = lt
+utod x = 41f0000000300000
+dtou f = 68db8
+stod x = 41f0000000300000
+dtos f = 68db8
+utof x = 4f800000
+stof x = 4f800000
+
+x = 100000003
 y = e9bcd26890f095a5
 -x = fffffffefffffffd
 x + y = e9bcd26990f095a8
@@ -3358,47 +3694,467 @@ dtos f = 14bb101261e18
 utof x = 5e4a72c9
 stof x = 5e4a72c9
 
-x = 8362aa9340fe215f
-y = f986342416ec8002
--x = 7c9d556cbf01dea1
-x + y = 7ce8deb757eaa161
-x - y = 89dc766f2a11a15d
-x * y = e4a2b426803fc2be
+x = 52ce6b4000000063
+y = 0
+-x = ad3194bfffffff9d
+x + y = 52ce6b4000000063
+x - y = 52ce6b4000000063
+x * y = 0
 x /u y = 0
-x %u y = 8362aa9340fe215f
-x /s y = 13
-x %s y = fe6ccbe58d70a139
-x /u y2 = 86cb918b
-x %u y2 = 910b6dd3
-x /s y3 = 133e437097
-x %s y3 = fffffffffe99a023
-x /u 3 = 2bcb8e3115aa0b1f
-x %u 3 = 2
-x /s 3 = d67638dbc054b5cb
-x %s 3 = fffffffffffffffe
-x /u 5 = 1a46eeea4032d379
-x %u 5 = 2
-x /s 5 = e713bbb70cffa047
-x %s 5 = fffffffffffffffc
-x /u 11 = bf1b26a7a45a5f1
-x %u 11 = 4
-x /s 11 = f4abe0f61d2e6020
-x %s 11 = ffffffffffffffff
-~x = 7c9d556cbf01dea0
-x & y = 8102200000ec0002
-x | y = fbe6beb756fea15f
-x ^ y = 7ae49eb75612a15d
-x << i = d8aaa4d03f8857c
-x >>u i = 20d8aaa4d03f8857
-x >>s i = e0d8aaa4d03f8857
+x %u y = 0
+x /s y = 0
+x %s y = 0
+x /u y2 = 0
+x %u y2 = 0
+x /s y3 = 0
+x %s y3 = 0
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 0
+x | y = 52ce6b4000000063
+x ^ y = 52ce6b4000000063
+x << i = 52ce6b4000000063
+x >>u i = 52ce6b4000000063
+x >>s i = 52ce6b4000000063
+x cmpu y = gt
+x cmps y = gt
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = 1
+-x = ad3194bfffffff9d
+x + y = 52ce6b4000000064
+x - y = 52ce6b4000000062
+x * y = 52ce6b4000000063
+x /u y = 52ce6b4000000063
+x %u y = 0
+x /s y = 52ce6b4000000063
+x %s y = 0
+x /u y2 = 0
+x %u y2 = 0
+x /s y3 = 0
+x %s y3 = 0
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 1
+x | y = 52ce6b4000000063
+x ^ y = 52ce6b4000000062
+x << i = a59cd680000000c6
+x >>u i = 296735a000000031
+x >>s i = 296735a000000031
+x cmpu y = gt
+x cmps y = gt
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = ffffffffffffffff
+-x = ad3194bfffffff9d
+x + y = 52ce6b4000000062
+x - y = 52ce6b4000000064
+x * y = ad3194bfffffff9d
+x /u y = 0
+x %u y = 52ce6b4000000063
+x /s y = ad3194bfffffff9d
+x %s y = 0
+x /u y2 = 52ce6b40
+x %u y2 = 52ce6ba3
+x /s y3 = ad3194bfffffff9d
+x %s y3 = 0
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 52ce6b4000000063
+x | y = ffffffffffffffff
+x ^ y = ad3194bfffffff9c
+x << i = 8000000000000000
+x >>u i = 0
+x >>s i = 0
+x cmpu y = lt
+x cmps y = gt
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = 7fffffff
+-x = ad3194bfffffff9d
+x + y = 52ce6b4080000062
+x - y = 52ce6b3f80000064
+x * y = ad3194f17fffff9d
+x /u y = a59cd681
+x %u y = 259cd6e4
+x /s y = a59cd681
+x %s y = 259cd6e4
+x /u y2 = 0
+x %u y2 = 0
+x /s y3 = 0
+x %s y3 = 0
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 63
+x | y = 52ce6b407fffffff
+x ^ y = 52ce6b407fffff9c
+x << i = 8000000000000000
+x >>u i = 0
+x >>s i = 0
+x cmpu y = gt
+x cmps y = gt
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = 80000000
+-x = ad3194bfffffff9d
+x + y = 52ce6b4080000063
+x - y = 52ce6b3f80000063
+x * y = 3180000000
+x /u y = a59cd680
+x %u y = 63
+x /s y = a59cd680
+x %s y = 63
+x /u y2 = 0
+x %u y2 = 0
+x /s y3 = 0
+x %s y3 = 0
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 0
+x | y = 52ce6b4080000063
+x ^ y = 52ce6b4080000063
+x << i = 52ce6b4000000063
+x >>u i = 52ce6b4000000063
+x >>s i = 52ce6b4000000063
+x cmpu y = gt
+x cmps y = gt
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = 7fffffffffffffff
+-x = ad3194bfffffff9d
+x + y = d2ce6b4000000062
+x - y = d2ce6b4000000064
+x * y = 2d3194bfffffff9d
+x /u y = 0
+x %u y = 52ce6b4000000063
+x /s y = 0
+x %s y = 52ce6b4000000063
+x /u y2 = a59cd681
+x %u y2 = 259cd6e4
+x /s y3 = a59cd681
+x %s y3 = 259cd6e4
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 52ce6b4000000063
+x | y = 7fffffffffffffff
+x ^ y = 2d3194bfffffff9c
+x << i = 8000000000000000
+x >>u i = 0
+x >>s i = 0
 x cmpu y = lt
 x cmps y = lt
-utod x = 43e06c5552681fc4
-dtou f = 35d0c262d14d7
-stod x = c3df27555b2fc078
-dtos f = fffccf536b66040d
-utof x = 5f0362ab
-stof x = def93aab
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = 8000000000000000
+-x = ad3194bfffffff9d
+x + y = d2ce6b4000000063
+x - y = d2ce6b4000000063
+x * y = 8000000000000000
+x /u y = 0
+x %u y = 52ce6b4000000063
+x /s y = 0
+x %s y = 52ce6b4000000063
+x /u y2 = a59cd680
+x %u y2 = 63
+x /s y3 = ffffffff5a632980
+x %s y3 = 63
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 0
+x | y = d2ce6b4000000063
+x ^ y = d2ce6b4000000063
+x << i = 52ce6b4000000063
+x >>u i = 52ce6b4000000063
+x >>s i = 52ce6b4000000063
+x cmpu y = lt
+x cmps y = gt
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = 100000003
+-x = ad3194bfffffff9d
+x + y = 52ce6b4100000066
+x - y = 52ce6b3f00000060
+x * y = f86b422300000129
+x /u y = 52ce6b3f
+x %u y = 794bea6
+x /s y = 52ce6b3f
+x %s y = 794bea6
+x /u y2 = 52ce6b4000000063
+x %u y2 = 0
+x /s y3 = 52ce6b4000000063
+x %s y3 = 0
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 3
+x | y = 52ce6b4100000063
+x ^ y = 52ce6b4100000060
+x << i = 96735a0000000318
+x >>u i = a59cd680000000c
+x >>s i = a59cd680000000c
+x cmpu y = gt
+x cmps y = gt
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = 52ce6b4000000063
+-x = ad3194bfffffff9d
+x + y = a59cd680000000c6
+x - y = 0
+x * y = ba6f38000002649
+x /u y = 1
+x %u y = 0
+x /s y = 1
+x %s y = 0
+x /u y2 = 100000000
+x %u y2 = 63
+x /s y3 = 100000000
+x %s y3 = 63
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 52ce6b4000000063
+x | y = 52ce6b4000000063
+x ^ y = 0
+x << i = 31800000000
+x >>u i = a59cd68
+x >>s i = a59cd68
+x cmpu y = eq
+x cmps y = eq
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = 52ce6b4000000063
+y = 8362aa9340fe215f
+-x = ad3194bfffffff9d
+x + y = d63115d340fe21c2
+x - y = cf6bc0acbf01df04
+x * y = 8f1503b22246e7bd
+x /u y = 0
+x %u y = 52ce6b4000000063
+x /s y = 0
+x %s y = 52ce6b4000000063
+x /u y2 = a158656f
+x %u y2 = 5640ba6
+x /s y3 = ffffffff55e35d11
+x %s y3 = 5f2245a0
+x /u 3 = 1b9a23c000000021
+x %u 3 = 0
+x /s 3 = 1b9a23c000000021
+x %s 3 = 0
+x /u 5 = 108faf0ccccccce0
+x %u 5 = 3
+x /s 5 = 108faf0ccccccce0
+x %s 5 = 3
+x /u 11 = 7872105d1745d20
+x %u 11 = 3
+x /s 11 = 7872105d1745d20
+x %s 11 = 3
+~x = ad3194bfffffff9c
+x & y = 2422a0000000043
+x | y = d3eeebd340fe217f
+x ^ y = d1acc1d340fe213c
+x << i = 3180000000
+x >>u i = a59cd680
+x >>s i = a59cd680
+x cmpu y = lt
+x cmps y = gt
+utod x = 43d4b39ad0000000
+dtou f = 21eadf559b3d0
+stod x = 43d4b39ad0000000
+dtos f = 21eadf559b3d0
+utof x = 5ea59cd7
+stof x = 5ea59cd7
+
+x = f986342416ec8002
+y = 52ce6b4000000063
+-x = 679cbdbe9137ffe
+x + y = 4c549f6416ec8065
+x - y = a6b7c8e416ec7f9f
+x * y = b9230074dd7580c6
+x /u y = 3
+x %u y = 11af26416ec7ed9
+x /s y = 0
+x %s y = f986342416ec8002
+x /u y2 = 3036abea3
+x %u y2 = 164b642
+x /s y3 = ffffffffebfad66d
+x %s y3 = ffffffffcae155c2
+x /u 3 = 532cbc0c07a42aab
+x %u 3 = 1
+x /s 3 = fdd766b6b24ed556
+x %s 3 = 0
+x /u 5 = 31e7a40737c8e666
+x %u 5 = 4
+x /s 5 = feb470d40495b334
+x %s 5 = fffffffffffffffe
+x /u 11 = 16af1c0347e6f45d
+x %u 11 = 3
+x /s 11 = ff694a8eeacfae8c
+x %s 11 = fffffffffffffffe
+~x = 679cbdbe9137ffd
+x & y = 5086200000000002
+x | y = fbce7f6416ec8063
+x ^ y = ab485f6416ec8061
+x << i = b764001000000000
+x >>u i = 1f30c684
+x >>s i = ffffffffff30c684
+x cmpu y = gt
+x cmps y = lt
+utod x = 43ef30c68482dd90
+dtou f = 6634832136daf
+stod x = c399e72f6fa44e00
+dtos f = ffffd58f774c5ce4
+utof x = 5f798634
+stof x = dccf397b
 
 x = 368083376ba4ffa9
 y = 6912b247b79a4904
@@ -7558,3 +8314,45 @@ dtos f = b3fdf698d581
 utof x = 5ddbb784
 stof x = 5ddbb784
 
+x = ca9a47c1649d27a7
+y = d56d650045e652aa
+-x = 3565b83e9b62d859
+x + y = a007acc1aa837a51
+x - y = f52ce2c11eb6d4fd
+x * y = 630e3c88ca19d2e6
+x /u y = 0
+x %u y = ca9a47c1649d27a7
+x /s y = 1
+x %s y = f52ce2c11eb6d4fd
+x /u y2 = f3042098
+x %u y2 = 6b092fa7
+x /s y3 = 141176486
+x %s y3 = ffffffffdee649a7
+x /u 3 = 4388c295cc34628d
+x %u 3 = 0
+x /s 3 = ee336d4076df0d38
+x %s 3 = ffffffffffffffff
+x /u 5 = 2885418d141f6e54
+x %u 5 = 3
+x /s 5 = f5520e59e0ec3b22
+x %s 5 = fffffffffffffffd
+x /u 11 = 126b1dcbc3541ae0
+x %u 11 = 7
+x /s 11 = fb254c57663cd510
+x %s 11 = fffffffffffffff7
+~x = 3565b83e9b62d858
+x & y = c0084500448402a2
+x | y = dfff67c165ff77af
+x ^ y = 1ff722c1217b750d
+x << i = 749e9c0000000000
+x >>u i = 32a691
+x >>s i = fffffffffff2a691
+x cmpu y = lt
+x cmps y = lt
+utod x = 43e95348f82c93a5
+dtou f = 52fc6dac31674
+stod x = c3cab2dc1f4db16c
+dtos f = fffea20e1ffc05aa
+utof x = 5f4a9a48
+stof x = de5596e1
+
diff --git a/test/regression/int64.c b/test/regression/int64.c
index d9785e95..0da9602d 100644
--- a/test/regression/int64.c
+++ b/test/regression/int64.c
@@ -103,7 +103,8 @@ u64 special_values[] = {
   0x80000000LLU,
   0x7FFFFFFFFFFFFFFFLLU,
   0x8000000000000000LLU,
-  0x100000003LLU
+  0x100000003LLU,
+  0x52ce6b4000000063LLU
 };
 
 #define NUM_SPECIAL_VALUES (sizeof(special_values) / sizeof(u64))
diff --git a/test/regression/packedstruct1.c b/test/regression/packedstruct1.c
index ac68c698..b805c92a 100644
--- a/test/regression/packedstruct1.c
+++ b/test/regression/packedstruct1.c
@@ -23,9 +23,9 @@ void test1(void)
   struct s1 s1;
   printf("sizeof(struct s1) = %d\n", szof(s1));
   printf("precomputed sizeof(struct s1) = %d\n", bszof(s1));
-  printf("offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          offsetOf(s1,x), offsetOf(s1,y), offsetOf(s1,z));
-  printf("precomputed offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("precomputed offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          boffsetof(s1,x), boffsetof(s1,y), boffsetof(s1,z));
   s1.x = 123; s1.y = -456; s1.z = 3.14159;
   printf("s1 = {x = %d, y = %d, z = %.5f}\n\n", s1.x, s1.y, s1.z);
@@ -44,9 +44,9 @@ void test2(void)
   printf("sizeof(struct s2) = %d\n", szof(s2));
   printf("precomputed sizeof(struct s2) = %d\n", bszof(s2));
   printf("&s2 mod 16 = %d\n", ((int) &s2) & 0xF);
-  printf("offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          offsetOf(s2,x), offsetOf(s2,y), offsetOf(s2,z));
-  printf("precomputed offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("precomputed offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          boffsetof(s2,x), boffsetof(s2,y), boffsetof(s2,z));
   s2.x = 12345; s2.y = -456; s2.z = 3.14159;
   printf("s2 = {x = %d, y = %d, z = %.5f}\n\n", s2.x, s2.y, s2.z);
@@ -73,8 +73,8 @@ void test3(void)
 
   printf("sizeof(struct s3) = %d\n", szof(s3));
   printf("precomputed sizeof(struct s3) = %d\n", bszof(s3));
-  printf("offsetOf(s) = %d\n", offsetOf(s3,s));
-  printf("precomputed offsetOf(s) = %d\n", boffsetof(s3,s));
+  printf("offsetof(s) = %d\n", offsetOf(s3,s));
+  printf("precomputed offsetof(s) = %d\n", boffsetof(s3,s));
   s3.x = 123;
   s3.y = 45678;
   s3.z = 0x80000001U;
@@ -103,9 +103,9 @@ void test4(void)
 
   printf("sizeof(struct s4) = %d\n", szof(s4));
   printf("precomputed sizeof(struct s4) = %d\n", bszof(s4));
-  printf("offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          offsetOf(s4,x), offsetOf(s4,y), offsetOf(s4,z));
-  printf("precomputed offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("precomputed offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          boffsetof(s4,x), boffsetof(s4,y), boffsetof(s4,z));
   s4.x = 123; s4.y = -456; s4.z = 3.14159;
   printf("s4 = {x = %d, y = %d, z = %.5f}\n\n", s4.x, s4.y, s4.z);
@@ -121,9 +121,9 @@ void test5(void)
 
   printf("sizeof(struct s5) = %d\n", szof(s5));
   printf("precomputed sizeof(struct s5) = %d\n", bszof(s5));
-  printf("offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          offsetOf(s5,x), offsetOf(s5,y), offsetOf(s5,z));
-  printf("precomputed offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("precomputed offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          boffsetof(s5,x), boffsetof(s5,y), boffsetof(s5,z));
   s5.x = 123; s5.y = -456; s5.z = 3.14159;
   printf("s5 = {x = %d, y = %d, z = %.5f}\n\n", s5.x, s5.y, s5.z);
@@ -139,9 +139,9 @@ void test6(void)
 
   printf("sizeof(struct s6) = %d\n", szof(s6));
   printf("precomputed sizeof(struct s6) = %d\n", bszof(s6));
-  printf("offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          offsetOf(s6,x), offsetOf(s6,y), offsetOf(s6,z));
-  printf("precomputed offsetOf(x) = %d, offsetOf(y) = %d, offsetOf(z) = %d\n",
+  printf("precomputed offsetof(x) = %d, offsetof(y) = %d, offsetof(z) = %d\n",
          boffsetof(s6,x), boffsetof(s6,y), boffsetof(s6,z));
   s62.x = 123; s62.y = -456; s62.z = 3.14159;
   printf("s62 = {x = %d, y = %d, z = %.5f}\n\n", s62.x, s62.y, s62.z);
diff --git a/x86/Asmexpand.ml b/x86/Asmexpand.ml
index c82d406e..b8353046 100644
--- a/x86/Asmexpand.ml
+++ b/x86/Asmexpand.ml
@@ -251,7 +251,7 @@ let expand_builtin_va_start_32 r =
     invalid_arg "Fatal error: va_start used in non-vararg function";
   let ofs =
     Int32.(add (add !PrintAsmaux.current_function_stacksize 4l)
-               (mul 4l (Z.to_int32 (Conventions1.size_arguments
+               (mul 4l (Z.to_int32 (Conventions.size_arguments
                                       (get_current_function_sig ()))))) in
   emit (Pleal (RAX, linear_addr RSP (Z.of_uint32 ofs)));
   emit (Pmovl_mr (linear_addr r _0z, RAX))
diff --git a/x86/CSE2deps.v b/x86/CSE2deps.v
new file mode 100644
index 00000000..f4d9e254
--- /dev/null
+++ b/x86/CSE2deps.v
@@ -0,0 +1,24 @@
+Require Import BoolEqual Coqlib.
+Require Import AST Integers Floats.
+Require Import Values Memory Globalenvs Events.
+Require Import Op.
+
+Definition can_swap_accesses_ofs ofsr chunkr ofsw chunkw :=
+     (0 <=? ofsw) && (ofsw <=? (Ptrofs.modulus - largest_size_chunk))
+  && (0 <=? ofsr) && (ofsr <=? (Ptrofs.modulus - largest_size_chunk))
+  && ((ofsw + size_chunk chunkw <=? ofsr) ||
+      (ofsr + size_chunk chunkr <=? ofsw)).
+
+Definition may_overlap chunk addr args chunk' addr' args' :=
+  match addr, addr', args, args' with
+  | (Aindexed ofs), (Aindexed ofs'),
+    (base :: nil), (base' :: nil) =>
+    if peq base base'
+    then negb (can_swap_accesses_ofs ofs' chunk' ofs chunk)
+    else true
+  | (Aglobal symb ofs), (Aglobal symb' ofs'), nil, nil =>
+    if peq symb symb'
+    then negb (can_swap_accesses_ofs (Ptrofs.unsigned ofs') chunk' (Ptrofs.unsigned ofs) chunk)
+    else false
+  | _, _, _, _ => true
+  end.
diff --git a/x86/CSE2depsproof.v b/x86/CSE2depsproof.v
new file mode 100644
index 00000000..1e913254
--- /dev/null
+++ b/x86/CSE2depsproof.v
@@ -0,0 +1,253 @@
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL Maps.
+
+Require Import Globalenvs Values.
+Require Import Linking Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import CSE2 CSE2deps.
+Require Import Lia.
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Section MEMORY_WRITE.
+  Variable m m2 : mem.
+  Variable chunkw chunkr : memory_chunk.
+  Variable base : val.
+  
+  Variable addrw addrr valw : val.
+  Hypothesis STORE : Mem.storev chunkw m addrw valw = Some m2.
+
+  Section INDEXED_AWAY.
+  Variable ofsw ofsr : Z.
+  Hypothesis ADDRW : eval_addressing genv sp
+                       (Aindexed ofsw) (base :: nil) = Some addrw. 
+  Hypothesis ADDRR : eval_addressing genv sp
+                       (Aindexed ofsr) (base :: nil) = Some addrr.
+
+  Lemma load_store_away1 :
+    forall RANGEW : 0 <= ofsw <= Ptrofs.modulus - largest_size_chunk,
+    forall RANGER : 0 <= ofsr <= Ptrofs.modulus - largest_size_chunk,
+    forall SWAPPABLE :    ofsw + size_chunk chunkw <= ofsr
+                       \/ ofsr + size_chunk chunkr <= ofsw,
+    Mem.loadv chunkr m2 addrr =  Mem.loadv chunkr m addrr.
+  Proof.
+    intros.
+    
+    pose proof (max_size_chunk chunkr) as size_chunkr_bounded.
+    pose proof (max_size_chunk chunkw) as size_chunkw_bounded.
+    try change (Ptrofs.modulus - largest_size_chunk) with 4294967288 in *.
+    try change (Ptrofs.modulus - largest_size_chunk) with 18446744073709551608 in *.
+    destruct addrr ; simpl in * ; trivial.
+    unfold eval_addressing, eval_addressing32, eval_addressing64 in *.
+    destruct Archi.ptr64 eqn:PTR64; destruct base; simpl in *; try discriminate.
+    rewrite PTR64 in *.
+    
+    inv ADDRR.
+    inv ADDRW.
+    eapply Mem.load_store_other with (chunk := chunkw) (v := valw) (b := b).
+    exact STORE.
+    right.
+ 
+    all: try (destruct (Ptrofs.unsigned_add_either i0
+                (Ptrofs.of_int (Int.repr ofsr))) as [OFSR | OFSR];
+              rewrite OFSR).
+    all: try (destruct (Ptrofs.unsigned_add_either i0
+                (Ptrofs.of_int64 (Int64.repr ofsr))) as [OFSR | OFSR];
+              rewrite OFSR).
+    all: try (destruct (Ptrofs.unsigned_add_either i0
+                (Ptrofs.of_int (Int.repr ofsw))) as [OFSW | OFSW];
+              rewrite OFSW).
+    all: try (destruct (Ptrofs.unsigned_add_either i0
+                (Ptrofs.of_int64 (Int64.repr ofsw))) as [OFSW | OFSW];
+              rewrite OFSW).
+    
+    all: unfold Ptrofs.of_int64.
+    all: unfold Ptrofs.of_int.
+   
+
+    all: repeat rewrite Int.unsigned_repr by (change Int.max_unsigned with 4294967295; lia).
+    all: repeat rewrite Ptrofs.unsigned_repr by (change Ptrofs.max_unsigned with 4294967295; lia).
+    all: repeat rewrite Int64.unsigned_repr by (change Int64.max_unsigned with 18446744073709551615; lia).
+    all: repeat rewrite Ptrofs.unsigned_repr by (change Ptrofs.max_unsigned with 18446744073709551615; lia).
+
+    all: try change Ptrofs.modulus with 4294967296.
+    all: try change Ptrofs.modulus with 18446744073709551616.
+   
+    all: intuition lia.
+  Qed.
+  
+  Theorem load_store_away :
+    can_swap_accesses_ofs ofsr chunkr ofsw chunkw = true ->
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intro SWAP.
+    unfold can_swap_accesses_ofs in SWAP.
+    repeat rewrite andb_true_iff in SWAP.
+    repeat rewrite orb_true_iff in SWAP.
+    repeat rewrite Z.leb_le in SWAP.
+    apply load_store_away1.
+    all: tauto.
+  Qed.
+  End INDEXED_AWAY.
+
+  Section DIFFERENT_GLOBALS.
+    Variable ofsw ofsr : ptrofs.
+    Hypothesis symw symr : ident.
+    Hypothesis ADDRW : eval_addressing genv sp
+                                       (Aglobal symw ofsw) nil = Some addrw. 
+    Hypothesis ADDRR : eval_addressing genv sp
+                                       (Aglobal symr ofsr) nil = Some addrr.
+
+    Lemma ptr64_cases:
+      forall {T : Type},
+      forall b : bool,
+      forall x y : T,
+        (if b then (if b then x else y) else (if b then y else x)) = x.
+    Proof.
+      destruct b; reflexivity.
+    Qed.
+
+    (* not needed
+    Lemma bool_cases_same:
+      forall {T : Type},
+      forall b : bool,
+      forall x : T,
+        (if b then x else x) = x.
+    Proof.
+      destruct b; reflexivity.
+    Qed.
+     *)
+
+    Lemma load_store_diff_globals :
+      symw <> symr ->
+      Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+    Proof.
+      intros.
+      unfold eval_addressing in *.
+      simpl in *.
+      rewrite ptr64_cases in ADDRR.
+      rewrite ptr64_cases in ADDRW.
+      unfold Genv.symbol_address in *.
+      unfold Genv.find_symbol in *.
+      destruct ((Genv.genv_symb genv) ! symw) as [bw |] eqn:SYMW; inv ADDRW.
+      2: simpl in STORE; discriminate.
+      destruct ((Genv.genv_symb genv) ! symr) as [br |] eqn:SYMR; inv ADDRR.
+      2: reflexivity.
+      assert (br <> bw).
+      {
+        intro EQ.
+        subst br.
+        assert (symr = symw).
+        {
+          eapply Genv.genv_vars_inj; eauto.
+        }
+        congruence.
+      }
+      eapply Mem.load_store_other with (chunk := chunkw) (v := valw) (b := bw).
+      - exact STORE.
+      - left. assumption.
+    Qed.
+  End DIFFERENT_GLOBALS.
+
+  Section SAME_GLOBALS.
+    Variable ofsw ofsr : ptrofs.
+    Hypothesis sym : ident.
+    Hypothesis ADDRW : eval_addressing genv sp
+                                       (Aglobal sym ofsw) nil = Some addrw. 
+    Hypothesis ADDRR : eval_addressing genv sp
+                                       (Aglobal sym ofsr) nil = Some addrr.
+    
+  Lemma load_store_glob_away1 :
+    forall RANGEW : 0 <= (Ptrofs.unsigned ofsw) <= Ptrofs.modulus - largest_size_chunk,
+    forall RANGER : 0 <= (Ptrofs.unsigned ofsr) <= Ptrofs.modulus - largest_size_chunk,
+    forall SWAPPABLE : (Ptrofs.unsigned ofsw) + size_chunk chunkw <= (Ptrofs.unsigned ofsr)
+                    \/ (Ptrofs.unsigned ofsr) + size_chunk chunkr <= (Ptrofs.unsigned ofsw),
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intros.
+    
+    pose proof (max_size_chunk chunkr) as size_chunkr_bounded.
+    pose proof (max_size_chunk chunkw) as size_chunkw_bounded.
+    unfold largest_size_chunk in size_chunkr_bounded, size_chunkw_bounded.
+    try change (Ptrofs.modulus - largest_size_chunk) with 4294967288 in *.
+    try change (Ptrofs.modulus - largest_size_chunk) with 18446744073709551608 in *.
+    unfold eval_addressing, eval_addressing32, eval_addressing64 in *.
+
+    rewrite ptr64_cases in ADDRR.
+    rewrite ptr64_cases in ADDRW.
+    unfold Genv.symbol_address in *.
+    inv ADDRR.
+    inv ADDRW.
+    destruct (Genv.find_symbol genv sym).
+    2: discriminate.
+    
+    eapply Mem.load_store_other with (chunk := chunkw) (v := valw) (b := b).
+    exact STORE.
+    right.
+    tauto.
+  Qed.
+
+  Lemma load_store_glob_away :
+    (can_swap_accesses_ofs (Ptrofs.unsigned ofsr) chunkr (Ptrofs.unsigned ofsw) chunkw) = true ->
+    Mem.loadv chunkr m2 addrr = Mem.loadv chunkr m addrr.
+  Proof.
+    intro SWAP.
+    unfold can_swap_accesses_ofs in SWAP.
+    repeat rewrite andb_true_iff in SWAP.
+    repeat rewrite orb_true_iff in SWAP.
+    repeat rewrite Z.leb_le in SWAP.
+    apply load_store_glob_away1.
+    all: tauto.
+  Qed.
+  End SAME_GLOBALS.
+End MEMORY_WRITE.
+End SOUNDNESS.
+
+
+Section SOUNDNESS.
+  Variable F V : Type.
+  Variable genv: Genv.t F V.
+  Variable sp : val.
+
+Lemma may_overlap_sound:
+  forall m m' : mem,
+  forall chunk addr args chunk' addr' args' v a a' rs,
+    (eval_addressing genv sp addr (rs ## args)) = Some a ->
+    (eval_addressing genv sp addr' (rs ## args')) = Some a' ->
+    (may_overlap chunk addr args chunk' addr' args') = false ->
+    (Mem.storev chunk m a v) = Some m' ->
+    (Mem.loadv chunk' m' a') = (Mem.loadv chunk' m  a').
+Proof.
+  intros until rs.
+  intros ADDR ADDR' OVERLAP STORE.
+  destruct addr; destruct addr'; try discriminate.
+  { (* Aindexed / Aindexed *)
+  destruct args as [ | base [ | ]]. 1,3: discriminate.
+  destruct args' as [ | base' [ | ]]. 1,3: discriminate.
+  simpl in OVERLAP.
+  destruct (peq base base'). 2: discriminate.
+  subst base'.
+  destruct (can_swap_accesses_ofs z0 chunk' z chunk) eqn:SWAP.
+  2: discriminate.
+  simpl in *.
+  eapply load_store_away; eassumption.
+  }
+  { (* Aglobal / Aglobal *)
+    destruct args. 2: discriminate.
+    destruct args'. 2: discriminate.
+    simpl in *.
+    destruct (peq i i1).
+    {
+      subst i1.
+      rewrite negb_false_iff in OVERLAP.
+      eapply load_store_glob_away; eassumption.
+    }
+    eapply load_store_diff_globals; eassumption.
+  }
+Qed.
+
+End SOUNDNESS.
diff --git a/x86/Conventions1.v b/x86/Conventions1.v
index ab4c4b13..d9f5b8fa 100644
--- a/x86/Conventions1.v
+++ b/x86/Conventions1.v
@@ -221,36 +221,6 @@ Definition loc_arguments (s: signature) : list (rpair loc) :=
   then loc_arguments_64 s.(sig_args) 0 0 0
   else loc_arguments_32 s.(sig_args) 0.
 
-(** [size_arguments s] returns the number of [Outgoing] slots used
-  to call a function with signature [s]. *)
-
-Fixpoint size_arguments_32
-    (tyl: list typ) (ofs: Z) {struct tyl} : Z :=
-  match tyl with
-  | nil => ofs
-  | ty :: tys => size_arguments_32 tys (ofs + typesize ty)
-  end.
-
-Fixpoint size_arguments_64 (tyl: list typ) (ir fr ofs: Z) {struct tyl} : Z :=
-  match tyl with
-  | nil => ofs
-  | (Tint | Tlong | Tany32 | Tany64) :: tys =>
-      match list_nth_z int_param_regs ir with
-      | None => size_arguments_64 tys ir fr (ofs + 2)
-      | Some ireg => size_arguments_64 tys (ir + 1) fr ofs
-      end
-  | (Tfloat | Tsingle) :: tys =>
-      match list_nth_z float_param_regs fr with
-      | None => size_arguments_64 tys ir fr (ofs + 2)
-      | Some freg => size_arguments_64 tys ir (fr + 1) ofs
-      end
-  end.
-
-Definition size_arguments (s: signature) : Z :=
-  if Archi.ptr64
-  then size_arguments_64 s.(sig_args) 0 0 0
-  else size_arguments_32 s.(sig_args) 0.
-
 (** Argument locations are either caller-save registers or [Outgoing]
   stack slots at nonnegative offsets. *)
 
@@ -352,121 +322,6 @@ Qed.
 
 Hint Resolve loc_arguments_acceptable: locs.
 
-(** The offsets of [Outgoing] arguments are below [size_arguments s]. *)
-
-Remark size_arguments_32_above:
-  forall tyl ofs0, ofs0 <= size_arguments_32 tyl ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  omega.
-  apply Z.le_trans with (ofs0 + typesize a); auto.
-  generalize (typesize_pos a); omega.
-Qed.
-
-Remark size_arguments_64_above:
-  forall tyl ir fr ofs0,
-  ofs0 <= size_arguments_64 tyl ir fr ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  omega.
-  assert (A: ofs0 <=
-    match list_nth_z int_param_regs ir with
-    | Some _ => size_arguments_64 tyl (ir + 1) fr ofs0
-    | None => size_arguments_64 tyl ir fr (ofs0 + 2)
-    end).
-  { destruct (list_nth_z int_param_regs ir); eauto.
-    apply Z.le_trans with (ofs0 + 2); auto. omega. }
-  assert (B: ofs0 <=
-    match list_nth_z float_param_regs fr with
-    | Some _ => size_arguments_64 tyl ir (fr + 1) ofs0
-    | None => size_arguments_64 tyl ir fr (ofs0 + 2)
-    end).
-  { destruct (list_nth_z float_param_regs fr); eauto.
-    apply Z.le_trans with (ofs0 + 2); auto. omega. }
-  destruct a; auto.
-Qed.
-
-Lemma size_arguments_above:
-  forall s, size_arguments s >= 0.
-Proof.
-  intros; unfold size_arguments. apply Z.le_ge.
-  destruct Archi.ptr64; [apply size_arguments_64_above|apply size_arguments_32_above].
-Qed.
-
-Lemma loc_arguments_32_bounded:
-  forall ofs ty tyl ofs0,
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments_32 tyl ofs0)) ->
-  ofs + typesize ty <= size_arguments_32 tyl ofs0.
-Proof.
-  induction tyl as [ | t l]; simpl; intros x IN.
-- contradiction.
-- rewrite in_app_iff in IN; destruct IN as [IN|IN].
-+ apply Z.le_trans with (x + typesize t); [|apply size_arguments_32_above].
-  Ltac decomp :=
-  match goal with
-  | [ H: _ \/ _ |- _ ] => destruct H; decomp
-  | [ H: S _ _ _ = S _ _ _ |- _ ] => inv H
-  | [ H: False |- _ ] => contradiction
-  end.
-  destruct t; simpl in IN; decomp; simpl; omega.
-+ apply IHl; auto.
-Qed.
-
-Lemma loc_arguments_64_bounded:
-  forall ofs ty tyl ir fr ofs0,
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments_64 tyl ir fr ofs0)) ->
-  ofs + typesize ty <= size_arguments_64 tyl ir fr ofs0.
-Proof.
-  induction tyl; simpl; intros.
-  contradiction.
-  assert (T: forall ty0, typesize ty0 <= 2).
-  { destruct ty0; simpl; omega. }
-  assert (A: forall ty0,
-             In (S Outgoing ofs ty) (regs_of_rpairs
-              match list_nth_z int_param_regs ir with
-              | Some ireg =>
-                  One (R ireg) :: loc_arguments_64 tyl (ir + 1) fr ofs0
-              | None => One (S Outgoing ofs0 ty0) :: loc_arguments_64 tyl ir fr (ofs0 + 2)
-              end) ->
-             ofs + typesize ty <=
-             match list_nth_z int_param_regs ir with
-             | Some _ => size_arguments_64 tyl (ir + 1) fr ofs0
-             | None => size_arguments_64 tyl ir fr (ofs0 + 2)
-             end).
-  { intros. destruct (list_nth_z int_param_regs ir); simpl in H0; destruct H0.
-  - discriminate.
-  - eapply IHtyl; eauto.
-  - inv H0. apply Z.le_trans with (ofs + 2). specialize (T ty). omega. apply size_arguments_64_above.
-  - eapply IHtyl; eauto. }
-  assert (B: forall ty0,
-             In (S Outgoing ofs ty) (regs_of_rpairs
-              match list_nth_z float_param_regs fr with
-              | Some ireg =>
-                  One (R ireg) :: loc_arguments_64 tyl ir (fr + 1) ofs0
-              | None => One (S Outgoing ofs0 ty0) :: loc_arguments_64 tyl ir fr (ofs0 + 2)
-              end) ->
-             ofs + typesize ty <=
-             match list_nth_z float_param_regs fr with
-             | Some _ => size_arguments_64 tyl ir (fr + 1) ofs0
-             | None => size_arguments_64 tyl ir fr (ofs0 + 2)
-             end).
-  { intros. destruct (list_nth_z float_param_regs fr); simpl in H0; destruct H0.
-  - discriminate.
-  - eapply IHtyl; eauto.
-  - inv H0. apply Z.le_trans with (ofs + 2). specialize (T ty). omega. apply size_arguments_64_above.
-  - eapply IHtyl; eauto. }
-  destruct a; eauto.
-Qed.
-
-Lemma loc_arguments_bounded:
-  forall (s: signature) (ofs: Z) (ty: typ),
-  In (S Outgoing ofs ty) (regs_of_rpairs (loc_arguments s)) ->
-  ofs + typesize ty <= size_arguments s.
-Proof.
-  unfold loc_arguments, size_arguments; intros.
-  destruct Archi.ptr64; eauto using loc_arguments_32_bounded, loc_arguments_64_bounded.
-Qed.
-
 Lemma loc_arguments_main:
   loc_arguments signature_main = nil.
 Proof.
diff --git a/x86/DuplicateOpcodeHeuristic.ml b/x86/DuplicateOpcodeHeuristic.ml
index 85505245..2ec314c1 100644
--- a/x86/DuplicateOpcodeHeuristic.ml
+++ b/x86/DuplicateOpcodeHeuristic.ml
@@ -1,3 +1,27 @@
-exception HeuristicSucceeded
-        
-let opcode_heuristic code cond ifso ifnot preferred = ()
+(* open Camlcoq *)
+open Op
+open Integers
+
+let opcode_heuristic code cond ifso ifnot is_loop_header =
+  match cond with
+  | Ccompimm (c, n) | Ccompuimm (c, n) -> if n == Integers.Int.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccomplimm (c, n) | Ccompluimm (c, n) -> if n == Integers.Int64.zero then (match c with
+      | Clt | Cle -> Some false
+      | Cgt | Cge -> Some true
+      | _ -> None
+      ) else None
+  | Ccompf c | Ccompfs c -> (match c with
+      | Ceq -> Some false
+      | Cne -> Some true
+      | _ -> None
+      )
+  | Cnotcompf c | Cnotcompfs c -> (match c with
+      | Ceq -> Some true
+      | Cne -> Some false
+      | _ -> None
+      )
+  | _ -> None
author	David Monniaux <david.monniaux@univ-grenoble-alpes.fr>	2020-04-08 21:45:42 +0200
committer	David Monniaux <david.monniaux@univ-grenoble-alpes.fr>	2020-04-08 21:45:42 +0200
commit	63915fbebe707cc1de7c0ed5a24148cac45a742c (patch)
tree	da503cba224f14281a2ee841930b8843459cb42b
parent	f78d61faf3db94ac1704ce0d11291211b5307629 (diff)
parent	e326ed9f28a2ed6869f0cb356ef9a8e189cb0a47 (diff)
download	compcert-kvx-63915fbebe707cc1de7c0ed5a24148cac45a742c.tar.gz compcert-kvx-63915fbebe707cc1de7c0ed5a24148cac45a742c.zip