aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Monniaux <david.monniaux@univ-grenoble-alpes.fr>2020-10-18 22:49:10 +0200
committerDavid Monniaux <david.monniaux@univ-grenoble-alpes.fr>2020-10-18 22:49:10 +0200
commit996a2e5bbc4826d95144b62f5218b6e3e1e7d881 (patch)
tree55a5136c68e7d0f7a304913f28d0a761d9facd9a
parent8c3a2bdb56eba8d8bc5e359b01a320916eac85f0 (diff)
parenta2f31f2b886ccb9656a019db1780aabc1789368a (diff)
downloadcompcert-kvx-996a2e5bbc4826d95144b62f5218b6e3e1e7d881.tar.gz
compcert-kvx-996a2e5bbc4826d95144b62f5218b6e3e1e7d881.zip
Merge remote-tracking branch 'origin/kvx-work' into kvx-test-prepass
-rw-r--r--.gitlab-ci.yml2
-rw-r--r--Makefile1
-rw-r--r--backend/CSE3analysis.v38
-rw-r--r--backend/CSE3analysisaux.ml4
-rw-r--r--backend/CSE3analysisproof.v29
-rw-r--r--backend/CSE3proof.v6
-rw-r--r--backend/Duplicateaux.ml450
-rw-r--r--backend/KillUselessMoves.v40
-rw-r--r--backend/KillUselessMovesproof.v361
-rw-r--r--backend/LICMaux.ml60
-rw-r--r--doc/index-kvx.html4
-rw-r--r--driver/Clflags.ml14
-rw-r--r--driver/Compopts.v3
-rw-r--r--driver/Driver.ml22
-rw-r--r--extraction/extraction.v2
-rw-r--r--kvx/SelectOp.vp14
-rw-r--r--kvx/SelectOpproof.v9
-rw-r--r--lib/Coqlib.v6
-rw-r--r--runtime/include/ccomp_kvx_fixes.h15
-rw-r--r--test/kvx/sort/Makefile2
-rw-r--r--test/monniaux/loop_nest/syrk.c28
-rw-r--r--tools/compiler_expand.ml5
22 files changed, 939 insertions, 176 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3b1a86fd..10008017 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -237,7 +237,7 @@ build_kvx:
- source /opt/kalray/accesscore/kalray.sh && ./config_kvx.sh
- source /opt/kalray/accesscore/kalray.sh && make -j "$NJOBS"
- source /opt/kalray/accesscore/kalray.sh && make -C test CCOMPOPTS=-static SIMU='kvx-cluster -- ' EXECUTE='kvx-cluster -- ' all test
- - source /opt/kalray/accesscore/kalray.sh && make -C test/monniaux/yarpgen TARGET_CC='kvx-cos-gcc' EXECUTE='kvx-cluster -- ' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
+ - source /opt/kalray/accesscore/kalray.sh && ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='kvx-cos-gcc' EXECUTE='kvx-cluster -- ' CCOMPOPTS='-static' TARGET_CFLAGS='-static'
rules:
- if: '$CI_COMMIT_BRANCH == "kvx-work"'
when: always
diff --git a/Makefile b/Makefile
index 73b3a446..6560a64b 100644
--- a/Makefile
+++ b/Makefile
@@ -95,6 +95,7 @@ BACKEND=\
CSE2deps.v CSE2depsproof.v \
CSE2.v CSE2proof.v \
CSE3analysis.v CSE3analysisproof.v CSE3.v CSE3proof.v \
+ KillUselessMoves.v KillUselessMovesproof.v \
LICM.v LICMproof.v \
NeedDomain.v NeedOp.v Deadcode.v Deadcodeproof.v \
Unusedglob.v Unusedglobproof.v \
diff --git a/backend/CSE3analysis.v b/backend/CSE3analysis.v
index ade79c28..7316c9a9 100644
--- a/backend/CSE3analysis.v
+++ b/backend/CSE3analysis.v
@@ -298,13 +298,22 @@ Section OPERATIONS.
Definition move (src dst : reg) (rel : RELATION.t) : RELATION.t :=
- match eq_find {| eq_lhs := dst;
+ if peq src dst
+ then rel
+ else
+ match eq_find {| eq_lhs := dst;
eq_op := SOp Omove;
eq_args:= src::nil |} with
- | Some eq_id => PSet.add eq_id (kill_reg dst rel)
- | None => kill_reg dst rel
- end.
+ | Some eq_id => PSet.add eq_id (kill_reg dst rel)
+ | None => kill_reg dst rel
+ end.
+ Definition is_trivial_sym_op sop :=
+ match sop with
+ | SOp op => is_trivial_op op
+ | SLoad _ _ => false
+ end.
+
Definition oper (dst : reg) (op: sym_op) (args : list reg)
(rel : RELATION.t) : RELATION.t :=
if is_smove op
@@ -315,15 +324,18 @@ Section OPERATIONS.
| _ => kill_reg dst rel
end
else
- let args' := forward_move_l rel args in
- match rhs_find op args' rel with
- | Some r =>
- if Compopts.optim_CSE3_glb tt
- then RELATION.glb (move r dst rel)
- (oper1 dst op args' rel)
- else oper1 dst op args' rel
- | None => oper1 dst op args' rel
- end.
+ if is_trivial_sym_op op
+ then kill_reg dst rel
+ else
+ let args' := forward_move_l rel args in
+ match rhs_find op args' rel with
+ | Some r =>
+ if Compopts.optim_CSE3_glb tt
+ then RELATION.glb (move r dst rel)
+ (oper1 dst op args' rel)
+ else oper1 dst op args' rel
+ | None => oper1 dst op args' rel
+ end.
Definition clever_kill_store
(chunk : memory_chunk) (addr: addressing) (args : list reg)
diff --git a/backend/CSE3analysisaux.ml b/backend/CSE3analysisaux.ml
index 3e4a6b9e..3990b765 100644
--- a/backend/CSE3analysisaux.ml
+++ b/backend/CSE3analysisaux.ml
@@ -67,6 +67,9 @@ let pp_option pp oc = function
| None -> output_string oc "none"
| Some x -> pp oc x;;
+let is_trivial eq =
+ (eq.eq_op = SOp Op.Omove) && (eq.eq_args = [eq.eq_lhs]);;
+
let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
let cur_eq_id = ref 0
and cur_catalog = ref PTree.empty
@@ -76,6 +79,7 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) =
and cur_kill_mem = ref PSet.empty
and cur_moves = ref (PMap.init PSet.empty) in
let eq_find_oracle node eq =
+ assert (not (is_trivial eq));
let o = Hashtbl.find_opt eq_table (flatten_eq eq) in
(if !Clflags.option_debug_compcert > 1
then Printf.printf "@%d: eq_find %a -> %a\n" (P.to_int node)
diff --git a/backend/CSE3analysisproof.v b/backend/CSE3analysisproof.v
index f4e3672d..66b199cc 100644
--- a/backend/CSE3analysisproof.v
+++ b/backend/CSE3analysisproof.v
@@ -745,6 +745,25 @@ Section SOUNDNESS.
Hint Resolve oper1_sound : cse3.
+ Lemma rel_idem_replace:
+ forall rel rs r m,
+ sem_rel rel rs m ->
+ sem_rel rel rs # r <- (rs # r) m.
+ Proof.
+ intros until m.
+ intro REL.
+ unfold sem_rel, sem_eq, sem_rhs in *.
+ intros.
+ specialize REL with (i:=i) (eq0:=eq).
+ rewrite Regmap.gsident.
+ replace ((rs # r <- (rs # r)) ## (eq_args eq)) with
+ (rs ## (eq_args eq)).
+ { apply REL; auto. }
+ apply list_map_exten.
+ intros.
+ apply Regmap.gsident.
+ Qed.
+
Lemma move_sound :
forall no : node,
forall rel : RELATION.t,
@@ -756,6 +775,10 @@ Section SOUNDNESS.
unfold move.
intros until m.
intro REL.
+ destruct (peq src dst).
+ { subst dst.
+ apply rel_idem_replace; auto.
+ }
pose proof (eq_find_sound no {| eq_lhs := dst; eq_op := SOp Omove; eq_args := src :: nil |}) as EQ_FIND_SOUND.
destruct eq_find.
- intros i eq CONTAINS.
@@ -798,7 +821,11 @@ Section SOUNDNESS.
subst.
rewrite <- (forward_move_sound rel rs m r) by auto.
apply move_sound; auto.
- - destruct rhs_find as [src |] eqn:RHS_FIND.
+ - destruct (is_trivial_sym_op sop).
+ {
+ apply kill_reg_sound; auto.
+ }
+ destruct rhs_find as [src |] eqn:RHS_FIND.
+ destruct (Compopts.optim_CSE3_glb tt).
* apply sem_rel_glb; split.
** pose proof (rhs_find_sound no sop (forward_move_l (ctx:=ctx) rel args) rel src rs m REL RHS_FIND) as SOUND.
diff --git a/backend/CSE3proof.v b/backend/CSE3proof.v
index 6e489066..3fbc9912 100644
--- a/backend/CSE3proof.v
+++ b/backend/CSE3proof.v
@@ -443,12 +443,6 @@ Ltac IND_STEP :=
idtac mpc mpc' fn minstr *)
end.
-Lemma if_same : forall {T : Type} (b : bool) (x : T),
- (if b then x else x) = x.
-Proof.
- destruct b; trivial.
-Qed.
-
Lemma step_simulation:
forall S1 t S2, RTL.step ge S1 t S2 ->
forall S1', match_states S1 S1' ->
diff --git a/backend/Duplicateaux.ml b/backend/Duplicateaux.ml
index 1297ec90..eb9f42e0 100644
--- a/backend/Duplicateaux.ml
+++ b/backend/Duplicateaux.ml
@@ -15,6 +15,7 @@
(* Oracle for Duplicate pass.
* - Add static prediction information to Icond nodes
* - Performs tail duplication on interesting traces to form superblocks
+ * - Unrolls a single iteration of innermost loops
* - (TODO: perform partial loop unrolling inside innermost loops)
*)
@@ -22,23 +23,13 @@ open RTL
open Maps
open Camlcoq
-let debug_flag = ref false
-
-let debug fmt =
- if !debug_flag then Printf.eprintf fmt
- else Printf.ifprintf stderr fmt
-
-let get_some = function
-| None -> failwith "Did not get some"
-| Some thing -> thing
-
-let rtl_successors = function
-| Itailcall _ | Ireturn _ -> []
-| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n)
-| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n]
-| Icond (_,_,n1,n2,_) -> [n1; n2]
-| Ijumptable (_,ln) -> ln
+let debug_flag = LICMaux.debug_flag
+let debug = LICMaux.debug
+let get_loop_headers = LICMaux.get_loop_headers
+let get_some = LICMaux.get_some
+let rtl_successors = LICMaux.rtl_successors
+(* Get list of nodes following a BFS of the code *)
let bfs code entrypoint = begin
debug "bfs\n";
let visited = ref (PTree.map (fun n i -> false) code)
@@ -67,6 +58,7 @@ let optbool o = match o with Some _ -> true | None -> false
let ptree_get_some n ptree = get_some @@ PTree.get n ptree
+(* Returns a PTree: node -> list of the predecessors of that node *)
let get_predecessors_rtl code = begin
debug "get_predecessors_rtl\n";
let preds = ref (PTree.map (fun n i -> []) code) in
@@ -89,15 +81,13 @@ end
module PSet = Set.Make(PInt)
-let print_intlist l =
- let rec f = function
+let print_intlist oc l =
+ let rec f oc = function
| [] -> ()
- | n::ln -> (Printf.printf "%d " (P.to_int n); f ln)
+ | n::ln -> (Printf.fprintf oc "%d %a" (P.to_int n) f ln)
in begin
if !debug_flag then begin
- Printf.printf "[";
- f l;
- Printf.printf "]"
+ Printf.fprintf oc "[%a]" f l
end
end
@@ -113,43 +103,6 @@ let print_intset s =
end
end
-type vstate = Unvisited | Processed | Visited
-
-(** Getting loop branches with a DFS visit :
- * Each node is either Unvisited, Visited, or Processed
- * pre-order: node becomes Processed
- * post-order: node becomes Visited
- *
- * If we come accross an edge to a Processed node, it's a loop!
- *)
-let get_loop_headers code entrypoint = begin
- debug "get_loop_headers\n";
- let visited = ref (PTree.map (fun n i -> Unvisited) code)
- and is_loop_header = ref (PTree.map (fun n i -> false) code)
- in let rec dfs_visit code = function
- | [] -> ()
- | node :: ln ->
- match (get_some @@ PTree.get node !visited) with
- | Visited -> ()
- | Processed -> begin
- debug "Node %d is a loop header\n" (P.to_int node);
- is_loop_header := PTree.set node true !is_loop_header;
- visited := PTree.set node Visited !visited
- end
- | Unvisited -> begin
- visited := PTree.set node Processed !visited;
- match PTree.get node code with
- | None -> failwith "No such node"
- | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits;
- visited := PTree.set node Visited !visited;
- dfs_visit code ln
- end
- in begin
- dfs_visit code [entrypoint];
- !is_loop_header
- end
-end
-
let ptree_printbool pt =
let elements = PTree.elements pt
in begin
@@ -174,6 +127,10 @@ let rec look_ahead code node is_loop_header predicate =
)
| _ -> false
+(**
+ * Heuristics mostly based on the paper Branch Prediction for Free
+ *)
+
let do_call_heuristic code cond ifso ifnot is_loop_header =
begin
debug "\tCall heuristic..\n";
@@ -302,7 +259,7 @@ let get_loop_info is_loop_header bfs_order code =
!loop_info
end
-(* Remark - compared to the original paper, we don't use the store heuristic *)
+(* Remark - compared to the original Branch Prediction for Free paper, we don't use the store heuristic *)
let get_directions code entrypoint = begin
debug "get_directions\n";
let bfs_order = bfs code entrypoint in
@@ -435,18 +392,29 @@ let best_predecessor_of node predecessors code order is_visited =
) order)
with Not_found -> None
-let print_trace t = print_intlist t
+let print_trace = print_intlist
-let print_traces traces =
- let rec f = function
+let print_traces oc traces =
+ let rec f oc = function
| [] -> ()
- | t::lt -> Printf.printf "\n\t"; print_trace t; Printf.printf ",\n"; f lt
+ | t::lt -> Printf.fprintf oc "\n\t%a,\n%a" print_trace t f lt
in begin
- if !debug_flag then begin
- Printf.printf "Traces: {";
- f traces;
- Printf.printf "}\n";
- end
+ if !debug_flag then
+ Printf.fprintf oc "Traces: {%a}\n" f traces
+ end
+
+(* Adapted from backend/PrintRTL.ml: print_function *)
+let print_code code = let open PrintRTL in let open Printf in
+ if (!debug_flag) then begin
+ fprintf stdout "{\n";
+ let instrs =
+ List.sort
+ (fun (pc1, _) (pc2, _) -> compare pc2 pc1)
+ (List.rev_map
+ (fun (pc, i) -> (P.to_int pc, i))
+ (PTree.elements code)) in
+ List.iter (print_instruction stdout) instrs;
+ fprintf stdout "}"
end
(* Dumb (but linear) trace selection *)
@@ -521,7 +489,7 @@ let select_traces_chang code entrypoint = begin
end
done;
(* debug "DFS: \t"; print_intlist order; debug "\n"; *)
- debug "Traces: "; print_traces !traces;
+ debug "Traces: %a" print_traces !traces;
!traces
end
end
@@ -537,26 +505,26 @@ let rec make_identity_ptree_rec = function
let make_identity_ptree code = make_identity_ptree_rec (PTree.elements code)
-(* Change the pointers of preds nodes to point to n' instead of n *)
+(* Change the pointers of nodes to point to n' instead of n *)
let rec change_pointers code n n' = function
| [] -> code
- | pred :: preds ->
- let new_pred_inst = match ptree_get_some pred code with
- | Icall(a, b, c, d, n0) -> assert (n0 == n); Icall(a, b, c, d, n')
- | Ibuiltin(a, b, c, n0) -> assert (n0 == n); Ibuiltin(a, b, c, n')
- | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e == n) ln);
- Ijumptable(a, List.map (fun e -> if (e == n) then n' else e) ln)
- | Icond(a, b, n1, n2, i) -> assert (n1 == n || n2 == n);
- let n1' = if (n1 == n) then n' else n1
- in let n2' = if (n2 == n) then n' else n2
+ | node :: nodes ->
+ let new_pred_inst = match ptree_get_some node code with
+ | Icall(a, b, c, d, n0) -> assert (n0 = n); Icall(a, b, c, d, n')
+ | Ibuiltin(a, b, c, n0) -> assert (n0 = n); Ibuiltin(a, b, c, n')
+ | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e = n) ln);
+ Ijumptable(a, List.map (fun e -> if (e = n) then n' else e) ln)
+ | Icond(a, b, n1, n2, i) -> assert (n1 = n || n2 = n);
+ let n1' = if (n1 = n) then n' else n1
+ in let n2' = if (n2 = n) then n' else n2
in Icond(a, b, n1', n2', i)
- | Inop n0 -> assert (n0 == n); Inop n'
- | Iop (a, b, c, n0) -> assert (n0 == n); Iop (a, b, c, n')
- | Iload (a, b, c, d, e, n0) -> assert (n0 == n); Iload (a, b, c, d, e, n')
- | Istore (a, b, c, d, n0) -> assert (n0 == n); Istore (a, b, c, d, n')
+ | Inop n0 -> assert (n0 = n); Inop n'
+ | Iop (a, b, c, n0) -> assert (n0 = n); Iop (a, b, c, n')
+ | Iload (a, b, c, d, e, n0) -> assert (n0 = n); Iload (a, b, c, d, e, n')
+ | Istore (a, b, c, d, n0) -> assert (n0 = n); Istore (a, b, c, d, n')
| Itailcall _ | Ireturn _ -> failwith "That instruction cannot be a predecessor"
- in let new_code = PTree.set pred new_pred_inst code
- in change_pointers new_code n n' preds
+ in let new_code = PTree.set node new_pred_inst code
+ in change_pointers new_code n n' nodes
(* parent: parent of n to keep as parent
* preds: all the other parents of n
@@ -580,13 +548,20 @@ let is_empty = function
| [] -> true
| _ -> false
+let next_free_pc code = maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1
+
+let is_a_nop code n =
+ match get_some @@ PTree.get n code with
+ | Inop _ -> true
+ | _ -> false
+
(* code: RTL code
* preds: mapping node -> predecessors
* ptree: the revmap
* trace: the trace to follow tail duplication on *)
let tail_duplicate code preds ptree trace =
(* next_int: unused integer that can be used for the next duplication *)
- let next_int = ref (maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1)
+ let next_int = ref (next_free_pc code)
(* last_node and last_duplicate store resp. the last processed node of the trace, and its duplication *)
in let last_node = ref None
in let last_duplicate = ref None
@@ -608,7 +583,7 @@ let tail_duplicate code preds ptree trace =
in let (newc, newp) = duplicate code ptree !last_node n final_node_preds (P.of_int n')
in begin
next_int := !next_int + 1;
- nb_duplicated := !nb_duplicated + 1;
+ (if not @@ is_a_nop code n then nb_duplicated := !nb_duplicated + 1);
last_duplicate := Some (P.of_int n');
(newc, newp)
end
@@ -620,9 +595,8 @@ let tail_duplicate code preds ptree trace =
in let new_code, new_ptree = f code ptree true trace
in (new_code, new_ptree, !nb_duplicated)
-let superblockify_traces code preds traces =
- let max_nb_duplicated = !Clflags.option_fduplicate (* FIXME - should be architecture dependent *)
- in let ptree = make_identity_ptree code
+let superblockify_traces code preds traces ptree =
+ let max_nb_duplicated = !Clflags.option_ftailduplicate (* FIXME - should be architecture dependent *)
in let rec f code ptree = function
| [] -> (code, ptree, 0)
| trace :: traces ->
@@ -633,37 +607,271 @@ let superblockify_traces code preds traces =
in let new_code, new_ptree, _ = f code ptree traces
in (new_code, new_ptree)
-let rec invert_iconds_trace code = function
- | [] -> code
- | n :: ln ->
- let code' = match ptree_get_some n code with
- | Icond (c, lr, ifso, ifnot, info) -> (match info with
- | Some true -> begin
- (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *)
- PTree.set n (Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)) code
- end
- | _ -> code)
- | _ -> code
- in invert_iconds_trace code' ln
+let invert_iconds code =
+ PTree.map1 (fun i -> match i with
+ | Icond (c, lr, ifso, ifnot, info) -> (match info with
+ | Some true -> begin
+ (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *)
+ Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)
+ end
+ | _ -> i)
+ | _ -> i
+ ) code
+
+(** Partial loop unrolling
+ *
+ * The following code seeks innermost loops, and unfolds the first iteration
+ * Most of the code has been moved from LICMaux.ml to Duplicateaux.ml to solve
+ * cyclic dependencies between LICMaux and Duplicateaux
+ *)
-let rec invert_iconds code = function
- | [] -> code
- | t :: ts ->
- let code' = if !Clflags.option_finvertcond then invert_iconds_trace code t
- else code
- in invert_iconds code' ts
+type innerLoop = {
+ preds: P.t list;
+ body: HashedSet.PSet.t;
+ head: P.t; (* head of the loop *)
+ final: P.t (* the final instruction, which loops back to the head *)
+}
+
+let print_pset = LICMaux.pp_pset
+
+let print_inner_loop iloop =
+ debug "{preds: %a, body: %a}" print_intlist iloop.preds print_pset iloop.body
+
+let rec print_inner_loops = function
+| [] -> ()
+| iloop :: iloops -> begin
+ print_inner_loop iloop;
+ debug "\n";
+ print_inner_loops iloops
+ end
+
+let print_ptree printer pt =
+ let elements = PTree.elements pt in
+ begin
+ debug "[\n";
+ List.iter (fun (n, elt) ->
+ debug "\t%d: %a\n" (P.to_int n) printer elt
+ ) elements;
+ debug "]\n"
+ end
+
+let print_pint oc i = if !debug_flag then Printf.fprintf oc "%d" (P.to_int i) else ()
+
+let get_inner_loops f code is_loop_header =
+ let fake_f = { fn_sig = f.fn_sig; fn_params = f.fn_params;
+ fn_stacksize = f.fn_stacksize; fn_code = code; fn_entrypoint = f.fn_entrypoint } in
+ let (_, predmap, loopmap) = LICMaux.inner_loops fake_f in
+ begin
+ debug "PREDMAP: "; print_ptree print_intlist predmap;
+ debug "LOOPMAP: "; print_ptree print_pset loopmap;
+ List.map (fun (n, body) ->
+ let preds = List.filter (fun p -> not @@ HashedSet.PSet.contains body p)
+ @@ get_some @@ PTree.get n predmap in
+ let head = (* the instruction from body which is a loop header *)
+ let heads = HashedSet.PSet.elements @@ HashedSet.PSet.filter
+ (fun n -> ptree_get_some n is_loop_header) body in
+ begin
+ assert (List.length heads == 1);
+ List.hd heads
+ end in
+ let final = (* the predecessors from head that are in the body *)
+ let head_preds = ptree_get_some head predmap in
+ let filtered = List.filter (fun n -> HashedSet.PSet.contains body n) head_preds in
+ begin
+ debug "HEAD: %d\n" (P.to_int head);
+ debug "BODY: %a\n" print_pset body;
+ debug "HEADPREDS: %a\n" print_intlist head_preds;
+ assert (List.length filtered == 1);
+ List.hd filtered
+ end in
+ { preds = preds; body = body; head = head; final = final }
+ )
+ (* LICMaux.inner_loops also returns non-inner loops, but with a body of 1 instruction
+ * We remove those to get just the inner loops *)
+ @@ List.filter (fun (n, body) ->
+ let count = List.length @@ HashedSet.PSet.elements body in count != 1
+ ) (PTree.elements loopmap)
+ end
+
+let rec generate_fwmap ln ln' fwmap =
+ match ln with
+ | [] -> begin
+ match ln' with
+ | [] -> fwmap
+ | _ -> failwith "ln and ln' have different lengths"
+ end
+ | n :: ln -> begin
+ match ln' with
+ | n' :: ln' -> generate_fwmap ln ln' (PTree.set n n' fwmap)
+ | _ -> failwith "ln and ln' have different lengths"
+ end
+
+let generate_revmap ln ln' revmap = generate_fwmap ln' ln revmap
+
+let apply_map fw n = P.of_int @@ ptree_get_some n fw
+
+let apply_map_opt fw n =
+ match PTree.get n fw with
+ | Some n' -> P.of_int n'
+ | None -> n
+
+let change_nexts fwmap = function
+ | Icall (a, b, c, d, n) -> Icall (a, b, c, d, apply_map fwmap n)
+ | Ibuiltin (a, b, c, n) -> Ibuiltin (a, b, c, apply_map fwmap n)
+ | Ijumptable (a, ln) -> Ijumptable (a, List.map (apply_map_opt fwmap) ln)
+ | Icond (a, b, n1, n2, i) -> Icond (a, b, apply_map_opt fwmap n1, apply_map_opt fwmap n2, i)
+ | Inop n -> Inop (apply_map fwmap n)
+ | Iop (a, b, c, n) -> Iop (a, b, c, apply_map fwmap n)
+ | Iload (a, b, c, d, e, n) -> Iload (a, b, c, d, e, apply_map fwmap n)
+ | Istore (a, b, c, d, n) -> Istore (a, b, c, d, apply_map fwmap n)
+ | Itailcall (a, b, c) -> Itailcall (a, b, c)
+ | Ireturn o -> Ireturn o
+
+(** Clone a list of instructions into free pc indexes
+ *
+ * The list of instructions should be contiguous, and not include any loop.
+ * It is assumed that the first instruction of the list is the head.
+ * Also, the last instruction of the list should be the loop backedge.
+ *
+ * Returns: (code', revmap', ln', fwmap)
+ * code' is the updated code, after cloning
+ * revmap' is the updated revmap
+ * ln' is the list of the new indexes used to reference the cloned instructions
+ * fwmap is a map from ln to ln'
+ *)
+let clone code revmap ln = begin
+ assert (List.length ln > 0);
+ let head' = next_free_pc code in
+ (* +head' to ensure we never overlap with the existing code *)
+ let ln' = List.map (fun n -> n + head') @@ List.map P.to_int ln in
+ let fwmap = generate_fwmap ln ln' PTree.empty in
+ let revmap' = generate_revmap ln (List.map P.of_int ln') revmap in
+ let code' = ref code in
+ List.iter (fun n ->
+ let instr = get_some @@ PTree.get n code in
+ let instr' = change_nexts fwmap instr in
+ code' := PTree.set (apply_map fwmap n) instr' !code'
+ ) ln;
+ (!code', revmap', ln', fwmap)
+end
+
+let rec count_ignore_nops code = function
+ | [] -> 0
+ | n::ln ->
+ let inst = get_some @@ PTree.get n code in
+ match inst with
+ | Inop _ -> count_ignore_nops code ln
+ | _ -> 1 + count_ignore_nops code ln
+
+(* Unrolls a single interation of the inner loop
+ * 1) Clones the body into body'
+ * 2) Links the preds to the first instruction of body'
+ * 3) Links the last instruction of body' into the first instruction of body
+ *)
+let unroll_inner_loop_single code revmap iloop =
+ let body = HashedSet.PSet.elements (iloop.body) in
+ if count_ignore_nops code body > !Clflags.option_funrollsingle then begin
+ debug "Too many nodes in the loop body (%d > %d)" (List.length body) !Clflags.option_funrollsingle;
+ (code, revmap)
+ end else
+ let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
+ let code' = ref code2 in
+ let head' = apply_map fwmap (iloop.head) in
+ let final' = apply_map fwmap (iloop.final) in
+ begin
+ debug "PREDS: %a\n" print_intlist iloop.preds;
+ debug "IHEAD: %d\n" (P.to_int iloop.head);
+ code' := change_pointers !code' (iloop.head) head' (iloop.preds);
+ code' := change_pointers !code' head' (iloop.head) [final'];
+ (!code', revmap2)
+ end
+
+let unroll_inner_loops_single f code revmap =
+ let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
+ let inner_loops = get_inner_loops f code is_loop_header in
+ let code' = ref code in
+ let revmap' = ref revmap in
+ begin
+ print_inner_loops inner_loops;
+ List.iter (fun iloop ->
+ let (new_code, new_revmap) = unroll_inner_loop_single !code' !revmap' iloop in
+ code' := new_code; revmap' := new_revmap
+ ) inner_loops;
+ (!code', !revmap')
+ end
+
+(* Unrolls the body of the inner loop once - duplicating the exit condition as well
+ * 1) Clones body into body'
+ * 2) Links the last instruction of body into the first of body'
+ * 3) Links the last instruction of body' into the first of body
+ *)
+let unroll_inner_loop_body code revmap iloop =
+ let body = HashedSet.PSet.elements (iloop.body) in
+ let limit = !Clflags.option_funrollbody in
+ if count_ignore_nops code body > limit then begin
+ debug "Too many nodes in the loop body (%d > %d)" (List.length body) limit;
+ (code, revmap)
+ end else
+ let (code2, revmap2, dupbody, fwmap) = clone code revmap body in
+ let code' = ref code2 in
+ let head' = apply_map fwmap (iloop.head) in
+ let final' = apply_map fwmap (iloop.final) in
+ begin
+ code' := change_pointers !code' iloop.head head' [iloop.final];
+ code' := change_pointers !code' head' iloop.head [final'];
+ (!code', revmap2)
+ end
+
+let unroll_inner_loops_body f code revmap =
+ let is_loop_header = get_loop_headers code (f.fn_entrypoint) in
+ let inner_loops = get_inner_loops f code is_loop_header in
+ let code' = ref code in
+ let revmap' = ref revmap in
+ begin
+ print_inner_loops inner_loops;
+ List.iter (fun iloop ->
+ let (new_code, new_revmap) = unroll_inner_loop_body !code' !revmap' iloop in
+ code' := new_code; revmap' := new_revmap
+ ) inner_loops;
+ (!code', !revmap')
+ end
let duplicate_aux f =
+ (* initializing *)
let entrypoint = f.fn_entrypoint in
- if !Clflags.option_fduplicate < 0 then
- ((f.fn_code, entrypoint), make_identity_ptree f.fn_code)
- else
- let code = update_directions (f.fn_code) entrypoint in
- let traces = select_traces code entrypoint in
- let icond_code = invert_iconds code traces in
- let preds = get_predecessors_rtl icond_code in
- if !Clflags.option_fduplicate >= 1 then
- let (new_code, pTreeId) = ((* print_traces traces; *) superblockify_traces icond_code preds traces) in
- ((new_code, f.fn_entrypoint), pTreeId)
- else
- ((icond_code, entrypoint), make_identity_ptree code)
+ let code = f.fn_code in
+ let revmap = make_identity_ptree code in
+
+ (* static prediction *)
+ let code =
+ if !Clflags.option_fpredict then
+ update_directions code entrypoint
+ else code in
+
+ (* unroll single *)
+ let (code, revmap) =
+ if !Clflags.option_funrollsingle > 0 then
+ unroll_inner_loops_single f code revmap
+ else (code, revmap) in
+
+ (* unroll body *)
+ let (code, revmap) =
+ if !Clflags.option_funrollbody > 0 then
+ unroll_inner_loops_body f code revmap
+ else (code, revmap) in
+
+ (* static prediction bis *)
+ let code =
+ if !Clflags.option_fpredict then
+ invert_iconds code
+ else code in
+
+ (* tail duplication *)
+ let (code, revmap) =
+ if !Clflags.option_ftailduplicate > 0 then
+ let traces = select_traces code entrypoint in
+ let preds = get_predecessors_rtl code in
+ superblockify_traces code preds traces revmap
+ else (code, revmap) in
+
+ ((code, entrypoint), revmap)
diff --git a/backend/KillUselessMoves.v b/backend/KillUselessMoves.v
new file mode 100644
index 00000000..bdd7ec60
--- /dev/null
+++ b/backend/KillUselessMoves.v
@@ -0,0 +1,40 @@
+(* *************************************************************)
+(* *)
+(* The Compcert verified compiler *)
+(* *)
+(* David Monniaux CNRS, VERIMAG *)
+(* *)
+(* Copyright VERIMAG. All rights reserved. *)
+(* This file is distributed under the terms of the INRIA *)
+(* Non-Commercial License Agreement. *)
+(* *)
+(* *************************************************************)
+
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Memory Registers Op RTL.
+Require List.
+
+Definition transf_ros (ros: reg + ident) : reg + ident := ros.
+
+Definition transf_instr (pc: node) (instr: instruction) :=
+ match instr with
+ | Iop op args res s =>
+ if (eq_operation op Omove) && (List.list_eq_dec peq args (res :: nil))
+ then Inop s
+ else instr
+ | _ => instr
+ end.
+
+Definition transf_function (f: function) : function :=
+ {| fn_sig := f.(fn_sig);
+ fn_params := f.(fn_params);
+ fn_stacksize := f.(fn_stacksize);
+ fn_code := PTree.map transf_instr f.(fn_code);
+ fn_entrypoint := f.(fn_entrypoint) |}.
+
+Definition transf_fundef (fd: fundef) : fundef :=
+ AST.transf_fundef transf_function fd.
+
+Definition transf_program (p: program) : program :=
+ transform_program transf_fundef p.
diff --git a/backend/KillUselessMovesproof.v b/backend/KillUselessMovesproof.v
new file mode 100644
index 00000000..629aa6aa
--- /dev/null
+++ b/backend/KillUselessMovesproof.v
@@ -0,0 +1,361 @@
+(* *************************************************************)
+(* *)
+(* The Compcert verified compiler *)
+(* *)
+(* David Monniaux CNRS, VERIMAG *)
+(* *)
+(* Copyright VERIMAG. All rights reserved. *)
+(* This file is distributed under the terms of the INRIA *)
+(* Non-Commercial License Agreement. *)
+(* *)
+(* *************************************************************)
+
+Require Import Axioms.
+Require Import FunInd.
+Require Import Coqlib Maps Errors Integers Floats Lattice Kildall.
+Require Import AST Linking.
+Require Import Values Memory Globalenvs Events Smallstep.
+Require Import Registers Op RTL.
+Require Import KillUselessMoves.
+
+
+Definition match_prog (p tp: RTL.program) :=
+ match_program (fun ctx f tf => tf = transf_fundef f) eq p tp.
+
+Lemma transf_program_match:
+ forall p, match_prog p (transf_program p).
+Proof.
+ intros. eapply match_transform_program; eauto.
+Qed.
+
+Section PRESERVATION.
+
+Variables prog tprog: program.
+Hypothesis TRANSL: match_prog prog tprog.
+Let ge := Genv.globalenv prog.
+Let tge := Genv.globalenv tprog.
+
+Lemma functions_translated:
+ forall v f,
+ Genv.find_funct ge v = Some f ->
+ Genv.find_funct tge v = Some (transf_fundef f).
+Proof (Genv.find_funct_transf TRANSL).
+
+Lemma function_ptr_translated:
+ forall v f,
+ Genv.find_funct_ptr ge v = Some f ->
+ Genv.find_funct_ptr tge v = Some (transf_fundef f).
+Proof (Genv.find_funct_ptr_transf TRANSL).
+
+Lemma symbols_preserved:
+ forall id,
+ Genv.find_symbol tge id = Genv.find_symbol ge id.
+Proof (Genv.find_symbol_transf TRANSL).
+
+Lemma senv_preserved:
+ Senv.equiv ge tge.
+Proof (Genv.senv_transf TRANSL).
+
+Lemma sig_preserved:
+ forall f, funsig (transf_fundef f) = funsig f.
+Proof.
+ destruct f; reflexivity.
+Qed.
+
+Lemma find_function_translated:
+ forall ros rs fd,
+ find_function ge ros rs = Some fd ->
+ find_function tge ros rs = Some (transf_fundef fd).
+Proof.
+ unfold find_function; intros. destruct ros as [r|id].
+ eapply functions_translated; eauto.
+ rewrite symbols_preserved. destruct (Genv.find_symbol ge id); try congruence.
+ eapply function_ptr_translated; eauto.
+Qed.
+
+Lemma transf_function_at:
+ forall f pc i,
+ f.(fn_code)!pc = Some i ->
+ (transf_function f).(fn_code)!pc = Some(transf_instr pc i).
+Proof.
+ intros until i. intro Hcode.
+ unfold transf_function; simpl.
+ rewrite PTree.gmap.
+ unfold option_map.
+ rewrite Hcode.
+ reflexivity.
+Qed.
+
+Ltac TR_AT :=
+ match goal with
+ | [ A: (fn_code _)!_ = Some _ |- _ ] =>
+ generalize (transf_function_at _ _ _ A); intros
+ end.
+
+Section SAME_RS.
+ Context {A : Type}.
+
+ Definition same_rs (rs rs' : Regmap.t A) :=
+ forall x, rs # x = rs' # x.
+
+ Lemma same_rs_refl : forall rs, same_rs rs rs.
+ Proof.
+ unfold same_rs.
+ reflexivity.
+ Qed.
+
+ Lemma same_rs_comm : forall rs rs', (same_rs rs rs') -> (same_rs rs' rs).
+ Proof.
+ unfold same_rs.
+ congruence.
+ Qed.
+
+ Lemma same_rs_trans : forall rs1 rs2 rs3,
+ (same_rs rs1 rs2) -> (same_rs rs2 rs3) -> (same_rs rs1 rs3).
+ Proof.
+ unfold same_rs.
+ congruence.
+ Qed.
+
+ Lemma same_rs_idem_write : forall rs r,
+ (same_rs rs (rs # r <- (rs # r))).
+ Proof.
+ unfold same_rs.
+ intros.
+ rewrite Regmap.gsident.
+ reflexivity.
+ Qed.
+
+ Lemma same_rs_read:
+ forall rs rs' r, (same_rs rs rs') -> rs # r = rs' # r.
+ Proof.
+ unfold same_rs.
+ auto.
+ Qed.
+
+ Lemma same_rs_subst:
+ forall rs rs' l, (same_rs rs rs') -> rs ## l = rs' ## l.
+ Proof.
+ induction l; cbn; intuition congruence.
+ Qed.
+
+ Lemma same_rs_write: forall rs rs' r x,
+ (same_rs rs rs') -> (same_rs (rs # r <- x) (rs' # r <- x)).
+ Proof.
+ unfold same_rs.
+ intros.
+ destruct (peq r x0).
+ { subst x0.
+ rewrite Regmap.gss. rewrite Regmap.gss.
+ reflexivity.
+ }
+ rewrite Regmap.gso by congruence.
+ rewrite Regmap.gso by congruence.
+ auto.
+ Qed.
+
+ Lemma same_rs_setres:
+ forall rs rs' (SAME: same_rs rs rs') res vres,
+ same_rs (regmap_setres res vres rs) (regmap_setres res vres rs').
+ Proof.
+ induction res; cbn; auto using same_rs_write.
+ Qed.
+End SAME_RS.
+
+Lemma same_find_function: forall tge rs rs' (SAME: same_rs rs rs') ros,
+ find_function tge ros rs = find_function tge ros rs'.
+Proof.
+ destruct ros; cbn.
+ { rewrite (same_rs_read rs rs' r SAME).
+ reflexivity. }
+ reflexivity.
+Qed.
+
+Inductive match_frames: RTL.stackframe -> RTL.stackframe -> Prop :=
+| match_frames_intro: forall res f sp pc rs rs' (SAME : same_rs rs rs'),
+ match_frames (Stackframe res f sp pc rs)
+ (Stackframe res (transf_function f) sp pc rs').
+
+Inductive match_states: RTL.state -> RTL.state -> Prop :=
+ | match_regular_states: forall stk f sp pc rs rs' m stk'
+ (SAME: same_rs rs rs')
+ (STACKS: list_forall2 match_frames stk stk'),
+ match_states (State stk f sp pc rs m)
+ (State stk' (transf_function f) sp pc rs' m)
+ | match_callstates: forall stk f args m stk'
+ (STACKS: list_forall2 match_frames stk stk'),
+ match_states (Callstate stk f args m)
+ (Callstate stk' (transf_fundef f) args m)
+ | match_returnstates: forall stk v m stk'
+ (STACKS: list_forall2 match_frames stk stk'),
+ match_states (Returnstate stk v m)
+ (Returnstate stk' v m).
+
+Lemma step_simulation:
+ forall S1 t S2, RTL.step ge S1 t S2 ->
+ forall S1', match_states S1 S1' ->
+ exists S2', RTL.step tge S1' t S2' /\ match_states S2 S2'.
+Proof.
+ induction 1; intros S1' MS; inv MS; try TR_AT.
+- (* nop *)
+ econstructor; split. eapply exec_Inop; eauto.
+ constructor; auto.
+- (* op *)
+ cbn in H1.
+ destruct (_ && _) eqn:IS_MOVE in H1.
+ {
+ destruct eq_operation in IS_MOVE. 2: discriminate.
+ destruct list_eq_dec in IS_MOVE. 2: discriminate.
+ subst op. subst args.
+ clear IS_MOVE.
+ cbn in H0.
+ inv H0.
+ econstructor; split.
+ { eapply exec_Inop; eauto. }
+ constructor.
+ 2: assumption.
+ eapply same_rs_trans.
+ { apply same_rs_comm.
+ apply same_rs_idem_write.
+ }
+ assumption.
+ }
+ econstructor; split.
+ eapply exec_Iop with (v := v); eauto.
+ rewrite <- H0.
+ rewrite (same_rs_subst rs rs' args SAME).
+ apply eval_operation_preserved. exact symbols_preserved.
+ constructor; auto using same_rs_write.
+(* load *)
+- econstructor; split.
+ assert (eval_addressing tge sp addr rs' ## args = Some a).
+ { rewrite <- H0.
+ rewrite (same_rs_subst rs rs' args SAME).
+ apply eval_addressing_preserved. exact symbols_preserved.
+ }
+ eapply exec_Iload; eauto.
+ constructor; auto using same_rs_write.
+- (* load notrap1 *)
+ econstructor; split.
+ assert (eval_addressing tge sp addr rs' ## args = None).
+ { rewrite <- H0.
+ rewrite (same_rs_subst rs rs' args SAME).
+ apply eval_addressing_preserved. exact symbols_preserved.
+ }
+ eapply exec_Iload_notrap1; eauto.
+ constructor; auto using same_rs_write.
+- (* load notrap2 *)
+ econstructor; split.
+ assert (eval_addressing tge sp addr rs' ## args = Some a).
+ { rewrite <- H0.
+ rewrite (same_rs_subst rs rs' args SAME).
+ apply eval_addressing_preserved. exact symbols_preserved.
+ }
+ eapply exec_Iload_notrap2; eauto.
+ constructor; auto using same_rs_write.
+- (* store *)
+ econstructor; split.
+ assert (eval_addressing tge sp addr rs' ## args = Some a).
+ { rewrite <- H0.
+ rewrite (same_rs_subst rs rs' args SAME).
+ apply eval_addressing_preserved. exact symbols_preserved.
+ }
+ rewrite (same_rs_read rs rs' src SAME) in H1.
+ eapply exec_Istore; eauto.
+ constructor; auto.
+(* call *)
+- econstructor; split.
+ eapply exec_Icall with (fd := transf_fundef fd); eauto.
+ eapply find_function_translated; eauto.
+ { rewrite <- (same_find_function ge rs rs') by assumption.
+ assumption. }
+ apply sig_preserved.
+ rewrite (same_rs_subst rs rs' args SAME).
+ constructor. constructor; auto. constructor; auto.
+(* tailcall *)
+- econstructor; split.
+ eapply exec_Itailcall with (fd := transf_fundef fd); eauto.
+ eapply find_function_translated; eauto.
+ { rewrite <- (same_find_function ge rs rs') by assumption.
+ assumption. }
+ apply sig_preserved.
+ rewrite (same_rs_subst rs rs' args SAME).
+ constructor. auto.
+(* builtin *)
+- econstructor; split.
+ eapply exec_Ibuiltin; eauto.
+ eapply eval_builtin_args_preserved with (ge1 := ge); eauto. exact symbols_preserved.
+ {
+ replace (fun r : positive => rs' # r) with (fun r : positive => rs # r).
+ eassumption.
+ apply functional_extensionality.
+ auto using same_rs_read.
+ }
+ eapply external_call_symbols_preserved; eauto. apply senv_preserved.
+ constructor; auto.
+ auto using same_rs_setres.
+(* cond *)
+- econstructor; split.
+ eapply exec_Icond; eauto.
+ rewrite <- (same_rs_subst rs rs' args SAME); eassumption.
+ constructor; auto.
+(* jumptbl *)
+- econstructor; split.
+ eapply exec_Ijumptable; eauto.
+ rewrite <- (same_rs_read rs rs' arg SAME); eassumption.
+ constructor; auto.
+(* return *)
+- econstructor; split.
+ eapply exec_Ireturn; eauto.
+ destruct or; cbn.
+ + rewrite <- (same_rs_read rs rs' r SAME) by auto.
+ constructor; auto.
+ + constructor; auto.
+(* internal function *)
+- simpl. econstructor; split.
+ eapply exec_function_internal; eauto.
+ constructor; auto.
+ cbn.
+ apply same_rs_refl.
+(* external function *)
+- econstructor; split.
+ eapply exec_function_external; eauto.
+ eapply external_call_symbols_preserved; eauto. apply senv_preserved.
+ constructor; auto.
+(* return *)
+- inv STACKS. inv H1.
+ econstructor; split.
+ eapply exec_return; eauto.
+ constructor; auto using same_rs_write.
+Qed.
+
+Lemma transf_initial_states:
+ forall S1, RTL.initial_state prog S1 ->
+ exists S2, RTL.initial_state tprog S2 /\ match_states S1 S2.
+Proof.
+ intros. inv H. econstructor; split.
+ econstructor.
+ eapply (Genv.init_mem_transf TRANSL); eauto.
+ rewrite symbols_preserved. rewrite (match_program_main TRANSL). eauto.
+ eapply function_ptr_translated; eauto.
+ rewrite <- H3; apply sig_preserved.
+ constructor. constructor.
+Qed.
+
+Lemma transf_final_states:
+ forall S1 S2 r, match_states S1 S2 -> RTL.final_state S1 r -> RTL.final_state S2 r.
+Proof.
+ intros. inv H0. inv H. inv STACKS. constructor.
+Qed.
+
+Theorem transf_program_correct:
+ forward_simulation (RTL.semantics prog) (RTL.semantics tprog).
+Proof.
+ eapply forward_simulation_step.
+ apply senv_preserved.
+ eexact transf_initial_states.
+ eexact transf_final_states.
+ exact step_simulation.
+Qed.
+
+End PRESERVATION.
diff --git a/backend/LICMaux.ml b/backend/LICMaux.ml
index c3907809..0ca4418b 100644
--- a/backend/LICMaux.ml
+++ b/backend/LICMaux.ml
@@ -19,6 +19,62 @@ open Inject;;
type reg = P.t;;
+(** get_loop_headers moved from Duplicateaux.ml to LICMaux.ml to prevent cycle dependencies *)
+let debug_flag = ref false
+
+let debug fmt =
+ if !debug_flag then Printf.eprintf fmt
+ else Printf.ifprintf stderr fmt
+
+type vstate = Unvisited | Processed | Visited
+
+let get_some = function
+| None -> failwith "Did not get some"
+| Some thing -> thing
+
+let rtl_successors = function
+| Itailcall _ | Ireturn _ -> []
+| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n)
+| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n]
+| Icond (_,_,n1,n2,_) -> [n1; n2]
+| Ijumptable (_,ln) -> ln
+
+(** Getting loop branches with a DFS visit :
+ * Each node is either Unvisited, Visited, or Processed
+ * pre-order: node becomes Processed
+ * post-order: node becomes Visited
+ *
+ * If we come accross an edge to a Processed node, it's a loop!
+ *)
+let get_loop_headers code entrypoint = begin
+ debug "get_loop_headers\n";
+ let visited = ref (PTree.map (fun n i -> Unvisited) code)
+ and is_loop_header = ref (PTree.map (fun n i -> false) code)
+ in let rec dfs_visit code = function
+ | [] -> ()
+ | node :: ln ->
+ match (get_some @@ PTree.get node !visited) with
+ | Visited -> ()
+ | Processed -> begin
+ debug "Node %d is a loop header\n" (P.to_int node);
+ is_loop_header := PTree.set node true !is_loop_header;
+ visited := PTree.set node Visited !visited
+ end
+ | Unvisited -> begin
+ visited := PTree.set node Processed !visited;
+ match PTree.get node code with
+ | None -> failwith "No such node"
+ | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits;
+ visited := PTree.set node Visited !visited;
+ dfs_visit code ln
+ end
+ in begin
+ dfs_visit code [entrypoint];
+ !is_loop_header
+ end
+end
+
+
module Dominator =
struct
type t = Unreachable | Dominated of int | Multiple
@@ -57,7 +113,7 @@ let apply_dominator (is_marked : node -> bool) (pc : node)
let dominated_parts1 (f : coq_function) :
(bool PTree.t) * (Dominator.t PMap.t option) =
- let headers = Duplicateaux.get_loop_headers f.fn_code f.fn_entrypoint in
+ let headers = get_loop_headers f.fn_code f.fn_entrypoint in
let dominated = Dominator_Solver.fixpoint f.fn_code RTL.successors_instr
(apply_dominator (fun pc -> match PTree.get pc headers with
| Some x -> x
@@ -248,7 +304,7 @@ let print_dominated_parts1 oc f =
(PTree.elements f.fn_code);;
let loop_headers (f : coq_function) : RTL.node list =
- List.map fst (List.filter snd (PTree.elements (Duplicateaux.get_loop_headers f.fn_code f.fn_entrypoint)));;
+ List.map fst (List.filter snd (PTree.elements (get_loop_headers f.fn_code f.fn_entrypoint)));;
let print_loop_headers f =
print_endline "Loop headers";
diff --git a/doc/index-kvx.html b/doc/index-kvx.html
index 97eefc24..6906c212 100644
--- a/doc/index-kvx.html
+++ b/doc/index-kvx.html
@@ -34,8 +34,8 @@ a:active {color : Red; text-decoration : underline; }
The unmodified parts of this table appear in <font color=gray>gray</font>.
<br>
<br>
- A high-level view of this backend of CompCert is provided by this HAL preprint of Six, Boulm&eacute; and Monniaux (2019):
- <div><a href=https://hal.archives-ouvertes.fr/hal-02185883>Certified Compiler Backends for VLIW Processors (Highly Modular Postpass-Scheduling in the CompCert Certified Compiler)</a></div>
+ A high-level view of this CompCert backend is provided by this OOSPLA'20 paper (of Six, Boulm&eacute; and Monniaux):
+ <div><a href=https://hal.archives-ouvertes.fr/hal-02185883>Certified and Efficient Instruction Scheduling. Application to Interlocked VLIW Processors.</a></div>
<br>
Our source code is available on our <a href=https://gricad-gitlab.univ-grenoble-alpes.fr/certicompil/compcert-kvx>GitLab public repository</a> (see conditions in the LICENSE file).
</p>
diff --git a/driver/Clflags.ml b/driver/Clflags.ml
index 829af76a..d5f3aca5 100644
--- a/driver/Clflags.ml
+++ b/driver/Clflags.ml
@@ -33,13 +33,21 @@ let option_fcse3_across_calls = ref false
let option_fcse3_across_merges = ref true
let option_fcse3_glb = ref true
let option_fredundancy = ref true
-let option_fduplicate = ref (-1)
-let option_finvertcond = ref true
-let option_ftracelinearize = ref false
+
+(** Options relative to superblock scheduling *)
+let option_fpredict = ref true (* insert static branch prediction information, and swaps ifso/ifnot branches accordingly *)
+let option_ftailduplicate = ref 0 (* perform tail duplication for blocks of size n *)
+let option_ftracelinearize = ref true (* uses branch prediction information to improve the linearization *)
+let option_funrollsingle = ref 0 (* unroll a single iteration of innermost loops of size n *)
+let option_funrollbody = ref 0 (* unroll the body of innermost loops of size n *)
+
+(* Scheduling *)
let option_fprepass = ref false
let option_fprepass_sched = ref "list"
+
let option_fpostpass = ref true
let option_fpostpass_sched = ref "list"
+
let option_fifconversion = ref true
let option_Obranchless = ref false
let option_falignfunctions = ref (None: int option)
diff --git a/driver/Compopts.v b/driver/Compopts.v
index d576ede6..540e8922 100644
--- a/driver/Compopts.v
+++ b/driver/Compopts.v
@@ -27,9 +27,6 @@ Parameter generate_float_constants: unit -> bool.
(** For value analysis. Currently always false. *)
Parameter va_strict: unit -> bool.
-(** Flag -fduplicate. Branch prediction annotation + tail duplication *)
-Parameter optim_duplicate: unit -> bool.
-
(** Flag -ftailcalls. For tail call optimization. *)
Parameter optim_tailcalls: unit -> bool.
diff --git a/driver/Driver.ml b/driver/Driver.ml
index fef9c166..e5fc78f8 100644
--- a/driver/Driver.ml
+++ b/driver/Driver.ml
@@ -210,15 +210,12 @@ Processing options:
-fpostpass Perform postpass scheduling (only for K1 architecture) [on]
-fpostpass= <optim> Perform postpass scheduling with the specified optimization [list]
(<optim>=list: list scheduling, <optim>=ilp: ILP, <optim>=greedy: just packing bundles)
- -fduplicate <nb_nodes> Perform tail duplication to form superblocks on predicted traces
- nb_nodes control the heuristic deciding to duplicate or not
- A value of -1 desactivates the entire pass (including branch prediction)
- A value of 0 desactivates the duplication (but activates the branch prediction)
- FIXME : this is desactivated by default for now
- -finvertcond Invert conditions based on predicted paths (to prefer fallthrough).
- Requires -fduplicate to be also activated [on]
- -ftracelinearize Linearizes based on the traces identified by duplicate phase
- It is heavily recommended to activate -finvertcond with this pass [off]
+ -fpredict Insert static branch prediction information [on]
+ Also swaps ifso/ifnot branches accordingly at RTL level
+ -ftailduplicate n Perform tail duplication for RTL code blocks of size n (not counting Inops) [0]
+ -ftracelinearize Uses branch prediction information to improve the Linearize [on]
+ -funrollsingle n Unrolls a single iteration of innermost loops of size n (not counting Inops) [0]
+ -funrollbody n Unrolls once the body of innermost loops of size n (not counting Inops) [0]
-fforward-moves Forward moves after CSE
-finline Perform inlining of functions [on]
-finline-functions-called-once Integrate functions only required by their
@@ -283,6 +280,7 @@ let dump_mnemonics destfile =
let optimization_options = [
option_ftailcalls; option_fifconversion; option_fconstprop;
option_fcse; option_fcse2; option_fcse3;
+ option_fpredict; option_ftracelinearize;
option_fpostpass;
option_fredundancy; option_finline; option_finline_functions_called_once;
]
@@ -424,8 +422,10 @@ let cmdline_actions =
@ f_opt "redundancy" option_fredundancy
@ f_opt "prepass" option_fprepass
@ f_opt "postpass" option_fpostpass
- @ [ Exact "-fduplicate", Integer (fun n -> option_fduplicate := n) ]
- @ f_opt "invertcond" option_finvertcond
+ @ [ Exact "-ftailduplicate", Integer (fun n -> option_ftailduplicate := n) ]
+ @ f_opt "predict" option_fpredict
+ @ [ Exact "-funrollsingle", Integer (fun n -> option_funrollsingle := n) ]
+ @ [ Exact "-funrollbody", Integer (fun n -> option_funrollbody := n) ]
@ f_opt "tracelinearize" option_ftracelinearize
@ f_opt_str "prepass" option_fprepass option_fprepass_sched
@ f_opt_str "postpass" option_fpostpass option_fpostpass_sched
diff --git a/extraction/extraction.v b/extraction/extraction.v
index e43594fc..bd396cd8 100644
--- a/extraction/extraction.v
+++ b/extraction/extraction.v
@@ -111,8 +111,6 @@ Extract Constant Compopts.generate_float_constants =>
"fun _ -> !Clflags.option_ffloatconstprop >= 2".
Extract Constant Compopts.optim_tailcalls =>
"fun _ -> !Clflags.option_ftailcalls".
-Extract Constant Compopts.optim_duplicate =>
- "fun _ -> (if !Clflags.option_fduplicate = -1 then false else true)".
Extract Constant Compopts.optim_constprop =>
"fun _ -> !Clflags.option_fconstprop".
Extract Constant Compopts.optim_CSE =>
diff --git a/kvx/SelectOp.vp b/kvx/SelectOp.vp
index 9e5d45a0..65dba3ac 100644
--- a/kvx/SelectOp.vp
+++ b/kvx/SelectOp.vp
@@ -103,8 +103,14 @@ Nondetfunction select0 (ty : typ) (cond0 : condition0) (e1 e2 e3: expr) :=
| _, _, _ => (Eop (Osel cond0 ty) (e1 ::: e2 ::: e3 ::: Enil))
end.
+Definition same_expr_pure (e1 e2: expr) :=
+ match e1, e2 with
+ | Evar v1, Evar v2 => if ident_eq v1 v2 then true else false
+ | _, _ => false
+ end.
+
Definition select (ty : typ) (cond : condition) (args : exprlist) (e1 e2: expr) : option expr :=
- Some(
+ Some (if same_expr_pure e1 e2 then e1 else
match cond_to_condition0 cond args with
| None => select0 ty (Ccomp0 Cne) e1 e2 (Eop (Ocmp cond) args)
| Some(cond0, ec) => select0 ty cond0 e1 e2 ec
@@ -356,12 +362,6 @@ Nondetfunction orimm (n1: int) (e2: expr) :=
| _ => Eop (Oorimm n1) (e2:::Enil)
end.
-Definition same_expr_pure (e1 e2: expr) :=
- match e1, e2 with
- | Evar v1, Evar v2 => if ident_eq v1 v2 then true else false
- | _, _ => false
- end.
-
Nondetfunction or (e1: expr) (e2: expr) :=
match e1, e2 with
| Eop (Ointconst n1) Enil, t2 => orimm n1 t2
diff --git a/kvx/SelectOpproof.v b/kvx/SelectOpproof.v
index c23ed601..7a301929 100644
--- a/kvx/SelectOpproof.v
+++ b/kvx/SelectOpproof.v
@@ -1547,6 +1547,15 @@ Proof.
intros until b.
intro Hop; injection Hop; clear Hop; intro; subst a.
intros HeL He1 He2 HeC.
+ destruct same_expr_pure eqn:SAME.
+ {
+ destruct (eval_same_expr a1 a2 le v1 v2 SAME He1 He2) as [EQ1 EQ2].
+ subst a2. subst v2.
+ exists v1; split; trivial.
+ cbn.
+ rewrite if_same.
+ apply Val.lessdef_normalize.
+ }
unfold cond_to_condition0.
destruct (cond_to_condition0_match cond al).
{
diff --git a/lib/Coqlib.v b/lib/Coqlib.v
index 02c5d07f..16d880fa 100644
--- a/lib/Coqlib.v
+++ b/lib/Coqlib.v
@@ -1325,3 +1325,9 @@ Lemma nlist_forall2_imply:
Proof.
induction 1; simpl; intros; constructor; auto.
Qed.
+
+Lemma if_same : forall {T : Type} (b : bool) (x : T),
+ (if b then x else x) = x.
+Proof.
+ destruct b; trivial.
+Qed.
diff --git a/runtime/include/ccomp_kvx_fixes.h b/runtime/include/ccomp_kvx_fixes.h
index 65d65e7b..a518a069 100644
--- a/runtime/include/ccomp_kvx_fixes.h
+++ b/runtime/include/ccomp_kvx_fixes.h
@@ -33,13 +33,26 @@ extern __int128 __compcert_acswapd(void *address, unsigned long long new_value,
#define __builtin_kvx_acswapw __compcert_acswapw
extern __int128 __compcert_acswapw(void *address, unsigned long long new_value, unsigned long long old_value);
+#define __builtin_kvx_aladdd __compcert_aladdd
+extern long long __compcert_aladdd(void *address, unsigned long long incr);
+
+#define __builtin_kvx_aladdw __compcert_aladdw
+extern int __compcert_aladdw(void *address, unsigned int incr);
+
#define __builtin_kvx_afaddd __compcert_afaddd
extern long long __compcert_afaddd(void *address, unsigned long long incr);
#define __builtin_kvx_afaddw __compcert_afaddw
extern int __compcert_afaddw(void *address, unsigned int incr);
-#endif
+
+#define __builtin_kvx_ld __compcert_ld
+extern int __compcert_ld(void *address, const char *str, const int b);
+
+#define __builtin_kvx_lwz __compcert_lwz
+extern int __compcert_lwz(void *address, const char *str, const int b);
/* #define __builtin_expect(x, y) (x) */
#define __builtin_ctz(x) __builtin_kvx_ctzw(x)
#define __builtin_clz(x) __builtin_kvx_clzw(x)
+
+#endif
diff --git a/test/kvx/sort/Makefile b/test/kvx/sort/Makefile
index 1afab6e9..46a8f025 100644
--- a/test/kvx/sort/Makefile
+++ b/test/kvx/sort/Makefile
@@ -3,7 +3,7 @@ CC ?= gcc
CCOMP ?= ccomp
CFLAGS ?= -O2
SIMU ?= kvx-mppa
-TIMEOUT ?= 10s
+TIMEOUT ?= 20s
KVXCPATH=$(shell which $(KVXC))
CCPATH=$(shell which $(CC))
diff --git a/test/monniaux/loop_nest/syrk.c b/test/monniaux/loop_nest/syrk.c
new file mode 100644
index 00000000..490d0a01
--- /dev/null
+++ b/test/monniaux/loop_nest/syrk.c
@@ -0,0 +1,28 @@
+/* Include polybench common header. */
+#include "polybench.h"
+
+/* Include benchmark-specific header. */
+/* Default data type is double, default size is 4000. */
+#include "syrk.h"
+
+/* Main computational kernel. The whole function will be timed,
+ including the call and return. */
+void kernel_syrk(int ni, int nj,
+ DATA_TYPE alpha,
+ DATA_TYPE beta,
+ DATA_TYPE POLYBENCH_2D(C,NI,NI,ni,ni),
+ DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj))
+{
+ int i, j, k;
+
+ /* C := alpha*A*A' + beta*C */
+#if 0
+ for (i = 0; i < _PB_NI; i++)
+ for (j = 0; j < _PB_NI; j++)
+ C[i][j] *= beta;
+#endif
+ for (i = 0; i < _PB_NI; i++)
+ for (j = 0; j < _PB_NI; j++)
+ for (k = 0; k < _PB_NJ; k++)
+ C[i][j] += alpha * A[i][k] * A[j][k];
+}
diff --git a/tools/compiler_expand.ml b/tools/compiler_expand.ml
index c37880d5..7e1e0181 100644
--- a/tools/compiler_expand.ml
+++ b/tools/compiler_expand.ml
@@ -20,13 +20,14 @@ TOTAL, (Option "profile_arcs"), (Some "Profiling insertion"), "Profiling";
TOTAL, (Option "branch_probabilities"), (Some "Profiling use"), "ProfilingExploit";
TOTAL, (Option "optim_move_loop_invariants"), (Some "Inserting initial nop"), "FirstNop";
TOTAL, Always, (Some "Renumbering"), "Renumber";
-PARTIAL, (Option "optim_duplicate"), (Some "Tail-duplicating"), "Duplicate";
+PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE";
+PARTIAL, Always, (Some "Duplicating blocks"), "Duplicate";
TOTAL, Always, (Some "Renumbering pre constprop"), "Renumber";
TOTAL, (Option "optim_constprop"), (Some "Constant propagation"), "Constprop";
TOTAL, Always, (Some "Renumbering pre CSE"), "Renumber";
-PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE";
TOTAL, (Option "optim_CSE2"), (Some "CSE2"), "CSE2";
PARTIAL, (Option "optim_CSE3"), (Some "CSE3"), "CSE3";
+TOTAL, (Option "optim_CSE3"), (Some "Kill useless moves after CSE3"), "KillUselessMoves";
TOTAL, (Option "optim_forward_moves"), (Some "Forwarding moves"), "ForwardMoves";
PARTIAL, (Option "optim_redundancy"), (Some "Redundancy elimination"), "Deadcode";
PARTIAL, (Option "optim_move_loop_invariants"), (Some "LICM"), "LICM";