diff options
author | David Monniaux <david.monniaux@univ-grenoble-alpes.fr> | 2020-10-18 22:49:10 +0200 |
---|---|---|
committer | David Monniaux <david.monniaux@univ-grenoble-alpes.fr> | 2020-10-18 22:49:10 +0200 |
commit | 996a2e5bbc4826d95144b62f5218b6e3e1e7d881 (patch) | |
tree | 55a5136c68e7d0f7a304913f28d0a761d9facd9a | |
parent | 8c3a2bdb56eba8d8bc5e359b01a320916eac85f0 (diff) | |
parent | a2f31f2b886ccb9656a019db1780aabc1789368a (diff) | |
download | compcert-kvx-996a2e5bbc4826d95144b62f5218b6e3e1e7d881.tar.gz compcert-kvx-996a2e5bbc4826d95144b62f5218b6e3e1e7d881.zip |
Merge remote-tracking branch 'origin/kvx-work' into kvx-test-prepass
-rw-r--r-- | .gitlab-ci.yml | 2 | ||||
-rw-r--r-- | Makefile | 1 | ||||
-rw-r--r-- | backend/CSE3analysis.v | 38 | ||||
-rw-r--r-- | backend/CSE3analysisaux.ml | 4 | ||||
-rw-r--r-- | backend/CSE3analysisproof.v | 29 | ||||
-rw-r--r-- | backend/CSE3proof.v | 6 | ||||
-rw-r--r-- | backend/Duplicateaux.ml | 450 | ||||
-rw-r--r-- | backend/KillUselessMoves.v | 40 | ||||
-rw-r--r-- | backend/KillUselessMovesproof.v | 361 | ||||
-rw-r--r-- | backend/LICMaux.ml | 60 | ||||
-rw-r--r-- | doc/index-kvx.html | 4 | ||||
-rw-r--r-- | driver/Clflags.ml | 14 | ||||
-rw-r--r-- | driver/Compopts.v | 3 | ||||
-rw-r--r-- | driver/Driver.ml | 22 | ||||
-rw-r--r-- | extraction/extraction.v | 2 | ||||
-rw-r--r-- | kvx/SelectOp.vp | 14 | ||||
-rw-r--r-- | kvx/SelectOpproof.v | 9 | ||||
-rw-r--r-- | lib/Coqlib.v | 6 | ||||
-rw-r--r-- | runtime/include/ccomp_kvx_fixes.h | 15 | ||||
-rw-r--r-- | test/kvx/sort/Makefile | 2 | ||||
-rw-r--r-- | test/monniaux/loop_nest/syrk.c | 28 | ||||
-rw-r--r-- | tools/compiler_expand.ml | 5 |
22 files changed, 939 insertions, 176 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3b1a86fd..10008017 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -237,7 +237,7 @@ build_kvx: - source /opt/kalray/accesscore/kalray.sh && ./config_kvx.sh - source /opt/kalray/accesscore/kalray.sh && make -j "$NJOBS" - source /opt/kalray/accesscore/kalray.sh && make -C test CCOMPOPTS=-static SIMU='kvx-cluster -- ' EXECUTE='kvx-cluster -- ' all test - - source /opt/kalray/accesscore/kalray.sh && make -C test/monniaux/yarpgen TARGET_CC='kvx-cos-gcc' EXECUTE='kvx-cluster -- ' CCOMPOPTS='-static' TARGET_CFLAGS='-static' + - source /opt/kalray/accesscore/kalray.sh && ulimit -s65536 && make -C test/monniaux/yarpgen TARGET_CC='kvx-cos-gcc' EXECUTE='kvx-cluster -- ' CCOMPOPTS='-static' TARGET_CFLAGS='-static' rules: - if: '$CI_COMMIT_BRANCH == "kvx-work"' when: always @@ -95,6 +95,7 @@ BACKEND=\ CSE2deps.v CSE2depsproof.v \ CSE2.v CSE2proof.v \ CSE3analysis.v CSE3analysisproof.v CSE3.v CSE3proof.v \ + KillUselessMoves.v KillUselessMovesproof.v \ LICM.v LICMproof.v \ NeedDomain.v NeedOp.v Deadcode.v Deadcodeproof.v \ Unusedglob.v Unusedglobproof.v \ diff --git a/backend/CSE3analysis.v b/backend/CSE3analysis.v index ade79c28..7316c9a9 100644 --- a/backend/CSE3analysis.v +++ b/backend/CSE3analysis.v @@ -298,13 +298,22 @@ Section OPERATIONS. Definition move (src dst : reg) (rel : RELATION.t) : RELATION.t := - match eq_find {| eq_lhs := dst; + if peq src dst + then rel + else + match eq_find {| eq_lhs := dst; eq_op := SOp Omove; eq_args:= src::nil |} with - | Some eq_id => PSet.add eq_id (kill_reg dst rel) - | None => kill_reg dst rel - end. + | Some eq_id => PSet.add eq_id (kill_reg dst rel) + | None => kill_reg dst rel + end. + Definition is_trivial_sym_op sop := + match sop with + | SOp op => is_trivial_op op + | SLoad _ _ => false + end. + Definition oper (dst : reg) (op: sym_op) (args : list reg) (rel : RELATION.t) : RELATION.t := if is_smove op @@ -315,15 +324,18 @@ Section OPERATIONS. | _ => kill_reg dst rel end else - let args' := forward_move_l rel args in - match rhs_find op args' rel with - | Some r => - if Compopts.optim_CSE3_glb tt - then RELATION.glb (move r dst rel) - (oper1 dst op args' rel) - else oper1 dst op args' rel - | None => oper1 dst op args' rel - end. + if is_trivial_sym_op op + then kill_reg dst rel + else + let args' := forward_move_l rel args in + match rhs_find op args' rel with + | Some r => + if Compopts.optim_CSE3_glb tt + then RELATION.glb (move r dst rel) + (oper1 dst op args' rel) + else oper1 dst op args' rel + | None => oper1 dst op args' rel + end. Definition clever_kill_store (chunk : memory_chunk) (addr: addressing) (args : list reg) diff --git a/backend/CSE3analysisaux.ml b/backend/CSE3analysisaux.ml index 3e4a6b9e..3990b765 100644 --- a/backend/CSE3analysisaux.ml +++ b/backend/CSE3analysisaux.ml @@ -67,6 +67,9 @@ let pp_option pp oc = function | None -> output_string oc "none" | Some x -> pp oc x;; +let is_trivial eq = + (eq.eq_op = SOp Op.Omove) && (eq.eq_args = [eq.eq_lhs]);; + let preanalysis (tenv : typing_env) (f : RTL.coq_function) = let cur_eq_id = ref 0 and cur_catalog = ref PTree.empty @@ -76,6 +79,7 @@ let preanalysis (tenv : typing_env) (f : RTL.coq_function) = and cur_kill_mem = ref PSet.empty and cur_moves = ref (PMap.init PSet.empty) in let eq_find_oracle node eq = + assert (not (is_trivial eq)); let o = Hashtbl.find_opt eq_table (flatten_eq eq) in (if !Clflags.option_debug_compcert > 1 then Printf.printf "@%d: eq_find %a -> %a\n" (P.to_int node) diff --git a/backend/CSE3analysisproof.v b/backend/CSE3analysisproof.v index f4e3672d..66b199cc 100644 --- a/backend/CSE3analysisproof.v +++ b/backend/CSE3analysisproof.v @@ -745,6 +745,25 @@ Section SOUNDNESS. Hint Resolve oper1_sound : cse3. + Lemma rel_idem_replace: + forall rel rs r m, + sem_rel rel rs m -> + sem_rel rel rs # r <- (rs # r) m. + Proof. + intros until m. + intro REL. + unfold sem_rel, sem_eq, sem_rhs in *. + intros. + specialize REL with (i:=i) (eq0:=eq). + rewrite Regmap.gsident. + replace ((rs # r <- (rs # r)) ## (eq_args eq)) with + (rs ## (eq_args eq)). + { apply REL; auto. } + apply list_map_exten. + intros. + apply Regmap.gsident. + Qed. + Lemma move_sound : forall no : node, forall rel : RELATION.t, @@ -756,6 +775,10 @@ Section SOUNDNESS. unfold move. intros until m. intro REL. + destruct (peq src dst). + { subst dst. + apply rel_idem_replace; auto. + } pose proof (eq_find_sound no {| eq_lhs := dst; eq_op := SOp Omove; eq_args := src :: nil |}) as EQ_FIND_SOUND. destruct eq_find. - intros i eq CONTAINS. @@ -798,7 +821,11 @@ Section SOUNDNESS. subst. rewrite <- (forward_move_sound rel rs m r) by auto. apply move_sound; auto. - - destruct rhs_find as [src |] eqn:RHS_FIND. + - destruct (is_trivial_sym_op sop). + { + apply kill_reg_sound; auto. + } + destruct rhs_find as [src |] eqn:RHS_FIND. + destruct (Compopts.optim_CSE3_glb tt). * apply sem_rel_glb; split. ** pose proof (rhs_find_sound no sop (forward_move_l (ctx:=ctx) rel args) rel src rs m REL RHS_FIND) as SOUND. diff --git a/backend/CSE3proof.v b/backend/CSE3proof.v index 6e489066..3fbc9912 100644 --- a/backend/CSE3proof.v +++ b/backend/CSE3proof.v @@ -443,12 +443,6 @@ Ltac IND_STEP := idtac mpc mpc' fn minstr *) end. -Lemma if_same : forall {T : Type} (b : bool) (x : T), - (if b then x else x) = x. -Proof. - destruct b; trivial. -Qed. - Lemma step_simulation: forall S1 t S2, RTL.step ge S1 t S2 -> forall S1', match_states S1 S1' -> diff --git a/backend/Duplicateaux.ml b/backend/Duplicateaux.ml index 1297ec90..eb9f42e0 100644 --- a/backend/Duplicateaux.ml +++ b/backend/Duplicateaux.ml @@ -15,6 +15,7 @@ (* Oracle for Duplicate pass. * - Add static prediction information to Icond nodes * - Performs tail duplication on interesting traces to form superblocks + * - Unrolls a single iteration of innermost loops * - (TODO: perform partial loop unrolling inside innermost loops) *) @@ -22,23 +23,13 @@ open RTL open Maps open Camlcoq -let debug_flag = ref false - -let debug fmt = - if !debug_flag then Printf.eprintf fmt - else Printf.ifprintf stderr fmt - -let get_some = function -| None -> failwith "Did not get some" -| Some thing -> thing - -let rtl_successors = function -| Itailcall _ | Ireturn _ -> [] -| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n) -| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n] -| Icond (_,_,n1,n2,_) -> [n1; n2] -| Ijumptable (_,ln) -> ln +let debug_flag = LICMaux.debug_flag +let debug = LICMaux.debug +let get_loop_headers = LICMaux.get_loop_headers +let get_some = LICMaux.get_some +let rtl_successors = LICMaux.rtl_successors +(* Get list of nodes following a BFS of the code *) let bfs code entrypoint = begin debug "bfs\n"; let visited = ref (PTree.map (fun n i -> false) code) @@ -67,6 +58,7 @@ let optbool o = match o with Some _ -> true | None -> false let ptree_get_some n ptree = get_some @@ PTree.get n ptree +(* Returns a PTree: node -> list of the predecessors of that node *) let get_predecessors_rtl code = begin debug "get_predecessors_rtl\n"; let preds = ref (PTree.map (fun n i -> []) code) in @@ -89,15 +81,13 @@ end module PSet = Set.Make(PInt) -let print_intlist l = - let rec f = function +let print_intlist oc l = + let rec f oc = function | [] -> () - | n::ln -> (Printf.printf "%d " (P.to_int n); f ln) + | n::ln -> (Printf.fprintf oc "%d %a" (P.to_int n) f ln) in begin if !debug_flag then begin - Printf.printf "["; - f l; - Printf.printf "]" + Printf.fprintf oc "[%a]" f l end end @@ -113,43 +103,6 @@ let print_intset s = end end -type vstate = Unvisited | Processed | Visited - -(** Getting loop branches with a DFS visit : - * Each node is either Unvisited, Visited, or Processed - * pre-order: node becomes Processed - * post-order: node becomes Visited - * - * If we come accross an edge to a Processed node, it's a loop! - *) -let get_loop_headers code entrypoint = begin - debug "get_loop_headers\n"; - let visited = ref (PTree.map (fun n i -> Unvisited) code) - and is_loop_header = ref (PTree.map (fun n i -> false) code) - in let rec dfs_visit code = function - | [] -> () - | node :: ln -> - match (get_some @@ PTree.get node !visited) with - | Visited -> () - | Processed -> begin - debug "Node %d is a loop header\n" (P.to_int node); - is_loop_header := PTree.set node true !is_loop_header; - visited := PTree.set node Visited !visited - end - | Unvisited -> begin - visited := PTree.set node Processed !visited; - match PTree.get node code with - | None -> failwith "No such node" - | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits; - visited := PTree.set node Visited !visited; - dfs_visit code ln - end - in begin - dfs_visit code [entrypoint]; - !is_loop_header - end -end - let ptree_printbool pt = let elements = PTree.elements pt in begin @@ -174,6 +127,10 @@ let rec look_ahead code node is_loop_header predicate = ) | _ -> false +(** + * Heuristics mostly based on the paper Branch Prediction for Free + *) + let do_call_heuristic code cond ifso ifnot is_loop_header = begin debug "\tCall heuristic..\n"; @@ -302,7 +259,7 @@ let get_loop_info is_loop_header bfs_order code = !loop_info end -(* Remark - compared to the original paper, we don't use the store heuristic *) +(* Remark - compared to the original Branch Prediction for Free paper, we don't use the store heuristic *) let get_directions code entrypoint = begin debug "get_directions\n"; let bfs_order = bfs code entrypoint in @@ -435,18 +392,29 @@ let best_predecessor_of node predecessors code order is_visited = ) order) with Not_found -> None -let print_trace t = print_intlist t +let print_trace = print_intlist -let print_traces traces = - let rec f = function +let print_traces oc traces = + let rec f oc = function | [] -> () - | t::lt -> Printf.printf "\n\t"; print_trace t; Printf.printf ",\n"; f lt + | t::lt -> Printf.fprintf oc "\n\t%a,\n%a" print_trace t f lt in begin - if !debug_flag then begin - Printf.printf "Traces: {"; - f traces; - Printf.printf "}\n"; - end + if !debug_flag then + Printf.fprintf oc "Traces: {%a}\n" f traces + end + +(* Adapted from backend/PrintRTL.ml: print_function *) +let print_code code = let open PrintRTL in let open Printf in + if (!debug_flag) then begin + fprintf stdout "{\n"; + let instrs = + List.sort + (fun (pc1, _) (pc2, _) -> compare pc2 pc1) + (List.rev_map + (fun (pc, i) -> (P.to_int pc, i)) + (PTree.elements code)) in + List.iter (print_instruction stdout) instrs; + fprintf stdout "}" end (* Dumb (but linear) trace selection *) @@ -521,7 +489,7 @@ let select_traces_chang code entrypoint = begin end done; (* debug "DFS: \t"; print_intlist order; debug "\n"; *) - debug "Traces: "; print_traces !traces; + debug "Traces: %a" print_traces !traces; !traces end end @@ -537,26 +505,26 @@ let rec make_identity_ptree_rec = function let make_identity_ptree code = make_identity_ptree_rec (PTree.elements code) -(* Change the pointers of preds nodes to point to n' instead of n *) +(* Change the pointers of nodes to point to n' instead of n *) let rec change_pointers code n n' = function | [] -> code - | pred :: preds -> - let new_pred_inst = match ptree_get_some pred code with - | Icall(a, b, c, d, n0) -> assert (n0 == n); Icall(a, b, c, d, n') - | Ibuiltin(a, b, c, n0) -> assert (n0 == n); Ibuiltin(a, b, c, n') - | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e == n) ln); - Ijumptable(a, List.map (fun e -> if (e == n) then n' else e) ln) - | Icond(a, b, n1, n2, i) -> assert (n1 == n || n2 == n); - let n1' = if (n1 == n) then n' else n1 - in let n2' = if (n2 == n) then n' else n2 + | node :: nodes -> + let new_pred_inst = match ptree_get_some node code with + | Icall(a, b, c, d, n0) -> assert (n0 = n); Icall(a, b, c, d, n') + | Ibuiltin(a, b, c, n0) -> assert (n0 = n); Ibuiltin(a, b, c, n') + | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e = n) ln); + Ijumptable(a, List.map (fun e -> if (e = n) then n' else e) ln) + | Icond(a, b, n1, n2, i) -> assert (n1 = n || n2 = n); + let n1' = if (n1 = n) then n' else n1 + in let n2' = if (n2 = n) then n' else n2 in Icond(a, b, n1', n2', i) - | Inop n0 -> assert (n0 == n); Inop n' - | Iop (a, b, c, n0) -> assert (n0 == n); Iop (a, b, c, n') - | Iload (a, b, c, d, e, n0) -> assert (n0 == n); Iload (a, b, c, d, e, n') - | Istore (a, b, c, d, n0) -> assert (n0 == n); Istore (a, b, c, d, n') + | Inop n0 -> assert (n0 = n); Inop n' + | Iop (a, b, c, n0) -> assert (n0 = n); Iop (a, b, c, n') + | Iload (a, b, c, d, e, n0) -> assert (n0 = n); Iload (a, b, c, d, e, n') + | Istore (a, b, c, d, n0) -> assert (n0 = n); Istore (a, b, c, d, n') | Itailcall _ | Ireturn _ -> failwith "That instruction cannot be a predecessor" - in let new_code = PTree.set pred new_pred_inst code - in change_pointers new_code n n' preds + in let new_code = PTree.set node new_pred_inst code + in change_pointers new_code n n' nodes (* parent: parent of n to keep as parent * preds: all the other parents of n @@ -580,13 +548,20 @@ let is_empty = function | [] -> true | _ -> false +let next_free_pc code = maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1 + +let is_a_nop code n = + match get_some @@ PTree.get n code with + | Inop _ -> true + | _ -> false + (* code: RTL code * preds: mapping node -> predecessors * ptree: the revmap * trace: the trace to follow tail duplication on *) let tail_duplicate code preds ptree trace = (* next_int: unused integer that can be used for the next duplication *) - let next_int = ref (maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1) + let next_int = ref (next_free_pc code) (* last_node and last_duplicate store resp. the last processed node of the trace, and its duplication *) in let last_node = ref None in let last_duplicate = ref None @@ -608,7 +583,7 @@ let tail_duplicate code preds ptree trace = in let (newc, newp) = duplicate code ptree !last_node n final_node_preds (P.of_int n') in begin next_int := !next_int + 1; - nb_duplicated := !nb_duplicated + 1; + (if not @@ is_a_nop code n then nb_duplicated := !nb_duplicated + 1); last_duplicate := Some (P.of_int n'); (newc, newp) end @@ -620,9 +595,8 @@ let tail_duplicate code preds ptree trace = in let new_code, new_ptree = f code ptree true trace in (new_code, new_ptree, !nb_duplicated) -let superblockify_traces code preds traces = - let max_nb_duplicated = !Clflags.option_fduplicate (* FIXME - should be architecture dependent *) - in let ptree = make_identity_ptree code +let superblockify_traces code preds traces ptree = + let max_nb_duplicated = !Clflags.option_ftailduplicate (* FIXME - should be architecture dependent *) in let rec f code ptree = function | [] -> (code, ptree, 0) | trace :: traces -> @@ -633,37 +607,271 @@ let superblockify_traces code preds traces = in let new_code, new_ptree, _ = f code ptree traces in (new_code, new_ptree) -let rec invert_iconds_trace code = function - | [] -> code - | n :: ln -> - let code' = match ptree_get_some n code with - | Icond (c, lr, ifso, ifnot, info) -> (match info with - | Some true -> begin - (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *) - PTree.set n (Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)) code - end - | _ -> code) - | _ -> code - in invert_iconds_trace code' ln +let invert_iconds code = + PTree.map1 (fun i -> match i with + | Icond (c, lr, ifso, ifnot, info) -> (match info with + | Some true -> begin + (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *) + Icond (Op.negate_condition c, lr, ifnot, ifso, Some false) + end + | _ -> i) + | _ -> i + ) code + +(** Partial loop unrolling + * + * The following code seeks innermost loops, and unfolds the first iteration + * Most of the code has been moved from LICMaux.ml to Duplicateaux.ml to solve + * cyclic dependencies between LICMaux and Duplicateaux + *) -let rec invert_iconds code = function - | [] -> code - | t :: ts -> - let code' = if !Clflags.option_finvertcond then invert_iconds_trace code t - else code - in invert_iconds code' ts +type innerLoop = { + preds: P.t list; + body: HashedSet.PSet.t; + head: P.t; (* head of the loop *) + final: P.t (* the final instruction, which loops back to the head *) +} + +let print_pset = LICMaux.pp_pset + +let print_inner_loop iloop = + debug "{preds: %a, body: %a}" print_intlist iloop.preds print_pset iloop.body + +let rec print_inner_loops = function +| [] -> () +| iloop :: iloops -> begin + print_inner_loop iloop; + debug "\n"; + print_inner_loops iloops + end + +let print_ptree printer pt = + let elements = PTree.elements pt in + begin + debug "[\n"; + List.iter (fun (n, elt) -> + debug "\t%d: %a\n" (P.to_int n) printer elt + ) elements; + debug "]\n" + end + +let print_pint oc i = if !debug_flag then Printf.fprintf oc "%d" (P.to_int i) else () + +let get_inner_loops f code is_loop_header = + let fake_f = { fn_sig = f.fn_sig; fn_params = f.fn_params; + fn_stacksize = f.fn_stacksize; fn_code = code; fn_entrypoint = f.fn_entrypoint } in + let (_, predmap, loopmap) = LICMaux.inner_loops fake_f in + begin + debug "PREDMAP: "; print_ptree print_intlist predmap; + debug "LOOPMAP: "; print_ptree print_pset loopmap; + List.map (fun (n, body) -> + let preds = List.filter (fun p -> not @@ HashedSet.PSet.contains body p) + @@ get_some @@ PTree.get n predmap in + let head = (* the instruction from body which is a loop header *) + let heads = HashedSet.PSet.elements @@ HashedSet.PSet.filter + (fun n -> ptree_get_some n is_loop_header) body in + begin + assert (List.length heads == 1); + List.hd heads + end in + let final = (* the predecessors from head that are in the body *) + let head_preds = ptree_get_some head predmap in + let filtered = List.filter (fun n -> HashedSet.PSet.contains body n) head_preds in + begin + debug "HEAD: %d\n" (P.to_int head); + debug "BODY: %a\n" print_pset body; + debug "HEADPREDS: %a\n" print_intlist head_preds; + assert (List.length filtered == 1); + List.hd filtered + end in + { preds = preds; body = body; head = head; final = final } + ) + (* LICMaux.inner_loops also returns non-inner loops, but with a body of 1 instruction + * We remove those to get just the inner loops *) + @@ List.filter (fun (n, body) -> + let count = List.length @@ HashedSet.PSet.elements body in count != 1 + ) (PTree.elements loopmap) + end + +let rec generate_fwmap ln ln' fwmap = + match ln with + | [] -> begin + match ln' with + | [] -> fwmap + | _ -> failwith "ln and ln' have different lengths" + end + | n :: ln -> begin + match ln' with + | n' :: ln' -> generate_fwmap ln ln' (PTree.set n n' fwmap) + | _ -> failwith "ln and ln' have different lengths" + end + +let generate_revmap ln ln' revmap = generate_fwmap ln' ln revmap + +let apply_map fw n = P.of_int @@ ptree_get_some n fw + +let apply_map_opt fw n = + match PTree.get n fw with + | Some n' -> P.of_int n' + | None -> n + +let change_nexts fwmap = function + | Icall (a, b, c, d, n) -> Icall (a, b, c, d, apply_map fwmap n) + | Ibuiltin (a, b, c, n) -> Ibuiltin (a, b, c, apply_map fwmap n) + | Ijumptable (a, ln) -> Ijumptable (a, List.map (apply_map_opt fwmap) ln) + | Icond (a, b, n1, n2, i) -> Icond (a, b, apply_map_opt fwmap n1, apply_map_opt fwmap n2, i) + | Inop n -> Inop (apply_map fwmap n) + | Iop (a, b, c, n) -> Iop (a, b, c, apply_map fwmap n) + | Iload (a, b, c, d, e, n) -> Iload (a, b, c, d, e, apply_map fwmap n) + | Istore (a, b, c, d, n) -> Istore (a, b, c, d, apply_map fwmap n) + | Itailcall (a, b, c) -> Itailcall (a, b, c) + | Ireturn o -> Ireturn o + +(** Clone a list of instructions into free pc indexes + * + * The list of instructions should be contiguous, and not include any loop. + * It is assumed that the first instruction of the list is the head. + * Also, the last instruction of the list should be the loop backedge. + * + * Returns: (code', revmap', ln', fwmap) + * code' is the updated code, after cloning + * revmap' is the updated revmap + * ln' is the list of the new indexes used to reference the cloned instructions + * fwmap is a map from ln to ln' + *) +let clone code revmap ln = begin + assert (List.length ln > 0); + let head' = next_free_pc code in + (* +head' to ensure we never overlap with the existing code *) + let ln' = List.map (fun n -> n + head') @@ List.map P.to_int ln in + let fwmap = generate_fwmap ln ln' PTree.empty in + let revmap' = generate_revmap ln (List.map P.of_int ln') revmap in + let code' = ref code in + List.iter (fun n -> + let instr = get_some @@ PTree.get n code in + let instr' = change_nexts fwmap instr in + code' := PTree.set (apply_map fwmap n) instr' !code' + ) ln; + (!code', revmap', ln', fwmap) +end + +let rec count_ignore_nops code = function + | [] -> 0 + | n::ln -> + let inst = get_some @@ PTree.get n code in + match inst with + | Inop _ -> count_ignore_nops code ln + | _ -> 1 + count_ignore_nops code ln + +(* Unrolls a single interation of the inner loop + * 1) Clones the body into body' + * 2) Links the preds to the first instruction of body' + * 3) Links the last instruction of body' into the first instruction of body + *) +let unroll_inner_loop_single code revmap iloop = + let body = HashedSet.PSet.elements (iloop.body) in + if count_ignore_nops code body > !Clflags.option_funrollsingle then begin + debug "Too many nodes in the loop body (%d > %d)" (List.length body) !Clflags.option_funrollsingle; + (code, revmap) + end else + let (code2, revmap2, dupbody, fwmap) = clone code revmap body in + let code' = ref code2 in + let head' = apply_map fwmap (iloop.head) in + let final' = apply_map fwmap (iloop.final) in + begin + debug "PREDS: %a\n" print_intlist iloop.preds; + debug "IHEAD: %d\n" (P.to_int iloop.head); + code' := change_pointers !code' (iloop.head) head' (iloop.preds); + code' := change_pointers !code' head' (iloop.head) [final']; + (!code', revmap2) + end + +let unroll_inner_loops_single f code revmap = + let is_loop_header = get_loop_headers code (f.fn_entrypoint) in + let inner_loops = get_inner_loops f code is_loop_header in + let code' = ref code in + let revmap' = ref revmap in + begin + print_inner_loops inner_loops; + List.iter (fun iloop -> + let (new_code, new_revmap) = unroll_inner_loop_single !code' !revmap' iloop in + code' := new_code; revmap' := new_revmap + ) inner_loops; + (!code', !revmap') + end + +(* Unrolls the body of the inner loop once - duplicating the exit condition as well + * 1) Clones body into body' + * 2) Links the last instruction of body into the first of body' + * 3) Links the last instruction of body' into the first of body + *) +let unroll_inner_loop_body code revmap iloop = + let body = HashedSet.PSet.elements (iloop.body) in + let limit = !Clflags.option_funrollbody in + if count_ignore_nops code body > limit then begin + debug "Too many nodes in the loop body (%d > %d)" (List.length body) limit; + (code, revmap) + end else + let (code2, revmap2, dupbody, fwmap) = clone code revmap body in + let code' = ref code2 in + let head' = apply_map fwmap (iloop.head) in + let final' = apply_map fwmap (iloop.final) in + begin + code' := change_pointers !code' iloop.head head' [iloop.final]; + code' := change_pointers !code' head' iloop.head [final']; + (!code', revmap2) + end + +let unroll_inner_loops_body f code revmap = + let is_loop_header = get_loop_headers code (f.fn_entrypoint) in + let inner_loops = get_inner_loops f code is_loop_header in + let code' = ref code in + let revmap' = ref revmap in + begin + print_inner_loops inner_loops; + List.iter (fun iloop -> + let (new_code, new_revmap) = unroll_inner_loop_body !code' !revmap' iloop in + code' := new_code; revmap' := new_revmap + ) inner_loops; + (!code', !revmap') + end let duplicate_aux f = + (* initializing *) let entrypoint = f.fn_entrypoint in - if !Clflags.option_fduplicate < 0 then - ((f.fn_code, entrypoint), make_identity_ptree f.fn_code) - else - let code = update_directions (f.fn_code) entrypoint in - let traces = select_traces code entrypoint in - let icond_code = invert_iconds code traces in - let preds = get_predecessors_rtl icond_code in - if !Clflags.option_fduplicate >= 1 then - let (new_code, pTreeId) = ((* print_traces traces; *) superblockify_traces icond_code preds traces) in - ((new_code, f.fn_entrypoint), pTreeId) - else - ((icond_code, entrypoint), make_identity_ptree code) + let code = f.fn_code in + let revmap = make_identity_ptree code in + + (* static prediction *) + let code = + if !Clflags.option_fpredict then + update_directions code entrypoint + else code in + + (* unroll single *) + let (code, revmap) = + if !Clflags.option_funrollsingle > 0 then + unroll_inner_loops_single f code revmap + else (code, revmap) in + + (* unroll body *) + let (code, revmap) = + if !Clflags.option_funrollbody > 0 then + unroll_inner_loops_body f code revmap + else (code, revmap) in + + (* static prediction bis *) + let code = + if !Clflags.option_fpredict then + invert_iconds code + else code in + + (* tail duplication *) + let (code, revmap) = + if !Clflags.option_ftailduplicate > 0 then + let traces = select_traces code entrypoint in + let preds = get_predecessors_rtl code in + superblockify_traces code preds traces revmap + else (code, revmap) in + + ((code, entrypoint), revmap) diff --git a/backend/KillUselessMoves.v b/backend/KillUselessMoves.v new file mode 100644 index 00000000..bdd7ec60 --- /dev/null +++ b/backend/KillUselessMoves.v @@ -0,0 +1,40 @@ +(* *************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* David Monniaux CNRS, VERIMAG *) +(* *) +(* Copyright VERIMAG. All rights reserved. *) +(* This file is distributed under the terms of the INRIA *) +(* Non-Commercial License Agreement. *) +(* *) +(* *************************************************************) + +Require Import Coqlib Maps Errors Integers Floats Lattice Kildall. +Require Import AST Linking. +Require Import Memory Registers Op RTL. +Require List. + +Definition transf_ros (ros: reg + ident) : reg + ident := ros. + +Definition transf_instr (pc: node) (instr: instruction) := + match instr with + | Iop op args res s => + if (eq_operation op Omove) && (List.list_eq_dec peq args (res :: nil)) + then Inop s + else instr + | _ => instr + end. + +Definition transf_function (f: function) : function := + {| fn_sig := f.(fn_sig); + fn_params := f.(fn_params); + fn_stacksize := f.(fn_stacksize); + fn_code := PTree.map transf_instr f.(fn_code); + fn_entrypoint := f.(fn_entrypoint) |}. + +Definition transf_fundef (fd: fundef) : fundef := + AST.transf_fundef transf_function fd. + +Definition transf_program (p: program) : program := + transform_program transf_fundef p. diff --git a/backend/KillUselessMovesproof.v b/backend/KillUselessMovesproof.v new file mode 100644 index 00000000..629aa6aa --- /dev/null +++ b/backend/KillUselessMovesproof.v @@ -0,0 +1,361 @@ +(* *************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* David Monniaux CNRS, VERIMAG *) +(* *) +(* Copyright VERIMAG. All rights reserved. *) +(* This file is distributed under the terms of the INRIA *) +(* Non-Commercial License Agreement. *) +(* *) +(* *************************************************************) + +Require Import Axioms. +Require Import FunInd. +Require Import Coqlib Maps Errors Integers Floats Lattice Kildall. +Require Import AST Linking. +Require Import Values Memory Globalenvs Events Smallstep. +Require Import Registers Op RTL. +Require Import KillUselessMoves. + + +Definition match_prog (p tp: RTL.program) := + match_program (fun ctx f tf => tf = transf_fundef f) eq p tp. + +Lemma transf_program_match: + forall p, match_prog p (transf_program p). +Proof. + intros. eapply match_transform_program; eauto. +Qed. + +Section PRESERVATION. + +Variables prog tprog: program. +Hypothesis TRANSL: match_prog prog tprog. +Let ge := Genv.globalenv prog. +Let tge := Genv.globalenv tprog. + +Lemma functions_translated: + forall v f, + Genv.find_funct ge v = Some f -> + Genv.find_funct tge v = Some (transf_fundef f). +Proof (Genv.find_funct_transf TRANSL). + +Lemma function_ptr_translated: + forall v f, + Genv.find_funct_ptr ge v = Some f -> + Genv.find_funct_ptr tge v = Some (transf_fundef f). +Proof (Genv.find_funct_ptr_transf TRANSL). + +Lemma symbols_preserved: + forall id, + Genv.find_symbol tge id = Genv.find_symbol ge id. +Proof (Genv.find_symbol_transf TRANSL). + +Lemma senv_preserved: + Senv.equiv ge tge. +Proof (Genv.senv_transf TRANSL). + +Lemma sig_preserved: + forall f, funsig (transf_fundef f) = funsig f. +Proof. + destruct f; reflexivity. +Qed. + +Lemma find_function_translated: + forall ros rs fd, + find_function ge ros rs = Some fd -> + find_function tge ros rs = Some (transf_fundef fd). +Proof. + unfold find_function; intros. destruct ros as [r|id]. + eapply functions_translated; eauto. + rewrite symbols_preserved. destruct (Genv.find_symbol ge id); try congruence. + eapply function_ptr_translated; eauto. +Qed. + +Lemma transf_function_at: + forall f pc i, + f.(fn_code)!pc = Some i -> + (transf_function f).(fn_code)!pc = Some(transf_instr pc i). +Proof. + intros until i. intro Hcode. + unfold transf_function; simpl. + rewrite PTree.gmap. + unfold option_map. + rewrite Hcode. + reflexivity. +Qed. + +Ltac TR_AT := + match goal with + | [ A: (fn_code _)!_ = Some _ |- _ ] => + generalize (transf_function_at _ _ _ A); intros + end. + +Section SAME_RS. + Context {A : Type}. + + Definition same_rs (rs rs' : Regmap.t A) := + forall x, rs # x = rs' # x. + + Lemma same_rs_refl : forall rs, same_rs rs rs. + Proof. + unfold same_rs. + reflexivity. + Qed. + + Lemma same_rs_comm : forall rs rs', (same_rs rs rs') -> (same_rs rs' rs). + Proof. + unfold same_rs. + congruence. + Qed. + + Lemma same_rs_trans : forall rs1 rs2 rs3, + (same_rs rs1 rs2) -> (same_rs rs2 rs3) -> (same_rs rs1 rs3). + Proof. + unfold same_rs. + congruence. + Qed. + + Lemma same_rs_idem_write : forall rs r, + (same_rs rs (rs # r <- (rs # r))). + Proof. + unfold same_rs. + intros. + rewrite Regmap.gsident. + reflexivity. + Qed. + + Lemma same_rs_read: + forall rs rs' r, (same_rs rs rs') -> rs # r = rs' # r. + Proof. + unfold same_rs. + auto. + Qed. + + Lemma same_rs_subst: + forall rs rs' l, (same_rs rs rs') -> rs ## l = rs' ## l. + Proof. + induction l; cbn; intuition congruence. + Qed. + + Lemma same_rs_write: forall rs rs' r x, + (same_rs rs rs') -> (same_rs (rs # r <- x) (rs' # r <- x)). + Proof. + unfold same_rs. + intros. + destruct (peq r x0). + { subst x0. + rewrite Regmap.gss. rewrite Regmap.gss. + reflexivity. + } + rewrite Regmap.gso by congruence. + rewrite Regmap.gso by congruence. + auto. + Qed. + + Lemma same_rs_setres: + forall rs rs' (SAME: same_rs rs rs') res vres, + same_rs (regmap_setres res vres rs) (regmap_setres res vres rs'). + Proof. + induction res; cbn; auto using same_rs_write. + Qed. +End SAME_RS. + +Lemma same_find_function: forall tge rs rs' (SAME: same_rs rs rs') ros, + find_function tge ros rs = find_function tge ros rs'. +Proof. + destruct ros; cbn. + { rewrite (same_rs_read rs rs' r SAME). + reflexivity. } + reflexivity. +Qed. + +Inductive match_frames: RTL.stackframe -> RTL.stackframe -> Prop := +| match_frames_intro: forall res f sp pc rs rs' (SAME : same_rs rs rs'), + match_frames (Stackframe res f sp pc rs) + (Stackframe res (transf_function f) sp pc rs'). + +Inductive match_states: RTL.state -> RTL.state -> Prop := + | match_regular_states: forall stk f sp pc rs rs' m stk' + (SAME: same_rs rs rs') + (STACKS: list_forall2 match_frames stk stk'), + match_states (State stk f sp pc rs m) + (State stk' (transf_function f) sp pc rs' m) + | match_callstates: forall stk f args m stk' + (STACKS: list_forall2 match_frames stk stk'), + match_states (Callstate stk f args m) + (Callstate stk' (transf_fundef f) args m) + | match_returnstates: forall stk v m stk' + (STACKS: list_forall2 match_frames stk stk'), + match_states (Returnstate stk v m) + (Returnstate stk' v m). + +Lemma step_simulation: + forall S1 t S2, RTL.step ge S1 t S2 -> + forall S1', match_states S1 S1' -> + exists S2', RTL.step tge S1' t S2' /\ match_states S2 S2'. +Proof. + induction 1; intros S1' MS; inv MS; try TR_AT. +- (* nop *) + econstructor; split. eapply exec_Inop; eauto. + constructor; auto. +- (* op *) + cbn in H1. + destruct (_ && _) eqn:IS_MOVE in H1. + { + destruct eq_operation in IS_MOVE. 2: discriminate. + destruct list_eq_dec in IS_MOVE. 2: discriminate. + subst op. subst args. + clear IS_MOVE. + cbn in H0. + inv H0. + econstructor; split. + { eapply exec_Inop; eauto. } + constructor. + 2: assumption. + eapply same_rs_trans. + { apply same_rs_comm. + apply same_rs_idem_write. + } + assumption. + } + econstructor; split. + eapply exec_Iop with (v := v); eauto. + rewrite <- H0. + rewrite (same_rs_subst rs rs' args SAME). + apply eval_operation_preserved. exact symbols_preserved. + constructor; auto using same_rs_write. +(* load *) +- econstructor; split. + assert (eval_addressing tge sp addr rs' ## args = Some a). + { rewrite <- H0. + rewrite (same_rs_subst rs rs' args SAME). + apply eval_addressing_preserved. exact symbols_preserved. + } + eapply exec_Iload; eauto. + constructor; auto using same_rs_write. +- (* load notrap1 *) + econstructor; split. + assert (eval_addressing tge sp addr rs' ## args = None). + { rewrite <- H0. + rewrite (same_rs_subst rs rs' args SAME). + apply eval_addressing_preserved. exact symbols_preserved. + } + eapply exec_Iload_notrap1; eauto. + constructor; auto using same_rs_write. +- (* load notrap2 *) + econstructor; split. + assert (eval_addressing tge sp addr rs' ## args = Some a). + { rewrite <- H0. + rewrite (same_rs_subst rs rs' args SAME). + apply eval_addressing_preserved. exact symbols_preserved. + } + eapply exec_Iload_notrap2; eauto. + constructor; auto using same_rs_write. +- (* store *) + econstructor; split. + assert (eval_addressing tge sp addr rs' ## args = Some a). + { rewrite <- H0. + rewrite (same_rs_subst rs rs' args SAME). + apply eval_addressing_preserved. exact symbols_preserved. + } + rewrite (same_rs_read rs rs' src SAME) in H1. + eapply exec_Istore; eauto. + constructor; auto. +(* call *) +- econstructor; split. + eapply exec_Icall with (fd := transf_fundef fd); eauto. + eapply find_function_translated; eauto. + { rewrite <- (same_find_function ge rs rs') by assumption. + assumption. } + apply sig_preserved. + rewrite (same_rs_subst rs rs' args SAME). + constructor. constructor; auto. constructor; auto. +(* tailcall *) +- econstructor; split. + eapply exec_Itailcall with (fd := transf_fundef fd); eauto. + eapply find_function_translated; eauto. + { rewrite <- (same_find_function ge rs rs') by assumption. + assumption. } + apply sig_preserved. + rewrite (same_rs_subst rs rs' args SAME). + constructor. auto. +(* builtin *) +- econstructor; split. + eapply exec_Ibuiltin; eauto. + eapply eval_builtin_args_preserved with (ge1 := ge); eauto. exact symbols_preserved. + { + replace (fun r : positive => rs' # r) with (fun r : positive => rs # r). + eassumption. + apply functional_extensionality. + auto using same_rs_read. + } + eapply external_call_symbols_preserved; eauto. apply senv_preserved. + constructor; auto. + auto using same_rs_setres. +(* cond *) +- econstructor; split. + eapply exec_Icond; eauto. + rewrite <- (same_rs_subst rs rs' args SAME); eassumption. + constructor; auto. +(* jumptbl *) +- econstructor; split. + eapply exec_Ijumptable; eauto. + rewrite <- (same_rs_read rs rs' arg SAME); eassumption. + constructor; auto. +(* return *) +- econstructor; split. + eapply exec_Ireturn; eauto. + destruct or; cbn. + + rewrite <- (same_rs_read rs rs' r SAME) by auto. + constructor; auto. + + constructor; auto. +(* internal function *) +- simpl. econstructor; split. + eapply exec_function_internal; eauto. + constructor; auto. + cbn. + apply same_rs_refl. +(* external function *) +- econstructor; split. + eapply exec_function_external; eauto. + eapply external_call_symbols_preserved; eauto. apply senv_preserved. + constructor; auto. +(* return *) +- inv STACKS. inv H1. + econstructor; split. + eapply exec_return; eauto. + constructor; auto using same_rs_write. +Qed. + +Lemma transf_initial_states: + forall S1, RTL.initial_state prog S1 -> + exists S2, RTL.initial_state tprog S2 /\ match_states S1 S2. +Proof. + intros. inv H. econstructor; split. + econstructor. + eapply (Genv.init_mem_transf TRANSL); eauto. + rewrite symbols_preserved. rewrite (match_program_main TRANSL). eauto. + eapply function_ptr_translated; eauto. + rewrite <- H3; apply sig_preserved. + constructor. constructor. +Qed. + +Lemma transf_final_states: + forall S1 S2 r, match_states S1 S2 -> RTL.final_state S1 r -> RTL.final_state S2 r. +Proof. + intros. inv H0. inv H. inv STACKS. constructor. +Qed. + +Theorem transf_program_correct: + forward_simulation (RTL.semantics prog) (RTL.semantics tprog). +Proof. + eapply forward_simulation_step. + apply senv_preserved. + eexact transf_initial_states. + eexact transf_final_states. + exact step_simulation. +Qed. + +End PRESERVATION. diff --git a/backend/LICMaux.ml b/backend/LICMaux.ml index c3907809..0ca4418b 100644 --- a/backend/LICMaux.ml +++ b/backend/LICMaux.ml @@ -19,6 +19,62 @@ open Inject;; type reg = P.t;; +(** get_loop_headers moved from Duplicateaux.ml to LICMaux.ml to prevent cycle dependencies *) +let debug_flag = ref false + +let debug fmt = + if !debug_flag then Printf.eprintf fmt + else Printf.ifprintf stderr fmt + +type vstate = Unvisited | Processed | Visited + +let get_some = function +| None -> failwith "Did not get some" +| Some thing -> thing + +let rtl_successors = function +| Itailcall _ | Ireturn _ -> [] +| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n) +| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n] +| Icond (_,_,n1,n2,_) -> [n1; n2] +| Ijumptable (_,ln) -> ln + +(** Getting loop branches with a DFS visit : + * Each node is either Unvisited, Visited, or Processed + * pre-order: node becomes Processed + * post-order: node becomes Visited + * + * If we come accross an edge to a Processed node, it's a loop! + *) +let get_loop_headers code entrypoint = begin + debug "get_loop_headers\n"; + let visited = ref (PTree.map (fun n i -> Unvisited) code) + and is_loop_header = ref (PTree.map (fun n i -> false) code) + in let rec dfs_visit code = function + | [] -> () + | node :: ln -> + match (get_some @@ PTree.get node !visited) with + | Visited -> () + | Processed -> begin + debug "Node %d is a loop header\n" (P.to_int node); + is_loop_header := PTree.set node true !is_loop_header; + visited := PTree.set node Visited !visited + end + | Unvisited -> begin + visited := PTree.set node Processed !visited; + match PTree.get node code with + | None -> failwith "No such node" + | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits; + visited := PTree.set node Visited !visited; + dfs_visit code ln + end + in begin + dfs_visit code [entrypoint]; + !is_loop_header + end +end + + module Dominator = struct type t = Unreachable | Dominated of int | Multiple @@ -57,7 +113,7 @@ let apply_dominator (is_marked : node -> bool) (pc : node) let dominated_parts1 (f : coq_function) : (bool PTree.t) * (Dominator.t PMap.t option) = - let headers = Duplicateaux.get_loop_headers f.fn_code f.fn_entrypoint in + let headers = get_loop_headers f.fn_code f.fn_entrypoint in let dominated = Dominator_Solver.fixpoint f.fn_code RTL.successors_instr (apply_dominator (fun pc -> match PTree.get pc headers with | Some x -> x @@ -248,7 +304,7 @@ let print_dominated_parts1 oc f = (PTree.elements f.fn_code);; let loop_headers (f : coq_function) : RTL.node list = - List.map fst (List.filter snd (PTree.elements (Duplicateaux.get_loop_headers f.fn_code f.fn_entrypoint)));; + List.map fst (List.filter snd (PTree.elements (get_loop_headers f.fn_code f.fn_entrypoint)));; let print_loop_headers f = print_endline "Loop headers"; diff --git a/doc/index-kvx.html b/doc/index-kvx.html index 97eefc24..6906c212 100644 --- a/doc/index-kvx.html +++ b/doc/index-kvx.html @@ -34,8 +34,8 @@ a:active {color : Red; text-decoration : underline; } The unmodified parts of this table appear in <font color=gray>gray</font>. <br> <br> - A high-level view of this backend of CompCert is provided by this HAL preprint of Six, Boulmé and Monniaux (2019): - <div><a href=https://hal.archives-ouvertes.fr/hal-02185883>Certified Compiler Backends for VLIW Processors (Highly Modular Postpass-Scheduling in the CompCert Certified Compiler)</a></div> + A high-level view of this CompCert backend is provided by this OOSPLA'20 paper (of Six, Boulmé and Monniaux): + <div><a href=https://hal.archives-ouvertes.fr/hal-02185883>Certified and Efficient Instruction Scheduling. Application to Interlocked VLIW Processors.</a></div> <br> Our source code is available on our <a href=https://gricad-gitlab.univ-grenoble-alpes.fr/certicompil/compcert-kvx>GitLab public repository</a> (see conditions in the LICENSE file). </p> diff --git a/driver/Clflags.ml b/driver/Clflags.ml index 829af76a..d5f3aca5 100644 --- a/driver/Clflags.ml +++ b/driver/Clflags.ml @@ -33,13 +33,21 @@ let option_fcse3_across_calls = ref false let option_fcse3_across_merges = ref true let option_fcse3_glb = ref true let option_fredundancy = ref true -let option_fduplicate = ref (-1) -let option_finvertcond = ref true -let option_ftracelinearize = ref false + +(** Options relative to superblock scheduling *) +let option_fpredict = ref true (* insert static branch prediction information, and swaps ifso/ifnot branches accordingly *) +let option_ftailduplicate = ref 0 (* perform tail duplication for blocks of size n *) +let option_ftracelinearize = ref true (* uses branch prediction information to improve the linearization *) +let option_funrollsingle = ref 0 (* unroll a single iteration of innermost loops of size n *) +let option_funrollbody = ref 0 (* unroll the body of innermost loops of size n *) + +(* Scheduling *) let option_fprepass = ref false let option_fprepass_sched = ref "list" + let option_fpostpass = ref true let option_fpostpass_sched = ref "list" + let option_fifconversion = ref true let option_Obranchless = ref false let option_falignfunctions = ref (None: int option) diff --git a/driver/Compopts.v b/driver/Compopts.v index d576ede6..540e8922 100644 --- a/driver/Compopts.v +++ b/driver/Compopts.v @@ -27,9 +27,6 @@ Parameter generate_float_constants: unit -> bool. (** For value analysis. Currently always false. *) Parameter va_strict: unit -> bool. -(** Flag -fduplicate. Branch prediction annotation + tail duplication *) -Parameter optim_duplicate: unit -> bool. - (** Flag -ftailcalls. For tail call optimization. *) Parameter optim_tailcalls: unit -> bool. diff --git a/driver/Driver.ml b/driver/Driver.ml index fef9c166..e5fc78f8 100644 --- a/driver/Driver.ml +++ b/driver/Driver.ml @@ -210,15 +210,12 @@ Processing options: -fpostpass Perform postpass scheduling (only for K1 architecture) [on] -fpostpass= <optim> Perform postpass scheduling with the specified optimization [list] (<optim>=list: list scheduling, <optim>=ilp: ILP, <optim>=greedy: just packing bundles) - -fduplicate <nb_nodes> Perform tail duplication to form superblocks on predicted traces - nb_nodes control the heuristic deciding to duplicate or not - A value of -1 desactivates the entire pass (including branch prediction) - A value of 0 desactivates the duplication (but activates the branch prediction) - FIXME : this is desactivated by default for now - -finvertcond Invert conditions based on predicted paths (to prefer fallthrough). - Requires -fduplicate to be also activated [on] - -ftracelinearize Linearizes based on the traces identified by duplicate phase - It is heavily recommended to activate -finvertcond with this pass [off] + -fpredict Insert static branch prediction information [on] + Also swaps ifso/ifnot branches accordingly at RTL level + -ftailduplicate n Perform tail duplication for RTL code blocks of size n (not counting Inops) [0] + -ftracelinearize Uses branch prediction information to improve the Linearize [on] + -funrollsingle n Unrolls a single iteration of innermost loops of size n (not counting Inops) [0] + -funrollbody n Unrolls once the body of innermost loops of size n (not counting Inops) [0] -fforward-moves Forward moves after CSE -finline Perform inlining of functions [on] -finline-functions-called-once Integrate functions only required by their @@ -283,6 +280,7 @@ let dump_mnemonics destfile = let optimization_options = [ option_ftailcalls; option_fifconversion; option_fconstprop; option_fcse; option_fcse2; option_fcse3; + option_fpredict; option_ftracelinearize; option_fpostpass; option_fredundancy; option_finline; option_finline_functions_called_once; ] @@ -424,8 +422,10 @@ let cmdline_actions = @ f_opt "redundancy" option_fredundancy @ f_opt "prepass" option_fprepass @ f_opt "postpass" option_fpostpass - @ [ Exact "-fduplicate", Integer (fun n -> option_fduplicate := n) ] - @ f_opt "invertcond" option_finvertcond + @ [ Exact "-ftailduplicate", Integer (fun n -> option_ftailduplicate := n) ] + @ f_opt "predict" option_fpredict + @ [ Exact "-funrollsingle", Integer (fun n -> option_funrollsingle := n) ] + @ [ Exact "-funrollbody", Integer (fun n -> option_funrollbody := n) ] @ f_opt "tracelinearize" option_ftracelinearize @ f_opt_str "prepass" option_fprepass option_fprepass_sched @ f_opt_str "postpass" option_fpostpass option_fpostpass_sched diff --git a/extraction/extraction.v b/extraction/extraction.v index e43594fc..bd396cd8 100644 --- a/extraction/extraction.v +++ b/extraction/extraction.v @@ -111,8 +111,6 @@ Extract Constant Compopts.generate_float_constants => "fun _ -> !Clflags.option_ffloatconstprop >= 2". Extract Constant Compopts.optim_tailcalls => "fun _ -> !Clflags.option_ftailcalls". -Extract Constant Compopts.optim_duplicate => - "fun _ -> (if !Clflags.option_fduplicate = -1 then false else true)". Extract Constant Compopts.optim_constprop => "fun _ -> !Clflags.option_fconstprop". Extract Constant Compopts.optim_CSE => diff --git a/kvx/SelectOp.vp b/kvx/SelectOp.vp index 9e5d45a0..65dba3ac 100644 --- a/kvx/SelectOp.vp +++ b/kvx/SelectOp.vp @@ -103,8 +103,14 @@ Nondetfunction select0 (ty : typ) (cond0 : condition0) (e1 e2 e3: expr) := | _, _, _ => (Eop (Osel cond0 ty) (e1 ::: e2 ::: e3 ::: Enil)) end. +Definition same_expr_pure (e1 e2: expr) := + match e1, e2 with + | Evar v1, Evar v2 => if ident_eq v1 v2 then true else false + | _, _ => false + end. + Definition select (ty : typ) (cond : condition) (args : exprlist) (e1 e2: expr) : option expr := - Some( + Some (if same_expr_pure e1 e2 then e1 else match cond_to_condition0 cond args with | None => select0 ty (Ccomp0 Cne) e1 e2 (Eop (Ocmp cond) args) | Some(cond0, ec) => select0 ty cond0 e1 e2 ec @@ -356,12 +362,6 @@ Nondetfunction orimm (n1: int) (e2: expr) := | _ => Eop (Oorimm n1) (e2:::Enil) end. -Definition same_expr_pure (e1 e2: expr) := - match e1, e2 with - | Evar v1, Evar v2 => if ident_eq v1 v2 then true else false - | _, _ => false - end. - Nondetfunction or (e1: expr) (e2: expr) := match e1, e2 with | Eop (Ointconst n1) Enil, t2 => orimm n1 t2 diff --git a/kvx/SelectOpproof.v b/kvx/SelectOpproof.v index c23ed601..7a301929 100644 --- a/kvx/SelectOpproof.v +++ b/kvx/SelectOpproof.v @@ -1547,6 +1547,15 @@ Proof. intros until b. intro Hop; injection Hop; clear Hop; intro; subst a. intros HeL He1 He2 HeC. + destruct same_expr_pure eqn:SAME. + { + destruct (eval_same_expr a1 a2 le v1 v2 SAME He1 He2) as [EQ1 EQ2]. + subst a2. subst v2. + exists v1; split; trivial. + cbn. + rewrite if_same. + apply Val.lessdef_normalize. + } unfold cond_to_condition0. destruct (cond_to_condition0_match cond al). { diff --git a/lib/Coqlib.v b/lib/Coqlib.v index 02c5d07f..16d880fa 100644 --- a/lib/Coqlib.v +++ b/lib/Coqlib.v @@ -1325,3 +1325,9 @@ Lemma nlist_forall2_imply: Proof. induction 1; simpl; intros; constructor; auto. Qed. + +Lemma if_same : forall {T : Type} (b : bool) (x : T), + (if b then x else x) = x. +Proof. + destruct b; trivial. +Qed. diff --git a/runtime/include/ccomp_kvx_fixes.h b/runtime/include/ccomp_kvx_fixes.h index 65d65e7b..a518a069 100644 --- a/runtime/include/ccomp_kvx_fixes.h +++ b/runtime/include/ccomp_kvx_fixes.h @@ -33,13 +33,26 @@ extern __int128 __compcert_acswapd(void *address, unsigned long long new_value, #define __builtin_kvx_acswapw __compcert_acswapw extern __int128 __compcert_acswapw(void *address, unsigned long long new_value, unsigned long long old_value); +#define __builtin_kvx_aladdd __compcert_aladdd +extern long long __compcert_aladdd(void *address, unsigned long long incr); + +#define __builtin_kvx_aladdw __compcert_aladdw +extern int __compcert_aladdw(void *address, unsigned int incr); + #define __builtin_kvx_afaddd __compcert_afaddd extern long long __compcert_afaddd(void *address, unsigned long long incr); #define __builtin_kvx_afaddw __compcert_afaddw extern int __compcert_afaddw(void *address, unsigned int incr); -#endif + +#define __builtin_kvx_ld __compcert_ld +extern int __compcert_ld(void *address, const char *str, const int b); + +#define __builtin_kvx_lwz __compcert_lwz +extern int __compcert_lwz(void *address, const char *str, const int b); /* #define __builtin_expect(x, y) (x) */ #define __builtin_ctz(x) __builtin_kvx_ctzw(x) #define __builtin_clz(x) __builtin_kvx_clzw(x) + +#endif diff --git a/test/kvx/sort/Makefile b/test/kvx/sort/Makefile index 1afab6e9..46a8f025 100644 --- a/test/kvx/sort/Makefile +++ b/test/kvx/sort/Makefile @@ -3,7 +3,7 @@ CC ?= gcc CCOMP ?= ccomp CFLAGS ?= -O2 SIMU ?= kvx-mppa -TIMEOUT ?= 10s +TIMEOUT ?= 20s KVXCPATH=$(shell which $(KVXC)) CCPATH=$(shell which $(CC)) diff --git a/test/monniaux/loop_nest/syrk.c b/test/monniaux/loop_nest/syrk.c new file mode 100644 index 00000000..490d0a01 --- /dev/null +++ b/test/monniaux/loop_nest/syrk.c @@ -0,0 +1,28 @@ +/* Include polybench common header. */ +#include "polybench.h" + +/* Include benchmark-specific header. */ +/* Default data type is double, default size is 4000. */ +#include "syrk.h" + +/* Main computational kernel. The whole function will be timed, + including the call and return. */ +void kernel_syrk(int ni, int nj, + DATA_TYPE alpha, + DATA_TYPE beta, + DATA_TYPE POLYBENCH_2D(C,NI,NI,ni,ni), + DATA_TYPE POLYBENCH_2D(A,NI,NJ,ni,nj)) +{ + int i, j, k; + + /* C := alpha*A*A' + beta*C */ +#if 0 + for (i = 0; i < _PB_NI; i++) + for (j = 0; j < _PB_NI; j++) + C[i][j] *= beta; +#endif + for (i = 0; i < _PB_NI; i++) + for (j = 0; j < _PB_NI; j++) + for (k = 0; k < _PB_NJ; k++) + C[i][j] += alpha * A[i][k] * A[j][k]; +} diff --git a/tools/compiler_expand.ml b/tools/compiler_expand.ml index c37880d5..7e1e0181 100644 --- a/tools/compiler_expand.ml +++ b/tools/compiler_expand.ml @@ -20,13 +20,14 @@ TOTAL, (Option "profile_arcs"), (Some "Profiling insertion"), "Profiling"; TOTAL, (Option "branch_probabilities"), (Some "Profiling use"), "ProfilingExploit"; TOTAL, (Option "optim_move_loop_invariants"), (Some "Inserting initial nop"), "FirstNop"; TOTAL, Always, (Some "Renumbering"), "Renumber"; -PARTIAL, (Option "optim_duplicate"), (Some "Tail-duplicating"), "Duplicate"; +PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE"; +PARTIAL, Always, (Some "Duplicating blocks"), "Duplicate"; TOTAL, Always, (Some "Renumbering pre constprop"), "Renumber"; TOTAL, (Option "optim_constprop"), (Some "Constant propagation"), "Constprop"; TOTAL, Always, (Some "Renumbering pre CSE"), "Renumber"; -PARTIAL, (Option "optim_CSE"), (Some "CSE"), "CSE"; TOTAL, (Option "optim_CSE2"), (Some "CSE2"), "CSE2"; PARTIAL, (Option "optim_CSE3"), (Some "CSE3"), "CSE3"; +TOTAL, (Option "optim_CSE3"), (Some "Kill useless moves after CSE3"), "KillUselessMoves"; TOTAL, (Option "optim_forward_moves"), (Some "Forwarding moves"), "ForwardMoves"; PARTIAL, (Option "optim_redundancy"), (Some "Redundancy elimination"), "Deadcode"; PARTIAL, (Option "optim_move_loop_invariants"), (Some "LICM"), "LICM"; |