diff options
Diffstat (limited to 'backend/Duplicateaux.ml')
-rw-r--r-- | backend/Duplicateaux.ml | 450 |
1 files changed, 329 insertions, 121 deletions
diff --git a/backend/Duplicateaux.ml b/backend/Duplicateaux.ml index 1297ec90..eb9f42e0 100644 --- a/backend/Duplicateaux.ml +++ b/backend/Duplicateaux.ml @@ -15,6 +15,7 @@ (* Oracle for Duplicate pass. * - Add static prediction information to Icond nodes * - Performs tail duplication on interesting traces to form superblocks + * - Unrolls a single iteration of innermost loops * - (TODO: perform partial loop unrolling inside innermost loops) *) @@ -22,23 +23,13 @@ open RTL open Maps open Camlcoq -let debug_flag = ref false - -let debug fmt = - if !debug_flag then Printf.eprintf fmt - else Printf.ifprintf stderr fmt - -let get_some = function -| None -> failwith "Did not get some" -| Some thing -> thing - -let rtl_successors = function -| Itailcall _ | Ireturn _ -> [] -| Icall(_,_,_,_,n) | Ibuiltin(_,_,_,n) | Inop n | Iop (_,_,_,n) -| Iload (_,_,_,_,_,n) | Istore (_,_,_,_,n) -> [n] -| Icond (_,_,n1,n2,_) -> [n1; n2] -| Ijumptable (_,ln) -> ln +let debug_flag = LICMaux.debug_flag +let debug = LICMaux.debug +let get_loop_headers = LICMaux.get_loop_headers +let get_some = LICMaux.get_some +let rtl_successors = LICMaux.rtl_successors +(* Get list of nodes following a BFS of the code *) let bfs code entrypoint = begin debug "bfs\n"; let visited = ref (PTree.map (fun n i -> false) code) @@ -67,6 +58,7 @@ let optbool o = match o with Some _ -> true | None -> false let ptree_get_some n ptree = get_some @@ PTree.get n ptree +(* Returns a PTree: node -> list of the predecessors of that node *) let get_predecessors_rtl code = begin debug "get_predecessors_rtl\n"; let preds = ref (PTree.map (fun n i -> []) code) in @@ -89,15 +81,13 @@ end module PSet = Set.Make(PInt) -let print_intlist l = - let rec f = function +let print_intlist oc l = + let rec f oc = function | [] -> () - | n::ln -> (Printf.printf "%d " (P.to_int n); f ln) + | n::ln -> (Printf.fprintf oc "%d %a" (P.to_int n) f ln) in begin if !debug_flag then begin - Printf.printf "["; - f l; - Printf.printf "]" + Printf.fprintf oc "[%a]" f l end end @@ -113,43 +103,6 @@ let print_intset s = end end -type vstate = Unvisited | Processed | Visited - -(** Getting loop branches with a DFS visit : - * Each node is either Unvisited, Visited, or Processed - * pre-order: node becomes Processed - * post-order: node becomes Visited - * - * If we come accross an edge to a Processed node, it's a loop! - *) -let get_loop_headers code entrypoint = begin - debug "get_loop_headers\n"; - let visited = ref (PTree.map (fun n i -> Unvisited) code) - and is_loop_header = ref (PTree.map (fun n i -> false) code) - in let rec dfs_visit code = function - | [] -> () - | node :: ln -> - match (get_some @@ PTree.get node !visited) with - | Visited -> () - | Processed -> begin - debug "Node %d is a loop header\n" (P.to_int node); - is_loop_header := PTree.set node true !is_loop_header; - visited := PTree.set node Visited !visited - end - | Unvisited -> begin - visited := PTree.set node Processed !visited; - match PTree.get node code with - | None -> failwith "No such node" - | Some i -> let next_visits = rtl_successors i in dfs_visit code next_visits; - visited := PTree.set node Visited !visited; - dfs_visit code ln - end - in begin - dfs_visit code [entrypoint]; - !is_loop_header - end -end - let ptree_printbool pt = let elements = PTree.elements pt in begin @@ -174,6 +127,10 @@ let rec look_ahead code node is_loop_header predicate = ) | _ -> false +(** + * Heuristics mostly based on the paper Branch Prediction for Free + *) + let do_call_heuristic code cond ifso ifnot is_loop_header = begin debug "\tCall heuristic..\n"; @@ -302,7 +259,7 @@ let get_loop_info is_loop_header bfs_order code = !loop_info end -(* Remark - compared to the original paper, we don't use the store heuristic *) +(* Remark - compared to the original Branch Prediction for Free paper, we don't use the store heuristic *) let get_directions code entrypoint = begin debug "get_directions\n"; let bfs_order = bfs code entrypoint in @@ -435,18 +392,29 @@ let best_predecessor_of node predecessors code order is_visited = ) order) with Not_found -> None -let print_trace t = print_intlist t +let print_trace = print_intlist -let print_traces traces = - let rec f = function +let print_traces oc traces = + let rec f oc = function | [] -> () - | t::lt -> Printf.printf "\n\t"; print_trace t; Printf.printf ",\n"; f lt + | t::lt -> Printf.fprintf oc "\n\t%a,\n%a" print_trace t f lt in begin - if !debug_flag then begin - Printf.printf "Traces: {"; - f traces; - Printf.printf "}\n"; - end + if !debug_flag then + Printf.fprintf oc "Traces: {%a}\n" f traces + end + +(* Adapted from backend/PrintRTL.ml: print_function *) +let print_code code = let open PrintRTL in let open Printf in + if (!debug_flag) then begin + fprintf stdout "{\n"; + let instrs = + List.sort + (fun (pc1, _) (pc2, _) -> compare pc2 pc1) + (List.rev_map + (fun (pc, i) -> (P.to_int pc, i)) + (PTree.elements code)) in + List.iter (print_instruction stdout) instrs; + fprintf stdout "}" end (* Dumb (but linear) trace selection *) @@ -521,7 +489,7 @@ let select_traces_chang code entrypoint = begin end done; (* debug "DFS: \t"; print_intlist order; debug "\n"; *) - debug "Traces: "; print_traces !traces; + debug "Traces: %a" print_traces !traces; !traces end end @@ -537,26 +505,26 @@ let rec make_identity_ptree_rec = function let make_identity_ptree code = make_identity_ptree_rec (PTree.elements code) -(* Change the pointers of preds nodes to point to n' instead of n *) +(* Change the pointers of nodes to point to n' instead of n *) let rec change_pointers code n n' = function | [] -> code - | pred :: preds -> - let new_pred_inst = match ptree_get_some pred code with - | Icall(a, b, c, d, n0) -> assert (n0 == n); Icall(a, b, c, d, n') - | Ibuiltin(a, b, c, n0) -> assert (n0 == n); Ibuiltin(a, b, c, n') - | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e == n) ln); - Ijumptable(a, List.map (fun e -> if (e == n) then n' else e) ln) - | Icond(a, b, n1, n2, i) -> assert (n1 == n || n2 == n); - let n1' = if (n1 == n) then n' else n1 - in let n2' = if (n2 == n) then n' else n2 + | node :: nodes -> + let new_pred_inst = match ptree_get_some node code with + | Icall(a, b, c, d, n0) -> assert (n0 = n); Icall(a, b, c, d, n') + | Ibuiltin(a, b, c, n0) -> assert (n0 = n); Ibuiltin(a, b, c, n') + | Ijumptable(a, ln) -> assert (optbool @@ List.find_opt (fun e -> e = n) ln); + Ijumptable(a, List.map (fun e -> if (e = n) then n' else e) ln) + | Icond(a, b, n1, n2, i) -> assert (n1 = n || n2 = n); + let n1' = if (n1 = n) then n' else n1 + in let n2' = if (n2 = n) then n' else n2 in Icond(a, b, n1', n2', i) - | Inop n0 -> assert (n0 == n); Inop n' - | Iop (a, b, c, n0) -> assert (n0 == n); Iop (a, b, c, n') - | Iload (a, b, c, d, e, n0) -> assert (n0 == n); Iload (a, b, c, d, e, n') - | Istore (a, b, c, d, n0) -> assert (n0 == n); Istore (a, b, c, d, n') + | Inop n0 -> assert (n0 = n); Inop n' + | Iop (a, b, c, n0) -> assert (n0 = n); Iop (a, b, c, n') + | Iload (a, b, c, d, e, n0) -> assert (n0 = n); Iload (a, b, c, d, e, n') + | Istore (a, b, c, d, n0) -> assert (n0 = n); Istore (a, b, c, d, n') | Itailcall _ | Ireturn _ -> failwith "That instruction cannot be a predecessor" - in let new_code = PTree.set pred new_pred_inst code - in change_pointers new_code n n' preds + in let new_code = PTree.set node new_pred_inst code + in change_pointers new_code n n' nodes (* parent: parent of n to keep as parent * preds: all the other parents of n @@ -580,13 +548,20 @@ let is_empty = function | [] -> true | _ -> false +let next_free_pc code = maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1 + +let is_a_nop code n = + match get_some @@ PTree.get n code with + | Inop _ -> true + | _ -> false + (* code: RTL code * preds: mapping node -> predecessors * ptree: the revmap * trace: the trace to follow tail duplication on *) let tail_duplicate code preds ptree trace = (* next_int: unused integer that can be used for the next duplication *) - let next_int = ref (maxint (List.map (fun e -> let (n, _) = e in P.to_int n) (PTree.elements code)) + 1) + let next_int = ref (next_free_pc code) (* last_node and last_duplicate store resp. the last processed node of the trace, and its duplication *) in let last_node = ref None in let last_duplicate = ref None @@ -608,7 +583,7 @@ let tail_duplicate code preds ptree trace = in let (newc, newp) = duplicate code ptree !last_node n final_node_preds (P.of_int n') in begin next_int := !next_int + 1; - nb_duplicated := !nb_duplicated + 1; + (if not @@ is_a_nop code n then nb_duplicated := !nb_duplicated + 1); last_duplicate := Some (P.of_int n'); (newc, newp) end @@ -620,9 +595,8 @@ let tail_duplicate code preds ptree trace = in let new_code, new_ptree = f code ptree true trace in (new_code, new_ptree, !nb_duplicated) -let superblockify_traces code preds traces = - let max_nb_duplicated = !Clflags.option_fduplicate (* FIXME - should be architecture dependent *) - in let ptree = make_identity_ptree code +let superblockify_traces code preds traces ptree = + let max_nb_duplicated = !Clflags.option_ftailduplicate (* FIXME - should be architecture dependent *) in let rec f code ptree = function | [] -> (code, ptree, 0) | trace :: traces -> @@ -633,37 +607,271 @@ let superblockify_traces code preds traces = in let new_code, new_ptree, _ = f code ptree traces in (new_code, new_ptree) -let rec invert_iconds_trace code = function - | [] -> code - | n :: ln -> - let code' = match ptree_get_some n code with - | Icond (c, lr, ifso, ifnot, info) -> (match info with - | Some true -> begin - (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *) - PTree.set n (Icond (Op.negate_condition c, lr, ifnot, ifso, Some false)) code - end - | _ -> code) - | _ -> code - in invert_iconds_trace code' ln +let invert_iconds code = + PTree.map1 (fun i -> match i with + | Icond (c, lr, ifso, ifnot, info) -> (match info with + | Some true -> begin + (* debug "Reversing ifso/ifnot for node %d\n" (P.to_int n); *) + Icond (Op.negate_condition c, lr, ifnot, ifso, Some false) + end + | _ -> i) + | _ -> i + ) code + +(** Partial loop unrolling + * + * The following code seeks innermost loops, and unfolds the first iteration + * Most of the code has been moved from LICMaux.ml to Duplicateaux.ml to solve + * cyclic dependencies between LICMaux and Duplicateaux + *) -let rec invert_iconds code = function - | [] -> code - | t :: ts -> - let code' = if !Clflags.option_finvertcond then invert_iconds_trace code t - else code - in invert_iconds code' ts +type innerLoop = { + preds: P.t list; + body: HashedSet.PSet.t; + head: P.t; (* head of the loop *) + final: P.t (* the final instruction, which loops back to the head *) +} + +let print_pset = LICMaux.pp_pset + +let print_inner_loop iloop = + debug "{preds: %a, body: %a}" print_intlist iloop.preds print_pset iloop.body + +let rec print_inner_loops = function +| [] -> () +| iloop :: iloops -> begin + print_inner_loop iloop; + debug "\n"; + print_inner_loops iloops + end + +let print_ptree printer pt = + let elements = PTree.elements pt in + begin + debug "[\n"; + List.iter (fun (n, elt) -> + debug "\t%d: %a\n" (P.to_int n) printer elt + ) elements; + debug "]\n" + end + +let print_pint oc i = if !debug_flag then Printf.fprintf oc "%d" (P.to_int i) else () + +let get_inner_loops f code is_loop_header = + let fake_f = { fn_sig = f.fn_sig; fn_params = f.fn_params; + fn_stacksize = f.fn_stacksize; fn_code = code; fn_entrypoint = f.fn_entrypoint } in + let (_, predmap, loopmap) = LICMaux.inner_loops fake_f in + begin + debug "PREDMAP: "; print_ptree print_intlist predmap; + debug "LOOPMAP: "; print_ptree print_pset loopmap; + List.map (fun (n, body) -> + let preds = List.filter (fun p -> not @@ HashedSet.PSet.contains body p) + @@ get_some @@ PTree.get n predmap in + let head = (* the instruction from body which is a loop header *) + let heads = HashedSet.PSet.elements @@ HashedSet.PSet.filter + (fun n -> ptree_get_some n is_loop_header) body in + begin + assert (List.length heads == 1); + List.hd heads + end in + let final = (* the predecessors from head that are in the body *) + let head_preds = ptree_get_some head predmap in + let filtered = List.filter (fun n -> HashedSet.PSet.contains body n) head_preds in + begin + debug "HEAD: %d\n" (P.to_int head); + debug "BODY: %a\n" print_pset body; + debug "HEADPREDS: %a\n" print_intlist head_preds; + assert (List.length filtered == 1); + List.hd filtered + end in + { preds = preds; body = body; head = head; final = final } + ) + (* LICMaux.inner_loops also returns non-inner loops, but with a body of 1 instruction + * We remove those to get just the inner loops *) + @@ List.filter (fun (n, body) -> + let count = List.length @@ HashedSet.PSet.elements body in count != 1 + ) (PTree.elements loopmap) + end + +let rec generate_fwmap ln ln' fwmap = + match ln with + | [] -> begin + match ln' with + | [] -> fwmap + | _ -> failwith "ln and ln' have different lengths" + end + | n :: ln -> begin + match ln' with + | n' :: ln' -> generate_fwmap ln ln' (PTree.set n n' fwmap) + | _ -> failwith "ln and ln' have different lengths" + end + +let generate_revmap ln ln' revmap = generate_fwmap ln' ln revmap + +let apply_map fw n = P.of_int @@ ptree_get_some n fw + +let apply_map_opt fw n = + match PTree.get n fw with + | Some n' -> P.of_int n' + | None -> n + +let change_nexts fwmap = function + | Icall (a, b, c, d, n) -> Icall (a, b, c, d, apply_map fwmap n) + | Ibuiltin (a, b, c, n) -> Ibuiltin (a, b, c, apply_map fwmap n) + | Ijumptable (a, ln) -> Ijumptable (a, List.map (apply_map_opt fwmap) ln) + | Icond (a, b, n1, n2, i) -> Icond (a, b, apply_map_opt fwmap n1, apply_map_opt fwmap n2, i) + | Inop n -> Inop (apply_map fwmap n) + | Iop (a, b, c, n) -> Iop (a, b, c, apply_map fwmap n) + | Iload (a, b, c, d, e, n) -> Iload (a, b, c, d, e, apply_map fwmap n) + | Istore (a, b, c, d, n) -> Istore (a, b, c, d, apply_map fwmap n) + | Itailcall (a, b, c) -> Itailcall (a, b, c) + | Ireturn o -> Ireturn o + +(** Clone a list of instructions into free pc indexes + * + * The list of instructions should be contiguous, and not include any loop. + * It is assumed that the first instruction of the list is the head. + * Also, the last instruction of the list should be the loop backedge. + * + * Returns: (code', revmap', ln', fwmap) + * code' is the updated code, after cloning + * revmap' is the updated revmap + * ln' is the list of the new indexes used to reference the cloned instructions + * fwmap is a map from ln to ln' + *) +let clone code revmap ln = begin + assert (List.length ln > 0); + let head' = next_free_pc code in + (* +head' to ensure we never overlap with the existing code *) + let ln' = List.map (fun n -> n + head') @@ List.map P.to_int ln in + let fwmap = generate_fwmap ln ln' PTree.empty in + let revmap' = generate_revmap ln (List.map P.of_int ln') revmap in + let code' = ref code in + List.iter (fun n -> + let instr = get_some @@ PTree.get n code in + let instr' = change_nexts fwmap instr in + code' := PTree.set (apply_map fwmap n) instr' !code' + ) ln; + (!code', revmap', ln', fwmap) +end + +let rec count_ignore_nops code = function + | [] -> 0 + | n::ln -> + let inst = get_some @@ PTree.get n code in + match inst with + | Inop _ -> count_ignore_nops code ln + | _ -> 1 + count_ignore_nops code ln + +(* Unrolls a single interation of the inner loop + * 1) Clones the body into body' + * 2) Links the preds to the first instruction of body' + * 3) Links the last instruction of body' into the first instruction of body + *) +let unroll_inner_loop_single code revmap iloop = + let body = HashedSet.PSet.elements (iloop.body) in + if count_ignore_nops code body > !Clflags.option_funrollsingle then begin + debug "Too many nodes in the loop body (%d > %d)" (List.length body) !Clflags.option_funrollsingle; + (code, revmap) + end else + let (code2, revmap2, dupbody, fwmap) = clone code revmap body in + let code' = ref code2 in + let head' = apply_map fwmap (iloop.head) in + let final' = apply_map fwmap (iloop.final) in + begin + debug "PREDS: %a\n" print_intlist iloop.preds; + debug "IHEAD: %d\n" (P.to_int iloop.head); + code' := change_pointers !code' (iloop.head) head' (iloop.preds); + code' := change_pointers !code' head' (iloop.head) [final']; + (!code', revmap2) + end + +let unroll_inner_loops_single f code revmap = + let is_loop_header = get_loop_headers code (f.fn_entrypoint) in + let inner_loops = get_inner_loops f code is_loop_header in + let code' = ref code in + let revmap' = ref revmap in + begin + print_inner_loops inner_loops; + List.iter (fun iloop -> + let (new_code, new_revmap) = unroll_inner_loop_single !code' !revmap' iloop in + code' := new_code; revmap' := new_revmap + ) inner_loops; + (!code', !revmap') + end + +(* Unrolls the body of the inner loop once - duplicating the exit condition as well + * 1) Clones body into body' + * 2) Links the last instruction of body into the first of body' + * 3) Links the last instruction of body' into the first of body + *) +let unroll_inner_loop_body code revmap iloop = + let body = HashedSet.PSet.elements (iloop.body) in + let limit = !Clflags.option_funrollbody in + if count_ignore_nops code body > limit then begin + debug "Too many nodes in the loop body (%d > %d)" (List.length body) limit; + (code, revmap) + end else + let (code2, revmap2, dupbody, fwmap) = clone code revmap body in + let code' = ref code2 in + let head' = apply_map fwmap (iloop.head) in + let final' = apply_map fwmap (iloop.final) in + begin + code' := change_pointers !code' iloop.head head' [iloop.final]; + code' := change_pointers !code' head' iloop.head [final']; + (!code', revmap2) + end + +let unroll_inner_loops_body f code revmap = + let is_loop_header = get_loop_headers code (f.fn_entrypoint) in + let inner_loops = get_inner_loops f code is_loop_header in + let code' = ref code in + let revmap' = ref revmap in + begin + print_inner_loops inner_loops; + List.iter (fun iloop -> + let (new_code, new_revmap) = unroll_inner_loop_body !code' !revmap' iloop in + code' := new_code; revmap' := new_revmap + ) inner_loops; + (!code', !revmap') + end let duplicate_aux f = + (* initializing *) let entrypoint = f.fn_entrypoint in - if !Clflags.option_fduplicate < 0 then - ((f.fn_code, entrypoint), make_identity_ptree f.fn_code) - else - let code = update_directions (f.fn_code) entrypoint in - let traces = select_traces code entrypoint in - let icond_code = invert_iconds code traces in - let preds = get_predecessors_rtl icond_code in - if !Clflags.option_fduplicate >= 1 then - let (new_code, pTreeId) = ((* print_traces traces; *) superblockify_traces icond_code preds traces) in - ((new_code, f.fn_entrypoint), pTreeId) - else - ((icond_code, entrypoint), make_identity_ptree code) + let code = f.fn_code in + let revmap = make_identity_ptree code in + + (* static prediction *) + let code = + if !Clflags.option_fpredict then + update_directions code entrypoint + else code in + + (* unroll single *) + let (code, revmap) = + if !Clflags.option_funrollsingle > 0 then + unroll_inner_loops_single f code revmap + else (code, revmap) in + + (* unroll body *) + let (code, revmap) = + if !Clflags.option_funrollbody > 0 then + unroll_inner_loops_body f code revmap + else (code, revmap) in + + (* static prediction bis *) + let code = + if !Clflags.option_fpredict then + invert_iconds code + else code in + + (* tail duplication *) + let (code, revmap) = + if !Clflags.option_ftailduplicate > 0 then + let traces = select_traces code entrypoint in + let preds = get_predecessors_rtl code in + superblockify_traces code preds traces revmap + else (code, revmap) in + + ((code, entrypoint), revmap) |