From 67cfb5b65007aedbcadbdc92d1bc6507c7187858 Mon Sep 17 00:00:00 2001 From: Cyril SIX Date: Mon, 29 Mar 2021 16:17:31 +0200 Subject: Simplifications on Linearize - details below While I was developing the new trace linearize, I started off with implementing a big algorithm reasoning on dependencies etc.., but I realized later that it was giving a too different performance (sometimes better, sometimes worst) than the original CompCert. So I stripped it off gradually until its performance (on regular code with just branch prediction) was on par with the base Linearize of CompCert. I was aiming here for something that is either equal, or better, in terms of performance. My (then and current) theory is that I have stripped it out so much that now it's just like the algorithm of CompCert, but with a modification for Lcond instructions (see the new linearize_aux_cb). However, I never tested that theory: the code worked, so I left it as is, without any simplification. But now that I need to get a clear version for my manuscript, I'm digging into it. It turns out my theory is not really exact. A difference is that instead of taking the minpc across the chain, I take the pc of the very first block of the chain I create. This was (I think) out of laziness in the middle of two iterations, except that I forgot about it. I tested my new theory by deleting all the stuff about dependencies calculation (commited), and also computing a minpc just like original compcert (not commited): I get the same exact Mach code than linearize_aux_cb. So right now, the only difference between linearize_aux_cb and linearize_aux_trace is that slightly different minpc computation. I think transitionning to linearize_aux_cb will be 1) much clearer than this Frankenstein monster of linearize_aux_trace that I made, and 2) might be better performing too. I don't have access to Kalray machines today so i'm leaving this on hold for now, but tomorrow I will test performance wise to see if there is a regression. If there isn't, I will commit this (and it will be the version narrated by my manuscript). If there is a regression, it would mean selecting the pc of the first node (in opposition to the minpc) is more performant, so i'd backtrack the change to linearize_aux_cb anyway and there should then be 0 difference in the generated code. --- backend/Linearizeaux.ml | 284 ++++++++++++++---------------------------------- 1 file changed, 79 insertions(+), 205 deletions(-) (limited to 'backend') diff --git a/backend/Linearizeaux.ml b/backend/Linearizeaux.ml index 3f1a8b6e..402e376d 100644 --- a/backend/Linearizeaux.ml +++ b/backend/Linearizeaux.ml @@ -128,6 +128,68 @@ let enumerate_aux_flat f reach = * heuristic *) +let super_blocks f joins = + let blocks = ref [] in + let visited = ref IntSet.empty in + (* start_block: + pc is the function entry point + or a join point + or the successor of a conditional test *) + let rec start_block pc = + let npc = P.to_int pc in + if not (IntSet.mem npc !visited) then begin + visited := IntSet.add npc !visited; + in_block [] max_int pc + end + (* in_block: add pc to block and check successors *) + and in_block blk minpc pc = + let npc = P.to_int pc in + let blk = pc :: blk in + let minpc = min npc minpc in + match PTree.get pc f.fn_code with + | None -> assert false + | Some b -> + let rec do_instr_list = function + | [] -> assert false + | Lbranch s :: _ -> next_in_block blk minpc s + | Ltailcall (sig0, ros) :: _ -> end_block blk minpc + | Lcond (cond, args, ifso, ifnot, pred) :: _ -> begin + match pred with + | None -> (end_block blk minpc; start_block ifso; start_block ifnot) + | Some true -> (next_in_block blk minpc ifso; start_block ifnot) + | Some false -> (next_in_block blk minpc ifnot; start_block ifso) + end + | Ljumptable(arg, tbl) :: _ -> + end_block blk minpc; List.iter start_block tbl + | Lreturn :: _ -> end_block blk minpc + | instr :: b' -> do_instr_list b' in + do_instr_list b + (* next_in_block: check if join point and either extend block + or start block *) + and next_in_block blk minpc pc = + let npc = P.to_int pc in + if IntSet.mem npc joins + then (end_block blk minpc; start_block pc) + else in_block blk minpc pc + (* end_block: record block that we just discovered *) + and end_block blk minpc = + blocks := (minpc, List.rev blk) :: !blocks + in + start_block f.fn_entrypoint; !blocks + +(* Build the enumeration *) + +let enumerate_aux_sb f reach = + flatten_blocks (super_blocks f (join_points f)) + +(** + * Alternate enumeration based on traces as identified by Duplicate.v + * + * This is a slight alteration to the above heuristic, ensuring that any + * superblock will be contiguous in memory, while still following the original + * heuristic + *) + let get_some = function | None -> failwith "Did not get some" | Some thing -> thing @@ -207,98 +269,6 @@ let forward_sequences code entry = in [fs] @ ((f code rem_from_node) @ (f code ln)) in (f code [entry]) -(** Unused code -module PInt = struct - type t = P.t - let compare x y = compare (P.to_int x) (P.to_int y) -end - -module PSet = Set.Make(PInt) - -module LPInt = struct - type t = P.t list - let rec compare x y = - match x with - | [] -> ( match y with - | [] -> 0 - | _ -> 1 ) - | e :: l -> match y with - | [] -> -1 - | e' :: l' -> - let e_cmp = PInt.compare e e' in - if e_cmp == 0 then compare l l' else e_cmp -end - -module LPSet = Set.Make(LPInt) - -let iter_lpset f s = Seq.iter f (LPSet.to_seq s) - -let first_of = function - | [] -> None - | e :: l -> Some e - -let rec last_of = function - | [] -> None - | e :: l -> (match l with [] -> Some e | e :: l -> last_of l) - -let can_be_merged code s s' = - let last_s = get_some @@ last_of s in - let first_s' = get_some @@ first_of s' in - match get_some @@ PTree.get last_s code with - | Lop _ | Lload _ | Lgetstack _ | Lsetstack _ | Lstore _ | Lcall _ - | Lbuiltin _ | Ltailcall _ | Lreturn -> false - | Lbranch n -> n == first_s' - | Lcond (_, _, ifso, ifnot, info) -> (match info with - | None -> false - | Some false -> ifnot == first_s' - | Some true -> failwith "Inconsistency detected - ifnot is not the preferred branch") - | Ljumptable (_, ln) -> - match ln with - | [] -> false - | n :: ln -> n == first_s' - -let merge s s' = Some s - -let try_merge code (fs: (BinNums.positive list) list) = - let seqs = ref (LPSet.of_list fs) in - let oldLength = ref (LPSet.cardinal !seqs) in - let continue = ref true in - let found = ref false in - while !continue do - begin - found := false; - iter_lpset (fun s -> - if !found then () - else iter_lpset (fun s' -> - if (!found || s == s') then () - else if (can_be_merged code s s') then - begin - seqs := LPSet.remove s !seqs; - seqs := LPSet.remove s' !seqs; - seqs := LPSet.add (get_some (merge s s')) !seqs; - found := true; - end - else () - ) !seqs - ) !seqs; - if !oldLength == LPSet.cardinal !seqs then - continue := false - else - oldLength := LPSet.cardinal !seqs - end - done; - !seqs -*) - -(** Code adapted from Duplicateaux.get_loop_headers - * - * Getting loop branches with a DFS visit : - * Each node is either Unvisited, Visited, or Processed - * pre-order: node becomes Processed - * post-order: node becomes Visited - * - * If we come accross an edge to a Processed node, it's a loop! - *) type pos = BinNums.positive module PP = struct @@ -315,38 +285,6 @@ module PPMap = Map.Make(PP) type vstate = Unvisited | Processed | Visited -let get_loop_edges code entry = - let visited = ref (PTree.map (fun n i -> Unvisited) code) in - let is_loop_edge = ref PPMap.empty - in let rec dfs_visit code from = function - | [] -> () - | node :: ln -> - match (get_some @@ PTree.get node !visited) with - | Visited -> () - | Processed -> begin - let from_node = get_some from in - is_loop_edge := PPMap.add (from_node, node) true !is_loop_edge; - visited := PTree.set node Visited !visited - end - | Unvisited -> begin - visited := PTree.set node Processed !visited; - let bb = get_some @@ PTree.get node code in - let next_visits = (match (last_element bb) with - | Lop _ | Lload _ | Lgetstack _ | Lsetstack _ | Lstore _ | Lcall _ - | Lbuiltin _ -> assert false - | Ltailcall _ | Lreturn -> [] - | Lbranch n -> [n] - | Lcond (_, _, ifso, ifnot, _) -> [ifso; ifnot] - | Ljumptable(_, ln) -> ln - ) in dfs_visit code (Some node) next_visits; - visited := PTree.set node Visited !visited; - dfs_visit code from ln - end - in begin - dfs_visit code None [entry]; - !is_loop_edge - end - let ppmap_is_true pp ppmap = PPMap.mem pp ppmap && PPMap.find pp ppmap module Int = struct @@ -364,70 +302,6 @@ let print_iset s = begin end end -let print_depmap dm = begin - if !debug_flag then begin - Printf.printf "[|"; - Array.iter (fun s -> print_iset s; Printf.printf ", ") dm; - Printf.printf "|]\n" - end -end - -let construct_depmap code entry fs = - let is_loop_edge = get_loop_edges code entry in - let visited = ref (PTree.map (fun n i -> false) code) in - let depmap = Array.map (fun e -> ISet.empty) fs in - let find_index_of_node n = - let index = ref 0 in - begin - Array.iteri (fun i s -> - match List.find_opt (fun e -> e == n) s with - | Some _ -> index := i - | None -> () - ) fs; - !index - end - in let check_and_update_depmap from target = - (* debug "From %d to %d\n" (P.to_int from) (P.to_int target); *) - if not (ppmap_is_true (from, target) is_loop_edge) then - let in_index_fs = find_index_of_node from in - let out_index_fs = find_index_of_node target in - if out_index_fs != in_index_fs then - depmap.(out_index_fs) <- ISet.add in_index_fs depmap.(out_index_fs) - else () - else () - in let rec dfs_visit code = function - | [] -> () - | node :: ln -> - begin - match (get_some @@ PTree.get node !visited) with - | true -> () - | false -> begin - visited := PTree.set node true !visited; - let bb = get_some @@ PTree.get node code in - let next_visits = - match (last_element bb) with - | Ltailcall _ | Lreturn -> [] - | Lbranch n -> (check_and_update_depmap node n; [n]) - | Lcond (_, _, ifso, ifnot, _) -> begin - check_and_update_depmap node ifso; - check_and_update_depmap node ifnot; - [ifso; ifnot] - end - | Ljumptable(_, ln) -> begin - List.iter (fun n -> check_and_update_depmap node n) ln; - ln - end - (* end of bblocks should not be another value than one of the above *) - | _ -> failwith "last_element gave an invalid output" - in dfs_visit code next_visits - end; - dfs_visit code ln - end - in begin - dfs_visit code [entry]; - depmap - end - let print_sequence s = if !debug_flag then begin Printf.printf "["; @@ -442,23 +316,26 @@ let print_ssequence ofs = Printf.printf "]\n" end +let rec minpc_of l = + match l with + | [] -> None + | e::l -> begin + let e_score = P.to_int e in + let mpc = minpc_of l in + match mpc with + | None -> Some e_score + | Some e_score' -> if e_score < e_score' then Some e_score else Some e_score' + end + let order_sequences code entry fs = let fs_a = Array.of_list fs in - let depmap = construct_depmap code entry fs_a in let fs_evaluated = Array.map (fun e -> false) fs_a in let ordered_fs = ref [] in let evaluate s_id = begin assert (not fs_evaluated.(s_id)); ordered_fs := fs_a.(s_id) :: !ordered_fs; - fs_evaluated.(s_id) <- true; - (* debug "++++++\n"; - debug "Scheduling %d\n" s_id; - debug "Initial depmap: "; print_depmap depmap; *) - Array.iteri (fun i deps -> - depmap.(i) <- ISet.remove s_id deps - ) depmap; - (* debug "Final depmap: "; print_depmap depmap; *) + fs_evaluated.(s_id) <- true end in let choose_best_of candidates = let current_best_id = ref None in @@ -486,24 +363,21 @@ let order_sequences code entry fs = in let select_next () = let candidates = ref [] in begin - Array.iteri (fun i deps -> + Array.iteri (fun i _ -> begin - (* debug "Deps of %d: " i; print_iset deps; debug "\n"; *) - (* FIXME - if we keep it that way (no dependency check), remove all the unneeded stuff *) - if ((* deps == ISet.empty && *) not fs_evaluated.(i)) then + if (not fs_evaluated.(i)) then candidates := i :: !candidates end - ) depmap; + ) fs_a; if not (List.length !candidates > 0) then begin - Array.iteri (fun i deps -> + Array.iteri (fun i _ -> if (not fs_evaluated.(i)) then candidates := i :: !candidates - ) depmap; + ) fs_a; end; get_some (choose_best_of !candidates) end in begin debug "-------------------------------\n"; - debug "depmap: "; print_depmap depmap; debug "forward sequences identified: "; print_ssequence fs; while List.length !ordered_fs != List.length fs do let next_id = select_next () in -- cgit