Simplifications on Linearize - details below

While I was developing the new trace linearize, I started off with implementing a big algorithm reasoning on dependencies etc.., but I realized later that it was giving a too different performance (sometimes better, sometimes worst) than the original CompCert. So I stripped it off gradually until its performance (on regular code with just branch prediction) was on par with the base Linearize of CompCert. I was aiming here for something that is either equal, or better, in terms of performance. My (then and current) theory is that I have stripped it out so much that now it's just like the algorithm of CompCert, but with a modification for Lcond instructions (see the new linearize_aux_cb). However, I never tested that theory: the code worked, so I left it as is, without any simplification. But now that I need to get a clear version for my manuscript, I'm digging into it. It turns out my theory is not really exact. A difference is that instead of taking the minpc across the chain, I take the pc of the very first block of the chain I create. This was (I think) out of laziness in the middle of two iterations, except that I forgot about it. I tested my new theory by deleting all the stuff about dependencies calculation (commited), and also computing a minpc just like original compcert (not commited): I get the same exact Mach code than linearize_aux_cb. So right now, the only difference between linearize_aux_cb and linearize_aux_trace is that slightly different minpc computation. I think transitionning to linearize_aux_cb will be 1) much clearer than this Frankenstein monster of linearize_aux_trace that I made, and 2) might be better performing too. I don't have access to Kalray machines today so i'm leaving this on hold for now, but tomorrow I will test performance wise to see if there is a regression. If there isn't, I will commit this (and it will be the version narrated by my manuscript). If there is a regression, it would mean selecting the pc of the first node (in opposition to the minpc) is more performant, so i'd backtrack the change to linearize_aux_cb anyway and there should then be 0 difference in the generated code.
author: Cyril SIX <cyril.six@kalray.eu> 2021-03-29 16:17:31 +0200
committer: Cyril SIX <cyril.six@kalray.eu> 2021-03-29 16:17:31 +0200
commit: 67cfb5b65007aedbcadbdc92d1bc6507c7187858 (patch)
tree: b8a22ac1a95fdd52f57c657d3cf06ee6bf508972 /backend
parent: d85a8eb3d89ecb0ff5d7894b26a3497cd9fd7155 (diff)
download: compcert-kvx-67cfb5b65007aedbcadbdc92d1bc6507c7187858.tar.gz
compcert-kvx-67cfb5b65007aedbcadbdc92d1bc6507c7187858.zip
1 files changed, 79 insertions, 205 deletions
diff --git a/backend/Linearizeaux.ml b/backend/Linearizeaux.ml
index 3f1a8b6e..402e376d 100644
--- a/backend/Linearizeaux.ml
+++ b/backend/Linearizeaux.ml
@@ -128,6 +128,68 @@ let enumerate_aux_flat f reach =
  * heuristic
  *)
 
+let super_blocks f joins =
+  let blocks = ref [] in
+  let visited = ref IntSet.empty in
+  (* start_block:
+       pc is the function entry point
+          or a join point
+          or the successor of a conditional test *)
+  let rec start_block pc =
+    let npc = P.to_int pc in
+    if not (IntSet.mem npc !visited) then begin
+      visited := IntSet.add npc !visited;
+      in_block [] max_int pc
+    end
+  (* in_block: add pc to block and check successors *)
+  and in_block blk minpc pc =
+    let npc = P.to_int pc in
+    let blk = pc :: blk in
+    let minpc = min npc minpc in
+    match PTree.get pc f.fn_code with
+    | None -> assert false
+    | Some b ->
+       let rec do_instr_list = function
+       | [] -> assert false
+       | Lbranch s :: _ -> next_in_block blk minpc s
+       | Ltailcall (sig0, ros) :: _ -> end_block blk minpc
+       | Lcond (cond, args, ifso, ifnot, pred) :: _ -> begin
+            match pred with
+            | None -> (end_block blk minpc; start_block ifso; start_block ifnot)
+            | Some true -> (next_in_block blk minpc ifso; start_block ifnot)
+            | Some false -> (next_in_block blk minpc ifnot; start_block ifso)
+          end
+       | Ljumptable(arg, tbl) :: _ ->
+             end_block blk minpc; List.iter start_block tbl
+       | Lreturn :: _ -> end_block blk minpc
+       | instr :: b' -> do_instr_list b' in
+       do_instr_list b
+  (* next_in_block: check if join point and either extend block
+     or start block *)
+  and next_in_block blk minpc pc =
+    let npc = P.to_int pc in
+    if IntSet.mem npc joins
+    then (end_block blk minpc; start_block pc)
+    else in_block blk minpc pc
+  (* end_block: record block that we just discovered *)
+  and end_block blk minpc =
+    blocks := (minpc, List.rev blk) :: !blocks
+  in
+    start_block f.fn_entrypoint; !blocks
+
+(* Build the enumeration *)
+
+let enumerate_aux_sb f reach =
+  flatten_blocks (super_blocks f (join_points f))
+
+(**
+ * Alternate enumeration based on traces as identified by Duplicate.v 
+ *
+ * This is a slight alteration to the above heuristic, ensuring that any
+ * superblock will be contiguous in memory, while still following the original
+ * heuristic
+ *)
+
 let get_some = function
 | None -> failwith "Did not get some"
 | Some thing -> thing
@@ -207,98 +269,6 @@ let forward_sequences code entry =
       in [fs] @ ((f code rem_from_node) @ (f code ln))
   in (f code [entry])
 
-(** Unused code
-module PInt = struct
-  type t = P.t
-  let compare x y = compare (P.to_int x) (P.to_int y)
-end
-
-module PSet = Set.Make(PInt)
-
-module LPInt = struct
-  type t = P.t list
-  let rec compare x y =
-    match x with
-    | [] -> ( match y with
-      | [] -> 0
-      | _ -> 1 )
-    | e :: l -> match y with
-      | [] -> -1
-      | e' :: l' ->
-          let e_cmp = PInt.compare e e' in
-          if e_cmp == 0 then compare l l' else e_cmp
-end
-
-module LPSet = Set.Make(LPInt)
-
-let iter_lpset f s = Seq.iter f (LPSet.to_seq s)
-
-let first_of = function
-  | [] -> None
-  | e :: l -> Some e
-
-let rec last_of = function
-  | [] -> None
-  | e :: l -> (match l with [] -> Some e | e :: l -> last_of l)
-
-let can_be_merged code s s' =
-  let last_s = get_some @@ last_of s in
-  let first_s' = get_some @@ first_of s' in
-  match get_some @@ PTree.get last_s code with
-  | Lop _ | Lload _ | Lgetstack _ | Lsetstack _ | Lstore _ | Lcall _
-  | Lbuiltin _ | Ltailcall _ | Lreturn -> false
-  | Lbranch n -> n == first_s'
-  | Lcond (_, _, ifso, ifnot, info) -> (match info with
-    | None -> false
-    | Some false -> ifnot == first_s'
-    | Some true -> failwith "Inconsistency detected - ifnot is not the preferred branch")
-  | Ljumptable (_, ln) ->
-      match ln with
-      | [] -> false
-      | n :: ln -> n == first_s'
-
-let merge s s' = Some s
-
-let try_merge code (fs: (BinNums.positive list) list) =
-  let seqs = ref (LPSet.of_list fs) in
-  let oldLength = ref (LPSet.cardinal !seqs) in
-  let continue = ref true in
-  let found = ref false in
-  while !continue do
-    begin
-      found := false;
-      iter_lpset (fun s ->
-        if !found then ()
-        else iter_lpset (fun s' ->
-          if (!found || s == s') then ()
-          else if (can_be_merged code s s') then
-            begin
-              seqs := LPSet.remove s !seqs;
-              seqs := LPSet.remove s' !seqs;
-              seqs := LPSet.add (get_some (merge s s')) !seqs;
-              found := true;
-            end
-          else ()
-        ) !seqs
-      ) !seqs;
-      if !oldLength == LPSet.cardinal !seqs then
-        continue := false
-      else
-        oldLength := LPSet.cardinal !seqs
-    end
-  done;
-  !seqs
-*)
-
-(** Code adapted from Duplicateaux.get_loop_headers
-  *
-  * Getting loop branches with a DFS visit :
-  * Each node is either Unvisited, Visited, or Processed
-  * pre-order: node becomes Processed
-  * post-order: node becomes Visited
-  *
-  * If we come accross an edge to a Processed node, it's a loop!
-  *)
 type pos = BinNums.positive
 
 module PP = struct
@@ -315,38 +285,6 @@ module PPMap = Map.Make(PP)
 
 type vstate = Unvisited | Processed | Visited
 
-let get_loop_edges code entry =
-  let visited = ref (PTree.map (fun n i -> Unvisited) code) in
-  let is_loop_edge = ref PPMap.empty
-  in let rec dfs_visit code from = function
-  | [] -> ()
-  | node :: ln ->
-      match (get_some @@ PTree.get node !visited) with
-      | Visited -> ()
-      | Processed -> begin
-          let from_node = get_some from in
-          is_loop_edge := PPMap.add (from_node, node) true !is_loop_edge;
-          visited := PTree.set node Visited !visited
-        end
-      | Unvisited -> begin
-          visited := PTree.set node Processed !visited;
-          let bb = get_some @@ PTree.get node code in
-          let next_visits = (match (last_element bb) with
-          | Lop _ | Lload _ | Lgetstack _ | Lsetstack _ | Lstore _ | Lcall _
-          | Lbuiltin _ -> assert false
-          | Ltailcall _ | Lreturn -> []
-          | Lbranch n -> [n]
-          | Lcond (_, _, ifso, ifnot, _) -> [ifso; ifnot]
-          | Ljumptable(_, ln) -> ln
-          ) in dfs_visit code (Some node) next_visits;
-          visited := PTree.set node Visited !visited;
-          dfs_visit code from ln
-        end
-  in begin
-    dfs_visit code None [entry];
-    !is_loop_edge
-  end
-
 let ppmap_is_true pp ppmap = PPMap.mem pp ppmap && PPMap.find pp ppmap
 
 module Int = struct
@@ -364,70 +302,6 @@ let print_iset s = begin
   end
 end
 
-let print_depmap dm = begin
-  if !debug_flag then begin
-    Printf.printf "[|";
-    Array.iter (fun s -> print_iset s; Printf.printf ", ") dm;
-    Printf.printf "|]\n"
-  end
-end
-
-let construct_depmap code entry fs =
-  let is_loop_edge = get_loop_edges code entry in
-  let visited = ref (PTree.map (fun n i -> false) code) in
-  let depmap = Array.map (fun e -> ISet.empty) fs in
-  let find_index_of_node n =
-    let index = ref 0 in
-    begin
-      Array.iteri (fun i s ->
-        match List.find_opt (fun e -> e == n) s with
-        | Some _ -> index := i
-        | None -> ()
-      ) fs;
-      !index
-    end
-  in let check_and_update_depmap from target =
-    (* debug "From %d to %d\n" (P.to_int from) (P.to_int target); *)
-    if not (ppmap_is_true (from, target) is_loop_edge) then
-      let in_index_fs = find_index_of_node from in
-      let out_index_fs = find_index_of_node target in
-      if out_index_fs != in_index_fs then
-        depmap.(out_index_fs) <- ISet.add in_index_fs depmap.(out_index_fs)
-      else ()
-    else ()
-  in let rec dfs_visit code = function
-  | [] -> ()
-  | node :: ln ->
-      begin
-        match (get_some @@ PTree.get node !visited) with
-        | true -> ()
-        | false -> begin
-            visited := PTree.set node true !visited;
-            let bb = get_some @@ PTree.get node code in
-            let next_visits =
-              match (last_element bb) with
-              | Ltailcall _ | Lreturn -> []
-              | Lbranch n -> (check_and_update_depmap node n; [n])
-              | Lcond (_, _, ifso, ifnot, _) -> begin
-                  check_and_update_depmap node ifso;
-                  check_and_update_depmap node ifnot;
-                  [ifso; ifnot]
-                end
-              | Ljumptable(_, ln) -> begin
-                  List.iter (fun n -> check_and_update_depmap node n) ln;
-                  ln
-                end
-              (* end of bblocks should not be another value than one of the above *)
-              | _ -> failwith "last_element gave an invalid output"
-            in dfs_visit code next_visits
-          end;
-        dfs_visit code ln
-      end
-  in begin
-    dfs_visit code [entry];
-    depmap
-  end
-
 let print_sequence s =
   if !debug_flag then begin
     Printf.printf "[";
@@ -442,23 +316,26 @@ let print_ssequence ofs =
     Printf.printf "]\n"
   end
 
+let rec minpc_of l =
+  match l with
+  | [] -> None
+  | e::l -> begin
+        let e_score = P.to_int e in
+        let mpc = minpc_of l in 
+        match mpc with
+        | None -> Some e_score
+        | Some e_score' -> if e_score < e_score' then Some e_score else Some e_score'
+      end
+
 let order_sequences code entry fs =
   let fs_a = Array.of_list fs in
-  let depmap = construct_depmap code entry fs_a in
   let fs_evaluated = Array.map (fun e -> false) fs_a in
   let ordered_fs = ref [] in
   let evaluate s_id =
     begin
       assert (not fs_evaluated.(s_id));
       ordered_fs := fs_a.(s_id) :: !ordered_fs;
-      fs_evaluated.(s_id) <- true;
-      (* debug "++++++\n";
-      debug "Scheduling %d\n" s_id;
-      debug "Initial depmap: "; print_depmap depmap; *)
-      Array.iteri (fun i deps ->
-        depmap.(i) <- ISet.remove s_id deps
-      ) depmap;
-      (* debug "Final depmap: "; print_depmap depmap; *)
+      fs_evaluated.(s_id) <- true
     end
   in let choose_best_of candidates =
     let current_best_id = ref None in
@@ -486,24 +363,21 @@ let order_sequences code entry fs =
   in let select_next () =
     let candidates = ref [] in
     begin
-      Array.iteri (fun i deps ->
+      Array.iteri (fun i _ ->
         begin
-          (* debug "Deps of %d: " i; print_iset deps; debug "\n"; *)
-          (* FIXME - if we keep it that way (no dependency check), remove all the unneeded stuff *)
-          if ((* deps == ISet.empty && *) not fs_evaluated.(i)) then
+          if (not fs_evaluated.(i)) then
             candidates := i :: !candidates
         end
-      ) depmap;
+      ) fs_a;
       if not (List.length !candidates > 0) then begin
-        Array.iteri (fun i deps ->
+        Array.iteri (fun i _ ->
           if (not fs_evaluated.(i)) then candidates := i :: !candidates
-        ) depmap;
+        ) fs_a;
       end;
       get_some (choose_best_of !candidates)
     end
   in begin
     debug "-------------------------------\n";
-    debug "depmap: "; print_depmap depmap;
     debug "forward sequences identified: "; print_ssequence fs;
     while List.length !ordered_fs != List.length fs do
       let next_id = select_next () in
author	Cyril SIX <cyril.six@kalray.eu>	2021-03-29 16:17:31 +0200
committer	Cyril SIX <cyril.six@kalray.eu>	2021-03-29 16:17:31 +0200
commit	67cfb5b65007aedbcadbdc92d1bc6507c7187858 (patch)
tree	b8a22ac1a95fdd52f57c657d3cf06ee6bf508972 /backend
parent	d85a8eb3d89ecb0ff5d7894b26a3497cd9fd7155 (diff)
download	compcert-kvx-67cfb5b65007aedbcadbdc92d1bc6507c7187858.tar.gz compcert-kvx-67cfb5b65007aedbcadbdc92d1bc6507c7187858.zip