aboutsummaryrefslogtreecommitdiffstats
path: root/scheduling/MyRTLpathScheduleraux.ml
diff options
context:
space:
mode:
Diffstat (limited to 'scheduling/MyRTLpathScheduleraux.ml')
-rw-r--r--scheduling/MyRTLpathScheduleraux.ml1619
1 files changed, 1619 insertions, 0 deletions
diff --git a/scheduling/MyRTLpathScheduleraux.ml b/scheduling/MyRTLpathScheduleraux.ml
new file mode 100644
index 00000000..0375d26c
--- /dev/null
+++ b/scheduling/MyRTLpathScheduleraux.ml
@@ -0,0 +1,1619 @@
+open DebugPrint
+open RTLpathLivegenaux
+open RTLpath
+open RTLpathCommon
+open RTL
+open Maps
+open Registers
+
+let print_superblock sb code =
+ let insts = sb.instructions in
+ let li = sb.liveins in
+ let outs = sb.s_output_regs in
+ begin
+ debug "{ instructions = "; print_instructions (Array.to_list insts) code; debug "\n";
+ debug " liveins = "; print_ptree_regset li; debug "\n";
+ debug " output_regs = "; print_regset outs; debug "}"
+ end
+
+let print_superblocks lsb code =
+ let rec f = function
+ | [] -> ()
+ | sb :: lsb -> (print_superblock sb code; debug ",\n"; f lsb)
+ in begin
+ debug "[\n";
+ f lsb;
+ debug "]"
+ end
+
+let get_superblocks = RTLpathScheduleraux.get_superblocks
+
+let get_ok = RTLpathScheduleraux.get_ok
+
+let apply_map' fw n = Duplicateaux.ptree_get_some n fw
+
+let apply_map_opt fw n =
+ match PTree.get n fw with
+ | Some n' -> n'
+ | None -> n
+
+let change_arg_regs inst fwmap =
+ let open Datatypes in
+ match inst with
+ | Icall (sgn, fn, args, dest, succ) ->
+ let fn' =
+ ( match fn with
+ | Coq_inl r -> Datatypes.Coq_inl (apply_map_opt fwmap r)
+ | Coq_inr _ as ident -> (* function name *) ident )
+ in
+ let args' = List.map (apply_map_opt fwmap) args in
+ Icall (sgn, fn', args', dest, succ)
+ | Ibuiltin (ef, args, dest, succ) ->
+ let args' = List.map (AST.map_builtin_arg (apply_map_opt fwmap)) args in
+ Ibuiltin (ef, args', dest, succ)
+ | Ijumptable (arg, tbl) -> Ijumptable (apply_map_opt fwmap arg, tbl)
+ | Itailcall (sgn, fn, args) ->
+ let fn' =
+ ( match fn with
+ | Coq_inl r -> Datatypes.Coq_inl (apply_map_opt fwmap r)
+ | Coq_inr _ as ident -> (* function name *) ident )
+ in
+ let args' = List.map (apply_map_opt fwmap) args in
+ Itailcall (sgn, fn', args')
+ | Ireturn (reg_opt) ->
+ ( match reg_opt with
+ | None -> Ireturn None
+ | Some(reg) -> Ireturn (Some (apply_map_opt fwmap reg)) )
+ | Icond (a, b, n1, n2, i) -> Icond (a, List.map (apply_map_opt fwmap) b, n1, n2, i)
+ | Inop n -> Inop n
+ | Iop (a, b, c, n) -> Iop (a, List.map (apply_map_opt fwmap) b, c, n)
+ | Iload (a, b, c, d, e, n) ->
+ Iload (a, b, c, List.map (apply_map_opt fwmap) d, e, n)
+ | Istore (a, b, c, d, n) -> Istore (a, b, List.map (apply_map_opt fwmap) c, apply_map_opt fwmap d, n)
+
+let change_dest_reg inst fwmap =
+ match inst with
+ | Inop _
+ | Istore _
+ | Itailcall _
+ (* XXX builtin is a special case?! *)
+ | Ibuiltin _
+ | Icond _
+ | Ijumptable _
+ | Ireturn _
+ | Icall _ -> failwith "unexpectedly asked to change dest reg"
+ | Iop(op, args, res, s) -> Iop(op, args, (apply_map' fwmap res), s)
+ | Iload(trap, chunk, addr, args, dst, s) -> Iload(trap, chunk, addr, args, (apply_map' fwmap dst), s)
+
+let maybe_change_dest_reg ?only_rename inst fwmap ~next_free_reg =
+ let do_nothing = (inst, fwmap, next_free_reg) in
+ match inst with
+ | Icall _
+ | Ibuiltin _
+ | Ijumptable _
+ | Itailcall _
+ | Ireturn _ ->
+ (* Do not rename registers if the instructions MUST end a path because we cannot add
+ * restoration code afterwards. *)
+ do_nothing
+ | _ as i ->
+ match RTL.instr_defs i with
+ | None -> do_nothing
+ | Some r ->
+ if Option.is_some only_rename && not (Regset.mem r (get_some only_rename)) then
+ do_nothing
+ else
+ (match PTree.get r fwmap with
+ | None -> (i, PTree.set r r fwmap, next_free_reg)
+ | Some _previous_name ->
+ let new_name = next_free_reg in
+ let fwmap = PTree.set r new_name fwmap in
+ (change_dest_reg i fwmap, fwmap, Camlcoq.P.succ next_free_reg) )
+
+let ptree_get_or_default ptree key default =
+ match PTree.get key ptree with
+ | None -> default
+ | Some value -> value
+
+let is_icond = function
+ | Icond _ -> true
+ | _ -> false
+
+let side_exit_idxs sb code =
+ Array.to_list sb.instructions
+ |> List.map (fun pc -> get_some @@ PTree.get pc code)
+ |> List.mapi (fun i inst -> (i, inst))
+ |> List.filter (fun (_i, inst) -> is_icond inst
+ && RTLpathLivegenaux.predicted_successor inst
+ |> Option.is_some)
+ |> List.map (fun (i, _inst) -> i)
+
+let side_exit_pcs sb code =
+ side_exit_idxs sb code |> List.map (fun i -> sb.instructions.(i))
+
+module InsertPosition = struct
+ type t =
+ | Above of Camlcoq.P.t
+ | Below of Camlcoq.P.t
+ let anchor = function
+ | Above x | Below x -> x
+
+ let pseudo_map pos ~f = match pos with
+ | Above x -> Above (f x)
+ | Below x -> Below (f x)
+ let compare x y =
+ match Camlcoq.P.compare (anchor x) (anchor y) with
+ | 0 -> (match x, y with
+ | Above _, Above _ | Below _, Below _ -> 0
+ | Above _, Below _ -> 1
+ | Below _, Above _ -> -1 )
+ | c -> c
+end
+
+module InsertPositionMap = Map.Make(InsertPosition)
+
+ let insert_code sb code pm (to_insert : RTL.instruction list InsertPositionMap.t) ~next_free_pc =
+ let old_debug_flag = !debug_flag in
+ debug_flag := false;
+ debug "Before code insertion:\n";
+ print_superblock sb code;
+ debug "\n"; flush_all ();
+
+ debug "Code to insert:\n";
+ InsertPositionMap.iter
+ (fun pos insts ->
+ debug "%s %d: "
+ (match pos with
+ | InsertPosition.Above _ -> "Above"
+ | InsertPosition.Below _ -> "Below")
+ (Camlcoq.P.to_int @@ InsertPosition.anchor pos)
+ ;
+ List.iter (fun inst ->
+ if !debug_flag then PrintRTL.print_instruction stdout (0, inst)) insts;
+ debug "\n"; flush_all ()
+ )
+ to_insert
+ ;
+
+ let next_free_pc =
+ let next_free_pc = ref next_free_pc in
+ (fun () ->
+ let pc = !next_free_pc in
+ next_free_pc := Camlcoq.P.succ !next_free_pc;
+ pc )
+ in
+ let original_length = Array.length sb.instructions in
+ let orig_sort_keys =
+ Duplicateaux.generate_fwmap
+ (Array.to_list sb.instructions)
+ (List.init original_length (fun i -> i * 2))
+ PTree.empty
+ in
+ let new_key pos =
+ let open InsertPosition in
+ let anchor_key =
+ InsertPosition.anchor pos
+ |> apply_map' orig_sort_keys
+ in
+ match pos with
+ | Above _ -> anchor_key - 1
+ | Below _ -> anchor_key + 1
+ in
+ let (code, pc_lists, sort_keys) =
+ InsertPositionMap.fold
+ (fun (pos : InsertPosition.t) insts (code, pc_lists, sort_keys) ->
+ let insts_length = List.length insts in
+ let key = new_key pos in
+ let pcs = List.init insts_length (fun _ -> next_free_pc ()) in
+ let new_sort_keys = List.init insts_length (fun _ -> key) in
+ let code =
+ ListLabels.fold_left2 pcs insts
+ ~init:code
+ ~f:(fun code pc inst -> PTree.set pc inst code)
+ in
+ let sort_keys = Duplicateaux.generate_fwmap pcs new_sort_keys sort_keys in
+ (code, pcs::pc_lists, sort_keys) )
+ to_insert
+ (code, [], orig_sort_keys)
+ in
+ let new_pcs = List.flatten pc_lists |> Array.of_list in
+ let last_instruction = [| sb.instructions.(original_length - 1) |] in
+ let upto_last =
+ if original_length > 1 then
+ Array.sub sb.instructions 0 (original_length - 1)
+ else [| |]
+ in
+ let instructions = Array.concat [upto_last; new_pcs; last_instruction ] in
+ let instructions_order = Array.copy instructions in
+ ArrayLabels.stable_sort instructions_order
+ ~cmp:(fun x y -> Int.compare (apply_map' sort_keys x) (apply_map' sort_keys y));
+
+ let new_length = Array.length instructions in
+ let fwmap = Duplicateaux.generate_fwmap (Array.to_list instructions_order) (Array.init new_length (fun i -> i) |> Array.to_list) PTree.empty in
+ let fwmap_pc =
+ Duplicateaux.generate_fwmap
+ (Array.to_list sb.instructions)
+ (Array.to_list @@ Array.map (fun pc -> instructions.(apply_map' fwmap pc)) sb.instructions)
+ PTree.empty
+ in
+ let liveins' =
+ PTree.fold
+ (fun liveins' pc live_regs ->
+ PTree.set (apply_map' fwmap_pc pc) live_regs liveins')
+ sb.liveins PTree.empty
+ in
+
+ let sb' = {sb with instructions = instructions; liveins = liveins'} in
+ let code = RTLpathScheduleraux.apply_schedule code sb' instructions_order in
+
+ let num_added = new_length - original_length in
+ let first_pc = sb.instructions.(0) in
+ let pi = get_some @@ PTree.get first_pc pm in
+ let module N = Camlcoq.Nat in
+ let new_size = N.to_int pi.psize + num_added |> N.of_int in
+ let pm = PTree.set first_pc {pi with psize = new_size} pm in
+
+ debug_flag := old_debug_flag;
+ (sb', code, pm, next_free_pc (), fwmap_pc)
+
+let prepend_nops_before_iconds sb code =
+ (* We need the a first and last instruction so that
+ * a) the pc of the superblock entry stays the same and
+ * b) apply_schedule correclty preserved the successors of the last instruction *)
+ if Array.length sb.instructions < 2 then
+ (* Early exit, this should probably be replaced by a more general exclusion of
+ * superblock with just one instruction. *)
+ InsertPositionMap.empty
+ else
+
+ (* TODO, probably only need it before side exits *)
+ let icond_pcs =
+ Array.to_list sb.instructions
+ |> List.filter (fun pc ->
+ let inst = get_some @@ PTree.get pc code in
+ is_icond inst )
+ in
+ let to_insert =
+ ListLabels.fold_left icond_pcs
+ ~init:InsertPositionMap.empty
+ ~f:(fun acc icond_pc -> InsertPositionMap.add (InsertPosition.Above icond_pc) [Inop Camlcoq.P.one] acc)
+ in
+ to_insert
+
+type renamed =
+ { old_name : reg
+ ; new_name : reg }
+
+let update_live_renames pc live_renames fwmap regs =
+ Regset.fold
+ (fun live_reg renames ->
+ match PTree.get live_reg fwmap with
+ | None -> renames
+ | Some(r) when r = live_reg -> renames
+ | Some(new_name) ->
+ let old = ptree_get_or_default renames pc [] in
+ let upd = {old_name = live_reg; new_name} :: old in
+ PTree.set pc upd renames )
+ regs
+ live_renames
+
+let my_merge_overwrite m1 m2 =
+ PTree.combine (fun x y -> match (x, y) with
+ | None, None -> None
+ | Some x, None
+ | None, Some x -> Some x
+ | Some _, Some y -> Some y
+ ) m1 m2
+
+let rename_regs ?only_rename sb code ~liveatentry ~next_free_reg =
+ let old_debug_flag = !debug_flag in
+
+ let length = Array.length sb.instructions in
+ assert (length > 0);
+ (* Early exit *)
+ if length = 1 then (code, PTree.empty, next_free_reg) else
+ (* The last instruction is treated in a special way because if it defines a register,
+ * that register cannot possibly be used afterwards in the path AND often we cannot
+ * insert restoration code later after it since it must remain at the end of the path.
+ * In the future, this may be resolved by only renaming registers which are used
+ * afterwards in path, which would exclude the register possibly assigned by the last
+ * instruction. *)
+ let last_pc = sb.instructions.(length - 1) in
+ let upto_last = Array.init (length - 1) (fun i -> sb.instructions.(i)) in
+ let liveatentry = Regset.elements liveatentry in
+ let fwmap = Duplicateaux.generate_fwmap liveatentry liveatentry PTree.empty in
+
+ let (code, fwmap, live_renames, next_free_reg) =
+ ArrayLabels.fold_left upto_last
+ ~init:(code, fwmap, PTree.empty, next_free_reg)
+ ~f:(fun (code, fwmap, live_renames, next_free_reg) pc ->
+ (* Rewrite instruction to use potentially renamed registers *)
+ let inst = get_some @@ PTree.get pc code in
+ let inst = change_arg_regs inst fwmap in
+ let (inst, fwmap, next_free_reg) =
+ maybe_change_dest_reg ?only_rename inst fwmap ~next_free_reg
+ in
+ let code = PTree.set pc inst code in
+
+ let (live_renames, fwmap)=
+ if is_icond inst then (
+ (* Pretend that registers that are live at an exit already have a definition, so
+ * this catches a couple of edge cases where an instruction was not renamed and
+ * could thus not be moved up. *)
+ let live_regs = Regset.elements @@ get_some @@ PTree.get pc sb.liveins in
+ let fwmap' = Duplicateaux.generate_fwmap live_regs live_regs PTree.empty in
+ let fwmap = my_merge_overwrite fwmap' fwmap in
+ (update_live_renames pc live_renames fwmap (get_some @@ PTree.get pc sb.liveins)
+ , fwmap)
+ ) else
+ (live_renames, fwmap)
+ in
+ (code, fwmap, live_renames, next_free_reg)
+ )
+ in
+
+ let last_inst = get_some @@ PTree.get last_pc code in
+ let last_inst = change_arg_regs last_inst fwmap in
+ let (last_inst, fwmap, next_free_reg) =
+ maybe_change_dest_reg ?only_rename last_inst fwmap ~next_free_reg
+ in
+ let code = PTree.set last_pc last_inst code in
+
+ let live_renames = update_live_renames last_pc live_renames fwmap sb.s_output_regs in
+
+ debug_flag := old_debug_flag;
+
+ (code, live_renames, next_free_reg)
+
+(* Pass over the superblock instruction in-order
+ * For each redefinition of a register, create a new register name and use that one from
+ * then on. There is one exception, if the last instruction defines a register, it will be
+ * left unchanged since the rest of the path cannot possibly use it.
+ * WARNING: This invalidates the superblock, it will need to be repaired with the
+ * information returned. *)
+let local_single_assignment sb code liveatentry ~next_free_reg =
+ let old_debug_flag = !debug_flag in
+
+ let (code, live_renames, next_free_reg) =
+ rename_regs sb code ~liveatentry ~next_free_reg
+ in
+
+ debug_flag := old_debug_flag;
+ (code, live_renames, next_free_reg)
+
+let final_restoration_code sb code live_renames =
+ let last_inst_pc = sb.instructions.(Array.length sb.instructions - 1) in
+ let last_inst = get_some @@ PTree.get last_inst_pc code in
+ let last_inst_is_basic = match last_inst with
+ | Icall _
+ | Ibuiltin _
+ | Ijumptable _
+ | Itailcall _
+ | Ireturn _
+ | Icond _ -> false
+ | _ -> true
+ in
+ let final_renames = ptree_get_or_default live_renames last_inst_pc [] in
+ let live_regs_opt = PTree.get last_inst_pc sb.liveins in
+ let (above, below) = let open Either in
+ ListLabels.partition_map final_renames
+ ~f:(fun {old_name; new_name} ->
+ let inst = Iop (Op.Omove, [new_name], old_name, Camlcoq.P.one) in
+ match last_inst_is_basic, live_regs_opt with
+ | true, _ ->
+ (* Printf.eprintf "Putting %d below basic inst\n" (Camlcoq.P.to_int old_name); *)
+ Right inst
+ | false, None ->
+ (* Printf.eprintf "Putting %d above unpredicted icond.\n" (Camlcoq.P.to_int old_name); *)
+ Left inst
+ | false, Some live_regs ->
+ if Regset.mem old_name live_regs then (
+ (* Printf.eprintf "Putting %d above icond\n" (Camlcoq.P.to_int old_name); *)
+ Left inst
+ ) else (
+ (* Printf.eprintf "Putting %d below icond\n" (Camlcoq.P.to_int old_name); *)
+ Right inst))
+ in
+ InsertPositionMap.empty
+ |> InsertPositionMap.add (InsertPosition.Above last_inst_pc) above
+ |> InsertPositionMap.add (InsertPosition.Below last_inst_pc) below
+
+let used_before_redefinition sb code ~offset ~reg =
+ let length = Array.length sb.instructions in
+ if not (offset < length) then
+ raise (Invalid_argument (Printf.sprintf "offset must be less than the superblock's length: %d is not less than %d" offset length))
+ ;
+ let finished = ref false in
+ let i = ref offset in
+ let res = ref false in
+ while (!i < length && not !finished) do
+ let inst = get_some @@ PTree.get sb.instructions.(!i) code in
+ if List.mem reg (RTL.instr_uses inst) then (
+ res := true;
+ finished := true;
+ ) else
+ ()
+ ;
+ let defined_reg = RTL.instr_defs inst in
+ (match defined_reg with
+ | None -> ()
+ | Some r -> if r = reg then finished := true else ());
+ i := !i + 1;
+ done;
+ !res
+
+type restoration_actions =
+ | Just_restore of renamed
+ | Restore_and_alias of renamed
+
+let restoration_instructions' sb code live_renames ~next_free_reg =
+ let next_free_reg =
+ let next_free_reg = ref next_free_reg in
+ (fun () ->
+ let r = !next_free_reg in
+ next_free_reg := Camlcoq.P.succ !next_free_reg;
+ r )
+ in
+
+ let length = Array.length sb.instructions in
+ let pc_to_idx =
+ Duplicateaux.generate_fwmap
+ (Array.to_list sb.instructions)
+ (List.init length (fun i -> i))
+ PTree.empty
+ in
+
+ let live_renames = PTree.map
+ (fun pc renames ->
+ let offset = apply_map' pc_to_idx pc in
+ List.map
+ (fun rename ->
+ let {old_name; new_name} = rename in
+ if used_before_redefinition sb code ~offset ~reg:old_name then
+ Restore_and_alias rename
+ else
+ Just_restore rename)
+ renames)
+ live_renames
+ in
+
+ let to_rename = PTree.map1
+ (fun renames ->
+ let old_names = List.filter_map (fun rename ->
+ match rename with
+ | Just_restore _ -> None
+ | Restore_and_alias {old_name; new_name} -> Some old_name) renames
+ in
+ let aliases = List.init (List.length old_names) (fun _ -> next_free_reg ()) in
+ let fwmap = Duplicateaux.generate_fwmap old_names aliases PTree.empty in
+ fwmap
+ )
+ live_renames
+ in
+
+ let to_insert_restore =
+ PTree.fold
+ (fun to_insert side_exit_pc renames ->
+ let alias_map = get_some @@ PTree.get side_exit_pc to_rename in
+ let insts = ListLabels.map renames
+ ~f:(fun rename ->
+ match rename with
+ | Just_restore {old_name; new_name} -> [Iop (Op.Omove, [new_name], old_name, Camlcoq.P.one)]
+ | Restore_and_alias {old_name; new_name} ->
+ [ Iop (Op.Omove, [old_name], apply_map' alias_map old_name, Camlcoq.P.one)
+ ; Iop (Op.Omove, [new_name], old_name, Camlcoq.P.one)] )
+ in
+ let insts = List.flatten insts in
+ InsertPositionMap.add (InsertPosition.Above side_exit_pc) insts to_insert)
+ live_renames
+ InsertPositionMap.empty
+ in
+ (to_insert_restore, to_rename, next_free_reg ())
+
+(* Assumes the path is in local single assignment form *)
+let intra_path_dependencies (sb : superblock) (code : code) =
+ let old_debug_flag = !debug_flag in
+ debug_flag := false;
+
+ (* Directly taken from RTLpathScheduleraux *)
+ let nr_instr = Array.length sb.instructions in
+ let trailer_length =
+ match PTree.get (sb.instructions.(nr_instr-1)) code with
+ | None -> 0
+ | Some ii ->
+ (match predicted_successor ii with
+ | Some _ -> 0
+ | None -> 1 )
+ in
+
+ let module IS = InstructionScheduler in
+ let opweights = OpWeights.get_opweights () in
+ let seqa =
+ ArrayLabels.map (Array.sub sb.instructions 0 (Array.length sb.instructions - trailer_length))
+ ~f:(fun i ->
+ (match PTree.get i code with
+ | Some ii -> ii
+ | None -> failwith "MyRTLpathScheduleraux.intra_path_dependencies"),
+ (match PTree.get i sb.liveins with
+ | Some s -> s
+ | None -> Regset.empty) )
+ in
+ let latency_constraints = PrepassSchedulingOracle.get_simple_dependencies opweights seqa in
+ debug "intra_path_dependencies for superblock:\n";
+ print_superblock sb code;
+ debug "\nlatency_constraints:\n\n";
+ ListLabels.iter latency_constraints
+ ~f:(fun {IS.instr_from; instr_to; latency} ->
+ debug "instr_from: %2d\ninstr_to: %4d\nlatency: %3d\n\n" instr_from instr_to latency );
+ flush_all ();
+
+ let deps =
+ ListLabels.fold_left latency_constraints
+ ~init:PTree.empty
+ ~f:(fun deps {IS.instr_from; instr_to; latency = _} ->
+ let to_pc = sb.instructions.(instr_to) in
+ let from_pc = sb.instructions.(instr_from) in
+ let inst_deps = ptree_get_or_default deps to_pc HashedSet.PSet.empty in
+ let inst_deps' = HashedSet.PSet.add from_pc inst_deps in
+ PTree.set to_pc inst_deps' deps )
+ in
+ debug_flag := old_debug_flag;
+ deps
+
+let transitive_dependencies deps pc =
+ let old_debug_flag = !debug_flag in
+ let immediate_deps = ptree_get_or_default deps pc HashedSet.PSet.empty in
+
+ let rec iter ~acc ~todo ~seen =
+ match todo with
+ | [] -> acc
+ | pc'::todo -> (* NB: todo has just shrunk *)
+ if HashedSet.PSet.contains seen pc' then
+ iter ~acc ~todo ~seen
+ else
+ let deps_of_dep = ptree_get_or_default deps pc' HashedSet.PSet.empty in
+ let new_todo = HashedSet.PSet.subtract deps_of_dep acc |> HashedSet.PSet.elements in
+ iter
+ ~acc:(HashedSet.PSet.add pc' acc)
+ ~todo:(new_todo @ todo)
+ ~seen:(HashedSet.PSet.add pc' seen)
+ in
+ let transitive_dependencies =
+ iter
+ ~acc:HashedSet.PSet.empty
+ ~todo:(HashedSet.PSet.elements immediate_deps)
+ ~seen:HashedSet.PSet.empty
+ in
+ debug_flag := old_debug_flag;
+ transitive_dependencies
+
+let moved_dependencies deps order side_exit_pc =
+ let old_debug_flag = !debug_flag in
+
+ let dependencies = transitive_dependencies deps side_exit_pc in
+ let side_exit_pc_idx = apply_map' order side_exit_pc in
+ let moved_dependencies =
+ HashedSet.PSet.filter
+ (fun pc ->
+ let dep_idx = apply_map' order pc in
+ dep_idx > side_exit_pc_idx )
+ dependencies
+ in
+ debug_flag := old_debug_flag;
+ moved_dependencies
+
+let update_liveins liveins live_renames =
+ PTree.map
+ (fun pc liveregs ->
+ match PTree.get pc live_renames with
+ | None -> liveregs
+ | Some renames ->
+ let old_to_new = ListLabels.fold_left renames
+ ~init:PTree.empty
+ ~f:(fun acc {old_name; new_name} -> PTree.set old_name new_name acc)
+ in
+ (* There doesn't seem to be a proper map function for Regset.t *)
+ let liveregs' =
+ Regset.fold
+ (fun r acc ->
+ let r' = ptree_get_or_default old_to_new r r in
+ Regset.add r' acc)
+ liveregs
+ Regset.empty
+ in
+ liveregs'
+ )
+ liveins
+
+let replace_iconds_by_ocmps sb code ~next_free_reg =
+ let module P = Camlcoq.P in
+ let (code, _previous_icond, next_free_reg) =
+ ArrayLabels.fold_left sb.instructions
+ ~init:(code, None, next_free_reg)
+ ~f:(fun (code, previous_icond_proxy_reg, next_free_reg) pc ->
+ let inst = get_some @@ PTree.get pc code in
+ match inst with
+ | Istore(chunk, addr, args, src, succ) ->
+ ( match previous_icond_proxy_reg with
+ | None -> (code, previous_icond_proxy_reg, next_free_reg)
+ | Some r ->
+ let istore' = Istore(chunk, addr, r::args, src, succ) in
+ let code' = PTree.set pc istore' code in
+ (code', previous_icond_proxy_reg, next_free_reg) )
+ | Icond(cond, args, ifso, ifnot, prediction) ->
+ (match prediction with
+ | None ->
+ (* Case only happens at the very end of the path; no transformation necessary *)
+ assert(sb.instructions.(Array.length sb.instructions - 1) = pc);
+ (code, None, next_free_reg)
+ | Some true -> (
+ let ocmp = match previous_icond_proxy_reg with
+ | None -> Iop((Op.Ocmp cond), args, next_free_reg, ifso)
+ | Some r -> Iop((Op.Ocmp cond), r::args, next_free_reg, ifso)
+ in
+ let code' = PTree.set pc ocmp code in
+ (code', Some next_free_reg, P.succ next_free_reg) )
+ | Some false -> (
+ let ocmp = match previous_icond_proxy_reg with
+ | None -> Iop((Op.Ocmp cond), args, next_free_reg, ifnot)
+ | Some r -> Iop((Op.Ocmp cond), r::args, next_free_reg, ifnot)
+ in
+ let code' = PTree.set pc ocmp code in
+ (code', Some next_free_reg, P.succ next_free_reg) ))
+ | Iload(trap, chunk, addr, args, dest, succ) -> (
+ if !Machine.config.Machine.has_non_trapping_loads then
+ (code, previous_icond_proxy_reg, next_free_reg)
+ else
+ match previous_icond_proxy_reg with
+ | None -> (code, previous_icond_proxy_reg, next_free_reg)
+ | Some r ->
+ let load' = Iload(trap, chunk, addr, r::args, dest, succ) in
+ let code' = PTree.set pc load' code in
+ (code', previous_icond_proxy_reg, next_free_reg) )
+ | _ -> (code, previous_icond_proxy_reg, next_free_reg)
+ )
+ in
+ (code, next_free_reg)
+
+let is_store = function
+ | Istore _ -> true
+ | _ -> false
+
+type heuristic_mode =
+ | Default
+ | Ignore_liveness
+ | Move_stores
+
+let ideal_schedule'' sb code mode =
+ let old_debug_flag = !debug_flag in
+
+ let dep_function = match mode with
+ | Default -> PrepassSchedulingOracle.get_simple_dependencies'
+ | Ignore_liveness -> PrepassSchedulingOracle.get_fake_deps_liveness
+ | Move_stores -> PrepassSchedulingOracle.get_fake_deps_liveness_stores
+ in
+
+ (* copy-paste from RTLpathScheduleraux.schedule_superblock *)
+ let nr_instr = Array.length sb.instructions in
+ let trailer_length =
+ match PTree.get (sb.instructions.(nr_instr-1)) code with
+ | None -> 0
+ | Some ii ->
+ match predicted_successor ii with
+ | Some _ -> 0
+ | None -> 1 in
+ let live_regs_entry = RTLpathScheduleraux.get_live_regs_entry sb code in
+ let seqa =
+ Array.map (fun i ->
+ (match PTree.get i code with
+ | Some ii -> ii
+ | None -> failwith "RTLpathScheduleraux.schedule_superblock"),
+ (match PTree.get i sb.liveins with
+ | Some s -> s
+ | None -> Regset.empty))
+ (Array.sub sb.instructions 0 (nr_instr-trailer_length))
+ in
+ let nr_scheduled_instr = nr_instr - trailer_length in
+ (* Copy-pasted from PrepassSchedulingOracle.schedule_sequence *)
+ let opweights = OpWeights.get_opweights () in
+ (* NB: Early exit *)
+ if (Array.length seqa) <= 1 then None else
+ (* Copy-pasted from PrepassSchedulingOracle.define_problem *)
+ let problem =
+ let deps = dep_function opweights seqa in
+ debug_flag := false;
+ debug "Fake deps:\n";
+ if !debug_flag then (
+ deps
+ |> List.iter (fun {InstructionScheduler.instr_from; instr_to; latency} ->
+ debug "%2d depends on %2d\n" (Camlcoq.P.to_int sb.instructions.(instr_to)) (Camlcoq.P.to_int sb.instructions.(instr_from)));
+ flush_all ();
+ );
+ { InstructionScheduler.max_latency = -1
+ ; resource_bounds = opweights.PrepassSchedulingOracleDeps.pipelined_resource_bounds
+ ; live_regs_entry = live_regs_entry
+ ; typing = sb.typing
+ ; reference_counting = Option.some @@ RTLpathScheduleraux.reference_counting seqa sb.s_output_regs sb.typing
+ ; instruction_usages = Array.map (PrepassSchedulingOracle.resources_of_instruction opweights) (Array.map fst seqa)
+ ; latency_constraints = deps }
+ in
+ match PrepassSchedulingOracle.prepass_scheduler_by_name
+ (!Clflags.option_fprepass_sched)
+ problem
+ (Array.map (fun (ins, _) ->
+ match ins with
+ | Icond _ -> true
+ | _ -> false) seqa)
+ with
+ | None ->
+ debug_flag := old_debug_flag;
+ failwith "no solution in prepass scheduling\n"
+ | Some solution ->
+ let positions = Array.init nr_scheduled_instr (fun i -> i) in
+ let final_time = solution.(nr_scheduled_instr) in
+ Array.sort (fun i j ->
+ let si = solution.(i) and sj = solution.(j) in
+ if si < sj then -1
+ else if si > sj then 1
+ else i - j) positions;
+
+ let ins' =
+ Array.append
+ (Array.map (fun i -> sb.instructions.(i)) positions)
+ (Array.sub sb.instructions (nr_instr - trailer_length) trailer_length) in
+ Some (ins', final_time)
+
+(* Improved scheduling heuristic which allows moving memory writes downwards by turning
+ * Iconds into Ocmps for the purpose of dependency calculations. *)
+let ideal_schedule' sb code ~next_free_reg =
+ let old_debug_flag = !debug_flag in
+ let (fake_code, _next_free_reg) = replace_iconds_by_ocmps sb code ~next_free_reg in
+ (* Does PTree.empty work or do I need to map the entries to Regset.empty *)
+ let fake_sb = { sb with liveins = PTree.empty } in
+ (* Copied from RTLpathScheduleraux.schedule_superblock *)
+ let nr_instr_sb = Array.length sb.instructions in
+ assert (nr_instr_sb = Array.length fake_sb.instructions);
+
+ let trailer_length =
+ match PTree.get (sb.instructions.(nr_instr_sb - 1)) code with
+ | None -> 0
+ | Some ii ->
+ match predicted_successor ii with
+ | Some _ -> 0
+ | None -> 1
+ in
+ let seqa =
+ (Array.map (fun i ->
+ (match PTree.get i code with
+ | Some ii -> ii
+ | None -> failwith "MyRTLpathScheduleraux.ideal_schedule'"),
+ (match PTree.get i sb.liveins with
+ | Some s -> s
+ | None -> Regset.empty))
+ (Array.sub sb.instructions 0 (nr_instr_sb - trailer_length)))
+ in
+ let fake_seqa =
+ (Array.map (fun i ->
+ (match PTree.get i fake_code with
+ | Some ii -> ii
+ | None -> failwith "MyRTLpathScheduleraux.ideal_schedule'"),
+ (match PTree.get i fake_sb.liveins with
+ | Some s -> s
+ | None -> Regset.empty))
+ (Array.sub fake_sb.instructions 0 (nr_instr_sb - trailer_length)))
+ in
+ (* Copied from PrepassSchedulingOracle.schedule_sequence *)
+ let opweights = OpWeights.get_opweights () in
+ (* WARNING: Early exit in case there is only on instruction to schedule *)
+ if (Array.length fake_seqa) <= 1 then None else
+ let nr_instr_fake_seqa = Array.length fake_seqa in
+ assert (nr_instr_fake_seqa = Array.length seqa);
+ let nr_instr_seqa = nr_instr_fake_seqa in
+ let store_idxs =
+ Array.to_list sb.instructions
+ |> List.mapi (fun i pc -> (i, get_some @@ PTree.get pc code))
+ |> List.filter (fun (_i, inst) -> is_store inst)
+ |> List.map (fun (i, _inst) -> i)
+ in
+ let side_exit_idxs = side_exit_idxs sb code in
+ let store_side_exit_limit =
+ ListLabels.map store_idxs
+ ~f:(fun st_idx ->
+ let first_se = List.find_opt (fun se_idx -> se_idx > st_idx) side_exit_idxs in
+ let second_se =
+ Option.bind
+ first_se
+ (fun se_idx -> List.find_opt (fun se_idx' -> se_idx' > se_idx) side_exit_idxs)
+ in
+ second_se )
+ in
+ let store_side_exit_deps =
+ ListLabels.map2 store_idxs store_side_exit_limit
+ ~f:(fun st_idx se_idx_opt ->
+ let module IS = InstructionScheduler in
+ match se_idx_opt with
+ | None -> None
+ | Some se_idx -> Some {IS.instr_to = se_idx; instr_from = st_idx; latency = 0} )
+ |> List.filter_map (fun x -> x)
+ in
+ (* Copied from PrepassSchedulingOracle.define_problem *)
+ let fake_deps = PrepassSchedulingOracle.get_simple_dependencies opweights fake_seqa in
+ let fake_deps = store_side_exit_deps @ fake_deps in
+
+ let problem =
+ { InstructionScheduler.max_latency = -1
+ ; live_regs_entry = RTLpathScheduleraux.get_live_regs_entry fake_sb fake_code
+ ; typing = fake_sb.typing
+ ; reference_counting = Some (RTLpathScheduleraux.reference_counting fake_seqa fake_sb.s_output_regs fake_sb.typing)
+ ; resource_bounds = opweights.PrepassSchedulingOracleDeps.pipelined_resource_bounds
+ ; instruction_usages = Array.map (PrepassSchedulingOracle.resources_of_instruction opweights) (Array.map fst seqa)
+ ; latency_constraints = fake_deps }
+
+ in
+ let scheduled_sequence =
+ match
+ PrepassSchedulingOracle.prepass_scheduler_by_name
+ (!Clflags.option_fprepass_sched)
+ problem
+ (Array.map (fun (ins, _) ->
+ match ins with
+ | Icond _ -> true
+ | _ -> false)
+ seqa)
+ with
+ | None -> None
+ | Some solution ->
+ (* Printf.eprintf "Scheduling instruction sequence of length: %d\n" nr_instructions; flush_all ();
+ Printf.eprintf "Result: %d\n" solution.(nr_instructions); flush_all (); *)
+ let positions = Array.init nr_instr_seqa (fun i -> i) in
+ let final_time = solution.(nr_instr_seqa) in
+ Array.sort (fun i j ->
+ let si = solution.(i) and sj = solution.(j) in
+ if si < sj then -1
+ else if si > sj then 1
+ else i - j) positions;
+ Some (positions, final_time)
+ in
+
+ debug_flag := old_debug_flag;
+
+ match scheduled_sequence with
+ | None -> None
+ | Some (order, final_time) ->
+ let ins' =
+ Array.append
+ (Array.map (fun i -> sb.instructions.(i)) order)
+ (Array.sub sb.instructions (nr_instr_sb - trailer_length) trailer_length) in
+ Some (ins', final_time)
+
+(* "ideal" *)
+let ideal_schedule sb code =
+ let schedule =
+ RTLpathScheduleraux.schedule_superblock
+ {sb with liveins = PTree.map (fun n _regs -> Regset.empty) sb.liveins}
+ code
+ in
+ schedule
+
+let merge_append _ x y = match x, y with
+ | None, None -> None
+ | Some x, None | None, Some x -> Some x
+ | Some x, Some y -> Some (x @ y)
+
+(* Turns a tree of dependencies (pc -> [pcs; that; depend; on pc]) into a tree of uses by
+ * "inverting" tree.
+ * Now, each pc has the pcs associated to it that depend on it, according to the original
+ * tree. *)
+let uses_of_deps p_ptree =
+ PTree.fold
+ (fun acc p vs ->
+ let acc = HashedSet.PSet.fold
+ (fun acc v ->
+ let old = ptree_get_or_default acc v HashedSet.PSet.empty in
+ let upd = HashedSet.PSet.add p old in
+ PTree.set v upd acc)
+ vs
+ acc
+ in
+ acc
+ )
+ p_ptree
+ PTree.empty
+
+(* Returns for every side-exit pc, a list of instructions that should be executed beforehand *)
+let downschedule_compensation_code sb code pm live_renames ~next_free_pc ~next_free_reg =
+ (* TODO: Right now we are copying the instructions even if there are duplicates per
+ * basic block. This leads to an issue where two identical memory writes lead to
+ * non-matching symbolic memory states.
+ * As a work-around we could eliminate the first/original memory write, this would
+ * allow moving memory writes at least one side-exit further down, but not
+ * farther. For that we would need to refine the symbolic memory model/evaluation
+ * which might be related to memory aliasing in general?!
+ * For now however, we simply use a more restrictive ideal_schedule function which
+ * does not propose moving memory writes below side-exits. *)
+ let old_debug_flag = !debug_flag in
+
+ let mode = match !Clflags.option_prepass_past_side_exits_sched with
+ | "move_stores" -> Move_stores
+ | "no_move_stores" -> Ignore_liveness
+ | _ -> failwith "Unsupported option for scheduling code past side exits"
+ in
+ let sb_with_liveins = {sb with liveins = update_liveins sb.liveins live_renames} in
+ match ideal_schedule'' sb_with_liveins code mode, ideal_schedule'' sb_with_liveins code Default with
+ | None, None -> InsertPositionMap.empty (* Early Exit*)
+ | None, Some _ | Some _, None -> failwith "downschedule_compensation_code: Scheduling procedure failed."
+ | Some (idealized_schedule, idealized_final_time)
+ , Some (default_schedule, default_final_time) ->
+ if idealized_final_time >= default_final_time then (
+ if (!debug_flag && idealized_final_time > default_final_time) then (
+ debug "Unexpectedly, idealized dependencies lead to a worse expected final time.\n";
+ debug "idealized_final_time = %d, default_final_time = %d\n" idealized_final_time default_final_time;
+ debug "For superblock %d" (Camlcoq.P.to_int sb.instructions.(0));
+ failwith "Unexpectedly bad final time for idealized schedule";
+ );
+ (* Early exit *)
+ InsertPositionMap.empty
+ ) else (
+ let sb_length = Array.length sb.instructions in
+ let pc_to_idx =
+ Duplicateaux.generate_fwmap
+ (Array.to_list sb.instructions)
+ (List.init sb_length (fun i -> i))
+ PTree.empty
+ in
+ let pc_to_idx' =
+ Duplicateaux.generate_fwmap
+ (Array.to_list idealized_schedule)
+ (List.init sb_length (fun i -> i))
+ PTree.empty
+ in
+
+ let side_exit_pcs = side_exit_pcs sb code in
+
+ (* NB: for the purpose of this heuristic we consider the superblock to include the final
+ * restoration code. sb has changed. *)
+ let liveins' = update_liveins sb.liveins live_renames in
+ (* Use the new names to calculate proper dependencies *)
+ let path_deps = intra_path_dependencies {sb with liveins = liveins'} code in
+ let path_deps_without_iconds = PTree.map
+ (fun _pc deps ->
+ HashedSet.PSet.filter
+ (fun dep_pc -> not @@ is_icond @@ get_some @@ PTree.get dep_pc code)
+ deps)
+ path_deps
+ in
+ let transitive_path_deps_without_iconds =
+ PTree.map (fun pc _deps -> transitive_dependencies path_deps_without_iconds pc)
+ path_deps
+ in
+ let transitive_uses_without_iconds = uses_of_deps transitive_path_deps_without_iconds in
+
+ (* For each side-exit, check if all the dependencies are still above it
+ * if not, remember the pc and transitively consider its dependencies until
+ * no further insts to be covered are discovered *)
+ let side_exit_and_compensation =
+ ListLabels.map side_exit_pcs
+ ~f:(fun side_exit_pc ->
+ let moved_deps = moved_dependencies path_deps pc_to_idx' side_exit_pc in
+ let moved_deps_sorted =
+ ListLabels.sort (HashedSet.PSet.elements moved_deps)
+ ~cmp:(fun pc1 pc2 -> Int.compare (apply_map' pc_to_idx pc1) (apply_map' pc_to_idx pc2))
+ in
+ (* The sucessors are *incorrect* at this point *)
+ (side_exit_pc, moved_deps_sorted) )
+ in
+ let (side_exit_pcs, insts_pcs) = List.split side_exit_and_compensation in
+
+ let to_insert_pcs =
+ ListLabels.fold_left2 side_exit_pcs insts_pcs
+ ~init:InsertPositionMap.empty
+ ~f:(fun acc side_exit_pc pcs ->
+ InsertPositionMap.add (InsertPosition.Above side_exit_pc) pcs acc) in
+ let (to_insert_as_well : Camlcoq.P.t list InsertPositionMap.t)=
+ InsertPositionMap.fold
+ (fun side_exit_pc pcs_to_insert acc ->
+ let side_exit_pc = InsertPosition.anchor side_exit_pc in
+ let ( let* ) = Option.bind in
+ let collateral_moves =
+ pcs_to_insert
+ |> List.filter_map (fun pc ->
+ let* uses = PTree.get pc transitive_uses_without_iconds in
+ HashedSet.PSet.filter
+ (fun pc ->
+ not @@ List.mem pc pcs_to_insert
+ && apply_map' pc_to_idx pc < apply_map' pc_to_idx side_exit_pc
+ && (not @@ is_icond @@ get_some @@ PTree.get pc code))
+ uses
+ |> Option.some)
+ |> ListLabels.fold_left ~f:HashedSet.PSet.union ~init:HashedSet.PSet.empty
+ in
+ let collateral_moves = ListLabels.sort (HashedSet.PSet.elements collateral_moves)
+ ~cmp:(fun pc1 pc2 -> Int.compare (apply_map' pc_to_idx pc1) (apply_map' pc_to_idx pc2))
+ in
+ (* TODO?
+ * In principle we should be able to move the instructions Below, unless they are
+ * live (liveins') at the side exit.
+ * But adding them above is simpler and unnecessary instructions should be removed
+ * by DCE. *)
+ let acc = InsertPositionMap.add (InsertPosition.Above side_exit_pc) collateral_moves acc in
+ acc)
+ to_insert_pcs
+ InsertPositionMap.empty
+ in
+ let to_insert_pcs = InsertPositionMap.merge merge_append to_insert_pcs to_insert_as_well in
+ let num_probably_duplicated = InsertPositionMap.fold (fun _pos pcs n ->
+ n + List.length pcs)
+ to_insert_pcs
+ 0
+ in
+ let gain = default_final_time - idealized_final_time in
+ assert(gain > 0);
+ if gain * !Clflags.option_fliftif < num_probably_duplicated then (
+ debug_flag := false;
+ debug "Expected number of cycles gained, %d, not considered worth the code duplication, expected at %d instructions.\n"
+ gain num_probably_duplicated;
+ debug_flag := old_debug_flag;
+ InsertPositionMap.empty
+ ) else (
+ debug_flag := old_debug_flag;
+ to_insert_pcs
+ ))
+
+let my_merge_no_overwrite m1 m2 =
+ PTree.combine (fun x y -> match (x, y) with
+ | None, None -> None
+ | Some x, None
+ | None, Some x -> Some x
+ | Some x, Some y ->
+ if x = y then Some x
+ else failwith "Merge conflict."
+ ) m1 m2
+
+let print_schedule schedule =
+ debug "Schedule\n";
+ Array.iter (fun pos -> debug "%d\n" (Camlcoq.P.to_int pos)) schedule;
+ debug "\n";
+ flush_all ();
+;;
+
+(* Walk through sb and find those register which possibly take on different values
+ * i.e. which are written to twice. *)
+let find_mutated_registers (sb : superblock) code input_regs : Regset.t =
+ let (defined, defined_multiple) =
+ ArrayLabels.fold_left sb.instructions
+ ~init:(input_regs, Regset.empty)
+ ~f:(fun (defined, defined_multiple) pc ->
+ let inst = get_some @@ PTree.get pc code in
+ match RTL.instr_defs inst with
+ | None -> (defined, defined_multiple)
+ | Some(r) ->
+ if Regset.mem r defined then
+ let defined_multiple = Regset.add r defined_multiple in
+ (defined, defined_multiple)
+ else
+ let defined = Regset.add r defined in
+ (defined, defined_multiple)
+ )
+ in
+ defined_multiple
+
+(* Map each register in regs to the index of its first definition in the superblock
+ * Returns: mapping from a register to the index of the definition that first defines it
+ * in the superblock *)
+let find_first_definition (sb : superblock) code (regs : Regset.t) : int PTree.t =
+ let (regs, first_defs, _index) =
+ ArrayLabels.fold_left sb.instructions
+ ~init:(regs, PTree.empty, 0)
+ ~f:(fun (regs, first_defs, index) pc ->
+ let inst = get_some @@ PTree.get pc code in
+ match RTL.instr_defs inst with
+ | None -> (regs, first_defs, index + 1)
+ | Some(r) ->
+ if Regset.mem r regs then
+ let regs = Regset.remove r regs in
+ let first_defs = PTree.set r index first_defs in
+ (regs, first_defs, index + 1)
+ else
+ (regs, first_defs, index + 1) )
+ in
+ assert (Regset.empty = regs);
+ first_defs
+
+let print_int_ptree pt =
+ let module P = Camlcoq.P in
+ if not !debug_flag then () else
+ debug "Mappings, P.t -> Int.t:\n";
+ List.iter (fun (p, i) -> debug "%d |-> %d\n" (P.to_int p) i) (PTree.elements pt)
+
+(* [def_index], the index in the superblock of the instruction defining [r] is set to -1
+ * if the register is first defined outside of the path *)
+let is_read_after_definition (sb : superblock) code (r : reg) (def_index) : bool =
+ let start = if def_index < 0 then 0 else def_index + 1 in
+ let stop = Array.length sb.instructions in
+ let rec aux n =
+ if n > stop then false else
+
+ let pc = sb.instructions.(n) in
+ let inst = get_some @@ PTree.get pc code in
+ if List.mem r @@ RTL.instr_uses inst then
+ true
+ else
+ aux (n + 1)
+ in
+ aux start
+
+let registers_to_alias (sb : superblock) code pm =
+ let first_pc = sb.instructions.(0) in
+ let pi = get_some @@ PTree.get first_pc pm in
+ let sb_length = Array.length sb.instructions in
+ let mutated_registers = find_mutated_registers sb code pi.input_regs in
+ (* Registers are of interest if they would need to be restored after renaming *)
+ let registers_of_interest = Regset.inter sb.s_output_regs mutated_registers in
+ let first_defs_in_sb = find_first_definition sb code registers_of_interest in
+ (* If the register is first defined by the last instruction of the path, it does not
+ * need to be aliased, since it cannot possibly be used afterwards in the superblock.
+ * Furthermore, the renaming pass won't rename this register, in case the definition
+ * happens by a path-ending instruction in which case it would be impossible to place
+ * restoration code afterwards *)
+ let registers_to_alias =
+ registers_of_interest
+ |> Regset.filter (fun r -> if apply_map' first_defs_in_sb r = sb_length - 1 then false else true)
+ in
+ registers_to_alias
+
+let add_aliasing_code sb code pm : RTL.instruction list InsertPositionMap.t =
+ let first_pc = sb.instructions.(0) in
+ let pi = get_some @@ PTree.get first_pc pm in
+ let to_alias = registers_to_alias sb code pm in
+ let (initial, other) = Regset.partition (fun r -> Regset.mem r pi.input_regs) to_alias in
+ let to_insert =
+ (* If registers that are already live at the beginning of the superblock need to be
+ * aliased, they need to be inserted before (above) the first instruction of the
+ * superblock. *)
+ InsertPositionMap.singleton
+ (InsertPosition.Above sb.instructions.(0))
+ (Regset.elements initial |> List.map (fun r -> Iop (Op.Omove, [r], r, Camlcoq.P.one)))
+ in
+
+ let first_def_other = find_first_definition sb code other in
+ let to_insert = PTree.fold
+ (fun to_insert r idx ->
+ let pos = (InsertPosition.Below sb.instructions.(idx)) in
+ let old =
+ match InsertPositionMap.find_opt pos to_insert with
+ | None -> []
+ | Some v -> v
+ in
+ let upd = (Iop (Op.Omove, [r], r, Camlcoq.P.one)) :: old in
+ InsertPositionMap.add pos upd to_insert)
+ first_def_other
+ to_insert
+ in
+ to_insert
+
+type icond_frame =
+ { inop_idx : int
+ ; icond_idx : int }
+
+let find_icond_frames (sb : superblock) code =
+ let (_last_inop_idx, frames, _i) =
+ ArrayLabels.fold_left sb.instructions
+ ~init:(None, [], 0)
+ ~f:(fun (last_inop_idx, frames, i) pc ->
+ let inst = get_some @@ PTree.get pc code in
+ match inst with
+ | Inop _ -> (Some i, frames, i + 1)
+ | Icond _ ->
+ ( match last_inop_idx with
+ | None -> (None, frames, i + 1)
+ | Some inop_idx -> (None, {inop_idx; icond_idx = i} :: frames, i + 1) )
+ | _ -> (last_inop_idx, frames, i + 1) )
+ in
+ frames
+
+let stage_duplication sb code staged_dupcode staged_revmap ~next_free_pc =
+ let module D = Duplicateaux in
+ (* let module P = Camlcoq.P in *)
+ let icond_frames = find_icond_frames sb code in
+ let (code, staged_dupcode, staged_revmap, next_free_pc) =
+ ListLabels.fold_left icond_frames
+ ~init:(code, staged_dupcode, staged_revmap, next_free_pc)
+ ~f:(fun (code, staged_dupcode, staged_revmap, next_free_pc) {inop_idx; icond_idx} ->
+ if (* icond_idx = Array.length sb.instructions - 1 (* Do not lift code before end of path *)
+ || *) icond_idx = inop_idx + 1 then (* Sentinel value that no code needs to be duplicated *)
+ (* do nothing *)
+ (code, staged_dupcode, staged_revmap, next_free_pc)
+ else
+ let pcs_to_copy = Array.sub sb.instructions (inop_idx + 1) (icond_idx - inop_idx) in
+ let (staged_dupcode', staged_revmap', dupcode, fwmap, next_free_pc) = D.clone_only_new code next_free_pc (Array.to_list pcs_to_copy) in
+ let staged_dupcode = my_merge_no_overwrite staged_dupcode staged_dupcode' in
+ let staged_revmap = my_merge_no_overwrite staged_revmap staged_revmap' in
+ let parental_icond = get_some @@ PTree.get sb.instructions.(icond_idx) code in
+ let (useless_icond, staged_icond) =
+ match parental_icond with
+ | Icond(cond, args, ifso, ifnot, info) ->
+ let staged_icond = Icond(cond, args, Camlcoq.P.of_int @@ List.hd dupcode, pcs_to_copy.(0), info) in
+ let useless_icond = Icond(cond, args, pcs_to_copy.(0), pcs_to_copy.(0), info) in
+ (useless_icond, staged_icond)
+ | _ -> failwith "Instruction was expected to be Icond, but is not"
+ in
+ let staged_dupcode = PTree.set sb.instructions.(inop_idx) staged_icond staged_dupcode in
+
+ let code = PTree.set sb.instructions.(inop_idx) useless_icond code in
+
+ (code, staged_dupcode, staged_revmap, next_free_pc) )
+ in
+ (code, staged_dupcode, staged_revmap, next_free_pc)
+
+(* TODO? better name *)
+let apply_aliases sb code name_map ~offset =
+ let code = ref code in
+ let name_map = ref name_map in
+ let length = Array.length sb.instructions in
+
+ (* TODO: preferably there was an early exit condition when there is nothing left to do *)
+ for i = offset to length - 1 do
+ let pc = sb.instructions.(i) in
+ let inst = get_some @@ PTree.get pc !code in
+ let inst' = change_arg_regs inst !name_map in
+ code := PTree.set pc inst' !code;
+ name_map :=
+ (match RTL.instr_defs inst with
+ | None -> !name_map
+ | Some r ->
+ if Option.is_some @@ PTree.get r !name_map then (
+ (* The restoration code, is no longer incorrectly applicable *)
+ if not !Clflags.option_fpoormansssa then
+ PTree.remove r !name_map
+ else
+ !name_map
+ ) else
+ !name_map)
+ ;
+ done;
+ !code
+
+let scheduler f =
+ (* let module D = Duplicateaux in
+ let module P = Camlcoq.P in
+ let module N = Camlcoq.Nat in *)
+ (* TODO: - control for amount of code duplication *)
+ let open! Duplicateaux in
+ let f_rtl = f.fn_RTL in
+ let code = f_rtl.fn_code in
+ let _orig_code = code in
+ let id_ptree = PTree.map (fun n i -> n) (f.fn_path) in
+ let entry = f.fn_RTL.fn_entrypoint in
+ let pm = f.fn_path in
+ let do_nothing = ((((code, entry), pm), id_ptree), None) in
+ (* TODO: Add flag to select between "conserative" downard scheduling and the one that
+ * allows moving memory stores below the next side exit (singular). *)
+ if not !Clflags.option_fpoormansssa && !Clflags.option_fliftif < 1 then do_nothing
+ else (* NB: Early exit above *)
+ let _orig_pm = pm in
+ let typing = get_ok @@ RTLtyping.type_function f.fn_RTL in
+ let is_loop_header = Duplicateaux.get_loop_headers code (f_rtl.fn_entrypoint) in
+ let inner_loops = Duplicateaux.get_inner_loops f_rtl code is_loop_header in
+ (* inner loop map: map loop headers to inner loops *)
+ let ilmap = generate_fwmap (List.map (fun il -> il.head) inner_loops) inner_loops PTree.empty in
+ let superblocks = get_superblocks code entry pm typing in
+
+ (* Get only those superblocks that span an inner loop *)
+ let superblocks =
+ if not !Clflags.option_ftargetinnerloops then
+ superblocks
+ else
+ List.filter (fun sb ->
+ (* sanity check; a superblock contains at least one instruction *)
+ assert (Array.length sb.instructions >= 1);
+ (* Check if first instruction of a superblock is the beginning of an inner loop*)
+ let first_pc = sb.instructions.(0) in
+ match PTree.get first_pc ilmap with
+ | None -> false
+ | Some(il) -> (* List.length il.body = Array.length sb.instructions *)
+ (match il.sb_final with
+ (* don't bother if the loop is not predicted to loop *)
+ | None -> false
+ | Some pc ->
+ pc == first_pc (* loop is spanned by the superblock *)
+ (* Make sure the loop does not exceed the superblock *)
+ && List.length il.body = Array.length sb.instructions
+ )
+ ) superblocks
+ in
+
+ let next_free_pc = next_free_pc code |> Camlcoq.P.of_int in
+ let next_free_reg = max_reg_function f.fn_RTL |> Camlcoq.P.succ in
+
+ (* Apply aliasing code *)
+ let old_debug_flag = !debug_flag in
+ debug_flag := false;
+ debug "Initial code.\n";
+ print_code code;
+ print_path_map pm;
+ print_superblocks superblocks code;
+ debug "\n";
+ flush_all ();
+
+ (* TODO: Is this extra aliasing logic really useless? *)
+ (* let (superblocks, code, pm, next_free_pc) =
+ ListLabels.fold_left superblocks
+ ~init:([], code, pm, next_free_pc)
+ ~f:(fun (superblocks, code, pm, next_free_pc) sb ->
+ let to_insert = add_aliasing_code sb code pm in
+ let (sb', code', pm', next_free_pc') = insert_code sb code pm to_insert ~next_free_pc in
+ (sb'::superblocks, code', pm', next_free_pc') )
+ in
+ debug "After adding aliasing code.\n";
+ print_code code;
+ print_path_map pm;
+ print_superblocks superblocks code;
+ debug "\n";
+ flush_all (); *)
+
+ let (superblocks, code, pm, next_free_pc) =
+ ListLabels.fold_left superblocks
+ ~init:([], code, pm, next_free_pc)
+ ~f:(fun (superblocks, code, pm, next_free_pc) sb ->
+ let to_insert = prepend_nops_before_iconds sb code in
+ let (sb', code', pm', next_free_pc', _) = insert_code sb code pm to_insert ~next_free_pc in
+ (sb'::superblocks, code', pm', next_free_pc') )
+ in
+ debug "After adding nops before Iconds.\n";
+ print_code code;
+ print_path_map pm;
+ print_superblocks superblocks code;
+ debug "\n";
+ flush_all ();
+
+ let (code, sb_renamings, next_free_reg) =
+ if not !Clflags.option_fpoormansssa then
+ (code, List.map (fun sb -> (sb, PTree.empty)) superblocks, next_free_reg)
+ else (
+ debug "pmSSA path\n"; flush_all ();
+ ListLabels.fold_left superblocks
+ ~init:(code, [], next_free_reg)
+ ~f:(fun (code, sb_renamings, next_free_reg) sb ->
+ let first_pc = sb.instructions.(0) in
+ let pi = get_some @@ PTree.get first_pc pm in
+ let (code, live_renames, next_free_reg) = local_single_assignment sb code pi.input_regs ~next_free_reg in
+ (code, (sb, live_renames)::sb_renamings, next_free_reg)
+ ) )
+ in
+
+ debug "After renaming :\n";
+ print_code code;
+ print_path_map pm;
+ print_superblocks (fst @@ List.split sb_renamings) code;
+ debug "\n";
+ flush_all ();
+
+
+ let (sb_renamings, code, pm, next_free_pc) = ListLabels.fold_left sb_renamings
+ ~init:([], code, pm, next_free_pc)
+ ~f:(fun (sbs, code, pm, next_free_pc) (sb, live_renames) ->
+ let final_restoration = final_restoration_code sb code live_renames in
+ let (sb, code, pm, next_free_pc, fwmap) = insert_code sb code pm final_restoration ~next_free_pc in
+ let live_renames =
+ PTree.fold
+ (fun acc pc insts ->
+ let pc' = apply_map' fwmap pc in
+ (* Remove final renames, which were just inserted *)
+ let insts = if Camlcoq.P.eq pc sb.instructions.(Array.length sb.instructions - 1) then [] else insts in
+ PTree.set pc' insts acc)
+ live_renames
+ PTree.empty
+ in
+ ((sb, live_renames)::sbs, code, pm, next_free_pc)
+ )
+ in
+
+ debug "After inserting the final restoration code:\n";
+ (* print_code code; *)
+ (* print_path_map pm; *)
+ print_superblocks (fst @@ List.split sb_renamings) code;
+ debug "\n";
+ flush_all ();
+
+ let sb_tocompensatepcs_liverenames =
+ ListLabels.map sb_renamings
+ ~f:(fun (sb, live_renames) ->
+ let (to_insert_compensation_pcs, live_renames) =
+ if !Clflags.option_prepass_past_side_exits then
+ (downschedule_compensation_code sb code pm live_renames ~next_free_pc ~next_free_reg
+ , live_renames )
+ else
+ (InsertPositionMap.empty, live_renames)
+ in
+ (sb, to_insert_compensation_pcs, live_renames ) )
+ in
+
+ let (sb_tocompensate_liverenames, code, next_free_reg) = ListLabels.fold_left sb_tocompensatepcs_liverenames
+ ~init:([], code, next_free_reg)
+ ~f:(fun (sbs, code, next_free_reg) (sb, to_compensate_pcs, live_renames) ->
+ if !Clflags.option_fpoormansssa then (
+ let to_compensate = InsertPositionMap.map (fun pcs ->
+ let insts = List.map (fun pc -> get_some @@ PTree.get pc code) pcs in
+ insts)
+ to_compensate_pcs
+ in
+ let code = InsertPositionMap.fold
+ (fun _pos pcs code->
+ List.fold_left (fun code pc -> PTree.set pc (Inop Camlcoq.P.one) code)
+ code
+ pcs)
+ to_compensate_pcs
+ code
+ in
+ ((sb, to_compensate, live_renames)::sbs, code, next_free_reg)
+ ) else (
+ assert (PTree.elements live_renames |> List.for_all (fun (_, l) -> l = []));
+ let dup_count = InsertPositionMap.fold
+ (fun _pos (pcs : Camlcoq.P.t list) acc ->
+ let acc = ListLabels.fold_left pcs
+ ~init:acc
+ ~f:(fun acc pc ->
+ let old = ptree_get_or_default acc pc 0 in
+ PTree.set pc (old + 1) acc)
+ in
+ acc )
+ to_compensate_pcs
+ PTree.empty
+ in
+ let pcs_dupd_twice_or_more =
+ PTree.filter1 (fun n -> n > 1) dup_count
+ |> PTree.elements
+ |> List.map fst
+ in
+ let arg_regs = ListLabels.fold_left pcs_dupd_twice_or_more
+ ~init:(Regset.empty)
+ ~f:(fun acc pc ->
+ let inst = get_some @@ PTree.get pc code in
+ RTL.instr_uses inst
+ |> List.fold_left
+ (fun acc reg -> Regset.add reg acc)
+ acc
+ )
+ in
+ let pi = get_some @@ PTree.get sb.instructions.(0) pm in
+ let (code, live_renames, next_free_reg) = rename_regs ~only_rename:arg_regs sb code ~liveatentry:pi.input_regs ~next_free_reg in
+ let to_compensate = InsertPositionMap.map (fun pcs ->
+ let insts = List.map (fun pc -> get_some @@ PTree.get pc code) pcs in
+ insts)
+ to_compensate_pcs
+ in
+ let code = InsertPositionMap.fold
+ (fun _pos pcs code->
+ List.fold_left (fun code pc -> PTree.set pc (Inop Camlcoq.P.one) code)
+ code
+ pcs)
+ to_compensate_pcs
+ code
+ in
+ ((sb, to_compensate, live_renames)::sbs, code, next_free_reg) ))
+ in
+
+ (* Insert the compensation code (downward scheduling) and update the restoration code
+ * information to reflect the new pcs. *)
+ let (superblocks_liverenames, code, pm, next_free_pc) = ListLabels.fold_left sb_tocompensate_liverenames
+ ~init:([], code, pm, next_free_pc)
+ ~f:(fun (sbs, code, pm, next_free_pc) (sb, to_insert_compensation, live_renames) ->
+ let (sb, code, pm, next_free_pc, fwmap) = insert_code sb code pm to_insert_compensation ~next_free_pc in
+ let live_renames =
+ PTree.fold
+ (fun acc pc insts ->
+ let pc' = apply_map' fwmap pc in
+ PTree.set pc' insts acc)
+ live_renames
+ PTree.empty
+ in
+ ((sb, live_renames)::sbs, code, pm, next_free_pc) )
+ in
+
+ debug "After inserting the compensation code:\n";
+ (* print_code code; *)
+ (* print_path_map pm; *)
+ print_superblocks (fst @@ List.split superblocks_liverenames) code;
+ debug "\n"; flush_all ();
+
+ (* Insert the restoration code *)
+ let (superblocks, code, pm, next_free_pc, next_free_reg) = ListLabels.fold_left superblocks_liverenames
+ ~init:([], code, pm, next_free_pc, next_free_reg)
+ ~f:(fun (sbs, code, pm, next_free_pc, next_free_reg) (sb, live_renames) ->
+ let (sb, code, pm, next_free_pc, live_renames) =
+ if !Clflags.option_fpoormansssa then
+ (* Final restoration code was already inserted. *)
+ (sb, code, pm, next_free_pc, live_renames)
+ else
+ (* Ther combination of code motion below side exits WITHOUT register renaming may
+ * cause some restoration code to be necessary. Otherwise it is not safe to
+ * duplicate instructions.
+ * The final restoration code is special since it may insert below. *)
+ let final_restoration = final_restoration_code sb code live_renames in
+ let (sb, code, pm, next_free_pc, fwmap) = insert_code sb code pm final_restoration ~next_free_pc in
+ let live_renames =
+ PTree.fold
+ (fun acc pc insts ->
+ let pc' = apply_map' fwmap pc in
+ (* Remove final renames, which were just inserted *)
+ let insts = if Camlcoq.P.eq pc sb.instructions.(Array.length sb.instructions - 1) then [] else insts in
+ PTree.set pc' insts acc)
+ live_renames
+ PTree.empty
+ in
+ (sb, code, pm, next_free_pc, live_renames)
+ in
+ let pc_to_idx = Duplicateaux.generate_fwmap
+ (Array.to_list sb.instructions)
+ (List.init (Array.length sb.instructions) (fun i -> i))
+ PTree.empty
+ in
+ let (to_insert_restoration, to_rename, next_free_reg) = restoration_instructions' sb code live_renames ~next_free_reg in
+ let side_exit_pcs = side_exit_pcs sb code in
+ let code = ListLabels.fold_left side_exit_pcs
+ ~init:code
+ ~f:(fun code side_exit_pc ->
+ match PTree.get side_exit_pc to_rename with
+ | None -> code
+ | Some aliases ->
+ let idx = apply_map' pc_to_idx side_exit_pc in
+ let code = apply_aliases sb code aliases ~offset:idx in
+ code)
+ in
+ let (sb, code, pm, next_free_pc, _fwmap) = insert_code sb code pm to_insert_restoration ~next_free_pc in
+ (sb::sbs, code, pm, next_free_pc, next_free_reg) )
+ in
+
+ debug "After inserting the restoration code:\n";
+ (* print_code code; *)
+ (* print_path_map pm; *)
+ print_superblocks superblocks code;
+ debug "\n"; flush_all ();
+
+ (* let (superblocks, code, pm, next_free_pc, next_free_reg) =
+ if not !Clflags.option_fpoormansssa then (
+ (* In principle we need to do something like this because if we do not systematically
+ * rename registers, the downward scheduling might copy a series of instructions like
+ * this: i = i + 1, which when copied twice (to move below two side exits) is
+ * incorrect unless we rename the register i (or at least its second redefinition).
+ * However, at least the benchmarks do not seem to trigger this special case.
+ ListLabels.fold_left superblocks_torename
+ ~init:([], code, pm, next_free_pc, next_free_reg)
+ ~f:(fun (sbs, code, pm, next_free_pc, next_free_reg) (sb, to_rename) ->
+ let pi = get_some @@ PTree.get sb.instructions.(0) pm in
+ let liveatentry = pi.input_regs in
+ let (code, live_renames, next_free_reg) = rename_regs ~liveatentry ~only_rename:to_rename sb code ~next_free_reg in
+ let restoration_insts = restoration_instructions live_renames in
+ let (sb, code, pm, next_free_pc) = insert_code sb code pm restoration_insts ~next_free_pc in
+ (sb::sbs, code, pm, next_free_pc, next_free_reg) ) *)
+ (fst @@ List.split superblocks_torestore, code, pm, next_free_pc, next_free_reg)
+ ) else
+ (fst @@ List.split superblocks_torestore, code, pm, next_free_pc, next_free_reg)
+ in *)
+
+ let (code, to_lift) =
+ if !Clflags.option_fliftif > 0 then
+ (* TODO: Use this flag to control for the amount of duplicated code.
+ * However, this is probably best controlled at the "downscheduling" level since only
+ * those instructions need to be actually duplicated, i.e. restoration code writing
+ * back the current value to renamed registers is not actually duplicated. *)
+ let (code, staged_dupcode, staged_revmap, next_free_pc) = ListLabels.fold_left superblocks
+ ~init:(code, PTree.empty, PTree.empty, (Camlcoq.P.to_int next_free_pc))
+ ~f:(fun (code, staged_dupcode, staged_revmap, next_free_pc) sb ->
+ stage_duplication sb code staged_dupcode staged_revmap ~next_free_pc )
+ in
+ (code, Some(staged_revmap, staged_dupcode))
+ else
+ (code, None)
+ in
+
+ debug "After staging the duplication (\"if-lifting\") :\n";
+ print_code code;
+ print_path_map pm;
+ print_superblocks superblocks code;
+ debug "\n";
+ flush_all ();
+
+ debug_flag := old_debug_flag;
+
+ ((((code, entry), pm), id_ptree), to_lift)