diff options
Diffstat (limited to 'scheduling/MyRTLpathScheduleraux.ml')
-rw-r--r-- | scheduling/MyRTLpathScheduleraux.ml | 1619 |
1 files changed, 1619 insertions, 0 deletions
diff --git a/scheduling/MyRTLpathScheduleraux.ml b/scheduling/MyRTLpathScheduleraux.ml new file mode 100644 index 00000000..0375d26c --- /dev/null +++ b/scheduling/MyRTLpathScheduleraux.ml @@ -0,0 +1,1619 @@ +open DebugPrint +open RTLpathLivegenaux +open RTLpath +open RTLpathCommon +open RTL +open Maps +open Registers + +let print_superblock sb code = + let insts = sb.instructions in + let li = sb.liveins in + let outs = sb.s_output_regs in + begin + debug "{ instructions = "; print_instructions (Array.to_list insts) code; debug "\n"; + debug " liveins = "; print_ptree_regset li; debug "\n"; + debug " output_regs = "; print_regset outs; debug "}" + end + +let print_superblocks lsb code = + let rec f = function + | [] -> () + | sb :: lsb -> (print_superblock sb code; debug ",\n"; f lsb) + in begin + debug "[\n"; + f lsb; + debug "]" + end + +let get_superblocks = RTLpathScheduleraux.get_superblocks + +let get_ok = RTLpathScheduleraux.get_ok + +let apply_map' fw n = Duplicateaux.ptree_get_some n fw + +let apply_map_opt fw n = + match PTree.get n fw with + | Some n' -> n' + | None -> n + +let change_arg_regs inst fwmap = + let open Datatypes in + match inst with + | Icall (sgn, fn, args, dest, succ) -> + let fn' = + ( match fn with + | Coq_inl r -> Datatypes.Coq_inl (apply_map_opt fwmap r) + | Coq_inr _ as ident -> (* function name *) ident ) + in + let args' = List.map (apply_map_opt fwmap) args in + Icall (sgn, fn', args', dest, succ) + | Ibuiltin (ef, args, dest, succ) -> + let args' = List.map (AST.map_builtin_arg (apply_map_opt fwmap)) args in + Ibuiltin (ef, args', dest, succ) + | Ijumptable (arg, tbl) -> Ijumptable (apply_map_opt fwmap arg, tbl) + | Itailcall (sgn, fn, args) -> + let fn' = + ( match fn with + | Coq_inl r -> Datatypes.Coq_inl (apply_map_opt fwmap r) + | Coq_inr _ as ident -> (* function name *) ident ) + in + let args' = List.map (apply_map_opt fwmap) args in + Itailcall (sgn, fn', args') + | Ireturn (reg_opt) -> + ( match reg_opt with + | None -> Ireturn None + | Some(reg) -> Ireturn (Some (apply_map_opt fwmap reg)) ) + | Icond (a, b, n1, n2, i) -> Icond (a, List.map (apply_map_opt fwmap) b, n1, n2, i) + | Inop n -> Inop n + | Iop (a, b, c, n) -> Iop (a, List.map (apply_map_opt fwmap) b, c, n) + | Iload (a, b, c, d, e, n) -> + Iload (a, b, c, List.map (apply_map_opt fwmap) d, e, n) + | Istore (a, b, c, d, n) -> Istore (a, b, List.map (apply_map_opt fwmap) c, apply_map_opt fwmap d, n) + +let change_dest_reg inst fwmap = + match inst with + | Inop _ + | Istore _ + | Itailcall _ + (* XXX builtin is a special case?! *) + | Ibuiltin _ + | Icond _ + | Ijumptable _ + | Ireturn _ + | Icall _ -> failwith "unexpectedly asked to change dest reg" + | Iop(op, args, res, s) -> Iop(op, args, (apply_map' fwmap res), s) + | Iload(trap, chunk, addr, args, dst, s) -> Iload(trap, chunk, addr, args, (apply_map' fwmap dst), s) + +let maybe_change_dest_reg ?only_rename inst fwmap ~next_free_reg = + let do_nothing = (inst, fwmap, next_free_reg) in + match inst with + | Icall _ + | Ibuiltin _ + | Ijumptable _ + | Itailcall _ + | Ireturn _ -> + (* Do not rename registers if the instructions MUST end a path because we cannot add + * restoration code afterwards. *) + do_nothing + | _ as i -> + match RTL.instr_defs i with + | None -> do_nothing + | Some r -> + if Option.is_some only_rename && not (Regset.mem r (get_some only_rename)) then + do_nothing + else + (match PTree.get r fwmap with + | None -> (i, PTree.set r r fwmap, next_free_reg) + | Some _previous_name -> + let new_name = next_free_reg in + let fwmap = PTree.set r new_name fwmap in + (change_dest_reg i fwmap, fwmap, Camlcoq.P.succ next_free_reg) ) + +let ptree_get_or_default ptree key default = + match PTree.get key ptree with + | None -> default + | Some value -> value + +let is_icond = function + | Icond _ -> true + | _ -> false + +let side_exit_idxs sb code = + Array.to_list sb.instructions + |> List.map (fun pc -> get_some @@ PTree.get pc code) + |> List.mapi (fun i inst -> (i, inst)) + |> List.filter (fun (_i, inst) -> is_icond inst + && RTLpathLivegenaux.predicted_successor inst + |> Option.is_some) + |> List.map (fun (i, _inst) -> i) + +let side_exit_pcs sb code = + side_exit_idxs sb code |> List.map (fun i -> sb.instructions.(i)) + +module InsertPosition = struct + type t = + | Above of Camlcoq.P.t + | Below of Camlcoq.P.t + let anchor = function + | Above x | Below x -> x + + let pseudo_map pos ~f = match pos with + | Above x -> Above (f x) + | Below x -> Below (f x) + let compare x y = + match Camlcoq.P.compare (anchor x) (anchor y) with + | 0 -> (match x, y with + | Above _, Above _ | Below _, Below _ -> 0 + | Above _, Below _ -> 1 + | Below _, Above _ -> -1 ) + | c -> c +end + +module InsertPositionMap = Map.Make(InsertPosition) + + let insert_code sb code pm (to_insert : RTL.instruction list InsertPositionMap.t) ~next_free_pc = + let old_debug_flag = !debug_flag in + debug_flag := false; + debug "Before code insertion:\n"; + print_superblock sb code; + debug "\n"; flush_all (); + + debug "Code to insert:\n"; + InsertPositionMap.iter + (fun pos insts -> + debug "%s %d: " + (match pos with + | InsertPosition.Above _ -> "Above" + | InsertPosition.Below _ -> "Below") + (Camlcoq.P.to_int @@ InsertPosition.anchor pos) + ; + List.iter (fun inst -> + if !debug_flag then PrintRTL.print_instruction stdout (0, inst)) insts; + debug "\n"; flush_all () + ) + to_insert + ; + + let next_free_pc = + let next_free_pc = ref next_free_pc in + (fun () -> + let pc = !next_free_pc in + next_free_pc := Camlcoq.P.succ !next_free_pc; + pc ) + in + let original_length = Array.length sb.instructions in + let orig_sort_keys = + Duplicateaux.generate_fwmap + (Array.to_list sb.instructions) + (List.init original_length (fun i -> i * 2)) + PTree.empty + in + let new_key pos = + let open InsertPosition in + let anchor_key = + InsertPosition.anchor pos + |> apply_map' orig_sort_keys + in + match pos with + | Above _ -> anchor_key - 1 + | Below _ -> anchor_key + 1 + in + let (code, pc_lists, sort_keys) = + InsertPositionMap.fold + (fun (pos : InsertPosition.t) insts (code, pc_lists, sort_keys) -> + let insts_length = List.length insts in + let key = new_key pos in + let pcs = List.init insts_length (fun _ -> next_free_pc ()) in + let new_sort_keys = List.init insts_length (fun _ -> key) in + let code = + ListLabels.fold_left2 pcs insts + ~init:code + ~f:(fun code pc inst -> PTree.set pc inst code) + in + let sort_keys = Duplicateaux.generate_fwmap pcs new_sort_keys sort_keys in + (code, pcs::pc_lists, sort_keys) ) + to_insert + (code, [], orig_sort_keys) + in + let new_pcs = List.flatten pc_lists |> Array.of_list in + let last_instruction = [| sb.instructions.(original_length - 1) |] in + let upto_last = + if original_length > 1 then + Array.sub sb.instructions 0 (original_length - 1) + else [| |] + in + let instructions = Array.concat [upto_last; new_pcs; last_instruction ] in + let instructions_order = Array.copy instructions in + ArrayLabels.stable_sort instructions_order + ~cmp:(fun x y -> Int.compare (apply_map' sort_keys x) (apply_map' sort_keys y)); + + let new_length = Array.length instructions in + let fwmap = Duplicateaux.generate_fwmap (Array.to_list instructions_order) (Array.init new_length (fun i -> i) |> Array.to_list) PTree.empty in + let fwmap_pc = + Duplicateaux.generate_fwmap + (Array.to_list sb.instructions) + (Array.to_list @@ Array.map (fun pc -> instructions.(apply_map' fwmap pc)) sb.instructions) + PTree.empty + in + let liveins' = + PTree.fold + (fun liveins' pc live_regs -> + PTree.set (apply_map' fwmap_pc pc) live_regs liveins') + sb.liveins PTree.empty + in + + let sb' = {sb with instructions = instructions; liveins = liveins'} in + let code = RTLpathScheduleraux.apply_schedule code sb' instructions_order in + + let num_added = new_length - original_length in + let first_pc = sb.instructions.(0) in + let pi = get_some @@ PTree.get first_pc pm in + let module N = Camlcoq.Nat in + let new_size = N.to_int pi.psize + num_added |> N.of_int in + let pm = PTree.set first_pc {pi with psize = new_size} pm in + + debug_flag := old_debug_flag; + (sb', code, pm, next_free_pc (), fwmap_pc) + +let prepend_nops_before_iconds sb code = + (* We need the a first and last instruction so that + * a) the pc of the superblock entry stays the same and + * b) apply_schedule correclty preserved the successors of the last instruction *) + if Array.length sb.instructions < 2 then + (* Early exit, this should probably be replaced by a more general exclusion of + * superblock with just one instruction. *) + InsertPositionMap.empty + else + + (* TODO, probably only need it before side exits *) + let icond_pcs = + Array.to_list sb.instructions + |> List.filter (fun pc -> + let inst = get_some @@ PTree.get pc code in + is_icond inst ) + in + let to_insert = + ListLabels.fold_left icond_pcs + ~init:InsertPositionMap.empty + ~f:(fun acc icond_pc -> InsertPositionMap.add (InsertPosition.Above icond_pc) [Inop Camlcoq.P.one] acc) + in + to_insert + +type renamed = + { old_name : reg + ; new_name : reg } + +let update_live_renames pc live_renames fwmap regs = + Regset.fold + (fun live_reg renames -> + match PTree.get live_reg fwmap with + | None -> renames + | Some(r) when r = live_reg -> renames + | Some(new_name) -> + let old = ptree_get_or_default renames pc [] in + let upd = {old_name = live_reg; new_name} :: old in + PTree.set pc upd renames ) + regs + live_renames + +let my_merge_overwrite m1 m2 = + PTree.combine (fun x y -> match (x, y) with + | None, None -> None + | Some x, None + | None, Some x -> Some x + | Some _, Some y -> Some y + ) m1 m2 + +let rename_regs ?only_rename sb code ~liveatentry ~next_free_reg = + let old_debug_flag = !debug_flag in + + let length = Array.length sb.instructions in + assert (length > 0); + (* Early exit *) + if length = 1 then (code, PTree.empty, next_free_reg) else + (* The last instruction is treated in a special way because if it defines a register, + * that register cannot possibly be used afterwards in the path AND often we cannot + * insert restoration code later after it since it must remain at the end of the path. + * In the future, this may be resolved by only renaming registers which are used + * afterwards in path, which would exclude the register possibly assigned by the last + * instruction. *) + let last_pc = sb.instructions.(length - 1) in + let upto_last = Array.init (length - 1) (fun i -> sb.instructions.(i)) in + let liveatentry = Regset.elements liveatentry in + let fwmap = Duplicateaux.generate_fwmap liveatentry liveatentry PTree.empty in + + let (code, fwmap, live_renames, next_free_reg) = + ArrayLabels.fold_left upto_last + ~init:(code, fwmap, PTree.empty, next_free_reg) + ~f:(fun (code, fwmap, live_renames, next_free_reg) pc -> + (* Rewrite instruction to use potentially renamed registers *) + let inst = get_some @@ PTree.get pc code in + let inst = change_arg_regs inst fwmap in + let (inst, fwmap, next_free_reg) = + maybe_change_dest_reg ?only_rename inst fwmap ~next_free_reg + in + let code = PTree.set pc inst code in + + let (live_renames, fwmap)= + if is_icond inst then ( + (* Pretend that registers that are live at an exit already have a definition, so + * this catches a couple of edge cases where an instruction was not renamed and + * could thus not be moved up. *) + let live_regs = Regset.elements @@ get_some @@ PTree.get pc sb.liveins in + let fwmap' = Duplicateaux.generate_fwmap live_regs live_regs PTree.empty in + let fwmap = my_merge_overwrite fwmap' fwmap in + (update_live_renames pc live_renames fwmap (get_some @@ PTree.get pc sb.liveins) + , fwmap) + ) else + (live_renames, fwmap) + in + (code, fwmap, live_renames, next_free_reg) + ) + in + + let last_inst = get_some @@ PTree.get last_pc code in + let last_inst = change_arg_regs last_inst fwmap in + let (last_inst, fwmap, next_free_reg) = + maybe_change_dest_reg ?only_rename last_inst fwmap ~next_free_reg + in + let code = PTree.set last_pc last_inst code in + + let live_renames = update_live_renames last_pc live_renames fwmap sb.s_output_regs in + + debug_flag := old_debug_flag; + + (code, live_renames, next_free_reg) + +(* Pass over the superblock instruction in-order + * For each redefinition of a register, create a new register name and use that one from + * then on. There is one exception, if the last instruction defines a register, it will be + * left unchanged since the rest of the path cannot possibly use it. + * WARNING: This invalidates the superblock, it will need to be repaired with the + * information returned. *) +let local_single_assignment sb code liveatentry ~next_free_reg = + let old_debug_flag = !debug_flag in + + let (code, live_renames, next_free_reg) = + rename_regs sb code ~liveatentry ~next_free_reg + in + + debug_flag := old_debug_flag; + (code, live_renames, next_free_reg) + +let final_restoration_code sb code live_renames = + let last_inst_pc = sb.instructions.(Array.length sb.instructions - 1) in + let last_inst = get_some @@ PTree.get last_inst_pc code in + let last_inst_is_basic = match last_inst with + | Icall _ + | Ibuiltin _ + | Ijumptable _ + | Itailcall _ + | Ireturn _ + | Icond _ -> false + | _ -> true + in + let final_renames = ptree_get_or_default live_renames last_inst_pc [] in + let live_regs_opt = PTree.get last_inst_pc sb.liveins in + let (above, below) = let open Either in + ListLabels.partition_map final_renames + ~f:(fun {old_name; new_name} -> + let inst = Iop (Op.Omove, [new_name], old_name, Camlcoq.P.one) in + match last_inst_is_basic, live_regs_opt with + | true, _ -> + (* Printf.eprintf "Putting %d below basic inst\n" (Camlcoq.P.to_int old_name); *) + Right inst + | false, None -> + (* Printf.eprintf "Putting %d above unpredicted icond.\n" (Camlcoq.P.to_int old_name); *) + Left inst + | false, Some live_regs -> + if Regset.mem old_name live_regs then ( + (* Printf.eprintf "Putting %d above icond\n" (Camlcoq.P.to_int old_name); *) + Left inst + ) else ( + (* Printf.eprintf "Putting %d below icond\n" (Camlcoq.P.to_int old_name); *) + Right inst)) + in + InsertPositionMap.empty + |> InsertPositionMap.add (InsertPosition.Above last_inst_pc) above + |> InsertPositionMap.add (InsertPosition.Below last_inst_pc) below + +let used_before_redefinition sb code ~offset ~reg = + let length = Array.length sb.instructions in + if not (offset < length) then + raise (Invalid_argument (Printf.sprintf "offset must be less than the superblock's length: %d is not less than %d" offset length)) + ; + let finished = ref false in + let i = ref offset in + let res = ref false in + while (!i < length && not !finished) do + let inst = get_some @@ PTree.get sb.instructions.(!i) code in + if List.mem reg (RTL.instr_uses inst) then ( + res := true; + finished := true; + ) else + () + ; + let defined_reg = RTL.instr_defs inst in + (match defined_reg with + | None -> () + | Some r -> if r = reg then finished := true else ()); + i := !i + 1; + done; + !res + +type restoration_actions = + | Just_restore of renamed + | Restore_and_alias of renamed + +let restoration_instructions' sb code live_renames ~next_free_reg = + let next_free_reg = + let next_free_reg = ref next_free_reg in + (fun () -> + let r = !next_free_reg in + next_free_reg := Camlcoq.P.succ !next_free_reg; + r ) + in + + let length = Array.length sb.instructions in + let pc_to_idx = + Duplicateaux.generate_fwmap + (Array.to_list sb.instructions) + (List.init length (fun i -> i)) + PTree.empty + in + + let live_renames = PTree.map + (fun pc renames -> + let offset = apply_map' pc_to_idx pc in + List.map + (fun rename -> + let {old_name; new_name} = rename in + if used_before_redefinition sb code ~offset ~reg:old_name then + Restore_and_alias rename + else + Just_restore rename) + renames) + live_renames + in + + let to_rename = PTree.map1 + (fun renames -> + let old_names = List.filter_map (fun rename -> + match rename with + | Just_restore _ -> None + | Restore_and_alias {old_name; new_name} -> Some old_name) renames + in + let aliases = List.init (List.length old_names) (fun _ -> next_free_reg ()) in + let fwmap = Duplicateaux.generate_fwmap old_names aliases PTree.empty in + fwmap + ) + live_renames + in + + let to_insert_restore = + PTree.fold + (fun to_insert side_exit_pc renames -> + let alias_map = get_some @@ PTree.get side_exit_pc to_rename in + let insts = ListLabels.map renames + ~f:(fun rename -> + match rename with + | Just_restore {old_name; new_name} -> [Iop (Op.Omove, [new_name], old_name, Camlcoq.P.one)] + | Restore_and_alias {old_name; new_name} -> + [ Iop (Op.Omove, [old_name], apply_map' alias_map old_name, Camlcoq.P.one) + ; Iop (Op.Omove, [new_name], old_name, Camlcoq.P.one)] ) + in + let insts = List.flatten insts in + InsertPositionMap.add (InsertPosition.Above side_exit_pc) insts to_insert) + live_renames + InsertPositionMap.empty + in + (to_insert_restore, to_rename, next_free_reg ()) + +(* Assumes the path is in local single assignment form *) +let intra_path_dependencies (sb : superblock) (code : code) = + let old_debug_flag = !debug_flag in + debug_flag := false; + + (* Directly taken from RTLpathScheduleraux *) + let nr_instr = Array.length sb.instructions in + let trailer_length = + match PTree.get (sb.instructions.(nr_instr-1)) code with + | None -> 0 + | Some ii -> + (match predicted_successor ii with + | Some _ -> 0 + | None -> 1 ) + in + + let module IS = InstructionScheduler in + let opweights = OpWeights.get_opweights () in + let seqa = + ArrayLabels.map (Array.sub sb.instructions 0 (Array.length sb.instructions - trailer_length)) + ~f:(fun i -> + (match PTree.get i code with + | Some ii -> ii + | None -> failwith "MyRTLpathScheduleraux.intra_path_dependencies"), + (match PTree.get i sb.liveins with + | Some s -> s + | None -> Regset.empty) ) + in + let latency_constraints = PrepassSchedulingOracle.get_simple_dependencies opweights seqa in + debug "intra_path_dependencies for superblock:\n"; + print_superblock sb code; + debug "\nlatency_constraints:\n\n"; + ListLabels.iter latency_constraints + ~f:(fun {IS.instr_from; instr_to; latency} -> + debug "instr_from: %2d\ninstr_to: %4d\nlatency: %3d\n\n" instr_from instr_to latency ); + flush_all (); + + let deps = + ListLabels.fold_left latency_constraints + ~init:PTree.empty + ~f:(fun deps {IS.instr_from; instr_to; latency = _} -> + let to_pc = sb.instructions.(instr_to) in + let from_pc = sb.instructions.(instr_from) in + let inst_deps = ptree_get_or_default deps to_pc HashedSet.PSet.empty in + let inst_deps' = HashedSet.PSet.add from_pc inst_deps in + PTree.set to_pc inst_deps' deps ) + in + debug_flag := old_debug_flag; + deps + +let transitive_dependencies deps pc = + let old_debug_flag = !debug_flag in + let immediate_deps = ptree_get_or_default deps pc HashedSet.PSet.empty in + + let rec iter ~acc ~todo ~seen = + match todo with + | [] -> acc + | pc'::todo -> (* NB: todo has just shrunk *) + if HashedSet.PSet.contains seen pc' then + iter ~acc ~todo ~seen + else + let deps_of_dep = ptree_get_or_default deps pc' HashedSet.PSet.empty in + let new_todo = HashedSet.PSet.subtract deps_of_dep acc |> HashedSet.PSet.elements in + iter + ~acc:(HashedSet.PSet.add pc' acc) + ~todo:(new_todo @ todo) + ~seen:(HashedSet.PSet.add pc' seen) + in + let transitive_dependencies = + iter + ~acc:HashedSet.PSet.empty + ~todo:(HashedSet.PSet.elements immediate_deps) + ~seen:HashedSet.PSet.empty + in + debug_flag := old_debug_flag; + transitive_dependencies + +let moved_dependencies deps order side_exit_pc = + let old_debug_flag = !debug_flag in + + let dependencies = transitive_dependencies deps side_exit_pc in + let side_exit_pc_idx = apply_map' order side_exit_pc in + let moved_dependencies = + HashedSet.PSet.filter + (fun pc -> + let dep_idx = apply_map' order pc in + dep_idx > side_exit_pc_idx ) + dependencies + in + debug_flag := old_debug_flag; + moved_dependencies + +let update_liveins liveins live_renames = + PTree.map + (fun pc liveregs -> + match PTree.get pc live_renames with + | None -> liveregs + | Some renames -> + let old_to_new = ListLabels.fold_left renames + ~init:PTree.empty + ~f:(fun acc {old_name; new_name} -> PTree.set old_name new_name acc) + in + (* There doesn't seem to be a proper map function for Regset.t *) + let liveregs' = + Regset.fold + (fun r acc -> + let r' = ptree_get_or_default old_to_new r r in + Regset.add r' acc) + liveregs + Regset.empty + in + liveregs' + ) + liveins + +let replace_iconds_by_ocmps sb code ~next_free_reg = + let module P = Camlcoq.P in + let (code, _previous_icond, next_free_reg) = + ArrayLabels.fold_left sb.instructions + ~init:(code, None, next_free_reg) + ~f:(fun (code, previous_icond_proxy_reg, next_free_reg) pc -> + let inst = get_some @@ PTree.get pc code in + match inst with + | Istore(chunk, addr, args, src, succ) -> + ( match previous_icond_proxy_reg with + | None -> (code, previous_icond_proxy_reg, next_free_reg) + | Some r -> + let istore' = Istore(chunk, addr, r::args, src, succ) in + let code' = PTree.set pc istore' code in + (code', previous_icond_proxy_reg, next_free_reg) ) + | Icond(cond, args, ifso, ifnot, prediction) -> + (match prediction with + | None -> + (* Case only happens at the very end of the path; no transformation necessary *) + assert(sb.instructions.(Array.length sb.instructions - 1) = pc); + (code, None, next_free_reg) + | Some true -> ( + let ocmp = match previous_icond_proxy_reg with + | None -> Iop((Op.Ocmp cond), args, next_free_reg, ifso) + | Some r -> Iop((Op.Ocmp cond), r::args, next_free_reg, ifso) + in + let code' = PTree.set pc ocmp code in + (code', Some next_free_reg, P.succ next_free_reg) ) + | Some false -> ( + let ocmp = match previous_icond_proxy_reg with + | None -> Iop((Op.Ocmp cond), args, next_free_reg, ifnot) + | Some r -> Iop((Op.Ocmp cond), r::args, next_free_reg, ifnot) + in + let code' = PTree.set pc ocmp code in + (code', Some next_free_reg, P.succ next_free_reg) )) + | Iload(trap, chunk, addr, args, dest, succ) -> ( + if !Machine.config.Machine.has_non_trapping_loads then + (code, previous_icond_proxy_reg, next_free_reg) + else + match previous_icond_proxy_reg with + | None -> (code, previous_icond_proxy_reg, next_free_reg) + | Some r -> + let load' = Iload(trap, chunk, addr, r::args, dest, succ) in + let code' = PTree.set pc load' code in + (code', previous_icond_proxy_reg, next_free_reg) ) + | _ -> (code, previous_icond_proxy_reg, next_free_reg) + ) + in + (code, next_free_reg) + +let is_store = function + | Istore _ -> true + | _ -> false + +type heuristic_mode = + | Default + | Ignore_liveness + | Move_stores + +let ideal_schedule'' sb code mode = + let old_debug_flag = !debug_flag in + + let dep_function = match mode with + | Default -> PrepassSchedulingOracle.get_simple_dependencies' + | Ignore_liveness -> PrepassSchedulingOracle.get_fake_deps_liveness + | Move_stores -> PrepassSchedulingOracle.get_fake_deps_liveness_stores + in + + (* copy-paste from RTLpathScheduleraux.schedule_superblock *) + let nr_instr = Array.length sb.instructions in + let trailer_length = + match PTree.get (sb.instructions.(nr_instr-1)) code with + | None -> 0 + | Some ii -> + match predicted_successor ii with + | Some _ -> 0 + | None -> 1 in + let live_regs_entry = RTLpathScheduleraux.get_live_regs_entry sb code in + let seqa = + Array.map (fun i -> + (match PTree.get i code with + | Some ii -> ii + | None -> failwith "RTLpathScheduleraux.schedule_superblock"), + (match PTree.get i sb.liveins with + | Some s -> s + | None -> Regset.empty)) + (Array.sub sb.instructions 0 (nr_instr-trailer_length)) + in + let nr_scheduled_instr = nr_instr - trailer_length in + (* Copy-pasted from PrepassSchedulingOracle.schedule_sequence *) + let opweights = OpWeights.get_opweights () in + (* NB: Early exit *) + if (Array.length seqa) <= 1 then None else + (* Copy-pasted from PrepassSchedulingOracle.define_problem *) + let problem = + let deps = dep_function opweights seqa in + debug_flag := false; + debug "Fake deps:\n"; + if !debug_flag then ( + deps + |> List.iter (fun {InstructionScheduler.instr_from; instr_to; latency} -> + debug "%2d depends on %2d\n" (Camlcoq.P.to_int sb.instructions.(instr_to)) (Camlcoq.P.to_int sb.instructions.(instr_from))); + flush_all (); + ); + { InstructionScheduler.max_latency = -1 + ; resource_bounds = opweights.PrepassSchedulingOracleDeps.pipelined_resource_bounds + ; live_regs_entry = live_regs_entry + ; typing = sb.typing + ; reference_counting = Option.some @@ RTLpathScheduleraux.reference_counting seqa sb.s_output_regs sb.typing + ; instruction_usages = Array.map (PrepassSchedulingOracle.resources_of_instruction opweights) (Array.map fst seqa) + ; latency_constraints = deps } + in + match PrepassSchedulingOracle.prepass_scheduler_by_name + (!Clflags.option_fprepass_sched) + problem + (Array.map (fun (ins, _) -> + match ins with + | Icond _ -> true + | _ -> false) seqa) + with + | None -> + debug_flag := old_debug_flag; + failwith "no solution in prepass scheduling\n" + | Some solution -> + let positions = Array.init nr_scheduled_instr (fun i -> i) in + let final_time = solution.(nr_scheduled_instr) in + Array.sort (fun i j -> + let si = solution.(i) and sj = solution.(j) in + if si < sj then -1 + else if si > sj then 1 + else i - j) positions; + + let ins' = + Array.append + (Array.map (fun i -> sb.instructions.(i)) positions) + (Array.sub sb.instructions (nr_instr - trailer_length) trailer_length) in + Some (ins', final_time) + +(* Improved scheduling heuristic which allows moving memory writes downwards by turning + * Iconds into Ocmps for the purpose of dependency calculations. *) +let ideal_schedule' sb code ~next_free_reg = + let old_debug_flag = !debug_flag in + let (fake_code, _next_free_reg) = replace_iconds_by_ocmps sb code ~next_free_reg in + (* Does PTree.empty work or do I need to map the entries to Regset.empty *) + let fake_sb = { sb with liveins = PTree.empty } in + (* Copied from RTLpathScheduleraux.schedule_superblock *) + let nr_instr_sb = Array.length sb.instructions in + assert (nr_instr_sb = Array.length fake_sb.instructions); + + let trailer_length = + match PTree.get (sb.instructions.(nr_instr_sb - 1)) code with + | None -> 0 + | Some ii -> + match predicted_successor ii with + | Some _ -> 0 + | None -> 1 + in + let seqa = + (Array.map (fun i -> + (match PTree.get i code with + | Some ii -> ii + | None -> failwith "MyRTLpathScheduleraux.ideal_schedule'"), + (match PTree.get i sb.liveins with + | Some s -> s + | None -> Regset.empty)) + (Array.sub sb.instructions 0 (nr_instr_sb - trailer_length))) + in + let fake_seqa = + (Array.map (fun i -> + (match PTree.get i fake_code with + | Some ii -> ii + | None -> failwith "MyRTLpathScheduleraux.ideal_schedule'"), + (match PTree.get i fake_sb.liveins with + | Some s -> s + | None -> Regset.empty)) + (Array.sub fake_sb.instructions 0 (nr_instr_sb - trailer_length))) + in + (* Copied from PrepassSchedulingOracle.schedule_sequence *) + let opweights = OpWeights.get_opweights () in + (* WARNING: Early exit in case there is only on instruction to schedule *) + if (Array.length fake_seqa) <= 1 then None else + let nr_instr_fake_seqa = Array.length fake_seqa in + assert (nr_instr_fake_seqa = Array.length seqa); + let nr_instr_seqa = nr_instr_fake_seqa in + let store_idxs = + Array.to_list sb.instructions + |> List.mapi (fun i pc -> (i, get_some @@ PTree.get pc code)) + |> List.filter (fun (_i, inst) -> is_store inst) + |> List.map (fun (i, _inst) -> i) + in + let side_exit_idxs = side_exit_idxs sb code in + let store_side_exit_limit = + ListLabels.map store_idxs + ~f:(fun st_idx -> + let first_se = List.find_opt (fun se_idx -> se_idx > st_idx) side_exit_idxs in + let second_se = + Option.bind + first_se + (fun se_idx -> List.find_opt (fun se_idx' -> se_idx' > se_idx) side_exit_idxs) + in + second_se ) + in + let store_side_exit_deps = + ListLabels.map2 store_idxs store_side_exit_limit + ~f:(fun st_idx se_idx_opt -> + let module IS = InstructionScheduler in + match se_idx_opt with + | None -> None + | Some se_idx -> Some {IS.instr_to = se_idx; instr_from = st_idx; latency = 0} ) + |> List.filter_map (fun x -> x) + in + (* Copied from PrepassSchedulingOracle.define_problem *) + let fake_deps = PrepassSchedulingOracle.get_simple_dependencies opweights fake_seqa in + let fake_deps = store_side_exit_deps @ fake_deps in + + let problem = + { InstructionScheduler.max_latency = -1 + ; live_regs_entry = RTLpathScheduleraux.get_live_regs_entry fake_sb fake_code + ; typing = fake_sb.typing + ; reference_counting = Some (RTLpathScheduleraux.reference_counting fake_seqa fake_sb.s_output_regs fake_sb.typing) + ; resource_bounds = opweights.PrepassSchedulingOracleDeps.pipelined_resource_bounds + ; instruction_usages = Array.map (PrepassSchedulingOracle.resources_of_instruction opweights) (Array.map fst seqa) + ; latency_constraints = fake_deps } + + in + let scheduled_sequence = + match + PrepassSchedulingOracle.prepass_scheduler_by_name + (!Clflags.option_fprepass_sched) + problem + (Array.map (fun (ins, _) -> + match ins with + | Icond _ -> true + | _ -> false) + seqa) + with + | None -> None + | Some solution -> + (* Printf.eprintf "Scheduling instruction sequence of length: %d\n" nr_instructions; flush_all (); + Printf.eprintf "Result: %d\n" solution.(nr_instructions); flush_all (); *) + let positions = Array.init nr_instr_seqa (fun i -> i) in + let final_time = solution.(nr_instr_seqa) in + Array.sort (fun i j -> + let si = solution.(i) and sj = solution.(j) in + if si < sj then -1 + else if si > sj then 1 + else i - j) positions; + Some (positions, final_time) + in + + debug_flag := old_debug_flag; + + match scheduled_sequence with + | None -> None + | Some (order, final_time) -> + let ins' = + Array.append + (Array.map (fun i -> sb.instructions.(i)) order) + (Array.sub sb.instructions (nr_instr_sb - trailer_length) trailer_length) in + Some (ins', final_time) + +(* "ideal" *) +let ideal_schedule sb code = + let schedule = + RTLpathScheduleraux.schedule_superblock + {sb with liveins = PTree.map (fun n _regs -> Regset.empty) sb.liveins} + code + in + schedule + +let merge_append _ x y = match x, y with + | None, None -> None + | Some x, None | None, Some x -> Some x + | Some x, Some y -> Some (x @ y) + +(* Turns a tree of dependencies (pc -> [pcs; that; depend; on pc]) into a tree of uses by + * "inverting" tree. + * Now, each pc has the pcs associated to it that depend on it, according to the original + * tree. *) +let uses_of_deps p_ptree = + PTree.fold + (fun acc p vs -> + let acc = HashedSet.PSet.fold + (fun acc v -> + let old = ptree_get_or_default acc v HashedSet.PSet.empty in + let upd = HashedSet.PSet.add p old in + PTree.set v upd acc) + vs + acc + in + acc + ) + p_ptree + PTree.empty + +(* Returns for every side-exit pc, a list of instructions that should be executed beforehand *) +let downschedule_compensation_code sb code pm live_renames ~next_free_pc ~next_free_reg = + (* TODO: Right now we are copying the instructions even if there are duplicates per + * basic block. This leads to an issue where two identical memory writes lead to + * non-matching symbolic memory states. + * As a work-around we could eliminate the first/original memory write, this would + * allow moving memory writes at least one side-exit further down, but not + * farther. For that we would need to refine the symbolic memory model/evaluation + * which might be related to memory aliasing in general?! + * For now however, we simply use a more restrictive ideal_schedule function which + * does not propose moving memory writes below side-exits. *) + let old_debug_flag = !debug_flag in + + let mode = match !Clflags.option_prepass_past_side_exits_sched with + | "move_stores" -> Move_stores + | "no_move_stores" -> Ignore_liveness + | _ -> failwith "Unsupported option for scheduling code past side exits" + in + let sb_with_liveins = {sb with liveins = update_liveins sb.liveins live_renames} in + match ideal_schedule'' sb_with_liveins code mode, ideal_schedule'' sb_with_liveins code Default with + | None, None -> InsertPositionMap.empty (* Early Exit*) + | None, Some _ | Some _, None -> failwith "downschedule_compensation_code: Scheduling procedure failed." + | Some (idealized_schedule, idealized_final_time) + , Some (default_schedule, default_final_time) -> + if idealized_final_time >= default_final_time then ( + if (!debug_flag && idealized_final_time > default_final_time) then ( + debug "Unexpectedly, idealized dependencies lead to a worse expected final time.\n"; + debug "idealized_final_time = %d, default_final_time = %d\n" idealized_final_time default_final_time; + debug "For superblock %d" (Camlcoq.P.to_int sb.instructions.(0)); + failwith "Unexpectedly bad final time for idealized schedule"; + ); + (* Early exit *) + InsertPositionMap.empty + ) else ( + let sb_length = Array.length sb.instructions in + let pc_to_idx = + Duplicateaux.generate_fwmap + (Array.to_list sb.instructions) + (List.init sb_length (fun i -> i)) + PTree.empty + in + let pc_to_idx' = + Duplicateaux.generate_fwmap + (Array.to_list idealized_schedule) + (List.init sb_length (fun i -> i)) + PTree.empty + in + + let side_exit_pcs = side_exit_pcs sb code in + + (* NB: for the purpose of this heuristic we consider the superblock to include the final + * restoration code. sb has changed. *) + let liveins' = update_liveins sb.liveins live_renames in + (* Use the new names to calculate proper dependencies *) + let path_deps = intra_path_dependencies {sb with liveins = liveins'} code in + let path_deps_without_iconds = PTree.map + (fun _pc deps -> + HashedSet.PSet.filter + (fun dep_pc -> not @@ is_icond @@ get_some @@ PTree.get dep_pc code) + deps) + path_deps + in + let transitive_path_deps_without_iconds = + PTree.map (fun pc _deps -> transitive_dependencies path_deps_without_iconds pc) + path_deps + in + let transitive_uses_without_iconds = uses_of_deps transitive_path_deps_without_iconds in + + (* For each side-exit, check if all the dependencies are still above it + * if not, remember the pc and transitively consider its dependencies until + * no further insts to be covered are discovered *) + let side_exit_and_compensation = + ListLabels.map side_exit_pcs + ~f:(fun side_exit_pc -> + let moved_deps = moved_dependencies path_deps pc_to_idx' side_exit_pc in + let moved_deps_sorted = + ListLabels.sort (HashedSet.PSet.elements moved_deps) + ~cmp:(fun pc1 pc2 -> Int.compare (apply_map' pc_to_idx pc1) (apply_map' pc_to_idx pc2)) + in + (* The sucessors are *incorrect* at this point *) + (side_exit_pc, moved_deps_sorted) ) + in + let (side_exit_pcs, insts_pcs) = List.split side_exit_and_compensation in + + let to_insert_pcs = + ListLabels.fold_left2 side_exit_pcs insts_pcs + ~init:InsertPositionMap.empty + ~f:(fun acc side_exit_pc pcs -> + InsertPositionMap.add (InsertPosition.Above side_exit_pc) pcs acc) in + let (to_insert_as_well : Camlcoq.P.t list InsertPositionMap.t)= + InsertPositionMap.fold + (fun side_exit_pc pcs_to_insert acc -> + let side_exit_pc = InsertPosition.anchor side_exit_pc in + let ( let* ) = Option.bind in + let collateral_moves = + pcs_to_insert + |> List.filter_map (fun pc -> + let* uses = PTree.get pc transitive_uses_without_iconds in + HashedSet.PSet.filter + (fun pc -> + not @@ List.mem pc pcs_to_insert + && apply_map' pc_to_idx pc < apply_map' pc_to_idx side_exit_pc + && (not @@ is_icond @@ get_some @@ PTree.get pc code)) + uses + |> Option.some) + |> ListLabels.fold_left ~f:HashedSet.PSet.union ~init:HashedSet.PSet.empty + in + let collateral_moves = ListLabels.sort (HashedSet.PSet.elements collateral_moves) + ~cmp:(fun pc1 pc2 -> Int.compare (apply_map' pc_to_idx pc1) (apply_map' pc_to_idx pc2)) + in + (* TODO? + * In principle we should be able to move the instructions Below, unless they are + * live (liveins') at the side exit. + * But adding them above is simpler and unnecessary instructions should be removed + * by DCE. *) + let acc = InsertPositionMap.add (InsertPosition.Above side_exit_pc) collateral_moves acc in + acc) + to_insert_pcs + InsertPositionMap.empty + in + let to_insert_pcs = InsertPositionMap.merge merge_append to_insert_pcs to_insert_as_well in + let num_probably_duplicated = InsertPositionMap.fold (fun _pos pcs n -> + n + List.length pcs) + to_insert_pcs + 0 + in + let gain = default_final_time - idealized_final_time in + assert(gain > 0); + if gain * !Clflags.option_fliftif < num_probably_duplicated then ( + debug_flag := false; + debug "Expected number of cycles gained, %d, not considered worth the code duplication, expected at %d instructions.\n" + gain num_probably_duplicated; + debug_flag := old_debug_flag; + InsertPositionMap.empty + ) else ( + debug_flag := old_debug_flag; + to_insert_pcs + )) + +let my_merge_no_overwrite m1 m2 = + PTree.combine (fun x y -> match (x, y) with + | None, None -> None + | Some x, None + | None, Some x -> Some x + | Some x, Some y -> + if x = y then Some x + else failwith "Merge conflict." + ) m1 m2 + +let print_schedule schedule = + debug "Schedule\n"; + Array.iter (fun pos -> debug "%d\n" (Camlcoq.P.to_int pos)) schedule; + debug "\n"; + flush_all (); +;; + +(* Walk through sb and find those register which possibly take on different values + * i.e. which are written to twice. *) +let find_mutated_registers (sb : superblock) code input_regs : Regset.t = + let (defined, defined_multiple) = + ArrayLabels.fold_left sb.instructions + ~init:(input_regs, Regset.empty) + ~f:(fun (defined, defined_multiple) pc -> + let inst = get_some @@ PTree.get pc code in + match RTL.instr_defs inst with + | None -> (defined, defined_multiple) + | Some(r) -> + if Regset.mem r defined then + let defined_multiple = Regset.add r defined_multiple in + (defined, defined_multiple) + else + let defined = Regset.add r defined in + (defined, defined_multiple) + ) + in + defined_multiple + +(* Map each register in regs to the index of its first definition in the superblock + * Returns: mapping from a register to the index of the definition that first defines it + * in the superblock *) +let find_first_definition (sb : superblock) code (regs : Regset.t) : int PTree.t = + let (regs, first_defs, _index) = + ArrayLabels.fold_left sb.instructions + ~init:(regs, PTree.empty, 0) + ~f:(fun (regs, first_defs, index) pc -> + let inst = get_some @@ PTree.get pc code in + match RTL.instr_defs inst with + | None -> (regs, first_defs, index + 1) + | Some(r) -> + if Regset.mem r regs then + let regs = Regset.remove r regs in + let first_defs = PTree.set r index first_defs in + (regs, first_defs, index + 1) + else + (regs, first_defs, index + 1) ) + in + assert (Regset.empty = regs); + first_defs + +let print_int_ptree pt = + let module P = Camlcoq.P in + if not !debug_flag then () else + debug "Mappings, P.t -> Int.t:\n"; + List.iter (fun (p, i) -> debug "%d |-> %d\n" (P.to_int p) i) (PTree.elements pt) + +(* [def_index], the index in the superblock of the instruction defining [r] is set to -1 + * if the register is first defined outside of the path *) +let is_read_after_definition (sb : superblock) code (r : reg) (def_index) : bool = + let start = if def_index < 0 then 0 else def_index + 1 in + let stop = Array.length sb.instructions in + let rec aux n = + if n > stop then false else + + let pc = sb.instructions.(n) in + let inst = get_some @@ PTree.get pc code in + if List.mem r @@ RTL.instr_uses inst then + true + else + aux (n + 1) + in + aux start + +let registers_to_alias (sb : superblock) code pm = + let first_pc = sb.instructions.(0) in + let pi = get_some @@ PTree.get first_pc pm in + let sb_length = Array.length sb.instructions in + let mutated_registers = find_mutated_registers sb code pi.input_regs in + (* Registers are of interest if they would need to be restored after renaming *) + let registers_of_interest = Regset.inter sb.s_output_regs mutated_registers in + let first_defs_in_sb = find_first_definition sb code registers_of_interest in + (* If the register is first defined by the last instruction of the path, it does not + * need to be aliased, since it cannot possibly be used afterwards in the superblock. + * Furthermore, the renaming pass won't rename this register, in case the definition + * happens by a path-ending instruction in which case it would be impossible to place + * restoration code afterwards *) + let registers_to_alias = + registers_of_interest + |> Regset.filter (fun r -> if apply_map' first_defs_in_sb r = sb_length - 1 then false else true) + in + registers_to_alias + +let add_aliasing_code sb code pm : RTL.instruction list InsertPositionMap.t = + let first_pc = sb.instructions.(0) in + let pi = get_some @@ PTree.get first_pc pm in + let to_alias = registers_to_alias sb code pm in + let (initial, other) = Regset.partition (fun r -> Regset.mem r pi.input_regs) to_alias in + let to_insert = + (* If registers that are already live at the beginning of the superblock need to be + * aliased, they need to be inserted before (above) the first instruction of the + * superblock. *) + InsertPositionMap.singleton + (InsertPosition.Above sb.instructions.(0)) + (Regset.elements initial |> List.map (fun r -> Iop (Op.Omove, [r], r, Camlcoq.P.one))) + in + + let first_def_other = find_first_definition sb code other in + let to_insert = PTree.fold + (fun to_insert r idx -> + let pos = (InsertPosition.Below sb.instructions.(idx)) in + let old = + match InsertPositionMap.find_opt pos to_insert with + | None -> [] + | Some v -> v + in + let upd = (Iop (Op.Omove, [r], r, Camlcoq.P.one)) :: old in + InsertPositionMap.add pos upd to_insert) + first_def_other + to_insert + in + to_insert + +type icond_frame = + { inop_idx : int + ; icond_idx : int } + +let find_icond_frames (sb : superblock) code = + let (_last_inop_idx, frames, _i) = + ArrayLabels.fold_left sb.instructions + ~init:(None, [], 0) + ~f:(fun (last_inop_idx, frames, i) pc -> + let inst = get_some @@ PTree.get pc code in + match inst with + | Inop _ -> (Some i, frames, i + 1) + | Icond _ -> + ( match last_inop_idx with + | None -> (None, frames, i + 1) + | Some inop_idx -> (None, {inop_idx; icond_idx = i} :: frames, i + 1) ) + | _ -> (last_inop_idx, frames, i + 1) ) + in + frames + +let stage_duplication sb code staged_dupcode staged_revmap ~next_free_pc = + let module D = Duplicateaux in + (* let module P = Camlcoq.P in *) + let icond_frames = find_icond_frames sb code in + let (code, staged_dupcode, staged_revmap, next_free_pc) = + ListLabels.fold_left icond_frames + ~init:(code, staged_dupcode, staged_revmap, next_free_pc) + ~f:(fun (code, staged_dupcode, staged_revmap, next_free_pc) {inop_idx; icond_idx} -> + if (* icond_idx = Array.length sb.instructions - 1 (* Do not lift code before end of path *) + || *) icond_idx = inop_idx + 1 then (* Sentinel value that no code needs to be duplicated *) + (* do nothing *) + (code, staged_dupcode, staged_revmap, next_free_pc) + else + let pcs_to_copy = Array.sub sb.instructions (inop_idx + 1) (icond_idx - inop_idx) in + let (staged_dupcode', staged_revmap', dupcode, fwmap, next_free_pc) = D.clone_only_new code next_free_pc (Array.to_list pcs_to_copy) in + let staged_dupcode = my_merge_no_overwrite staged_dupcode staged_dupcode' in + let staged_revmap = my_merge_no_overwrite staged_revmap staged_revmap' in + let parental_icond = get_some @@ PTree.get sb.instructions.(icond_idx) code in + let (useless_icond, staged_icond) = + match parental_icond with + | Icond(cond, args, ifso, ifnot, info) -> + let staged_icond = Icond(cond, args, Camlcoq.P.of_int @@ List.hd dupcode, pcs_to_copy.(0), info) in + let useless_icond = Icond(cond, args, pcs_to_copy.(0), pcs_to_copy.(0), info) in + (useless_icond, staged_icond) + | _ -> failwith "Instruction was expected to be Icond, but is not" + in + let staged_dupcode = PTree.set sb.instructions.(inop_idx) staged_icond staged_dupcode in + + let code = PTree.set sb.instructions.(inop_idx) useless_icond code in + + (code, staged_dupcode, staged_revmap, next_free_pc) ) + in + (code, staged_dupcode, staged_revmap, next_free_pc) + +(* TODO? better name *) +let apply_aliases sb code name_map ~offset = + let code = ref code in + let name_map = ref name_map in + let length = Array.length sb.instructions in + + (* TODO: preferably there was an early exit condition when there is nothing left to do *) + for i = offset to length - 1 do + let pc = sb.instructions.(i) in + let inst = get_some @@ PTree.get pc !code in + let inst' = change_arg_regs inst !name_map in + code := PTree.set pc inst' !code; + name_map := + (match RTL.instr_defs inst with + | None -> !name_map + | Some r -> + if Option.is_some @@ PTree.get r !name_map then ( + (* The restoration code, is no longer incorrectly applicable *) + if not !Clflags.option_fpoormansssa then + PTree.remove r !name_map + else + !name_map + ) else + !name_map) + ; + done; + !code + +let scheduler f = + (* let module D = Duplicateaux in + let module P = Camlcoq.P in + let module N = Camlcoq.Nat in *) + (* TODO: - control for amount of code duplication *) + let open! Duplicateaux in + let f_rtl = f.fn_RTL in + let code = f_rtl.fn_code in + let _orig_code = code in + let id_ptree = PTree.map (fun n i -> n) (f.fn_path) in + let entry = f.fn_RTL.fn_entrypoint in + let pm = f.fn_path in + let do_nothing = ((((code, entry), pm), id_ptree), None) in + (* TODO: Add flag to select between "conserative" downard scheduling and the one that + * allows moving memory stores below the next side exit (singular). *) + if not !Clflags.option_fpoormansssa && !Clflags.option_fliftif < 1 then do_nothing + else (* NB: Early exit above *) + let _orig_pm = pm in + let typing = get_ok @@ RTLtyping.type_function f.fn_RTL in + let is_loop_header = Duplicateaux.get_loop_headers code (f_rtl.fn_entrypoint) in + let inner_loops = Duplicateaux.get_inner_loops f_rtl code is_loop_header in + (* inner loop map: map loop headers to inner loops *) + let ilmap = generate_fwmap (List.map (fun il -> il.head) inner_loops) inner_loops PTree.empty in + let superblocks = get_superblocks code entry pm typing in + + (* Get only those superblocks that span an inner loop *) + let superblocks = + if not !Clflags.option_ftargetinnerloops then + superblocks + else + List.filter (fun sb -> + (* sanity check; a superblock contains at least one instruction *) + assert (Array.length sb.instructions >= 1); + (* Check if first instruction of a superblock is the beginning of an inner loop*) + let first_pc = sb.instructions.(0) in + match PTree.get first_pc ilmap with + | None -> false + | Some(il) -> (* List.length il.body = Array.length sb.instructions *) + (match il.sb_final with + (* don't bother if the loop is not predicted to loop *) + | None -> false + | Some pc -> + pc == first_pc (* loop is spanned by the superblock *) + (* Make sure the loop does not exceed the superblock *) + && List.length il.body = Array.length sb.instructions + ) + ) superblocks + in + + let next_free_pc = next_free_pc code |> Camlcoq.P.of_int in + let next_free_reg = max_reg_function f.fn_RTL |> Camlcoq.P.succ in + + (* Apply aliasing code *) + let old_debug_flag = !debug_flag in + debug_flag := false; + debug "Initial code.\n"; + print_code code; + print_path_map pm; + print_superblocks superblocks code; + debug "\n"; + flush_all (); + + (* TODO: Is this extra aliasing logic really useless? *) + (* let (superblocks, code, pm, next_free_pc) = + ListLabels.fold_left superblocks + ~init:([], code, pm, next_free_pc) + ~f:(fun (superblocks, code, pm, next_free_pc) sb -> + let to_insert = add_aliasing_code sb code pm in + let (sb', code', pm', next_free_pc') = insert_code sb code pm to_insert ~next_free_pc in + (sb'::superblocks, code', pm', next_free_pc') ) + in + debug "After adding aliasing code.\n"; + print_code code; + print_path_map pm; + print_superblocks superblocks code; + debug "\n"; + flush_all (); *) + + let (superblocks, code, pm, next_free_pc) = + ListLabels.fold_left superblocks + ~init:([], code, pm, next_free_pc) + ~f:(fun (superblocks, code, pm, next_free_pc) sb -> + let to_insert = prepend_nops_before_iconds sb code in + let (sb', code', pm', next_free_pc', _) = insert_code sb code pm to_insert ~next_free_pc in + (sb'::superblocks, code', pm', next_free_pc') ) + in + debug "After adding nops before Iconds.\n"; + print_code code; + print_path_map pm; + print_superblocks superblocks code; + debug "\n"; + flush_all (); + + let (code, sb_renamings, next_free_reg) = + if not !Clflags.option_fpoormansssa then + (code, List.map (fun sb -> (sb, PTree.empty)) superblocks, next_free_reg) + else ( + debug "pmSSA path\n"; flush_all (); + ListLabels.fold_left superblocks + ~init:(code, [], next_free_reg) + ~f:(fun (code, sb_renamings, next_free_reg) sb -> + let first_pc = sb.instructions.(0) in + let pi = get_some @@ PTree.get first_pc pm in + let (code, live_renames, next_free_reg) = local_single_assignment sb code pi.input_regs ~next_free_reg in + (code, (sb, live_renames)::sb_renamings, next_free_reg) + ) ) + in + + debug "After renaming :\n"; + print_code code; + print_path_map pm; + print_superblocks (fst @@ List.split sb_renamings) code; + debug "\n"; + flush_all (); + + + let (sb_renamings, code, pm, next_free_pc) = ListLabels.fold_left sb_renamings + ~init:([], code, pm, next_free_pc) + ~f:(fun (sbs, code, pm, next_free_pc) (sb, live_renames) -> + let final_restoration = final_restoration_code sb code live_renames in + let (sb, code, pm, next_free_pc, fwmap) = insert_code sb code pm final_restoration ~next_free_pc in + let live_renames = + PTree.fold + (fun acc pc insts -> + let pc' = apply_map' fwmap pc in + (* Remove final renames, which were just inserted *) + let insts = if Camlcoq.P.eq pc sb.instructions.(Array.length sb.instructions - 1) then [] else insts in + PTree.set pc' insts acc) + live_renames + PTree.empty + in + ((sb, live_renames)::sbs, code, pm, next_free_pc) + ) + in + + debug "After inserting the final restoration code:\n"; + (* print_code code; *) + (* print_path_map pm; *) + print_superblocks (fst @@ List.split sb_renamings) code; + debug "\n"; + flush_all (); + + let sb_tocompensatepcs_liverenames = + ListLabels.map sb_renamings + ~f:(fun (sb, live_renames) -> + let (to_insert_compensation_pcs, live_renames) = + if !Clflags.option_prepass_past_side_exits then + (downschedule_compensation_code sb code pm live_renames ~next_free_pc ~next_free_reg + , live_renames ) + else + (InsertPositionMap.empty, live_renames) + in + (sb, to_insert_compensation_pcs, live_renames ) ) + in + + let (sb_tocompensate_liverenames, code, next_free_reg) = ListLabels.fold_left sb_tocompensatepcs_liverenames + ~init:([], code, next_free_reg) + ~f:(fun (sbs, code, next_free_reg) (sb, to_compensate_pcs, live_renames) -> + if !Clflags.option_fpoormansssa then ( + let to_compensate = InsertPositionMap.map (fun pcs -> + let insts = List.map (fun pc -> get_some @@ PTree.get pc code) pcs in + insts) + to_compensate_pcs + in + let code = InsertPositionMap.fold + (fun _pos pcs code-> + List.fold_left (fun code pc -> PTree.set pc (Inop Camlcoq.P.one) code) + code + pcs) + to_compensate_pcs + code + in + ((sb, to_compensate, live_renames)::sbs, code, next_free_reg) + ) else ( + assert (PTree.elements live_renames |> List.for_all (fun (_, l) -> l = [])); + let dup_count = InsertPositionMap.fold + (fun _pos (pcs : Camlcoq.P.t list) acc -> + let acc = ListLabels.fold_left pcs + ~init:acc + ~f:(fun acc pc -> + let old = ptree_get_or_default acc pc 0 in + PTree.set pc (old + 1) acc) + in + acc ) + to_compensate_pcs + PTree.empty + in + let pcs_dupd_twice_or_more = + PTree.filter1 (fun n -> n > 1) dup_count + |> PTree.elements + |> List.map fst + in + let arg_regs = ListLabels.fold_left pcs_dupd_twice_or_more + ~init:(Regset.empty) + ~f:(fun acc pc -> + let inst = get_some @@ PTree.get pc code in + RTL.instr_uses inst + |> List.fold_left + (fun acc reg -> Regset.add reg acc) + acc + ) + in + let pi = get_some @@ PTree.get sb.instructions.(0) pm in + let (code, live_renames, next_free_reg) = rename_regs ~only_rename:arg_regs sb code ~liveatentry:pi.input_regs ~next_free_reg in + let to_compensate = InsertPositionMap.map (fun pcs -> + let insts = List.map (fun pc -> get_some @@ PTree.get pc code) pcs in + insts) + to_compensate_pcs + in + let code = InsertPositionMap.fold + (fun _pos pcs code-> + List.fold_left (fun code pc -> PTree.set pc (Inop Camlcoq.P.one) code) + code + pcs) + to_compensate_pcs + code + in + ((sb, to_compensate, live_renames)::sbs, code, next_free_reg) )) + in + + (* Insert the compensation code (downward scheduling) and update the restoration code + * information to reflect the new pcs. *) + let (superblocks_liverenames, code, pm, next_free_pc) = ListLabels.fold_left sb_tocompensate_liverenames + ~init:([], code, pm, next_free_pc) + ~f:(fun (sbs, code, pm, next_free_pc) (sb, to_insert_compensation, live_renames) -> + let (sb, code, pm, next_free_pc, fwmap) = insert_code sb code pm to_insert_compensation ~next_free_pc in + let live_renames = + PTree.fold + (fun acc pc insts -> + let pc' = apply_map' fwmap pc in + PTree.set pc' insts acc) + live_renames + PTree.empty + in + ((sb, live_renames)::sbs, code, pm, next_free_pc) ) + in + + debug "After inserting the compensation code:\n"; + (* print_code code; *) + (* print_path_map pm; *) + print_superblocks (fst @@ List.split superblocks_liverenames) code; + debug "\n"; flush_all (); + + (* Insert the restoration code *) + let (superblocks, code, pm, next_free_pc, next_free_reg) = ListLabels.fold_left superblocks_liverenames + ~init:([], code, pm, next_free_pc, next_free_reg) + ~f:(fun (sbs, code, pm, next_free_pc, next_free_reg) (sb, live_renames) -> + let (sb, code, pm, next_free_pc, live_renames) = + if !Clflags.option_fpoormansssa then + (* Final restoration code was already inserted. *) + (sb, code, pm, next_free_pc, live_renames) + else + (* Ther combination of code motion below side exits WITHOUT register renaming may + * cause some restoration code to be necessary. Otherwise it is not safe to + * duplicate instructions. + * The final restoration code is special since it may insert below. *) + let final_restoration = final_restoration_code sb code live_renames in + let (sb, code, pm, next_free_pc, fwmap) = insert_code sb code pm final_restoration ~next_free_pc in + let live_renames = + PTree.fold + (fun acc pc insts -> + let pc' = apply_map' fwmap pc in + (* Remove final renames, which were just inserted *) + let insts = if Camlcoq.P.eq pc sb.instructions.(Array.length sb.instructions - 1) then [] else insts in + PTree.set pc' insts acc) + live_renames + PTree.empty + in + (sb, code, pm, next_free_pc, live_renames) + in + let pc_to_idx = Duplicateaux.generate_fwmap + (Array.to_list sb.instructions) + (List.init (Array.length sb.instructions) (fun i -> i)) + PTree.empty + in + let (to_insert_restoration, to_rename, next_free_reg) = restoration_instructions' sb code live_renames ~next_free_reg in + let side_exit_pcs = side_exit_pcs sb code in + let code = ListLabels.fold_left side_exit_pcs + ~init:code + ~f:(fun code side_exit_pc -> + match PTree.get side_exit_pc to_rename with + | None -> code + | Some aliases -> + let idx = apply_map' pc_to_idx side_exit_pc in + let code = apply_aliases sb code aliases ~offset:idx in + code) + in + let (sb, code, pm, next_free_pc, _fwmap) = insert_code sb code pm to_insert_restoration ~next_free_pc in + (sb::sbs, code, pm, next_free_pc, next_free_reg) ) + in + + debug "After inserting the restoration code:\n"; + (* print_code code; *) + (* print_path_map pm; *) + print_superblocks superblocks code; + debug "\n"; flush_all (); + + (* let (superblocks, code, pm, next_free_pc, next_free_reg) = + if not !Clflags.option_fpoormansssa then ( + (* In principle we need to do something like this because if we do not systematically + * rename registers, the downward scheduling might copy a series of instructions like + * this: i = i + 1, which when copied twice (to move below two side exits) is + * incorrect unless we rename the register i (or at least its second redefinition). + * However, at least the benchmarks do not seem to trigger this special case. + ListLabels.fold_left superblocks_torename + ~init:([], code, pm, next_free_pc, next_free_reg) + ~f:(fun (sbs, code, pm, next_free_pc, next_free_reg) (sb, to_rename) -> + let pi = get_some @@ PTree.get sb.instructions.(0) pm in + let liveatentry = pi.input_regs in + let (code, live_renames, next_free_reg) = rename_regs ~liveatentry ~only_rename:to_rename sb code ~next_free_reg in + let restoration_insts = restoration_instructions live_renames in + let (sb, code, pm, next_free_pc) = insert_code sb code pm restoration_insts ~next_free_pc in + (sb::sbs, code, pm, next_free_pc, next_free_reg) ) *) + (fst @@ List.split superblocks_torestore, code, pm, next_free_pc, next_free_reg) + ) else + (fst @@ List.split superblocks_torestore, code, pm, next_free_pc, next_free_reg) + in *) + + let (code, to_lift) = + if !Clflags.option_fliftif > 0 then + (* TODO: Use this flag to control for the amount of duplicated code. + * However, this is probably best controlled at the "downscheduling" level since only + * those instructions need to be actually duplicated, i.e. restoration code writing + * back the current value to renamed registers is not actually duplicated. *) + let (code, staged_dupcode, staged_revmap, next_free_pc) = ListLabels.fold_left superblocks + ~init:(code, PTree.empty, PTree.empty, (Camlcoq.P.to_int next_free_pc)) + ~f:(fun (code, staged_dupcode, staged_revmap, next_free_pc) sb -> + stage_duplication sb code staged_dupcode staged_revmap ~next_free_pc ) + in + (code, Some(staged_revmap, staged_dupcode)) + else + (code, None) + in + + debug "After staging the duplication (\"if-lifting\") :\n"; + print_code code; + print_path_map pm; + print_superblocks superblocks code; + debug "\n"; + flush_all (); + + debug_flag := old_debug_flag; + + ((((code, entry), pm), id_ptree), to_lift) |