diff options
author | David Monniaux <david.monniaux@univ-grenoble-alpes.fr> | 2020-07-07 20:46:40 +0200 |
---|---|---|
committer | David Monniaux <david.monniaux@univ-grenoble-alpes.fr> | 2020-07-07 20:46:40 +0200 |
commit | 2396cc336c7ffce0075073591c3243ace2320d18 (patch) | |
tree | 025083fa4a335147b2e2fb4d335282337a20e498 | |
parent | 962a35719c04c5325f6034fa68743c92613c3fd2 (diff) | |
download | compcert-kvx-2396cc336c7ffce0075073591c3243ace2320d18.tar.gz compcert-kvx-2396cc336c7ffce0075073591c3243ace2320d18.zip |
begin prepass scheduling oracle
-rw-r--r-- | kvx/OpWeights.ml | 16 | ||||
-rw-r--r-- | kvx/lib/PrepassSchedulingOracle.ml | 406 | ||||
-rw-r--r-- | kvx/lib/RTLpathScheduleraux.ml | 18 |
3 files changed, 426 insertions, 14 deletions
diff --git a/kvx/OpWeights.ml b/kvx/OpWeights.ml index 8396bde1..4c3c40d0 100644 --- a/kvx/OpWeights.ml +++ b/kvx/OpWeights.ml @@ -23,6 +23,17 @@ let insn_of_op op nargs = | [] -> failwith "OpWeights.insn_of_op" | h::_ -> h;; +let insns_of_cond (cond : condition) (nargs : int) = + match Asmblockgen.transl_cond_op cond + Asmvliw.GPR0 (bogus_inputs nargs) [] with + | Errors.Error msg -> failwith "OpWeights.insns_of_cond" + | Errors.OK insns -> insns;; + +let insn_of_cond cond nargs = + match insns_of_cond cond nargs with + | [] -> failwith "OpWeights.insn_of_cond" + | h::_ -> h;; + let insns_of_load trap chunk addressing (nargs : int) = match Asmblockgen.transl_load trap chunk addressing (bogus_inputs nargs) bogus_register [] with @@ -56,6 +67,11 @@ let resources_of_op (op : operation) (nargs : int) = let record = basic_rec insn in rec_to_usage record;; +let resources_of_cond (cond : condition) (nargs : int) = + let insn = insn_of_cond cond nargs in + let record = basic_rec insn in + rec_to_usage record;; + let latency_of_load trap chunk (addr : addressing) (nargs : int) = 3;; let latency_of_call _ _ = 6;; diff --git a/kvx/lib/PrepassSchedulingOracle.ml b/kvx/lib/PrepassSchedulingOracle.ml new file mode 100644 index 00000000..14239ed2 --- /dev/null +++ b/kvx/lib/PrepassSchedulingOracle.ml @@ -0,0 +1,406 @@ +open AST +open RTL +open Maps +open InstructionScheduler +open OpWeights +open Registers + +let use_alias_analysis () = false + +let length_of_chunk = function +| Mint8signed +| Mint8unsigned -> 1 +| Mint16signed +| Mint16unsigned -> 2 +| Mint32 +| Mfloat32 +| Many32 -> 4 +| Mint64 +| Mfloat64 +| Many64 -> 8;; + +let get_simple_dependencies (seqa : instruction array) = + let last_reg_reads : int list PTree.t ref = ref PTree.empty + and last_reg_write : (int*int) PTree.t ref = ref PTree.empty + and last_mem_reads : int list ref = ref [] + and last_mem_write : int option ref = ref None + and latency_constraints : latency_constraint list ref = ref [] in + let add_constraint instr_from instr_to latency = + assert (instr_from <= instr_to); + assert (latency >= 0); + if instr_from = instr_to + then (if latency = 0 + then () + else failwith "PrepassSchedulingOracle.get_dependencies: negative self-loop") + else + latency_constraints := + { instr_from = instr_from; + instr_to = instr_to; + latency = latency + }:: !latency_constraints + and get_last_reads reg = + match PTree.get reg !last_reg_reads + with Some l -> l + | None -> [] in + let add_input_mem i = + if not (use_alias_analysis ()) + then + begin + begin + (* Read after write *) + match !last_mem_write with + | None -> () + | Some j -> add_constraint j i 1 + end; + last_mem_reads := i :: !last_mem_reads + end + and add_output_mem i = + if not (use_alias_analysis ()) + then + begin + begin + (* Write after write *) + match !last_mem_write with + | None -> () + | Some j -> add_constraint j i 1 + end; + (* Write after read *) + List.iter (fun j -> add_constraint j i 0) !last_mem_reads; + last_mem_write := Some i; + last_mem_reads := [] + end + and add_input_reg i reg = + begin + (* Read after write *) + match PTree.get reg !last_reg_write with + | None -> () + | Some (j, latency) -> add_constraint j i latency + end; + last_reg_reads := PTree.set reg + (i :: get_last_reads reg) + !last_reg_reads + and add_output_reg i latency reg = + begin + (* Write after write *) + match PTree.get reg !last_reg_write with + | None -> () + | Some (j, _) -> add_constraint j i 1 + end; + begin + (* Write after read *) + List.iter (fun j -> add_constraint j i 0) (get_last_reads reg) + end; + last_reg_write := PTree.set reg (i, latency) !last_reg_write; + last_reg_reads := PTree.remove reg !last_reg_reads + in + let add_input_regs i regs = List.iter (add_input_reg i) regs in + let rec add_builtin_res i (res : reg builtin_res) = + match res with + | BR r -> add_output_reg i 10 r + | BR_none -> () + | BR_splitlong (hi, lo) -> add_builtin_res i hi; + add_builtin_res i lo in + let rec add_builtin_arg i (ba : reg builtin_arg) = + match ba with + | BA r -> add_input_reg i r + | BA_int _ | BA_long _ | BA_float _ | BA_single _ -> () + | BA_loadstack(_,_) -> add_input_mem i + | BA_addrstack _ -> () + | BA_loadglobal(_, _, _) -> add_input_mem i + | BA_addrglobal _ -> () + | BA_splitlong(hi, lo) -> add_builtin_arg i hi; + add_builtin_arg i lo + | BA_addptr(a1, a2) -> add_builtin_arg i a1; + add_builtin_arg i a2 + in + Array.iteri + begin + fun i insn -> + match insn with + | Inop _ -> () + | Iop(op, inputs, output, _) -> + add_input_regs i inputs; + add_output_reg i (latency_of_op op (List.length inputs)) output + | Iload(trap, chunk, addressing, addr_regs, output, _) -> + add_input_mem i; + add_input_regs i addr_regs; + add_output_reg i (latency_of_load trap chunk addressing (List.length addr_regs)) output + | Istore(chunk, addressing, addr_regs, input, _) -> + add_input_regs i addr_regs; + add_input_reg i input; + add_output_mem i + | Icall(signature, ef, inputs, output, _) -> + (match ef with + | Datatypes.Coq_inl r -> add_input_reg i r + | Datatypes.Coq_inr symbol -> () + ); + add_input_mem i; + add_input_regs i inputs; + add_output_reg i (latency_of_call signature ef) output; + add_output_mem i + | Itailcall(signature, ef, inputs) -> + (match ef with + | Datatypes.Coq_inl r -> add_input_reg i r + | Datatypes.Coq_inr symbol -> () + ); + add_input_mem i; + add_input_regs i inputs + | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> + add_input_mem i; + List.iter (add_builtin_arg i) builtin_inputs; + add_builtin_res i builtin_output; + add_output_mem i + | Icond(cond, inputs, _, _, _) -> + add_input_regs i inputs + | Ijumptable(input, _) -> + add_input_reg i input + | Ireturn(Some input) -> + add_input_reg i input + | Ireturn(None) -> () + end seqa; + !latency_constraints;; + +let resources_of_instruction = function + | Inop _ -> Array.map (fun _ -> 0) resource_bounds + | Iop(op, inputs, output, _) -> resources_of_op op (List.length inputs) + | Iload(trap, chunk, addressing, addr_regs, output, _) -> + resources_of_load trap chunk addressing (List.length addr_regs) + | Istore(chunk, addressing, addr_regs, input, _) -> + resources_of_store chunk addressing (List.length addr_regs) + | Icall(signature, ef, inputs, output, _) -> + resources_of_call signature ef + | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> + resources_of_builtin ef + | Icond(cond, args, _, _ , _) -> + resources_of_cond cond (List.length args) + | Itailcall _ | Ijumptable _ | Ireturn _ -> resource_bounds + +let print_sequence pp (seqa : instruction array) = + Array.iteri ( + fun i (insn : instruction) -> + PrintRTL.print_instruction pp (i, insn)) seqa;; + +type unique_id = int + +type 'a symbolic_term_node = + | STop of Op.operation * 'a list + | STinitial_reg of int + | STother of int;; + +type symbolic_term = { + hash_id : unique_id; + hash_ct : symbolic_term symbolic_term_node + };; + +let rec print_term channel term = + match term.hash_ct with + | STop(op, args) -> + PrintOp.print_operation print_term channel (op, args) + | STinitial_reg n -> Printf.fprintf channel "x%d" n + | STother n -> Printf.fprintf channel "y%d" n;; + +type symbolic_term_table = { + st_table : (unique_id symbolic_term_node, symbolic_term) Hashtbl.t; + mutable st_next_id : unique_id };; + +let hash_init () = { + st_table = Hashtbl.create 20; + st_next_id = 0 + };; + +let ground_to_id = function + | STop(op, l) -> STop(op, List.map (fun t -> t.hash_id) l) + | STinitial_reg r -> STinitial_reg r + | STother i -> STother i;; + +let hash_node (table : symbolic_term_table) (term : symbolic_term symbolic_term_node) : symbolic_term = + let grounded = ground_to_id term in + match Hashtbl.find_opt table.st_table grounded with + | Some x -> x + | None -> + let term' = { hash_id = table.st_next_id; + hash_ct = term } in + (if table.st_next_id = max_int then failwith "hash: max_int"); + table.st_next_id <- table.st_next_id + 1; + Hashtbl.add table.st_table grounded term'; + term';; + +type access = { + base : symbolic_term; + offset : int64; + length : int + };; + +let term_equal a b = (a.hash_id = b.hash_id);; + +let access_of_addressing get_reg chunk addressing args = + match addressing, args with + | (Op.Aindexed ofs), [reg] -> Some + { base = get_reg reg; + offset = Camlcoq.camlint64_of_ptrofs ofs; + length = length_of_chunk chunk + } + | _, _ -> None ;; +(* TODO: global *) + +let symbolic_execution (seqa : instruction array) = + let regs = ref PTree.empty + and table = hash_init() in + let assign reg term = regs := PTree.set reg term !regs + and hash term = hash_node table term in + let get_reg reg = + match PTree.get reg !regs with + | None -> hash (STinitial_reg (Camlcoq.P.to_int reg)) + | Some x -> x in + let targets = Array.make (Array.length seqa) None in + Array.iteri + begin + fun i insn -> + match insn with + | Iop(Op.Omove, [input], output, _) -> + assign output (get_reg input) + | Iop(op, inputs, output, _) -> + assign output (hash (STop(op, List.map get_reg inputs))) + + | Iload(trap, chunk, addressing, args, output, _) -> + let access = access_of_addressing get_reg chunk addressing args in + targets.(i) <- access; + assign output (hash (STother(i))) + + | Icall(_, _, _, output, _) + | Ibuiltin(_, _, BR output, _) -> + assign output (hash (STother(i))) + + | Istore(chunk, addressing, args, va, _) -> + let access = access_of_addressing get_reg chunk addressing args in + targets.(i) <- access + + | Inop _ -> () + | Ibuiltin(_, _, BR_none, _) -> () + | Ibuiltin(_, _, BR_splitlong _, _) -> failwith "BR_splitlong" + + | Itailcall (_, _, _) + |Icond (_, _, _, _, _) + |Ijumptable (_, _) + |Ireturn _ -> () + end seqa; + targets;; + +let print_access channel = function + | None -> Printf.fprintf channel "any" + | Some x -> Printf.fprintf channel "%a + %Ld" print_term x.base x.offset;; + +let print_targets channel seqa = + let targets = symbolic_execution seqa in + Array.iteri + (fun i insn -> + match insn with + | Iload _ -> Printf.fprintf channel "%d: load %a\n" + i print_access targets.(i) + | Istore _ -> Printf.fprintf channel "%d: store %a\n" + i print_access targets.(i) + | _ -> () + ) seqa;; + +let may_overlap a0 b0 = + match a0, b0 with + | (None, _) | (_ , None) -> true + | (Some a), (Some b) -> + if term_equal a.base b.base + then (max a.offset b.offset) < + (min (Int64.add (Int64.of_int a.length) a.offset) + (Int64.add (Int64.of_int b.length) b.offset)) + else match a.base.hash_ct, b.base.hash_ct with + | STop(Op.Oaddrsymbol(ida, ofsa),[]), + STop(Op.Oaddrsymbol(idb, ofsb),[]) -> + (ida=idb) && + let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) + and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in + (max ao bo) < + (min (Int64.add (Int64.of_int a.length) ao) + (Int64.add (Int64.of_int b.length) bo)) + | STop(Op.Oaddrstack _, []), + STop(Op.Oaddrsymbol _, []) + | STop(Op.Oaddrsymbol _, []), + STop(Op.Oaddrstack _, []) -> false + | STop(Op.Oaddrstack(ofsa),[]), + STop(Op.Oaddrstack(ofsb),[]) -> + let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) + and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in + (max ao bo) < + (min (Int64.add (Int64.of_int a.length) ao) + (Int64.add (Int64.of_int b.length) bo)) + | _ -> true;; + +(* +(* TODO suboptimal quadratic algorithm *) +let get_alias_dependencies seqa = + let targets = symbolic_execution seqa + and deps = ref [] in + let add_constraint instr_from instr_to latency = + deps := { instr_from = instr_from; + instr_to = instr_to; + latency = latency + }:: !deps in + for i=0 to (Array.length seqa)-1 + do + for j=0 to i-1 + do + match seqa.(j), seqa.(i) with + | (Istore _), ((Iload _) | (Istore _)) -> + if may_overlap targets.(j) targets.(i) + then add_constraint j i 1 + | (Iload _), (Istore _) -> + if may_overlap targets.(j) targets.(i) + then add_constraint j i 0 + | (Istore _ | Iload _), (Icall _ | Ibuiltin _) + | (Icall _ | Ibuiltin _), (Icall _ | Ibuiltin _ | Iload _ | Istore _) -> + add_constraint j i 1 + | (Inop _ | Iop _), _ + | _, (Inop _ | Iop _) + | (Iload _), (Iload _) -> () + done + done; + !deps;; + *) + +let define_problem seqa = + let simple_deps = get_simple_dependencies seqa in + { max_latency = -1; + resource_bounds = OpWeights.resource_bounds; + instruction_usages = Array.map resources_of_instruction seqa; + latency_constraints = + (* if (use_alias_analysis ()) + then (get_alias_dependencies seqa) @ simple_deps + else *) simple_deps };; + +let schedule_sequence (seqa : instruction array) = + try + if (Array.length seqa) <= 1 + then seqa + else + begin + let nr_instructions = Array.length seqa in + Printf.printf "prepass scheduling length = %d\n" (Array.length seqa); + let problem = define_problem seqa in + print_sequence stdout seqa; + print_problem stdout problem; + match reverse_list_scheduler problem + (* scheduler_by_name !Clflags.option_fprepass_sched problem *) with + | None -> Printf.printf "no solution in prepass scheduling\n"; + seqa + | Some solution -> + let positions = Array.init nr_instructions (fun i -> i) in + Array.sort (fun i j -> + let si = solution.(i) and sj = solution.(j) in + if si < sj then -1 + else if si > sj then 1 + else i - j) positions; + let reordered = Array.init nr_instructions + (fun i -> seqa.(positions.(i))) in + reordered + end + with (Failure s) -> + Printf.printf "failure in prepass scheduling: %s\n" s; + seqa;; + diff --git a/kvx/lib/RTLpathScheduleraux.ml b/kvx/lib/RTLpathScheduleraux.ml index 87d03fed..c304c6d3 100644 --- a/kvx/lib/RTLpathScheduleraux.ml +++ b/kvx/lib/RTLpathScheduleraux.ml @@ -94,24 +94,14 @@ let get_superblocks code entry pm = end (* TODO David *) -let compute_latency (insn : RTL.instruction) = - match insn with - | Inop(successor) -> 0 - | Iop(op, args, dst, successor) -> OpWeights.latency_of_op op (List.length args) - | Iload(trap, chunk, addr, args, dst, successor) -> OpWeights.latency_of_load trap chunk addr (List.length args) - | Istore(chunk, addr, args, src, successor) -> 0 - | Icond(cond, args, ifso, ifnot, suggestion) -> 0 - - | Ijumptable(arg, targets) -> 0 - | Itailcall(sign, ros, args) -> 0 - | Icall(sign, ros, args, dst, successor) -> 0 - | Ibuiltin(ef, args, res, successor) -> 0 - | Ireturn(arg) -> 0;; - let schedule_superblock sb code = let old_flag = !debug_flag in debug_flag := true; print_superblock sb code; + let _ = PrepassSchedulingOracle.schedule_sequence + (Array.map (fun i -> + match PTree.get i code with Some ii -> ii | None -> failwith "RTLpathScheduleraux.schedule_superblock") + sb.instructions) in debug_flag := old_flag; (* stub2: reverse function *) (* |