aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Monniaux <david.monniaux@univ-grenoble-alpes.fr>2020-07-07 20:46:40 +0200
committerDavid Monniaux <david.monniaux@univ-grenoble-alpes.fr>2020-07-07 20:46:40 +0200
commit2396cc336c7ffce0075073591c3243ace2320d18 (patch)
tree025083fa4a335147b2e2fb4d335282337a20e498
parent962a35719c04c5325f6034fa68743c92613c3fd2 (diff)
downloadcompcert-kvx-2396cc336c7ffce0075073591c3243ace2320d18.tar.gz
compcert-kvx-2396cc336c7ffce0075073591c3243ace2320d18.zip
begin prepass scheduling oracle
-rw-r--r--kvx/OpWeights.ml16
-rw-r--r--kvx/lib/PrepassSchedulingOracle.ml406
-rw-r--r--kvx/lib/RTLpathScheduleraux.ml18
3 files changed, 426 insertions, 14 deletions
diff --git a/kvx/OpWeights.ml b/kvx/OpWeights.ml
index 8396bde1..4c3c40d0 100644
--- a/kvx/OpWeights.ml
+++ b/kvx/OpWeights.ml
@@ -23,6 +23,17 @@ let insn_of_op op nargs =
| [] -> failwith "OpWeights.insn_of_op"
| h::_ -> h;;
+let insns_of_cond (cond : condition) (nargs : int) =
+ match Asmblockgen.transl_cond_op cond
+ Asmvliw.GPR0 (bogus_inputs nargs) [] with
+ | Errors.Error msg -> failwith "OpWeights.insns_of_cond"
+ | Errors.OK insns -> insns;;
+
+let insn_of_cond cond nargs =
+ match insns_of_cond cond nargs with
+ | [] -> failwith "OpWeights.insn_of_cond"
+ | h::_ -> h;;
+
let insns_of_load trap chunk addressing (nargs : int) =
match Asmblockgen.transl_load trap chunk addressing
(bogus_inputs nargs) bogus_register [] with
@@ -56,6 +67,11 @@ let resources_of_op (op : operation) (nargs : int) =
let record = basic_rec insn in
rec_to_usage record;;
+let resources_of_cond (cond : condition) (nargs : int) =
+ let insn = insn_of_cond cond nargs in
+ let record = basic_rec insn in
+ rec_to_usage record;;
+
let latency_of_load trap chunk (addr : addressing) (nargs : int) = 3;;
let latency_of_call _ _ = 6;;
diff --git a/kvx/lib/PrepassSchedulingOracle.ml b/kvx/lib/PrepassSchedulingOracle.ml
new file mode 100644
index 00000000..14239ed2
--- /dev/null
+++ b/kvx/lib/PrepassSchedulingOracle.ml
@@ -0,0 +1,406 @@
+open AST
+open RTL
+open Maps
+open InstructionScheduler
+open OpWeights
+open Registers
+
+let use_alias_analysis () = false
+
+let length_of_chunk = function
+| Mint8signed
+| Mint8unsigned -> 1
+| Mint16signed
+| Mint16unsigned -> 2
+| Mint32
+| Mfloat32
+| Many32 -> 4
+| Mint64
+| Mfloat64
+| Many64 -> 8;;
+
+let get_simple_dependencies (seqa : instruction array) =
+ let last_reg_reads : int list PTree.t ref = ref PTree.empty
+ and last_reg_write : (int*int) PTree.t ref = ref PTree.empty
+ and last_mem_reads : int list ref = ref []
+ and last_mem_write : int option ref = ref None
+ and latency_constraints : latency_constraint list ref = ref [] in
+ let add_constraint instr_from instr_to latency =
+ assert (instr_from <= instr_to);
+ assert (latency >= 0);
+ if instr_from = instr_to
+ then (if latency = 0
+ then ()
+ else failwith "PrepassSchedulingOracle.get_dependencies: negative self-loop")
+ else
+ latency_constraints :=
+ { instr_from = instr_from;
+ instr_to = instr_to;
+ latency = latency
+ }:: !latency_constraints
+ and get_last_reads reg =
+ match PTree.get reg !last_reg_reads
+ with Some l -> l
+ | None -> [] in
+ let add_input_mem i =
+ if not (use_alias_analysis ())
+ then
+ begin
+ begin
+ (* Read after write *)
+ match !last_mem_write with
+ | None -> ()
+ | Some j -> add_constraint j i 1
+ end;
+ last_mem_reads := i :: !last_mem_reads
+ end
+ and add_output_mem i =
+ if not (use_alias_analysis ())
+ then
+ begin
+ begin
+ (* Write after write *)
+ match !last_mem_write with
+ | None -> ()
+ | Some j -> add_constraint j i 1
+ end;
+ (* Write after read *)
+ List.iter (fun j -> add_constraint j i 0) !last_mem_reads;
+ last_mem_write := Some i;
+ last_mem_reads := []
+ end
+ and add_input_reg i reg =
+ begin
+ (* Read after write *)
+ match PTree.get reg !last_reg_write with
+ | None -> ()
+ | Some (j, latency) -> add_constraint j i latency
+ end;
+ last_reg_reads := PTree.set reg
+ (i :: get_last_reads reg)
+ !last_reg_reads
+ and add_output_reg i latency reg =
+ begin
+ (* Write after write *)
+ match PTree.get reg !last_reg_write with
+ | None -> ()
+ | Some (j, _) -> add_constraint j i 1
+ end;
+ begin
+ (* Write after read *)
+ List.iter (fun j -> add_constraint j i 0) (get_last_reads reg)
+ end;
+ last_reg_write := PTree.set reg (i, latency) !last_reg_write;
+ last_reg_reads := PTree.remove reg !last_reg_reads
+ in
+ let add_input_regs i regs = List.iter (add_input_reg i) regs in
+ let rec add_builtin_res i (res : reg builtin_res) =
+ match res with
+ | BR r -> add_output_reg i 10 r
+ | BR_none -> ()
+ | BR_splitlong (hi, lo) -> add_builtin_res i hi;
+ add_builtin_res i lo in
+ let rec add_builtin_arg i (ba : reg builtin_arg) =
+ match ba with
+ | BA r -> add_input_reg i r
+ | BA_int _ | BA_long _ | BA_float _ | BA_single _ -> ()
+ | BA_loadstack(_,_) -> add_input_mem i
+ | BA_addrstack _ -> ()
+ | BA_loadglobal(_, _, _) -> add_input_mem i
+ | BA_addrglobal _ -> ()
+ | BA_splitlong(hi, lo) -> add_builtin_arg i hi;
+ add_builtin_arg i lo
+ | BA_addptr(a1, a2) -> add_builtin_arg i a1;
+ add_builtin_arg i a2
+ in
+ Array.iteri
+ begin
+ fun i insn ->
+ match insn with
+ | Inop _ -> ()
+ | Iop(op, inputs, output, _) ->
+ add_input_regs i inputs;
+ add_output_reg i (latency_of_op op (List.length inputs)) output
+ | Iload(trap, chunk, addressing, addr_regs, output, _) ->
+ add_input_mem i;
+ add_input_regs i addr_regs;
+ add_output_reg i (latency_of_load trap chunk addressing (List.length addr_regs)) output
+ | Istore(chunk, addressing, addr_regs, input, _) ->
+ add_input_regs i addr_regs;
+ add_input_reg i input;
+ add_output_mem i
+ | Icall(signature, ef, inputs, output, _) ->
+ (match ef with
+ | Datatypes.Coq_inl r -> add_input_reg i r
+ | Datatypes.Coq_inr symbol -> ()
+ );
+ add_input_mem i;
+ add_input_regs i inputs;
+ add_output_reg i (latency_of_call signature ef) output;
+ add_output_mem i
+ | Itailcall(signature, ef, inputs) ->
+ (match ef with
+ | Datatypes.Coq_inl r -> add_input_reg i r
+ | Datatypes.Coq_inr symbol -> ()
+ );
+ add_input_mem i;
+ add_input_regs i inputs
+ | Ibuiltin(ef, builtin_inputs, builtin_output, _) ->
+ add_input_mem i;
+ List.iter (add_builtin_arg i) builtin_inputs;
+ add_builtin_res i builtin_output;
+ add_output_mem i
+ | Icond(cond, inputs, _, _, _) ->
+ add_input_regs i inputs
+ | Ijumptable(input, _) ->
+ add_input_reg i input
+ | Ireturn(Some input) ->
+ add_input_reg i input
+ | Ireturn(None) -> ()
+ end seqa;
+ !latency_constraints;;
+
+let resources_of_instruction = function
+ | Inop _ -> Array.map (fun _ -> 0) resource_bounds
+ | Iop(op, inputs, output, _) -> resources_of_op op (List.length inputs)
+ | Iload(trap, chunk, addressing, addr_regs, output, _) ->
+ resources_of_load trap chunk addressing (List.length addr_regs)
+ | Istore(chunk, addressing, addr_regs, input, _) ->
+ resources_of_store chunk addressing (List.length addr_regs)
+ | Icall(signature, ef, inputs, output, _) ->
+ resources_of_call signature ef
+ | Ibuiltin(ef, builtin_inputs, builtin_output, _) ->
+ resources_of_builtin ef
+ | Icond(cond, args, _, _ , _) ->
+ resources_of_cond cond (List.length args)
+ | Itailcall _ | Ijumptable _ | Ireturn _ -> resource_bounds
+
+let print_sequence pp (seqa : instruction array) =
+ Array.iteri (
+ fun i (insn : instruction) ->
+ PrintRTL.print_instruction pp (i, insn)) seqa;;
+
+type unique_id = int
+
+type 'a symbolic_term_node =
+ | STop of Op.operation * 'a list
+ | STinitial_reg of int
+ | STother of int;;
+
+type symbolic_term = {
+ hash_id : unique_id;
+ hash_ct : symbolic_term symbolic_term_node
+ };;
+
+let rec print_term channel term =
+ match term.hash_ct with
+ | STop(op, args) ->
+ PrintOp.print_operation print_term channel (op, args)
+ | STinitial_reg n -> Printf.fprintf channel "x%d" n
+ | STother n -> Printf.fprintf channel "y%d" n;;
+
+type symbolic_term_table = {
+ st_table : (unique_id symbolic_term_node, symbolic_term) Hashtbl.t;
+ mutable st_next_id : unique_id };;
+
+let hash_init () = {
+ st_table = Hashtbl.create 20;
+ st_next_id = 0
+ };;
+
+let ground_to_id = function
+ | STop(op, l) -> STop(op, List.map (fun t -> t.hash_id) l)
+ | STinitial_reg r -> STinitial_reg r
+ | STother i -> STother i;;
+
+let hash_node (table : symbolic_term_table) (term : symbolic_term symbolic_term_node) : symbolic_term =
+ let grounded = ground_to_id term in
+ match Hashtbl.find_opt table.st_table grounded with
+ | Some x -> x
+ | None ->
+ let term' = { hash_id = table.st_next_id;
+ hash_ct = term } in
+ (if table.st_next_id = max_int then failwith "hash: max_int");
+ table.st_next_id <- table.st_next_id + 1;
+ Hashtbl.add table.st_table grounded term';
+ term';;
+
+type access = {
+ base : symbolic_term;
+ offset : int64;
+ length : int
+ };;
+
+let term_equal a b = (a.hash_id = b.hash_id);;
+
+let access_of_addressing get_reg chunk addressing args =
+ match addressing, args with
+ | (Op.Aindexed ofs), [reg] -> Some
+ { base = get_reg reg;
+ offset = Camlcoq.camlint64_of_ptrofs ofs;
+ length = length_of_chunk chunk
+ }
+ | _, _ -> None ;;
+(* TODO: global *)
+
+let symbolic_execution (seqa : instruction array) =
+ let regs = ref PTree.empty
+ and table = hash_init() in
+ let assign reg term = regs := PTree.set reg term !regs
+ and hash term = hash_node table term in
+ let get_reg reg =
+ match PTree.get reg !regs with
+ | None -> hash (STinitial_reg (Camlcoq.P.to_int reg))
+ | Some x -> x in
+ let targets = Array.make (Array.length seqa) None in
+ Array.iteri
+ begin
+ fun i insn ->
+ match insn with
+ | Iop(Op.Omove, [input], output, _) ->
+ assign output (get_reg input)
+ | Iop(op, inputs, output, _) ->
+ assign output (hash (STop(op, List.map get_reg inputs)))
+
+ | Iload(trap, chunk, addressing, args, output, _) ->
+ let access = access_of_addressing get_reg chunk addressing args in
+ targets.(i) <- access;
+ assign output (hash (STother(i)))
+
+ | Icall(_, _, _, output, _)
+ | Ibuiltin(_, _, BR output, _) ->
+ assign output (hash (STother(i)))
+
+ | Istore(chunk, addressing, args, va, _) ->
+ let access = access_of_addressing get_reg chunk addressing args in
+ targets.(i) <- access
+
+ | Inop _ -> ()
+ | Ibuiltin(_, _, BR_none, _) -> ()
+ | Ibuiltin(_, _, BR_splitlong _, _) -> failwith "BR_splitlong"
+
+ | Itailcall (_, _, _)
+ |Icond (_, _, _, _, _)
+ |Ijumptable (_, _)
+ |Ireturn _ -> ()
+ end seqa;
+ targets;;
+
+let print_access channel = function
+ | None -> Printf.fprintf channel "any"
+ | Some x -> Printf.fprintf channel "%a + %Ld" print_term x.base x.offset;;
+
+let print_targets channel seqa =
+ let targets = symbolic_execution seqa in
+ Array.iteri
+ (fun i insn ->
+ match insn with
+ | Iload _ -> Printf.fprintf channel "%d: load %a\n"
+ i print_access targets.(i)
+ | Istore _ -> Printf.fprintf channel "%d: store %a\n"
+ i print_access targets.(i)
+ | _ -> ()
+ ) seqa;;
+
+let may_overlap a0 b0 =
+ match a0, b0 with
+ | (None, _) | (_ , None) -> true
+ | (Some a), (Some b) ->
+ if term_equal a.base b.base
+ then (max a.offset b.offset) <
+ (min (Int64.add (Int64.of_int a.length) a.offset)
+ (Int64.add (Int64.of_int b.length) b.offset))
+ else match a.base.hash_ct, b.base.hash_ct with
+ | STop(Op.Oaddrsymbol(ida, ofsa),[]),
+ STop(Op.Oaddrsymbol(idb, ofsb),[]) ->
+ (ida=idb) &&
+ let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa)
+ and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in
+ (max ao bo) <
+ (min (Int64.add (Int64.of_int a.length) ao)
+ (Int64.add (Int64.of_int b.length) bo))
+ | STop(Op.Oaddrstack _, []),
+ STop(Op.Oaddrsymbol _, [])
+ | STop(Op.Oaddrsymbol _, []),
+ STop(Op.Oaddrstack _, []) -> false
+ | STop(Op.Oaddrstack(ofsa),[]),
+ STop(Op.Oaddrstack(ofsb),[]) ->
+ let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa)
+ and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in
+ (max ao bo) <
+ (min (Int64.add (Int64.of_int a.length) ao)
+ (Int64.add (Int64.of_int b.length) bo))
+ | _ -> true;;
+
+(*
+(* TODO suboptimal quadratic algorithm *)
+let get_alias_dependencies seqa =
+ let targets = symbolic_execution seqa
+ and deps = ref [] in
+ let add_constraint instr_from instr_to latency =
+ deps := { instr_from = instr_from;
+ instr_to = instr_to;
+ latency = latency
+ }:: !deps in
+ for i=0 to (Array.length seqa)-1
+ do
+ for j=0 to i-1
+ do
+ match seqa.(j), seqa.(i) with
+ | (Istore _), ((Iload _) | (Istore _)) ->
+ if may_overlap targets.(j) targets.(i)
+ then add_constraint j i 1
+ | (Iload _), (Istore _) ->
+ if may_overlap targets.(j) targets.(i)
+ then add_constraint j i 0
+ | (Istore _ | Iload _), (Icall _ | Ibuiltin _)
+ | (Icall _ | Ibuiltin _), (Icall _ | Ibuiltin _ | Iload _ | Istore _) ->
+ add_constraint j i 1
+ | (Inop _ | Iop _), _
+ | _, (Inop _ | Iop _)
+ | (Iload _), (Iload _) -> ()
+ done
+ done;
+ !deps;;
+ *)
+
+let define_problem seqa =
+ let simple_deps = get_simple_dependencies seqa in
+ { max_latency = -1;
+ resource_bounds = OpWeights.resource_bounds;
+ instruction_usages = Array.map resources_of_instruction seqa;
+ latency_constraints =
+ (* if (use_alias_analysis ())
+ then (get_alias_dependencies seqa) @ simple_deps
+ else *) simple_deps };;
+
+let schedule_sequence (seqa : instruction array) =
+ try
+ if (Array.length seqa) <= 1
+ then seqa
+ else
+ begin
+ let nr_instructions = Array.length seqa in
+ Printf.printf "prepass scheduling length = %d\n" (Array.length seqa);
+ let problem = define_problem seqa in
+ print_sequence stdout seqa;
+ print_problem stdout problem;
+ match reverse_list_scheduler problem
+ (* scheduler_by_name !Clflags.option_fprepass_sched problem *) with
+ | None -> Printf.printf "no solution in prepass scheduling\n";
+ seqa
+ | Some solution ->
+ let positions = Array.init nr_instructions (fun i -> i) in
+ Array.sort (fun i j ->
+ let si = solution.(i) and sj = solution.(j) in
+ if si < sj then -1
+ else if si > sj then 1
+ else i - j) positions;
+ let reordered = Array.init nr_instructions
+ (fun i -> seqa.(positions.(i))) in
+ reordered
+ end
+ with (Failure s) ->
+ Printf.printf "failure in prepass scheduling: %s\n" s;
+ seqa;;
+
diff --git a/kvx/lib/RTLpathScheduleraux.ml b/kvx/lib/RTLpathScheduleraux.ml
index 87d03fed..c304c6d3 100644
--- a/kvx/lib/RTLpathScheduleraux.ml
+++ b/kvx/lib/RTLpathScheduleraux.ml
@@ -94,24 +94,14 @@ let get_superblocks code entry pm =
end
(* TODO David *)
-let compute_latency (insn : RTL.instruction) =
- match insn with
- | Inop(successor) -> 0
- | Iop(op, args, dst, successor) -> OpWeights.latency_of_op op (List.length args)
- | Iload(trap, chunk, addr, args, dst, successor) -> OpWeights.latency_of_load trap chunk addr (List.length args)
- | Istore(chunk, addr, args, src, successor) -> 0
- | Icond(cond, args, ifso, ifnot, suggestion) -> 0
-
- | Ijumptable(arg, targets) -> 0
- | Itailcall(sign, ros, args) -> 0
- | Icall(sign, ros, args, dst, successor) -> 0
- | Ibuiltin(ef, args, res, successor) -> 0
- | Ireturn(arg) -> 0;;
-
let schedule_superblock sb code =
let old_flag = !debug_flag in
debug_flag := true;
print_superblock sb code;
+ let _ = PrepassSchedulingOracle.schedule_sequence
+ (Array.map (fun i ->
+ match PTree.get i code with Some ii -> ii | None -> failwith "RTLpathScheduleraux.schedule_superblock")
+ sb.instructions) in
debug_flag := old_flag;
(* stub2: reverse function *)
(*