From 825b77fe8b4eb0919564e51cfaae69a6dfae24e3 Mon Sep 17 00:00:00 2001 From: David Monniaux Date: Fri, 2 Oct 2020 21:54:34 +0200 Subject: so that all architectures compile --- aarch64/PrepassSchedulingOracle.ml | 473 ++++++++++++++++++++++++++++++++++ arm/PrepassSchedulingOracle.ml | 1 + kvx/PrepassSchedulingOracle.ml | 1 + powerpc/PrepassSchedulingOracle.ml | 1 + riscV/OpWeights.ml | 23 +- riscV/PrepassSchedulingOracle.ml | 1 + scheduling/PrepassSchedulingOracle.ml | 473 ---------------------------------- x86/PrepassSchedulingOracle.ml | 5 + 8 files changed, 497 insertions(+), 481 deletions(-) create mode 100644 aarch64/PrepassSchedulingOracle.ml create mode 120000 arm/PrepassSchedulingOracle.ml create mode 120000 kvx/PrepassSchedulingOracle.ml create mode 120000 powerpc/PrepassSchedulingOracle.ml create mode 120000 riscV/PrepassSchedulingOracle.ml delete mode 100644 scheduling/PrepassSchedulingOracle.ml create mode 100644 x86/PrepassSchedulingOracle.ml diff --git a/aarch64/PrepassSchedulingOracle.ml b/aarch64/PrepassSchedulingOracle.ml new file mode 100644 index 00000000..25083bcd --- /dev/null +++ b/aarch64/PrepassSchedulingOracle.ml @@ -0,0 +1,473 @@ +open AST +open RTL +open Maps +open InstructionScheduler +open OpWeights +open Registers + +let use_alias_analysis () = false + +let length_of_chunk = function +| Mint8signed +| Mint8unsigned -> 1 +| Mint16signed +| Mint16unsigned -> 2 +| Mint32 +| Mfloat32 +| Many32 -> 4 +| Mint64 +| Mfloat64 +| Many64 -> 8;; + +let get_simple_dependencies (seqa : (instruction*Regset.t) array) = + let last_reg_reads : int list PTree.t ref = ref PTree.empty + and last_reg_write : (int*int) PTree.t ref = ref PTree.empty + and last_mem_reads : int list ref = ref [] + and last_mem_write : int option ref = ref None + and last_branch : int option ref = ref None + and last_non_pipelined_op : int array = Array.make + nr_non_pipelined_units ( -1 ) + and latency_constraints : latency_constraint list ref = ref [] in + let add_constraint instr_from instr_to latency = + assert (instr_from <= instr_to); + assert (latency >= 0); + if instr_from = instr_to + then (if latency = 0 + then () + else failwith "PrepassSchedulingOracle.get_dependencies: negative self-loop") + else + latency_constraints := + { instr_from = instr_from; + instr_to = instr_to; + latency = latency + }:: !latency_constraints + and get_last_reads reg = + match PTree.get reg !last_reg_reads + with Some l -> l + | None -> [] in + let add_input_mem i = + if not (use_alias_analysis ()) + then + begin + begin + (* Read after write *) + match !last_mem_write with + | None -> () + | Some j -> add_constraint j i 1 + end; + last_mem_reads := i :: !last_mem_reads + end + and add_output_mem i = + if not (use_alias_analysis ()) + then + begin + begin + (* Write after write *) + match !last_mem_write with + | None -> () + | Some j -> add_constraint j i 1 + end; + (* Write after read *) + List.iter (fun j -> add_constraint j i 0) !last_mem_reads; + last_mem_write := Some i; + last_mem_reads := [] + end + and add_input_reg i reg = + begin + (* Read after write *) + match PTree.get reg !last_reg_write with + | None -> () + | Some (j, latency) -> add_constraint j i latency + end; + last_reg_reads := PTree.set reg + (i :: get_last_reads reg) + !last_reg_reads + and add_output_reg i latency reg = + begin + (* Write after write *) + match PTree.get reg !last_reg_write with + | None -> () + | Some (j, _) -> add_constraint j i 1 + end; + begin + (* Write after read *) + List.iter (fun j -> add_constraint j i 0) (get_last_reads reg) + end; + last_reg_write := PTree.set reg (i, latency) !last_reg_write; + last_reg_reads := PTree.remove reg !last_reg_reads + in + let add_input_regs i regs = List.iter (add_input_reg i) regs in + let rec add_builtin_res i (res : reg builtin_res) = + match res with + | BR r -> add_output_reg i 10 r + | BR_none -> () + | BR_splitlong (hi, lo) -> add_builtin_res i hi; + add_builtin_res i lo in + let rec add_builtin_arg i (ba : reg builtin_arg) = + match ba with + | BA r -> add_input_reg i r + | BA_int _ | BA_long _ | BA_float _ | BA_single _ -> () + | BA_loadstack(_,_) -> add_input_mem i + | BA_addrstack _ -> () + | BA_loadglobal(_, _, _) -> add_input_mem i + | BA_addrglobal _ -> () + | BA_splitlong(hi, lo) -> add_builtin_arg i hi; + add_builtin_arg i lo + | BA_addptr(a1, a2) -> add_builtin_arg i a1; + add_builtin_arg i a2 in + let irreversible_action i = + match !last_branch with + | None -> () + | Some j -> add_constraint j i 1 in + let set_branch i = + irreversible_action i; + last_branch := Some i in + let add_non_pipelined_resources i resources = + Array.iter2 + (fun latency last -> + if latency >= 0 && last >= 0 then add_constraint last i latency) + resources last_non_pipelined_op; + Array.iteri (fun rsc latency -> + if latency >= 0 + then last_non_pipelined_op.(rsc) <- i) resources + in + Array.iteri + begin + fun i (insn, other_uses) -> + List.iter (fun use -> + add_input_reg i use) + (Regset.elements other_uses); + + match insn with + | Inop _ -> () + | Iop(op, inputs, output, _) -> + add_non_pipelined_resources i + (non_pipelined_resources_of_op op (List.length inputs)); + (if Op.is_trapping_op op then irreversible_action i); + add_input_regs i inputs; + add_output_reg i (latency_of_op op (List.length inputs)) output + | Iload(trap, chunk, addressing, addr_regs, output, _) -> + (if trap=TRAP then irreversible_action i); + add_input_mem i; + add_input_regs i addr_regs; + add_output_reg i (latency_of_load trap chunk addressing (List.length addr_regs)) output + | Istore(chunk, addressing, addr_regs, input, _) -> + irreversible_action i; + add_input_regs i addr_regs; + add_input_reg i input; + add_output_mem i + | Icall(signature, ef, inputs, output, _) -> + set_branch i; + (match ef with + | Datatypes.Coq_inl r -> add_input_reg i r + | Datatypes.Coq_inr symbol -> () + ); + add_input_mem i; + add_input_regs i inputs; + add_output_reg i (latency_of_call signature ef) output; + add_output_mem i; + failwith "Icall" + | Itailcall(signature, ef, inputs) -> + set_branch i; + (match ef with + | Datatypes.Coq_inl r -> add_input_reg i r + | Datatypes.Coq_inr symbol -> () + ); + add_input_mem i; + add_input_regs i inputs; + failwith "Itailcall" + | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> + set_branch i; + add_input_mem i; + List.iter (add_builtin_arg i) builtin_inputs; + add_builtin_res i builtin_output; + add_output_mem i; + failwith "Ibuiltin" + | Icond(cond, inputs, _, _, _) -> + set_branch i; + add_input_mem i; + add_input_regs i inputs + | Ijumptable(input, _) -> + set_branch i; + add_input_reg i input; + failwith "Ijumptable" + | Ireturn(Some input) -> + set_branch i; + add_input_reg i input; + failwith "Ireturn" + | Ireturn(None) -> + set_branch i; + failwith "Ireturn none" + end seqa; + !latency_constraints;; + +let resources_of_instruction = function + | Inop _ -> Array.map (fun _ -> 0) resource_bounds + | Iop(op, inputs, output, _) -> resources_of_op op (List.length inputs) + | Iload(trap, chunk, addressing, addr_regs, output, _) -> + resources_of_load trap chunk addressing (List.length addr_regs) + | Istore(chunk, addressing, addr_regs, input, _) -> + resources_of_store chunk addressing (List.length addr_regs) + | Icall(signature, ef, inputs, output, _) -> + resources_of_call signature ef + | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> + resources_of_builtin ef + | Icond(cond, args, _, _ , _) -> + resources_of_cond cond (List.length args) + | Itailcall _ | Ijumptable _ | Ireturn _ -> resource_bounds + +let print_sequence pp (seqa : instruction array) = + Array.iteri ( + fun i (insn : instruction) -> + PrintRTL.print_instruction pp (i, insn)) seqa;; + +type unique_id = int + +type 'a symbolic_term_node = + | STop of Op.operation * 'a list + | STinitial_reg of int + | STother of int;; + +type symbolic_term = { + hash_id : unique_id; + hash_ct : symbolic_term symbolic_term_node + };; + +let rec print_term channel term = + match term.hash_ct with + | STop(op, args) -> + PrintOp.print_operation print_term channel (op, args) + | STinitial_reg n -> Printf.fprintf channel "x%d" n + | STother n -> Printf.fprintf channel "y%d" n;; + +type symbolic_term_table = { + st_table : (unique_id symbolic_term_node, symbolic_term) Hashtbl.t; + mutable st_next_id : unique_id };; + +let hash_init () = { + st_table = Hashtbl.create 20; + st_next_id = 0 + };; + +let ground_to_id = function + | STop(op, l) -> STop(op, List.map (fun t -> t.hash_id) l) + | STinitial_reg r -> STinitial_reg r + | STother i -> STother i;; + +let hash_node (table : symbolic_term_table) (term : symbolic_term symbolic_term_node) : symbolic_term = + let grounded = ground_to_id term in + match Hashtbl.find_opt table.st_table grounded with + | Some x -> x + | None -> + let term' = { hash_id = table.st_next_id; + hash_ct = term } in + (if table.st_next_id = max_int then failwith "hash: max_int"); + table.st_next_id <- table.st_next_id + 1; + Hashtbl.add table.st_table grounded term'; + term';; + +type access = { + base : symbolic_term; + offset : int64; + length : int + };; + +let term_equal a b = (a.hash_id = b.hash_id);; + +let access_of_addressing get_reg chunk addressing args = + match addressing, args with + | (Op.Aindexed ofs), [reg] -> Some + { base = get_reg reg; + offset = Camlcoq.camlint64_of_ptrofs ofs; + length = length_of_chunk chunk + } + | _, _ -> None ;; +(* TODO: global *) + +let symbolic_execution (seqa : instruction array) = + let regs = ref PTree.empty + and table = hash_init() in + let assign reg term = regs := PTree.set reg term !regs + and hash term = hash_node table term in + let get_reg reg = + match PTree.get reg !regs with + | None -> hash (STinitial_reg (Camlcoq.P.to_int reg)) + | Some x -> x in + let targets = Array.make (Array.length seqa) None in + Array.iteri + begin + fun i insn -> + match insn with + | Iop(Op.Omove, [input], output, _) -> + assign output (get_reg input) + | Iop(op, inputs, output, _) -> + assign output (hash (STop(op, List.map get_reg inputs))) + + | Iload(trap, chunk, addressing, args, output, _) -> + let access = access_of_addressing get_reg chunk addressing args in + targets.(i) <- access; + assign output (hash (STother(i))) + + | Icall(_, _, _, output, _) + | Ibuiltin(_, _, BR output, _) -> + assign output (hash (STother(i))) + + | Istore(chunk, addressing, args, va, _) -> + let access = access_of_addressing get_reg chunk addressing args in + targets.(i) <- access + + | Inop _ -> () + | Ibuiltin(_, _, BR_none, _) -> () + | Ibuiltin(_, _, BR_splitlong _, _) -> failwith "BR_splitlong" + + | Itailcall (_, _, _) + |Icond (_, _, _, _, _) + |Ijumptable (_, _) + |Ireturn _ -> () + end seqa; + targets;; + +let print_access channel = function + | None -> Printf.fprintf channel "any" + | Some x -> Printf.fprintf channel "%a + %Ld" print_term x.base x.offset;; + +let print_targets channel seqa = + let targets = symbolic_execution seqa in + Array.iteri + (fun i insn -> + match insn with + | Iload _ -> Printf.fprintf channel "%d: load %a\n" + i print_access targets.(i) + | Istore _ -> Printf.fprintf channel "%d: store %a\n" + i print_access targets.(i) + | _ -> () + ) seqa;; + +let may_overlap a0 b0 = + match a0, b0 with + | (None, _) | (_ , None) -> true + | (Some a), (Some b) -> + if term_equal a.base b.base + then (max a.offset b.offset) < + (min (Int64.add (Int64.of_int a.length) a.offset) + (Int64.add (Int64.of_int b.length) b.offset)) + else match a.base.hash_ct, b.base.hash_ct with + | STop(Op.Oaddrsymbol(ida, ofsa),[]), + STop(Op.Oaddrsymbol(idb, ofsb),[]) -> + (ida=idb) && + let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) + and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in + (max ao bo) < + (min (Int64.add (Int64.of_int a.length) ao) + (Int64.add (Int64.of_int b.length) bo)) + | STop(Op.Oaddrstack _, []), + STop(Op.Oaddrsymbol _, []) + | STop(Op.Oaddrsymbol _, []), + STop(Op.Oaddrstack _, []) -> false + | STop(Op.Oaddrstack(ofsa),[]), + STop(Op.Oaddrstack(ofsb),[]) -> + let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) + and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in + (max ao bo) < + (min (Int64.add (Int64.of_int a.length) ao) + (Int64.add (Int64.of_int b.length) bo)) + | _ -> true;; + +(* +(* TODO suboptimal quadratic algorithm *) +let get_alias_dependencies seqa = + let targets = symbolic_execution seqa + and deps = ref [] in + let add_constraint instr_from instr_to latency = + deps := { instr_from = instr_from; + instr_to = instr_to; + latency = latency + }:: !deps in + for i=0 to (Array.length seqa)-1 + do + for j=0 to i-1 + do + match seqa.(j), seqa.(i) with + | (Istore _), ((Iload _) | (Istore _)) -> + if may_overlap targets.(j) targets.(i) + then add_constraint j i 1 + | (Iload _), (Istore _) -> + if may_overlap targets.(j) targets.(i) + then add_constraint j i 0 + | (Istore _ | Iload _), (Icall _ | Ibuiltin _) + | (Icall _ | Ibuiltin _), (Icall _ | Ibuiltin _ | Iload _ | Istore _) -> + add_constraint j i 1 + | (Inop _ | Iop _), _ + | _, (Inop _ | Iop _) + | (Iload _), (Iload _) -> () + done + done; + !deps;; + *) + +let define_problem seqa = + let simple_deps = get_simple_dependencies seqa in + { max_latency = -1; + resource_bounds = OpWeights.resource_bounds; + instruction_usages = Array.map resources_of_instruction (Array.map fst seqa); + latency_constraints = + (* if (use_alias_analysis ()) + then (get_alias_dependencies seqa) @ simple_deps + else *) simple_deps };; + +let zigzag_scheduler problem early_ones = + let nr_instructions = get_nr_instructions problem in + assert(nr_instructions = (Array.length early_ones)); + match list_scheduler problem with + | Some fwd_schedule -> + let fwd_makespan = fwd_schedule.((Array.length fwd_schedule) - 1) in + let constraints' = ref problem.latency_constraints in + Array.iteri (fun i is_early -> + if is_early then + constraints' := { + instr_from = i; + instr_to = nr_instructions ; + latency = fwd_makespan - fwd_schedule.(i) } ::!constraints' ) + early_ones; + validated_scheduler reverse_list_scheduler + { problem with latency_constraints = !constraints' } + | None -> None;; + +let prepass_scheduler_by_name name problem early_ones = + match name with + | "zigzag" -> zigzag_scheduler problem early_ones + | _ -> scheduler_by_name name problem + +let schedule_sequence (seqa : (instruction*Regset.t) array) = + try + if (Array.length seqa) <= 1 + then None + else + begin + let nr_instructions = Array.length seqa in + Printf.printf "prepass scheduling length = %d\n" (Array.length seqa); + let problem = define_problem seqa in + print_sequence stdout (Array.map fst seqa); + print_problem stdout problem; + match prepass_scheduler_by_name + (!Clflags.option_fprepass_sched) + problem + (Array.map (fun (ins, _) -> + match ins with + | Icond _ -> true + | _ -> false) seqa) with + | None -> Printf.printf "no solution in prepass scheduling\n"; + None + | Some solution -> + let positions = Array.init nr_instructions (fun i -> i) in + Array.sort (fun i j -> + let si = solution.(i) and sj = solution.(j) in + if si < sj then -1 + else if si > sj then 1 + else i - j) positions; + Some positions + end + with (Failure s) -> + Printf.printf "failure in prepass scheduling: %s\n" s; + None;; + diff --git a/arm/PrepassSchedulingOracle.ml b/arm/PrepassSchedulingOracle.ml new file mode 120000 index 00000000..9885fd52 --- /dev/null +++ b/arm/PrepassSchedulingOracle.ml @@ -0,0 +1 @@ +../x86/PrepassSchedulingOracle.ml \ No newline at end of file diff --git a/kvx/PrepassSchedulingOracle.ml b/kvx/PrepassSchedulingOracle.ml new file mode 120000 index 00000000..912e9ffa --- /dev/null +++ b/kvx/PrepassSchedulingOracle.ml @@ -0,0 +1 @@ +../aarch64/PrepassSchedulingOracle.ml \ No newline at end of file diff --git a/powerpc/PrepassSchedulingOracle.ml b/powerpc/PrepassSchedulingOracle.ml new file mode 120000 index 00000000..9885fd52 --- /dev/null +++ b/powerpc/PrepassSchedulingOracle.ml @@ -0,0 +1 @@ +../x86/PrepassSchedulingOracle.ml \ No newline at end of file diff --git a/riscV/OpWeights.ml b/riscV/OpWeights.ml index 88ef5216..143435c1 100644 --- a/riscV/OpWeights.ml +++ b/riscV/OpWeights.ml @@ -1,9 +1,9 @@ open Op;; (* Attempt at modeling SweRV EH1 -[| issues ; LSU ; multiplier ; divider |] *) -let resource_bounds = [| 2 ; 1; 1; 2 |];; - +[| issues ; LSU ; multiplier |] *) +let resource_bounds = [| 2 ; 1; 1 |];; +let nr_non_pipelined_units = 1;; (* divider *) let latency_of_op (op : operation) (nargs : int) = match op with @@ -15,18 +15,25 @@ let latency_of_op (op : operation) (nargs : int) = let resources_of_op (op : operation) (nargs : int) = match op with | Omul | Omulhs | Omulhu - | Omull | Omullhs | Omullhu -> [| 1 ; 0 ; 1; 0 |] - | Odiv | Odivu | Odivl | Odivlu -> [| 0 ; 0; 0; 1 |] + | Omull | Omullhs | Omullhu -> [| 1 ; 0 ; 1 |] + | Odiv | Odivu | Odivl | Odivlu -> [| 0 ; 0; 0 |] | _ -> [| 1; 0; 0; 0 |];; + +let non_pipelined_resources_of_op (op : operation) (nargs : int) = + match op with + | Odiv | Odivu -> [| 29 |] + | Odivfs -> [| 20 |] + | Odivl | Odivlu | Odivf -> [| 50 |] + | _ -> [| -1 |];; -let resources_of_cond (cond : condition) (nargs : int) = [| 1; 0; 0; 0 |];; +let resources_of_cond (cond : condition) (nargs : int) = [| 1; 0; 0 |];; let latency_of_load trap chunk (addr : addressing) (nargs : int) = 3;; let latency_of_call _ _ = 6;; -let resources_of_load trap chunk addressing nargs = [| 1; 1; 0; 0 |];; +let resources_of_load trap chunk addressing nargs = [| 1; 1; 0 |];; -let resources_of_store chunk addressing nargs = [| 1; 1; 0; 0 |];; +let resources_of_store chunk addressing nargs = [| 1; 1; 0 |];; let resources_of_call _ _ = resource_bounds;; let resources_of_builtin _ = resource_bounds;; diff --git a/riscV/PrepassSchedulingOracle.ml b/riscV/PrepassSchedulingOracle.ml new file mode 120000 index 00000000..912e9ffa --- /dev/null +++ b/riscV/PrepassSchedulingOracle.ml @@ -0,0 +1 @@ +../aarch64/PrepassSchedulingOracle.ml \ No newline at end of file diff --git a/scheduling/PrepassSchedulingOracle.ml b/scheduling/PrepassSchedulingOracle.ml deleted file mode 100644 index 25083bcd..00000000 --- a/scheduling/PrepassSchedulingOracle.ml +++ /dev/null @@ -1,473 +0,0 @@ -open AST -open RTL -open Maps -open InstructionScheduler -open OpWeights -open Registers - -let use_alias_analysis () = false - -let length_of_chunk = function -| Mint8signed -| Mint8unsigned -> 1 -| Mint16signed -| Mint16unsigned -> 2 -| Mint32 -| Mfloat32 -| Many32 -> 4 -| Mint64 -| Mfloat64 -| Many64 -> 8;; - -let get_simple_dependencies (seqa : (instruction*Regset.t) array) = - let last_reg_reads : int list PTree.t ref = ref PTree.empty - and last_reg_write : (int*int) PTree.t ref = ref PTree.empty - and last_mem_reads : int list ref = ref [] - and last_mem_write : int option ref = ref None - and last_branch : int option ref = ref None - and last_non_pipelined_op : int array = Array.make - nr_non_pipelined_units ( -1 ) - and latency_constraints : latency_constraint list ref = ref [] in - let add_constraint instr_from instr_to latency = - assert (instr_from <= instr_to); - assert (latency >= 0); - if instr_from = instr_to - then (if latency = 0 - then () - else failwith "PrepassSchedulingOracle.get_dependencies: negative self-loop") - else - latency_constraints := - { instr_from = instr_from; - instr_to = instr_to; - latency = latency - }:: !latency_constraints - and get_last_reads reg = - match PTree.get reg !last_reg_reads - with Some l -> l - | None -> [] in - let add_input_mem i = - if not (use_alias_analysis ()) - then - begin - begin - (* Read after write *) - match !last_mem_write with - | None -> () - | Some j -> add_constraint j i 1 - end; - last_mem_reads := i :: !last_mem_reads - end - and add_output_mem i = - if not (use_alias_analysis ()) - then - begin - begin - (* Write after write *) - match !last_mem_write with - | None -> () - | Some j -> add_constraint j i 1 - end; - (* Write after read *) - List.iter (fun j -> add_constraint j i 0) !last_mem_reads; - last_mem_write := Some i; - last_mem_reads := [] - end - and add_input_reg i reg = - begin - (* Read after write *) - match PTree.get reg !last_reg_write with - | None -> () - | Some (j, latency) -> add_constraint j i latency - end; - last_reg_reads := PTree.set reg - (i :: get_last_reads reg) - !last_reg_reads - and add_output_reg i latency reg = - begin - (* Write after write *) - match PTree.get reg !last_reg_write with - | None -> () - | Some (j, _) -> add_constraint j i 1 - end; - begin - (* Write after read *) - List.iter (fun j -> add_constraint j i 0) (get_last_reads reg) - end; - last_reg_write := PTree.set reg (i, latency) !last_reg_write; - last_reg_reads := PTree.remove reg !last_reg_reads - in - let add_input_regs i regs = List.iter (add_input_reg i) regs in - let rec add_builtin_res i (res : reg builtin_res) = - match res with - | BR r -> add_output_reg i 10 r - | BR_none -> () - | BR_splitlong (hi, lo) -> add_builtin_res i hi; - add_builtin_res i lo in - let rec add_builtin_arg i (ba : reg builtin_arg) = - match ba with - | BA r -> add_input_reg i r - | BA_int _ | BA_long _ | BA_float _ | BA_single _ -> () - | BA_loadstack(_,_) -> add_input_mem i - | BA_addrstack _ -> () - | BA_loadglobal(_, _, _) -> add_input_mem i - | BA_addrglobal _ -> () - | BA_splitlong(hi, lo) -> add_builtin_arg i hi; - add_builtin_arg i lo - | BA_addptr(a1, a2) -> add_builtin_arg i a1; - add_builtin_arg i a2 in - let irreversible_action i = - match !last_branch with - | None -> () - | Some j -> add_constraint j i 1 in - let set_branch i = - irreversible_action i; - last_branch := Some i in - let add_non_pipelined_resources i resources = - Array.iter2 - (fun latency last -> - if latency >= 0 && last >= 0 then add_constraint last i latency) - resources last_non_pipelined_op; - Array.iteri (fun rsc latency -> - if latency >= 0 - then last_non_pipelined_op.(rsc) <- i) resources - in - Array.iteri - begin - fun i (insn, other_uses) -> - List.iter (fun use -> - add_input_reg i use) - (Regset.elements other_uses); - - match insn with - | Inop _ -> () - | Iop(op, inputs, output, _) -> - add_non_pipelined_resources i - (non_pipelined_resources_of_op op (List.length inputs)); - (if Op.is_trapping_op op then irreversible_action i); - add_input_regs i inputs; - add_output_reg i (latency_of_op op (List.length inputs)) output - | Iload(trap, chunk, addressing, addr_regs, output, _) -> - (if trap=TRAP then irreversible_action i); - add_input_mem i; - add_input_regs i addr_regs; - add_output_reg i (latency_of_load trap chunk addressing (List.length addr_regs)) output - | Istore(chunk, addressing, addr_regs, input, _) -> - irreversible_action i; - add_input_regs i addr_regs; - add_input_reg i input; - add_output_mem i - | Icall(signature, ef, inputs, output, _) -> - set_branch i; - (match ef with - | Datatypes.Coq_inl r -> add_input_reg i r - | Datatypes.Coq_inr symbol -> () - ); - add_input_mem i; - add_input_regs i inputs; - add_output_reg i (latency_of_call signature ef) output; - add_output_mem i; - failwith "Icall" - | Itailcall(signature, ef, inputs) -> - set_branch i; - (match ef with - | Datatypes.Coq_inl r -> add_input_reg i r - | Datatypes.Coq_inr symbol -> () - ); - add_input_mem i; - add_input_regs i inputs; - failwith "Itailcall" - | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> - set_branch i; - add_input_mem i; - List.iter (add_builtin_arg i) builtin_inputs; - add_builtin_res i builtin_output; - add_output_mem i; - failwith "Ibuiltin" - | Icond(cond, inputs, _, _, _) -> - set_branch i; - add_input_mem i; - add_input_regs i inputs - | Ijumptable(input, _) -> - set_branch i; - add_input_reg i input; - failwith "Ijumptable" - | Ireturn(Some input) -> - set_branch i; - add_input_reg i input; - failwith "Ireturn" - | Ireturn(None) -> - set_branch i; - failwith "Ireturn none" - end seqa; - !latency_constraints;; - -let resources_of_instruction = function - | Inop _ -> Array.map (fun _ -> 0) resource_bounds - | Iop(op, inputs, output, _) -> resources_of_op op (List.length inputs) - | Iload(trap, chunk, addressing, addr_regs, output, _) -> - resources_of_load trap chunk addressing (List.length addr_regs) - | Istore(chunk, addressing, addr_regs, input, _) -> - resources_of_store chunk addressing (List.length addr_regs) - | Icall(signature, ef, inputs, output, _) -> - resources_of_call signature ef - | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> - resources_of_builtin ef - | Icond(cond, args, _, _ , _) -> - resources_of_cond cond (List.length args) - | Itailcall _ | Ijumptable _ | Ireturn _ -> resource_bounds - -let print_sequence pp (seqa : instruction array) = - Array.iteri ( - fun i (insn : instruction) -> - PrintRTL.print_instruction pp (i, insn)) seqa;; - -type unique_id = int - -type 'a symbolic_term_node = - | STop of Op.operation * 'a list - | STinitial_reg of int - | STother of int;; - -type symbolic_term = { - hash_id : unique_id; - hash_ct : symbolic_term symbolic_term_node - };; - -let rec print_term channel term = - match term.hash_ct with - | STop(op, args) -> - PrintOp.print_operation print_term channel (op, args) - | STinitial_reg n -> Printf.fprintf channel "x%d" n - | STother n -> Printf.fprintf channel "y%d" n;; - -type symbolic_term_table = { - st_table : (unique_id symbolic_term_node, symbolic_term) Hashtbl.t; - mutable st_next_id : unique_id };; - -let hash_init () = { - st_table = Hashtbl.create 20; - st_next_id = 0 - };; - -let ground_to_id = function - | STop(op, l) -> STop(op, List.map (fun t -> t.hash_id) l) - | STinitial_reg r -> STinitial_reg r - | STother i -> STother i;; - -let hash_node (table : symbolic_term_table) (term : symbolic_term symbolic_term_node) : symbolic_term = - let grounded = ground_to_id term in - match Hashtbl.find_opt table.st_table grounded with - | Some x -> x - | None -> - let term' = { hash_id = table.st_next_id; - hash_ct = term } in - (if table.st_next_id = max_int then failwith "hash: max_int"); - table.st_next_id <- table.st_next_id + 1; - Hashtbl.add table.st_table grounded term'; - term';; - -type access = { - base : symbolic_term; - offset : int64; - length : int - };; - -let term_equal a b = (a.hash_id = b.hash_id);; - -let access_of_addressing get_reg chunk addressing args = - match addressing, args with - | (Op.Aindexed ofs), [reg] -> Some - { base = get_reg reg; - offset = Camlcoq.camlint64_of_ptrofs ofs; - length = length_of_chunk chunk - } - | _, _ -> None ;; -(* TODO: global *) - -let symbolic_execution (seqa : instruction array) = - let regs = ref PTree.empty - and table = hash_init() in - let assign reg term = regs := PTree.set reg term !regs - and hash term = hash_node table term in - let get_reg reg = - match PTree.get reg !regs with - | None -> hash (STinitial_reg (Camlcoq.P.to_int reg)) - | Some x -> x in - let targets = Array.make (Array.length seqa) None in - Array.iteri - begin - fun i insn -> - match insn with - | Iop(Op.Omove, [input], output, _) -> - assign output (get_reg input) - | Iop(op, inputs, output, _) -> - assign output (hash (STop(op, List.map get_reg inputs))) - - | Iload(trap, chunk, addressing, args, output, _) -> - let access = access_of_addressing get_reg chunk addressing args in - targets.(i) <- access; - assign output (hash (STother(i))) - - | Icall(_, _, _, output, _) - | Ibuiltin(_, _, BR output, _) -> - assign output (hash (STother(i))) - - | Istore(chunk, addressing, args, va, _) -> - let access = access_of_addressing get_reg chunk addressing args in - targets.(i) <- access - - | Inop _ -> () - | Ibuiltin(_, _, BR_none, _) -> () - | Ibuiltin(_, _, BR_splitlong _, _) -> failwith "BR_splitlong" - - | Itailcall (_, _, _) - |Icond (_, _, _, _, _) - |Ijumptable (_, _) - |Ireturn _ -> () - end seqa; - targets;; - -let print_access channel = function - | None -> Printf.fprintf channel "any" - | Some x -> Printf.fprintf channel "%a + %Ld" print_term x.base x.offset;; - -let print_targets channel seqa = - let targets = symbolic_execution seqa in - Array.iteri - (fun i insn -> - match insn with - | Iload _ -> Printf.fprintf channel "%d: load %a\n" - i print_access targets.(i) - | Istore _ -> Printf.fprintf channel "%d: store %a\n" - i print_access targets.(i) - | _ -> () - ) seqa;; - -let may_overlap a0 b0 = - match a0, b0 with - | (None, _) | (_ , None) -> true - | (Some a), (Some b) -> - if term_equal a.base b.base - then (max a.offset b.offset) < - (min (Int64.add (Int64.of_int a.length) a.offset) - (Int64.add (Int64.of_int b.length) b.offset)) - else match a.base.hash_ct, b.base.hash_ct with - | STop(Op.Oaddrsymbol(ida, ofsa),[]), - STop(Op.Oaddrsymbol(idb, ofsb),[]) -> - (ida=idb) && - let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) - and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in - (max ao bo) < - (min (Int64.add (Int64.of_int a.length) ao) - (Int64.add (Int64.of_int b.length) bo)) - | STop(Op.Oaddrstack _, []), - STop(Op.Oaddrsymbol _, []) - | STop(Op.Oaddrsymbol _, []), - STop(Op.Oaddrstack _, []) -> false - | STop(Op.Oaddrstack(ofsa),[]), - STop(Op.Oaddrstack(ofsb),[]) -> - let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) - and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in - (max ao bo) < - (min (Int64.add (Int64.of_int a.length) ao) - (Int64.add (Int64.of_int b.length) bo)) - | _ -> true;; - -(* -(* TODO suboptimal quadratic algorithm *) -let get_alias_dependencies seqa = - let targets = symbolic_execution seqa - and deps = ref [] in - let add_constraint instr_from instr_to latency = - deps := { instr_from = instr_from; - instr_to = instr_to; - latency = latency - }:: !deps in - for i=0 to (Array.length seqa)-1 - do - for j=0 to i-1 - do - match seqa.(j), seqa.(i) with - | (Istore _), ((Iload _) | (Istore _)) -> - if may_overlap targets.(j) targets.(i) - then add_constraint j i 1 - | (Iload _), (Istore _) -> - if may_overlap targets.(j) targets.(i) - then add_constraint j i 0 - | (Istore _ | Iload _), (Icall _ | Ibuiltin _) - | (Icall _ | Ibuiltin _), (Icall _ | Ibuiltin _ | Iload _ | Istore _) -> - add_constraint j i 1 - | (Inop _ | Iop _), _ - | _, (Inop _ | Iop _) - | (Iload _), (Iload _) -> () - done - done; - !deps;; - *) - -let define_problem seqa = - let simple_deps = get_simple_dependencies seqa in - { max_latency = -1; - resource_bounds = OpWeights.resource_bounds; - instruction_usages = Array.map resources_of_instruction (Array.map fst seqa); - latency_constraints = - (* if (use_alias_analysis ()) - then (get_alias_dependencies seqa) @ simple_deps - else *) simple_deps };; - -let zigzag_scheduler problem early_ones = - let nr_instructions = get_nr_instructions problem in - assert(nr_instructions = (Array.length early_ones)); - match list_scheduler problem with - | Some fwd_schedule -> - let fwd_makespan = fwd_schedule.((Array.length fwd_schedule) - 1) in - let constraints' = ref problem.latency_constraints in - Array.iteri (fun i is_early -> - if is_early then - constraints' := { - instr_from = i; - instr_to = nr_instructions ; - latency = fwd_makespan - fwd_schedule.(i) } ::!constraints' ) - early_ones; - validated_scheduler reverse_list_scheduler - { problem with latency_constraints = !constraints' } - | None -> None;; - -let prepass_scheduler_by_name name problem early_ones = - match name with - | "zigzag" -> zigzag_scheduler problem early_ones - | _ -> scheduler_by_name name problem - -let schedule_sequence (seqa : (instruction*Regset.t) array) = - try - if (Array.length seqa) <= 1 - then None - else - begin - let nr_instructions = Array.length seqa in - Printf.printf "prepass scheduling length = %d\n" (Array.length seqa); - let problem = define_problem seqa in - print_sequence stdout (Array.map fst seqa); - print_problem stdout problem; - match prepass_scheduler_by_name - (!Clflags.option_fprepass_sched) - problem - (Array.map (fun (ins, _) -> - match ins with - | Icond _ -> true - | _ -> false) seqa) with - | None -> Printf.printf "no solution in prepass scheduling\n"; - None - | Some solution -> - let positions = Array.init nr_instructions (fun i -> i) in - Array.sort (fun i j -> - let si = solution.(i) and sj = solution.(j) in - if si < sj then -1 - else if si > sj then 1 - else i - j) positions; - Some positions - end - with (Failure s) -> - Printf.printf "failure in prepass scheduling: %s\n" s; - None;; - diff --git a/x86/PrepassSchedulingOracle.ml b/x86/PrepassSchedulingOracle.ml new file mode 100644 index 00000000..7b6a1b14 --- /dev/null +++ b/x86/PrepassSchedulingOracle.ml @@ -0,0 +1,5 @@ +open RTL +open Registers + +(* Do not do anything *) +let schedule_sequence (seqa : (instruction*Regset.t) array) = None -- cgit