From 51668ba258e7b79a1b2b129a404b1eb9981e8e3b Mon Sep 17 00:00:00 2001 From: David Monniaux Date: Fri, 16 Jul 2021 18:01:59 +0200 Subject: Make prepass scheduling sensitive to register pressure, by Nicolas Nardino. Squashed commit of the following: commit cf033ec29391d5358dea1d3b25da1738957478c4 Author: David Monniaux Date: Fri Jul 16 18:01:03 2021 +0200 comment for authors commit 2ff766a18432fd75739abab0b5741ded6b67a2a5 Author: David Monniaux Date: Fri Jul 16 15:29:25 2021 +0200 activate register pressure by default commit 67f4ae2b702cc95ed7cef67b726e15abbf18e768 Author: David Monniaux Date: Fri Jul 16 15:26:03 2021 +0200 use a more recognizable option name commit 6121be54b80a55fdadd8b64dfad53357148c9090 Author: David Monniaux Date: Fri Jul 16 14:13:50 2021 +0200 fix for KVX commit 43d4932e8ba9e00eb8c8788c86f56b6bddd46392 Author: David Monniaux Date: Fri Jul 16 13:28:26 2021 +0200 setup registers commit 169a221104c37737f12abe79711009fc0d88ce09 Author: David Monniaux Date: Fri Jul 16 13:00:56 2021 +0200 rm useless code commit d6a846b641787ea6a5ed113b1d7275ffb5028d9c Author: David Monniaux Date: Fri Jul 16 12:54:19 2021 +0200 rm "Admitted" commit fd4d085aa988a6044f89fc17e8422be23bc87f9d Merge: 70f5867e 56498b64 Author: David Monniaux Date: Fri Jul 16 12:30:25 2021 +0200 Merge remote-tracking branch 'origin/kvx-work' into kvx-sched-w-reg-press commit 70f5867e441e253869cb3b432af77636a186d1cb Author: David Monniaux Date: Fri Jul 16 12:26:27 2021 +0200 rm TODO commit f86f5df47b69053702661671340b0fcb31506aa3 Author: nicolas.nardino Date: Thu Jul 8 11:22:17 2021 +0200 add more debug info commit a4a0b36f56a94c19da301265a4e3acad1fbdf6c4 Author: nicolas.nardino Date: Thu Jul 8 11:20:49 2021 +0200 Deactivate sched validator (i think) commit af97fca0f1d824f3becf9c6895f44ad234e262f8 Author: nicolas.nardino Date: Tue Jul 6 15:32:35 2021 +0200 Add debug info commit b96a48de58e1969535865b7b345514a24f7178a6 Author: nicolas.nardino Date: Mon Jun 28 16:04:44 2021 +0200 Change temporary solution (see prev commits), and add option for it commit 9ac49c465f9c8969fba00e6242da0c188a6a3080 Author: nicolas.nardino Date: Fri Jun 25 09:42:41 2021 +0200 Changed printfs into debugs commit dfa09586ae40c70769eeda688a0e7f59f611749f Author: nicolas.nardino Date: Thu Jun 24 18:33:20 2021 +0200 Another scheduler commit c5e8595480604c78260017cc771b0e4195fdd182 Merge: 10cbe4b2 cf2aa686 Author: nicolas.nardino Date: Tue Jun 22 15:58:10 2021 +0200 Merge branch 'kvx-sched-w-reg-press' of gricad-gitlab.univ-grenoble-alpes.fr:sixcy/CompCert into kvx-sched-w-reg-press commit 10cbe4b28ef6dc5d02c9a5d4d369484e4943a18d Author: nicolas.nardino Date: Tue Jun 22 15:57:21 2021 +0200 Changed default threshold value following tests commit cf2aa686bcf9a823562fe977df6dd778d5467985 Merge: eddbce33 fe557bf6 Author: David Monniaux Date: Thu Jun 17 17:05:30 2021 +0200 Merge branch 'kvx-sched-w-reg-press' of gricad-gitlab.univ-grenoble-alpes.fr:sixcy/CompCert into kvx-sched-w-reg-press commit eddbce33e28c49bf7b9e83ebd5dbf6cb0d770090 Merge: 8f399dfa fae8d9b5 Author: David Monniaux Date: Thu Jun 17 17:05:20 2021 +0200 Merge branch 'kvx-sched-w-reg-press' of gricad-gitlab.univ-grenoble-alpes.fr:sixcy/CompCert into kvx-sched-w-reg-press commit 8f399dfa9d794f2f728f523ff1aa7788cc3599b2 Author: David Monniaux Date: Thu Jun 17 17:04:52 2021 +0200 fix for Risc-V commit fe557bf65ec738eaa078bc5e398ff690eb1f2b9e Author: nicolas.nardino Date: Thu Jun 17 17:03:53 2021 +0200 changed type of schedule_seq in x86 for compatibility commit fae8d9b5c5f93d5eda36f800eb0ca1837b237cba Author: nicolas.nardino Date: Thu Jun 17 17:00:57 2021 +0200 fix riscv/Machregsaux.mli commit 9759e94256fd09f4995418b67b7aedbcf84b4b10 Merge: 4413c27d 04b2489d Author: David Monniaux Date: Thu Jun 17 16:52:09 2021 +0200 Merge remote-tracking branch 'origin/kvx-work' into kvx-sched-w-reg-press commit 4413c27d6c6a3d69df34955d9d453c38b32174c7 Author: nicolas.nardino Date: Thu Jun 17 15:38:13 2021 +0200 Add option to set thresold and support for riscv commit 21278bd87e89210bcc287116f6e35fc1b52d0df2 Author: nicolas.nardino Date: Wed Jun 16 20:27:31 2021 +0200 Now working, tests show a decrease in spillage Should still find a proper way to treat the case mentioned in earlier commits commit 87c82b6fcf2bf825a8c60fc6a95498aac9f826d4 Author: nicolas.nardino Date: Tue Jun 15 14:44:56 2021 +0200 kinda fixed Spills are definitely reduced, but lots of arbitrary in there: See previous commit: need to determine what to do if pressure is too high but no schedulable instruction can reduce it. For now, advance time for at most 5 cycles, if still no suitable instruction, go back to CSP commit 19464b3992eadf7670acc7231896103ab54885e5 Author: nicolas.nardino Date: Tue Jun 15 12:07:43 2021 +0200 fixing Still need to find what to do when pressure is high but there are no instructions available that decrease it commit bff4e6ff0b782619b6fcc18751fa575cbb11de68 Author: nicolas.nardino Date: Mon Jun 14 17:39:58 2021 +0200 was very wrong, fixing commit 3eb3751f84348a20b7ce211fdbf1d01a9c4685a8 Author: nicolas.nardino Date: Mon Jun 14 14:46:01 2021 +0200 One fewer spill with new sched on `test/.../spille_forw.c` commit 66e15205c40de54639387a4c9b1cc78994525d55 Author: nicolas.nardino Date: Mon Jun 14 13:53:08 2021 +0200 scheduler written, need to test now commit 2b814b1f9bb30d9c8b59a713f69bced808bca7c7 Author: nicolas.nardino Date: Sat Jun 12 10:52:59 2021 +0200 work on the scheduler commit 1701e43316ee8e69e794a025a8c9979af6bb8c93 Author: nicolas.nardino Date: Thu Jun 10 16:31:51 2021 +0200 Work on new schedluer Renamed a test file, wrote function to compute pressure deltas, Still need to pass the info in some way; beginning of the actual scheduler function commit 386b9053177bb4ef2801cec00b717c400a828139 Author: nicolas.nardino Date: Tue Jun 8 16:53:19 2021 +0200 Fix RTLpathScheduleraux.get_live_regs_entry commit 9b6247b7996f3e0181d27ec0e20daffd28e0884f Author: nicolas.nardino Date: Tue Jun 8 16:06:36 2021 +0200 Another test : one spill when scheduled forward, none if not commit 52378f0600652a94edcc8c78e4b426243f717a89 Author: nicolas.nardino Date: Tue Jun 8 15:11:03 2021 +0200 Add some tests commit 2249f3c7771c285ccd25f6e94478be388a741da5 Author: nicolas.nardino Date: Sun Jun 6 20:49:34 2021 +0200 Adding debug info commit 9118878bd14e24cc04c2f36cab7aa7271a0f1852 Author: nicolas.nardino Date: Sun Jun 6 12:11:15 2021 +0200 Fixing scope error, and non-exhaustive pattern matching commit 599823a6410f1629f2b8704291839e0974bce83b Author: nicolas.nardino Date: Sat Jun 5 19:52:59 2021 +0200 function written, now needs testing commit 98a7a04258f2cf6caf9f18925cbeeae2f5b17be4 Author: nicolas.nardino Date: Fri Jun 4 16:56:32 2021 +0200 computing live regs at sb entry from its live output regs commit 7ae1fb0faea68ce5cfe04a232e49659247c244e9 Author: nicolas.nardino Date: Fri Jun 4 14:24:07 2021 +0200 Passing info of live regs to scheduler: beginning --- aarch64/Machregsaux.ml | 3 + aarch64/Machregsaux.mli | 3 + aarch64/PostpassSchedulingOracle.ml | 3 + aarch64/PrepassSchedulingOracle.ml | 29 +- arm/ExpansionOracle.ml | 18 +- arm/PrepassSchedulingOracle.ml | 7 +- common/DebugPrint.ml | 4 +- driver/Clflags.ml | 4 +- driver/Driver.ml | 14 +- kvx/ExpansionOracle.ml | 18 +- kvx/Machregsaux.ml | 2 + kvx/Machregsaux.mli | 3 + kvx/PostpassSchedulingOracle.ml | 10 +- kvx/PrepassSchedulingOracle.ml | 486 ++++++++++++++++++++++++++++++- kvx/PrepassSchedulingOracleDeps.ml | 18 +- powerpc/ExpansionOracle.ml | 18 +- powerpc/PrepassSchedulingOracle.ml | 7 +- riscV/Machregsaux.ml | 2 + riscV/Machregsaux.mli | 3 + riscV/PrepassSchedulingOracle.ml | 486 ++++++++++++++++++++++++++++++- riscV/PrepassSchedulingOracleDeps.ml | 18 +- scheduling/InstructionScheduler.ml | 503 ++++++++++++++++++++++++++++++++- scheduling/InstructionScheduler.mli | 16 ++ scheduling/RTLpathScheduleraux.ml | 187 +++++++++++- test/nardino/scheduling/entry_regs.c | 19 ++ test/nardino/scheduling/spille_backw.c | 114 ++++++++ test/nardino/scheduling/spille_forw.c | 166 +++++++++++ x86/ExpansionOracle.ml | 18 +- x86/PrepassSchedulingOracle.ml | 3 +- 29 files changed, 2146 insertions(+), 36 deletions(-) mode change 120000 => 100644 arm/ExpansionOracle.ml mode change 120000 => 100644 arm/PrepassSchedulingOracle.ml mode change 120000 => 100644 kvx/ExpansionOracle.ml mode change 120000 => 100644 kvx/PrepassSchedulingOracle.ml mode change 120000 => 100644 kvx/PrepassSchedulingOracleDeps.ml mode change 120000 => 100644 powerpc/ExpansionOracle.ml mode change 120000 => 100644 powerpc/PrepassSchedulingOracle.ml mode change 120000 => 100644 riscV/PrepassSchedulingOracle.ml mode change 120000 => 100644 riscV/PrepassSchedulingOracleDeps.ml create mode 100644 test/nardino/scheduling/entry_regs.c create mode 100644 test/nardino/scheduling/spille_backw.c create mode 100644 test/nardino/scheduling/spille_forw.c mode change 120000 => 100644 x86/ExpansionOracle.ml diff --git a/aarch64/Machregsaux.ml b/aarch64/Machregsaux.ml index 41db3bd4..98e461eb 100644 --- a/aarch64/Machregsaux.ml +++ b/aarch64/Machregsaux.ml @@ -19,3 +19,6 @@ let class_of_type = function | AST.Tint | AST.Tlong -> 0 | AST.Tfloat | AST.Tsingle -> 1 | AST.Tany32 | AST.Tany64 -> assert false + +(* number of available registers per class *) +let nr_regs = [| 29; 32 |] diff --git a/aarch64/Machregsaux.mli b/aarch64/Machregsaux.mli index 01b0f9fd..23ac1c9a 100644 --- a/aarch64/Machregsaux.mli +++ b/aarch64/Machregsaux.mli @@ -15,3 +15,6 @@ val is_scratch_register: string -> bool val class_of_type: AST.typ -> int + +(* Number of registers in each class *) +val nr_regs : int array diff --git a/aarch64/PostpassSchedulingOracle.ml b/aarch64/PostpassSchedulingOracle.ml index cde3e7a7..6f784238 100644 --- a/aarch64/PostpassSchedulingOracle.ml +++ b/aarch64/PostpassSchedulingOracle.ml @@ -507,6 +507,9 @@ let build_problem bb = { max_latency = -1; resource_bounds = opweights.pipelined_resource_bounds; + live_regs_entry = Registers.Regset.empty; (* unused here *) + typing = (fun x -> AST.Tint); (* unused here *) + reference_counting = None; instruction_usages = instruction_usages bb; latency_constraints = latency_constraints bb; } diff --git a/aarch64/PrepassSchedulingOracle.ml b/aarch64/PrepassSchedulingOracle.ml index 2c3eb14f..e09eea13 100644 --- a/aarch64/PrepassSchedulingOracle.ml +++ b/aarch64/PrepassSchedulingOracle.ml @@ -1,3 +1,16 @@ +(* *************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Sylvain Boulmé Grenoble-INP, VERIMAG *) +(* David Monniaux CNRS, VERIMAG *) +(* Cyril Six Kalray *) +(* Léo Gourdin UGA, VERIMAG *) +(* Nicolas Nardino ENS-Lyon, VERIMAG *) +(* *) +(* *) +(* *************************************************************) + open AST open RTL open Maps @@ -406,11 +419,15 @@ let get_alias_dependencies seqa = !deps;; *) -let define_problem (opweights : opweights) seqa = +let define_problem (opweights : opweights) (live_entry_regs : Regset.t) + (typing : RTLtyping.regenv) reference_counting seqa = let simple_deps = get_simple_dependencies opweights seqa in { max_latency = -1; resource_bounds = opweights.pipelined_resource_bounds; - instruction_usages = Array.map (resources_of_instruction opweights) (Array.map fst seqa); + live_regs_entry = live_entry_regs; + typing = typing; + reference_counting = Some reference_counting; + instruction_usages = Array.map (resources_of_instruction opweights) (Array.map fst seqa); latency_constraints = (* if (use_alias_analysis ()) then (get_alias_dependencies seqa) @ simple_deps @@ -439,7 +456,10 @@ let prepass_scheduler_by_name name problem early_ones = | "zigzag" -> zigzag_scheduler problem early_ones | _ -> scheduler_by_name name problem -let schedule_sequence (seqa : (instruction*Regset.t) array) = +let schedule_sequence (seqa : (instruction*Regset.t) array) + (live_regs_entry : Registers.Regset.t) + (typing : RTLtyping.regenv) + reference = let opweights = OpWeights.get_opweights () in try if (Array.length seqa) <= 1 @@ -449,7 +469,8 @@ let schedule_sequence (seqa : (instruction*Regset.t) array) = let nr_instructions = Array.length seqa in (if !Clflags.option_debug_compcert > 6 then Printf.printf "prepass scheduling length = %d\n" (Array.length seqa)); - let problem = define_problem opweights seqa in + let problem = define_problem opweights live_regs_entry + typing reference seqa in (if !Clflags.option_debug_compcert > 7 then (print_sequence stdout (Array.map fst seqa); print_problem stdout problem)); diff --git a/arm/ExpansionOracle.ml b/arm/ExpansionOracle.ml deleted file mode 120000 index ee2674bf..00000000 --- a/arm/ExpansionOracle.ml +++ /dev/null @@ -1 +0,0 @@ -../aarch64/ExpansionOracle.ml \ No newline at end of file diff --git a/arm/ExpansionOracle.ml b/arm/ExpansionOracle.ml new file mode 100644 index 00000000..3b63b80d --- /dev/null +++ b/arm/ExpansionOracle.ml @@ -0,0 +1,17 @@ +(* *************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Léo Gourdin UGA, VERIMAG *) +(* *) +(* Copyright VERIMAG. All rights reserved. *) +(* This file is distributed under the terms of the INRIA *) +(* Non-Commercial License Agreement. *) +(* *) +(* *************************************************************) + +open RTLpathCommon + +let expanse (sb : superblock) code pm = (code, pm) + +let find_last_node_reg c = () diff --git a/arm/PrepassSchedulingOracle.ml b/arm/PrepassSchedulingOracle.ml deleted file mode 120000 index 9885fd52..00000000 --- a/arm/PrepassSchedulingOracle.ml +++ /dev/null @@ -1 +0,0 @@ -../x86/PrepassSchedulingOracle.ml \ No newline at end of file diff --git a/arm/PrepassSchedulingOracle.ml b/arm/PrepassSchedulingOracle.ml new file mode 100644 index 00000000..42a3da23 --- /dev/null +++ b/arm/PrepassSchedulingOracle.ml @@ -0,0 +1,6 @@ +open RTL +open Registers + +(* Do not do anything *) +let schedule_sequence (seqa : (instruction*Regset.t) array) + live_regs_entry typing reference = None diff --git a/common/DebugPrint.ml b/common/DebugPrint.ml index 6f8449ee..275e6a71 100644 --- a/common/DebugPrint.ml +++ b/common/DebugPrint.ml @@ -132,10 +132,10 @@ let print_instructions insts code = | None -> failwith "Did not get some" | Some thing -> thing in if (!debug_flag) then begin - debug "[ "; + debug "[\n"; List.iter ( fun n -> (PrintRTL.print_instruction stdout (P.to_int n, get_some @@ PTree.get n code)) - ) insts; debug "]" + ) insts; debug " ]" end let print_arrayp arr = begin diff --git a/driver/Clflags.ml b/driver/Clflags.ml index fa17c2d9..25bd2c78 100644 --- a/driver/Clflags.ml +++ b/driver/Clflags.ml @@ -51,7 +51,7 @@ let option_flooprotate = ref 0 (* rotate the innermost loops to have the conditi let option_mtune = ref "" let option_fprepass = ref true -let option_fprepass_sched = ref "list" +let option_fprepass_sched = ref "regpres" let option_fpostpass = ref true let option_fpostpass_sched = ref "list" @@ -115,4 +115,6 @@ let option_inline_auto_threshold = ref 0 let option_profile_arcs = ref false let option_fbranch_probabilities = ref true let option_debug_compcert = ref 0 +let option_regpres_threshold = ref 2 +let option_regpres_wait_window = ref false let main_function_name = ref "main" diff --git a/driver/Driver.ml b/driver/Driver.ml index 7192ba4b..3f5a4bd9 100644 --- a/driver/Driver.ml +++ b/driver/Driver.ml @@ -210,7 +210,9 @@ Processing options: -mtune= Type of CPU (for scheduling on some architectures) -fprepass Perform prepass scheduling (only on some architectures) [on] -fprepass= Perform postpass scheduling with the specified optimization [list] - (=list: list scheduling, =revlist: reverse list scheduling, =zigzag: zigzag scheduling, =ilp: ILP, =greedy: just packing bundles) + (=list: list scheduling, =revlist: reverse list scheduling, =regpres: list scheduling aware of register pressure, =regpres_bis: variant of regpres, =zigzag: zigzag scheduling, =ilp: ILP, =greedy: just packing bundles) + -regpres-threshold n With `-fprepass= regpres`, set threshold value for number of free registers before trying to decrease register pressure + -fregpres-wait-window When register pressure is high, use a 5-cycle waiting window instead of scheduling short paths first (default no) -fpostpass Perform postpass scheduling (only for K1 architecture) [on] -fpostpass= Perform postpass scheduling with the specified optimization [list] (=list: list scheduling, =ilp: ILP, =greedy: just packing bundles) @@ -296,9 +298,9 @@ let num_input_files = ref 0 let cmdline_actions = let f_opt name ref = [Exact("-f" ^ name), Set ref; Exact("-fno-" ^ name), Unset ref] in - let f_opt_str name ref strref = + let f_opt_str name default ref strref = [Exact("-f" ^ name ^ "="), String - (fun s -> (strref := (if s == "" then "list" else s)); ref := true) + (fun s -> (strref := (if s == "" then default else s)); ref := true) ] in let f_str name strref default = [Exact("-f" ^ name ^ "="), String @@ -342,6 +344,7 @@ let cmdline_actions = Exact "-fprofile-use=", String (fun s -> Profilingaux.load_profiling_info s); Exact "-finline-auto-threshold", Integer (fun n -> option_inline_auto_threshold := n); Exact "-debug-compcert", Integer (fun n -> option_debug_compcert := n); + Exact "-regpres-threshold", Integer (fun n -> option_regpres_threshold := n); Exact "-fsmall-data", Integer(fun n -> option_small_data := n); Exact "-fsmall-const", Integer(fun n -> option_small_const := n); Exact "-ffloat-const-prop", Integer(fun n -> option_ffloatconstprop := n); @@ -424,6 +427,7 @@ let cmdline_actions = @ f_opt "redundancy" option_fredundancy @ [ Exact "-mtune", String (fun s -> option_mtune := s) ] @ f_opt "prepass" option_fprepass + @ f_opt "regpres-wait-window" option_regpres_wait_window @ f_opt "postpass" option_fpostpass @ [ Exact "-ftailduplicate", Integer (fun n -> option_ftailduplicate := n) ] @ f_opt "predict" option_fpredict @@ -431,8 +435,8 @@ let cmdline_actions = @ [ Exact "-funrollbody", Integer (fun n -> option_funrollbody := n) ] @ [ Exact "-flooprotate", Integer (fun n -> option_flooprotate := n) ] @ f_opt "tracelinearize" option_ftracelinearize - @ f_opt_str "prepass" option_fprepass option_fprepass_sched - @ f_opt_str "postpass" option_fpostpass option_fpostpass_sched + @ f_opt_str "prepass" "regpress" option_fprepass option_fprepass_sched + @ f_opt_str "postpass" "list" option_fpostpass option_fpostpass_sched @ f_opt "inline" option_finline @ f_opt "inline-functions-called-once" option_finline_functions_called_once @ f_opt "globaladdrtmp" option_fglobaladdrtmp diff --git a/kvx/ExpansionOracle.ml b/kvx/ExpansionOracle.ml deleted file mode 120000 index ee2674bf..00000000 --- a/kvx/ExpansionOracle.ml +++ /dev/null @@ -1 +0,0 @@ -../aarch64/ExpansionOracle.ml \ No newline at end of file diff --git a/kvx/ExpansionOracle.ml b/kvx/ExpansionOracle.ml new file mode 100644 index 00000000..3b63b80d --- /dev/null +++ b/kvx/ExpansionOracle.ml @@ -0,0 +1,17 @@ +(* *************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Léo Gourdin UGA, VERIMAG *) +(* *) +(* Copyright VERIMAG. All rights reserved. *) +(* This file is distributed under the terms of the INRIA *) +(* Non-Commercial License Agreement. *) +(* *) +(* *************************************************************) + +open RTLpathCommon + +let expanse (sb : superblock) code pm = (code, pm) + +let find_last_node_reg c = () diff --git a/kvx/Machregsaux.ml b/kvx/Machregsaux.ml index e3b18181..dbb89727 100644 --- a/kvx/Machregsaux.ml +++ b/kvx/Machregsaux.ml @@ -31,3 +31,5 @@ let class_of_type = function | AST.Tint | AST.Tlong | AST.Tfloat | AST.Tsingle -> 0 | AST.Tany32 | AST.Tany64 -> assert false + +let nr_regs = [| 59 |] diff --git a/kvx/Machregsaux.mli b/kvx/Machregsaux.mli index 01b0f9fd..23ac1c9a 100644 --- a/kvx/Machregsaux.mli +++ b/kvx/Machregsaux.mli @@ -15,3 +15,6 @@ val is_scratch_register: string -> bool val class_of_type: AST.typ -> int + +(* Number of registers in each class *) +val nr_regs : int array diff --git a/kvx/PostpassSchedulingOracle.ml b/kvx/PostpassSchedulingOracle.ml index 2107ce22..5ebad421 100644 --- a/kvx/PostpassSchedulingOracle.ml +++ b/kvx/PostpassSchedulingOracle.ml @@ -787,8 +787,14 @@ let latency_constraints bb = *) let build_problem bb = - { max_latency = -1; resource_bounds = resource_bounds; - instruction_usages = instruction_usages bb; latency_constraints = latency_constraints bb } +{ max_latency = -1; + resource_bounds = resource_bounds; + instruction_usages = instruction_usages bb; + latency_constraints = latency_constraints bb; + live_regs_entry = Registers.Regset.empty; (* unused here *) + typing = (fun x -> AST.Tint); (* unused here *) + reference_counting = None +} let rec find_min_opt (l: int option list) = match l with diff --git a/kvx/PrepassSchedulingOracle.ml b/kvx/PrepassSchedulingOracle.ml deleted file mode 120000 index 912e9ffa..00000000 --- a/kvx/PrepassSchedulingOracle.ml +++ /dev/null @@ -1 +0,0 @@ -../aarch64/PrepassSchedulingOracle.ml \ No newline at end of file diff --git a/kvx/PrepassSchedulingOracle.ml b/kvx/PrepassSchedulingOracle.ml new file mode 100644 index 00000000..53a81095 --- /dev/null +++ b/kvx/PrepassSchedulingOracle.ml @@ -0,0 +1,485 @@ +open AST +open RTL +open Maps +open InstructionScheduler +open Registers +open PrepassSchedulingOracleDeps + +let use_alias_analysis () = false + +let length_of_chunk = function +| Mint8signed +| Mint8unsigned -> 1 +| Mint16signed +| Mint16unsigned -> 2 +| Mint32 +| Mfloat32 +| Many32 -> 4 +| Mint64 +| Mfloat64 +| Many64 -> 8;; + +let get_simple_dependencies (opweights : opweights) (seqa : (instruction*Regset.t) array) = + let last_reg_reads : int list PTree.t ref = ref PTree.empty + and last_reg_write : (int*int) PTree.t ref = ref PTree.empty + and last_mem_reads : int list ref = ref [] + and last_mem_write : int option ref = ref None + and last_branch : int option ref = ref None + and last_non_pipelined_op : int array = Array.make + opweights.nr_non_pipelined_units ( -1 ) + and latency_constraints : latency_constraint list ref = ref [] in + let add_constraint instr_from instr_to latency = + assert (instr_from <= instr_to); + assert (latency >= 0); + if instr_from = instr_to + then (if latency = 0 + then () + else failwith "PrepassSchedulingOracle.get_dependencies: negative self-loop") + else + latency_constraints := + { instr_from = instr_from; + instr_to = instr_to; + latency = latency + }:: !latency_constraints + and get_last_reads reg = + match PTree.get reg !last_reg_reads + with Some l -> l + | None -> [] in + let add_input_mem i = + if not (use_alias_analysis ()) + then + begin + begin + (* Read after write *) + match !last_mem_write with + | None -> () + | Some j -> add_constraint j i 1 + end; + last_mem_reads := i :: !last_mem_reads + end + and add_output_mem i = + if not (use_alias_analysis ()) + then + begin + begin + (* Write after write *) + match !last_mem_write with + | None -> () + | Some j -> add_constraint j i 1 + end; + (* Write after read *) + List.iter (fun j -> add_constraint j i 0) !last_mem_reads; + last_mem_write := Some i; + last_mem_reads := [] + end + and add_input_reg i reg = + begin + (* Read after write *) + match PTree.get reg !last_reg_write with + | None -> () + | Some (j, latency) -> add_constraint j i latency + end; + last_reg_reads := PTree.set reg + (i :: get_last_reads reg) + !last_reg_reads + and add_output_reg i latency reg = + begin + (* Write after write *) + match PTree.get reg !last_reg_write with + | None -> () + | Some (j, _) -> add_constraint j i 1 + end; + begin + (* Write after read *) + List.iter (fun j -> add_constraint j i 0) (get_last_reads reg) + end; + last_reg_write := PTree.set reg (i, latency) !last_reg_write; + last_reg_reads := PTree.remove reg !last_reg_reads + in + let add_input_regs i regs = List.iter (add_input_reg i) regs in + let rec add_builtin_res i (res : reg builtin_res) = + match res with + | BR r -> add_output_reg i 10 r + | BR_none -> () + | BR_splitlong (hi, lo) -> add_builtin_res i hi; + add_builtin_res i lo in + let rec add_builtin_arg i (ba : reg builtin_arg) = + match ba with + | BA r -> add_input_reg i r + | BA_int _ | BA_long _ | BA_float _ | BA_single _ -> () + | BA_loadstack(_,_) -> add_input_mem i + | BA_addrstack _ -> () + | BA_loadglobal(_, _, _) -> add_input_mem i + | BA_addrglobal _ -> () + | BA_splitlong(hi, lo) -> add_builtin_arg i hi; + add_builtin_arg i lo + | BA_addptr(a1, a2) -> add_builtin_arg i a1; + add_builtin_arg i a2 in + let irreversible_action i = + match !last_branch with + | None -> () + | Some j -> add_constraint j i 1 in + let set_branch i = + irreversible_action i; + last_branch := Some i in + let add_non_pipelined_resources i resources = + Array.iter2 + (fun latency last -> + if latency >= 0 && last >= 0 then add_constraint last i latency) + resources last_non_pipelined_op; + Array.iteri (fun rsc latency -> + if latency >= 0 + then last_non_pipelined_op.(rsc) <- i) resources + in + Array.iteri + begin + fun i (insn, other_uses) -> + List.iter (fun use -> + add_input_reg i use) + (Regset.elements other_uses); + + match insn with + | Inop _ -> () + | Iop(op, inputs, output, _) -> + add_non_pipelined_resources i + (opweights.non_pipelined_resources_of_op op (List.length inputs)); + (if Op.is_trapping_op op then irreversible_action i); + add_input_regs i inputs; + add_output_reg i (opweights.latency_of_op op (List.length inputs)) output + | Iload(trap, chunk, addressing, addr_regs, output, _) -> + (if trap=TRAP then irreversible_action i); + add_input_mem i; + add_input_regs i addr_regs; + add_output_reg i (opweights.latency_of_load trap chunk addressing (List.length addr_regs)) output + | Istore(chunk, addressing, addr_regs, input, _) -> + irreversible_action i; + add_input_regs i addr_regs; + add_input_reg i input; + add_output_mem i + | Icall(signature, ef, inputs, output, _) -> + set_branch i; + (match ef with + | Datatypes.Coq_inl r -> add_input_reg i r + | Datatypes.Coq_inr symbol -> () + ); + add_input_mem i; + add_input_regs i inputs; + add_output_reg i (opweights.latency_of_call signature ef) output; + add_output_mem i; + failwith "Icall" + | Itailcall(signature, ef, inputs) -> + set_branch i; + (match ef with + | Datatypes.Coq_inl r -> add_input_reg i r + | Datatypes.Coq_inr symbol -> () + ); + add_input_mem i; + add_input_regs i inputs; + failwith "Itailcall" + | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> + set_branch i; + add_input_mem i; + List.iter (add_builtin_arg i) builtin_inputs; + add_builtin_res i builtin_output; + add_output_mem i; + failwith "Ibuiltin" + | Icond(cond, inputs, _, _, _) -> + set_branch i; + add_input_mem i; + add_input_regs i inputs + | Ijumptable(input, _) -> + set_branch i; + add_input_reg i input; + failwith "Ijumptable" + | Ireturn(Some input) -> + set_branch i; + add_input_reg i input; + failwith "Ireturn" + | Ireturn(None) -> + set_branch i; + failwith "Ireturn none" + end seqa; + !latency_constraints;; + +let resources_of_instruction (opweights : opweights) = function + | Inop _ -> Array.map (fun _ -> 0) opweights.pipelined_resource_bounds + | Iop(op, inputs, output, _) -> + opweights.resources_of_op op (List.length inputs) + | Iload(trap, chunk, addressing, addr_regs, output, _) -> + opweights.resources_of_load trap chunk addressing (List.length addr_regs) + | Istore(chunk, addressing, addr_regs, input, _) -> + opweights.resources_of_store chunk addressing (List.length addr_regs) + | Icall(signature, ef, inputs, output, _) -> + opweights.resources_of_call signature ef + | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> + opweights.resources_of_builtin ef + | Icond(cond, args, _, _ , _) -> + opweights.resources_of_cond cond (List.length args) + | Itailcall _ | Ijumptable _ | Ireturn _ -> opweights.pipelined_resource_bounds + +let print_sequence pp (seqa : instruction array) = + Array.iteri ( + fun i (insn : instruction) -> + PrintRTL.print_instruction pp (i, insn)) seqa;; + +type unique_id = int + +type 'a symbolic_term_node = + | STop of Op.operation * 'a list + | STinitial_reg of int + | STother of int;; + +type symbolic_term = { + hash_id : unique_id; + hash_ct : symbolic_term symbolic_term_node + };; + +let rec print_term channel term = + match term.hash_ct with + | STop(op, args) -> + PrintOp.print_operation print_term channel (op, args) + | STinitial_reg n -> Printf.fprintf channel "x%d" n + | STother n -> Printf.fprintf channel "y%d" n;; + +type symbolic_term_table = { + st_table : (unique_id symbolic_term_node, symbolic_term) Hashtbl.t; + mutable st_next_id : unique_id };; + +let hash_init () = { + st_table = Hashtbl.create 20; + st_next_id = 0 + };; + +let ground_to_id = function + | STop(op, l) -> STop(op, List.map (fun t -> t.hash_id) l) + | STinitial_reg r -> STinitial_reg r + | STother i -> STother i;; + +let hash_node (table : symbolic_term_table) (term : symbolic_term symbolic_term_node) : symbolic_term = + let grounded = ground_to_id term in + match Hashtbl.find_opt table.st_table grounded with + | Some x -> x + | None -> + let term' = { hash_id = table.st_next_id; + hash_ct = term } in + (if table.st_next_id = max_int then failwith "hash: max_int"); + table.st_next_id <- table.st_next_id + 1; + Hashtbl.add table.st_table grounded term'; + term';; + +type access = { + base : symbolic_term; + offset : int64; + length : int + };; + +let term_equal a b = (a.hash_id = b.hash_id);; + +let access_of_addressing get_reg chunk addressing args = + match addressing, args with + | (Op.Aindexed ofs), [reg] -> Some + { base = get_reg reg; + offset = Camlcoq.camlint64_of_ptrofs ofs; + length = length_of_chunk chunk + } + | _, _ -> None ;; +(* TODO: global *) + +let symbolic_execution (seqa : instruction array) = + let regs = ref PTree.empty + and table = hash_init() in + let assign reg term = regs := PTree.set reg term !regs + and hash term = hash_node table term in + let get_reg reg = + match PTree.get reg !regs with + | None -> hash (STinitial_reg (Camlcoq.P.to_int reg)) + | Some x -> x in + let targets = Array.make (Array.length seqa) None in + Array.iteri + begin + fun i insn -> + match insn with + | Iop(Op.Omove, [input], output, _) -> + assign output (get_reg input) + | Iop(op, inputs, output, _) -> + assign output (hash (STop(op, List.map get_reg inputs))) + + | Iload(trap, chunk, addressing, args, output, _) -> + let access = access_of_addressing get_reg chunk addressing args in + targets.(i) <- access; + assign output (hash (STother(i))) + + | Icall(_, _, _, output, _) + | Ibuiltin(_, _, BR output, _) -> + assign output (hash (STother(i))) + + | Istore(chunk, addressing, args, va, _) -> + let access = access_of_addressing get_reg chunk addressing args in + targets.(i) <- access + + | Inop _ -> () + | Ibuiltin(_, _, BR_none, _) -> () + | Ibuiltin(_, _, BR_splitlong _, _) -> failwith "BR_splitlong" + + | Itailcall (_, _, _) + |Icond (_, _, _, _, _) + |Ijumptable (_, _) + |Ireturn _ -> () + end seqa; + targets;; + +let print_access channel = function + | None -> Printf.fprintf channel "any" + | Some x -> Printf.fprintf channel "%a + %Ld" print_term x.base x.offset;; + +let print_targets channel seqa = + let targets = symbolic_execution seqa in + Array.iteri + (fun i insn -> + match insn with + | Iload _ -> Printf.fprintf channel "%d: load %a\n" + i print_access targets.(i) + | Istore _ -> Printf.fprintf channel "%d: store %a\n" + i print_access targets.(i) + | _ -> () + ) seqa;; + +let may_overlap a0 b0 = + match a0, b0 with + | (None, _) | (_ , None) -> true + | (Some a), (Some b) -> + if term_equal a.base b.base + then (max a.offset b.offset) < + (min (Int64.add (Int64.of_int a.length) a.offset) + (Int64.add (Int64.of_int b.length) b.offset)) + else match a.base.hash_ct, b.base.hash_ct with + | STop(Op.Oaddrsymbol(ida, ofsa),[]), + STop(Op.Oaddrsymbol(idb, ofsb),[]) -> + (ida=idb) && + let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) + and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in + (max ao bo) < + (min (Int64.add (Int64.of_int a.length) ao) + (Int64.add (Int64.of_int b.length) bo)) + | STop(Op.Oaddrstack _, []), + STop(Op.Oaddrsymbol _, []) + | STop(Op.Oaddrsymbol _, []), + STop(Op.Oaddrstack _, []) -> false + | STop(Op.Oaddrstack(ofsa),[]), + STop(Op.Oaddrstack(ofsb),[]) -> + let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) + and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in + (max ao bo) < + (min (Int64.add (Int64.of_int a.length) ao) + (Int64.add (Int64.of_int b.length) bo)) + | _ -> true;; + +(* +(* TODO suboptimal quadratic algorithm *) +let get_alias_dependencies seqa = + let targets = symbolic_execution seqa + and deps = ref [] in + let add_constraint instr_from instr_to latency = + deps := { instr_from = instr_from; + instr_to = instr_to; + latency = latency + }:: !deps in + for i=0 to (Array.length seqa)-1 + do + for j=0 to i-1 + do + match seqa.(j), seqa.(i) with + | (Istore _), ((Iload _) | (Istore _)) -> + if may_overlap targets.(j) targets.(i) + then add_constraint j i 1 + | (Iload _), (Istore _) -> + if may_overlap targets.(j) targets.(i) + then add_constraint j i 0 + | (Istore _ | Iload _), (Icall _ | Ibuiltin _) + | (Icall _ | Ibuiltin _), (Icall _ | Ibuiltin _ | Iload _ | Istore _) -> + add_constraint j i 1 + | (Inop _ | Iop _), _ + | _, (Inop _ | Iop _) + | (Iload _), (Iload _) -> () + done + done; + !deps;; + *) + +let define_problem (opweights : opweights) (live_entry_regs : Regset.t) + (typing : RTLtyping.regenv) reference_counting seqa = + let simple_deps = get_simple_dependencies opweights seqa in + { max_latency = -1; + resource_bounds = opweights.pipelined_resource_bounds; + live_regs_entry = live_entry_regs; + typing = typing; + reference_counting = Some reference_counting; + instruction_usages = Array.map (resources_of_instruction opweights) (Array.map fst seqa); + latency_constraints = + (* if (use_alias_analysis ()) + then (get_alias_dependencies seqa) @ simple_deps + else *) simple_deps };; + +let zigzag_scheduler problem early_ones = + let nr_instructions = get_nr_instructions problem in + assert(nr_instructions = (Array.length early_ones)); + match list_scheduler problem with + | Some fwd_schedule -> + let fwd_makespan = fwd_schedule.((Array.length fwd_schedule) - 1) in + let constraints' = ref problem.latency_constraints in + Array.iteri (fun i is_early -> + if is_early then + constraints' := { + instr_from = i; + instr_to = nr_instructions ; + latency = fwd_makespan - fwd_schedule.(i) } ::!constraints' ) + early_ones; + validated_scheduler reverse_list_scheduler + { problem with latency_constraints = !constraints' } + | None -> None;; + +let prepass_scheduler_by_name name problem early_ones = + match name with + | "zigzag" -> zigzag_scheduler problem early_ones + | _ -> scheduler_by_name name problem + +let schedule_sequence (seqa : (instruction*Regset.t) array) + (live_regs_entry : Registers.Regset.t) + (typing : RTLtyping.regenv) + reference = + let opweights = OpWeights.get_opweights () in + try + if (Array.length seqa) <= 1 + then None + else + begin + let nr_instructions = Array.length seqa in + (if !Clflags.option_debug_compcert > 6 + then Printf.printf "prepass scheduling length = %d\n" (Array.length seqa)); + let problem = define_problem opweights live_regs_entry + typing reference seqa in + (if !Clflags.option_debug_compcert > 7 + then (print_sequence stdout (Array.map fst seqa); + print_problem stdout problem)); + match prepass_scheduler_by_name + (!Clflags.option_fprepass_sched) + problem + (Array.map (fun (ins, _) -> + match ins with + | Icond _ -> true + | _ -> false) seqa) with + | None -> Printf.printf "no solution in prepass scheduling\n"; + None + | Some solution -> + let positions = Array.init nr_instructions (fun i -> i) in + Array.sort (fun i j -> + let si = solution.(i) and sj = solution.(j) in + if si < sj then -1 + else if si > sj then 1 + else i - j) positions; + Some positions + end + with (Failure s) -> + Printf.printf "failure in prepass scheduling: %s\n" s; + None;; + diff --git a/kvx/PrepassSchedulingOracleDeps.ml b/kvx/PrepassSchedulingOracleDeps.ml deleted file mode 120000 index 1e955b85..00000000 --- a/kvx/PrepassSchedulingOracleDeps.ml +++ /dev/null @@ -1 +0,0 @@ -../aarch64/PrepassSchedulingOracleDeps.ml \ No newline at end of file diff --git a/kvx/PrepassSchedulingOracleDeps.ml b/kvx/PrepassSchedulingOracleDeps.ml new file mode 100644 index 00000000..8d10d406 --- /dev/null +++ b/kvx/PrepassSchedulingOracleDeps.ml @@ -0,0 +1,17 @@ +type called_function = (Registers.reg, AST.ident) Datatypes.sum + +type opweights = + { + pipelined_resource_bounds : int array; + nr_non_pipelined_units : int; + latency_of_op : Op.operation -> int -> int; + resources_of_op : Op.operation -> int -> int array; + non_pipelined_resources_of_op : Op.operation -> int -> int array; + latency_of_load : AST.trapping_mode -> AST.memory_chunk -> Op.addressing -> int -> int; + resources_of_load : AST.trapping_mode -> AST.memory_chunk -> Op.addressing -> int -> int array; + resources_of_store : AST.memory_chunk -> Op.addressing -> int -> int array; + resources_of_cond : Op.condition -> int -> int array; + latency_of_call : AST.signature -> called_function -> int; + resources_of_call : AST.signature -> called_function -> int array; + resources_of_builtin : AST.external_function -> int array + };; diff --git a/powerpc/ExpansionOracle.ml b/powerpc/ExpansionOracle.ml deleted file mode 120000 index ee2674bf..00000000 --- a/powerpc/ExpansionOracle.ml +++ /dev/null @@ -1 +0,0 @@ -../aarch64/ExpansionOracle.ml \ No newline at end of file diff --git a/powerpc/ExpansionOracle.ml b/powerpc/ExpansionOracle.ml new file mode 100644 index 00000000..3b63b80d --- /dev/null +++ b/powerpc/ExpansionOracle.ml @@ -0,0 +1,17 @@ +(* *************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Léo Gourdin UGA, VERIMAG *) +(* *) +(* Copyright VERIMAG. All rights reserved. *) +(* This file is distributed under the terms of the INRIA *) +(* Non-Commercial License Agreement. *) +(* *) +(* *************************************************************) + +open RTLpathCommon + +let expanse (sb : superblock) code pm = (code, pm) + +let find_last_node_reg c = () diff --git a/powerpc/PrepassSchedulingOracle.ml b/powerpc/PrepassSchedulingOracle.ml deleted file mode 120000 index 9885fd52..00000000 --- a/powerpc/PrepassSchedulingOracle.ml +++ /dev/null @@ -1 +0,0 @@ -../x86/PrepassSchedulingOracle.ml \ No newline at end of file diff --git a/powerpc/PrepassSchedulingOracle.ml b/powerpc/PrepassSchedulingOracle.ml new file mode 100644 index 00000000..42a3da23 --- /dev/null +++ b/powerpc/PrepassSchedulingOracle.ml @@ -0,0 +1,6 @@ +open RTL +open Registers + +(* Do not do anything *) +let schedule_sequence (seqa : (instruction*Regset.t) array) + live_regs_entry typing reference = None diff --git a/riscV/Machregsaux.ml b/riscV/Machregsaux.ml index 840943e7..e3e47946 100644 --- a/riscV/Machregsaux.ml +++ b/riscV/Machregsaux.ml @@ -18,3 +18,5 @@ let class_of_type = function | AST.Tint | AST.Tlong -> 0 | AST.Tfloat | AST.Tsingle -> 1 | AST.Tany32 | AST.Tany64 -> assert false + +let nr_regs = [| 26; 32|] diff --git a/riscV/Machregsaux.mli b/riscV/Machregsaux.mli index 01b0f9fd..bb3777bf 100644 --- a/riscV/Machregsaux.mli +++ b/riscV/Machregsaux.mli @@ -15,3 +15,6 @@ val is_scratch_register: string -> bool val class_of_type: AST.typ -> int + +(* Number of registers in each class *) +val nr_regs : int array diff --git a/riscV/PrepassSchedulingOracle.ml b/riscV/PrepassSchedulingOracle.ml deleted file mode 120000 index 912e9ffa..00000000 --- a/riscV/PrepassSchedulingOracle.ml +++ /dev/null @@ -1 +0,0 @@ -../aarch64/PrepassSchedulingOracle.ml \ No newline at end of file diff --git a/riscV/PrepassSchedulingOracle.ml b/riscV/PrepassSchedulingOracle.ml new file mode 100644 index 00000000..53a81095 --- /dev/null +++ b/riscV/PrepassSchedulingOracle.ml @@ -0,0 +1,485 @@ +open AST +open RTL +open Maps +open InstructionScheduler +open Registers +open PrepassSchedulingOracleDeps + +let use_alias_analysis () = false + +let length_of_chunk = function +| Mint8signed +| Mint8unsigned -> 1 +| Mint16signed +| Mint16unsigned -> 2 +| Mint32 +| Mfloat32 +| Many32 -> 4 +| Mint64 +| Mfloat64 +| Many64 -> 8;; + +let get_simple_dependencies (opweights : opweights) (seqa : (instruction*Regset.t) array) = + let last_reg_reads : int list PTree.t ref = ref PTree.empty + and last_reg_write : (int*int) PTree.t ref = ref PTree.empty + and last_mem_reads : int list ref = ref [] + and last_mem_write : int option ref = ref None + and last_branch : int option ref = ref None + and last_non_pipelined_op : int array = Array.make + opweights.nr_non_pipelined_units ( -1 ) + and latency_constraints : latency_constraint list ref = ref [] in + let add_constraint instr_from instr_to latency = + assert (instr_from <= instr_to); + assert (latency >= 0); + if instr_from = instr_to + then (if latency = 0 + then () + else failwith "PrepassSchedulingOracle.get_dependencies: negative self-loop") + else + latency_constraints := + { instr_from = instr_from; + instr_to = instr_to; + latency = latency + }:: !latency_constraints + and get_last_reads reg = + match PTree.get reg !last_reg_reads + with Some l -> l + | None -> [] in + let add_input_mem i = + if not (use_alias_analysis ()) + then + begin + begin + (* Read after write *) + match !last_mem_write with + | None -> () + | Some j -> add_constraint j i 1 + end; + last_mem_reads := i :: !last_mem_reads + end + and add_output_mem i = + if not (use_alias_analysis ()) + then + begin + begin + (* Write after write *) + match !last_mem_write with + | None -> () + | Some j -> add_constraint j i 1 + end; + (* Write after read *) + List.iter (fun j -> add_constraint j i 0) !last_mem_reads; + last_mem_write := Some i; + last_mem_reads := [] + end + and add_input_reg i reg = + begin + (* Read after write *) + match PTree.get reg !last_reg_write with + | None -> () + | Some (j, latency) -> add_constraint j i latency + end; + last_reg_reads := PTree.set reg + (i :: get_last_reads reg) + !last_reg_reads + and add_output_reg i latency reg = + begin + (* Write after write *) + match PTree.get reg !last_reg_write with + | None -> () + | Some (j, _) -> add_constraint j i 1 + end; + begin + (* Write after read *) + List.iter (fun j -> add_constraint j i 0) (get_last_reads reg) + end; + last_reg_write := PTree.set reg (i, latency) !last_reg_write; + last_reg_reads := PTree.remove reg !last_reg_reads + in + let add_input_regs i regs = List.iter (add_input_reg i) regs in + let rec add_builtin_res i (res : reg builtin_res) = + match res with + | BR r -> add_output_reg i 10 r + | BR_none -> () + | BR_splitlong (hi, lo) -> add_builtin_res i hi; + add_builtin_res i lo in + let rec add_builtin_arg i (ba : reg builtin_arg) = + match ba with + | BA r -> add_input_reg i r + | BA_int _ | BA_long _ | BA_float _ | BA_single _ -> () + | BA_loadstack(_,_) -> add_input_mem i + | BA_addrstack _ -> () + | BA_loadglobal(_, _, _) -> add_input_mem i + | BA_addrglobal _ -> () + | BA_splitlong(hi, lo) -> add_builtin_arg i hi; + add_builtin_arg i lo + | BA_addptr(a1, a2) -> add_builtin_arg i a1; + add_builtin_arg i a2 in + let irreversible_action i = + match !last_branch with + | None -> () + | Some j -> add_constraint j i 1 in + let set_branch i = + irreversible_action i; + last_branch := Some i in + let add_non_pipelined_resources i resources = + Array.iter2 + (fun latency last -> + if latency >= 0 && last >= 0 then add_constraint last i latency) + resources last_non_pipelined_op; + Array.iteri (fun rsc latency -> + if latency >= 0 + then last_non_pipelined_op.(rsc) <- i) resources + in + Array.iteri + begin + fun i (insn, other_uses) -> + List.iter (fun use -> + add_input_reg i use) + (Regset.elements other_uses); + + match insn with + | Inop _ -> () + | Iop(op, inputs, output, _) -> + add_non_pipelined_resources i + (opweights.non_pipelined_resources_of_op op (List.length inputs)); + (if Op.is_trapping_op op then irreversible_action i); + add_input_regs i inputs; + add_output_reg i (opweights.latency_of_op op (List.length inputs)) output + | Iload(trap, chunk, addressing, addr_regs, output, _) -> + (if trap=TRAP then irreversible_action i); + add_input_mem i; + add_input_regs i addr_regs; + add_output_reg i (opweights.latency_of_load trap chunk addressing (List.length addr_regs)) output + | Istore(chunk, addressing, addr_regs, input, _) -> + irreversible_action i; + add_input_regs i addr_regs; + add_input_reg i input; + add_output_mem i + | Icall(signature, ef, inputs, output, _) -> + set_branch i; + (match ef with + | Datatypes.Coq_inl r -> add_input_reg i r + | Datatypes.Coq_inr symbol -> () + ); + add_input_mem i; + add_input_regs i inputs; + add_output_reg i (opweights.latency_of_call signature ef) output; + add_output_mem i; + failwith "Icall" + | Itailcall(signature, ef, inputs) -> + set_branch i; + (match ef with + | Datatypes.Coq_inl r -> add_input_reg i r + | Datatypes.Coq_inr symbol -> () + ); + add_input_mem i; + add_input_regs i inputs; + failwith "Itailcall" + | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> + set_branch i; + add_input_mem i; + List.iter (add_builtin_arg i) builtin_inputs; + add_builtin_res i builtin_output; + add_output_mem i; + failwith "Ibuiltin" + | Icond(cond, inputs, _, _, _) -> + set_branch i; + add_input_mem i; + add_input_regs i inputs + | Ijumptable(input, _) -> + set_branch i; + add_input_reg i input; + failwith "Ijumptable" + | Ireturn(Some input) -> + set_branch i; + add_input_reg i input; + failwith "Ireturn" + | Ireturn(None) -> + set_branch i; + failwith "Ireturn none" + end seqa; + !latency_constraints;; + +let resources_of_instruction (opweights : opweights) = function + | Inop _ -> Array.map (fun _ -> 0) opweights.pipelined_resource_bounds + | Iop(op, inputs, output, _) -> + opweights.resources_of_op op (List.length inputs) + | Iload(trap, chunk, addressing, addr_regs, output, _) -> + opweights.resources_of_load trap chunk addressing (List.length addr_regs) + | Istore(chunk, addressing, addr_regs, input, _) -> + opweights.resources_of_store chunk addressing (List.length addr_regs) + | Icall(signature, ef, inputs, output, _) -> + opweights.resources_of_call signature ef + | Ibuiltin(ef, builtin_inputs, builtin_output, _) -> + opweights.resources_of_builtin ef + | Icond(cond, args, _, _ , _) -> + opweights.resources_of_cond cond (List.length args) + | Itailcall _ | Ijumptable _ | Ireturn _ -> opweights.pipelined_resource_bounds + +let print_sequence pp (seqa : instruction array) = + Array.iteri ( + fun i (insn : instruction) -> + PrintRTL.print_instruction pp (i, insn)) seqa;; + +type unique_id = int + +type 'a symbolic_term_node = + | STop of Op.operation * 'a list + | STinitial_reg of int + | STother of int;; + +type symbolic_term = { + hash_id : unique_id; + hash_ct : symbolic_term symbolic_term_node + };; + +let rec print_term channel term = + match term.hash_ct with + | STop(op, args) -> + PrintOp.print_operation print_term channel (op, args) + | STinitial_reg n -> Printf.fprintf channel "x%d" n + | STother n -> Printf.fprintf channel "y%d" n;; + +type symbolic_term_table = { + st_table : (unique_id symbolic_term_node, symbolic_term) Hashtbl.t; + mutable st_next_id : unique_id };; + +let hash_init () = { + st_table = Hashtbl.create 20; + st_next_id = 0 + };; + +let ground_to_id = function + | STop(op, l) -> STop(op, List.map (fun t -> t.hash_id) l) + | STinitial_reg r -> STinitial_reg r + | STother i -> STother i;; + +let hash_node (table : symbolic_term_table) (term : symbolic_term symbolic_term_node) : symbolic_term = + let grounded = ground_to_id term in + match Hashtbl.find_opt table.st_table grounded with + | Some x -> x + | None -> + let term' = { hash_id = table.st_next_id; + hash_ct = term } in + (if table.st_next_id = max_int then failwith "hash: max_int"); + table.st_next_id <- table.st_next_id + 1; + Hashtbl.add table.st_table grounded term'; + term';; + +type access = { + base : symbolic_term; + offset : int64; + length : int + };; + +let term_equal a b = (a.hash_id = b.hash_id);; + +let access_of_addressing get_reg chunk addressing args = + match addressing, args with + | (Op.Aindexed ofs), [reg] -> Some + { base = get_reg reg; + offset = Camlcoq.camlint64_of_ptrofs ofs; + length = length_of_chunk chunk + } + | _, _ -> None ;; +(* TODO: global *) + +let symbolic_execution (seqa : instruction array) = + let regs = ref PTree.empty + and table = hash_init() in + let assign reg term = regs := PTree.set reg term !regs + and hash term = hash_node table term in + let get_reg reg = + match PTree.get reg !regs with + | None -> hash (STinitial_reg (Camlcoq.P.to_int reg)) + | Some x -> x in + let targets = Array.make (Array.length seqa) None in + Array.iteri + begin + fun i insn -> + match insn with + | Iop(Op.Omove, [input], output, _) -> + assign output (get_reg input) + | Iop(op, inputs, output, _) -> + assign output (hash (STop(op, List.map get_reg inputs))) + + | Iload(trap, chunk, addressing, args, output, _) -> + let access = access_of_addressing get_reg chunk addressing args in + targets.(i) <- access; + assign output (hash (STother(i))) + + | Icall(_, _, _, output, _) + | Ibuiltin(_, _, BR output, _) -> + assign output (hash (STother(i))) + + | Istore(chunk, addressing, args, va, _) -> + let access = access_of_addressing get_reg chunk addressing args in + targets.(i) <- access + + | Inop _ -> () + | Ibuiltin(_, _, BR_none, _) -> () + | Ibuiltin(_, _, BR_splitlong _, _) -> failwith "BR_splitlong" + + | Itailcall (_, _, _) + |Icond (_, _, _, _, _) + |Ijumptable (_, _) + |Ireturn _ -> () + end seqa; + targets;; + +let print_access channel = function + | None -> Printf.fprintf channel "any" + | Some x -> Printf.fprintf channel "%a + %Ld" print_term x.base x.offset;; + +let print_targets channel seqa = + let targets = symbolic_execution seqa in + Array.iteri + (fun i insn -> + match insn with + | Iload _ -> Printf.fprintf channel "%d: load %a\n" + i print_access targets.(i) + | Istore _ -> Printf.fprintf channel "%d: store %a\n" + i print_access targets.(i) + | _ -> () + ) seqa;; + +let may_overlap a0 b0 = + match a0, b0 with + | (None, _) | (_ , None) -> true + | (Some a), (Some b) -> + if term_equal a.base b.base + then (max a.offset b.offset) < + (min (Int64.add (Int64.of_int a.length) a.offset) + (Int64.add (Int64.of_int b.length) b.offset)) + else match a.base.hash_ct, b.base.hash_ct with + | STop(Op.Oaddrsymbol(ida, ofsa),[]), + STop(Op.Oaddrsymbol(idb, ofsb),[]) -> + (ida=idb) && + let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) + and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in + (max ao bo) < + (min (Int64.add (Int64.of_int a.length) ao) + (Int64.add (Int64.of_int b.length) bo)) + | STop(Op.Oaddrstack _, []), + STop(Op.Oaddrsymbol _, []) + | STop(Op.Oaddrsymbol _, []), + STop(Op.Oaddrstack _, []) -> false + | STop(Op.Oaddrstack(ofsa),[]), + STop(Op.Oaddrstack(ofsb),[]) -> + let ao = Int64.add a.offset (Camlcoq.camlint64_of_ptrofs ofsa) + and bo = Int64.add b.offset (Camlcoq.camlint64_of_ptrofs ofsb) in + (max ao bo) < + (min (Int64.add (Int64.of_int a.length) ao) + (Int64.add (Int64.of_int b.length) bo)) + | _ -> true;; + +(* +(* TODO suboptimal quadratic algorithm *) +let get_alias_dependencies seqa = + let targets = symbolic_execution seqa + and deps = ref [] in + let add_constraint instr_from instr_to latency = + deps := { instr_from = instr_from; + instr_to = instr_to; + latency = latency + }:: !deps in + for i=0 to (Array.length seqa)-1 + do + for j=0 to i-1 + do + match seqa.(j), seqa.(i) with + | (Istore _), ((Iload _) | (Istore _)) -> + if may_overlap targets.(j) targets.(i) + then add_constraint j i 1 + | (Iload _), (Istore _) -> + if may_overlap targets.(j) targets.(i) + then add_constraint j i 0 + | (Istore _ | Iload _), (Icall _ | Ibuiltin _) + | (Icall _ | Ibuiltin _), (Icall _ | Ibuiltin _ | Iload _ | Istore _) -> + add_constraint j i 1 + | (Inop _ | Iop _), _ + | _, (Inop _ | Iop _) + | (Iload _), (Iload _) -> () + done + done; + !deps;; + *) + +let define_problem (opweights : opweights) (live_entry_regs : Regset.t) + (typing : RTLtyping.regenv) reference_counting seqa = + let simple_deps = get_simple_dependencies opweights seqa in + { max_latency = -1; + resource_bounds = opweights.pipelined_resource_bounds; + live_regs_entry = live_entry_regs; + typing = typing; + reference_counting = Some reference_counting; + instruction_usages = Array.map (resources_of_instruction opweights) (Array.map fst seqa); + latency_constraints = + (* if (use_alias_analysis ()) + then (get_alias_dependencies seqa) @ simple_deps + else *) simple_deps };; + +let zigzag_scheduler problem early_ones = + let nr_instructions = get_nr_instructions problem in + assert(nr_instructions = (Array.length early_ones)); + match list_scheduler problem with + | Some fwd_schedule -> + let fwd_makespan = fwd_schedule.((Array.length fwd_schedule) - 1) in + let constraints' = ref problem.latency_constraints in + Array.iteri (fun i is_early -> + if is_early then + constraints' := { + instr_from = i; + instr_to = nr_instructions ; + latency = fwd_makespan - fwd_schedule.(i) } ::!constraints' ) + early_ones; + validated_scheduler reverse_list_scheduler + { problem with latency_constraints = !constraints' } + | None -> None;; + +let prepass_scheduler_by_name name problem early_ones = + match name with + | "zigzag" -> zigzag_scheduler problem early_ones + | _ -> scheduler_by_name name problem + +let schedule_sequence (seqa : (instruction*Regset.t) array) + (live_regs_entry : Registers.Regset.t) + (typing : RTLtyping.regenv) + reference = + let opweights = OpWeights.get_opweights () in + try + if (Array.length seqa) <= 1 + then None + else + begin + let nr_instructions = Array.length seqa in + (if !Clflags.option_debug_compcert > 6 + then Printf.printf "prepass scheduling length = %d\n" (Array.length seqa)); + let problem = define_problem opweights live_regs_entry + typing reference seqa in + (if !Clflags.option_debug_compcert > 7 + then (print_sequence stdout (Array.map fst seqa); + print_problem stdout problem)); + match prepass_scheduler_by_name + (!Clflags.option_fprepass_sched) + problem + (Array.map (fun (ins, _) -> + match ins with + | Icond _ -> true + | _ -> false) seqa) with + | None -> Printf.printf "no solution in prepass scheduling\n"; + None + | Some solution -> + let positions = Array.init nr_instructions (fun i -> i) in + Array.sort (fun i j -> + let si = solution.(i) and sj = solution.(j) in + if si < sj then -1 + else if si > sj then 1 + else i - j) positions; + Some positions + end + with (Failure s) -> + Printf.printf "failure in prepass scheduling: %s\n" s; + None;; + diff --git a/riscV/PrepassSchedulingOracleDeps.ml b/riscV/PrepassSchedulingOracleDeps.ml deleted file mode 120000 index 1e955b85..00000000 --- a/riscV/PrepassSchedulingOracleDeps.ml +++ /dev/null @@ -1 +0,0 @@ -../aarch64/PrepassSchedulingOracleDeps.ml \ No newline at end of file diff --git a/riscV/PrepassSchedulingOracleDeps.ml b/riscV/PrepassSchedulingOracleDeps.ml new file mode 100644 index 00000000..8d10d406 --- /dev/null +++ b/riscV/PrepassSchedulingOracleDeps.ml @@ -0,0 +1,17 @@ +type called_function = (Registers.reg, AST.ident) Datatypes.sum + +type opweights = + { + pipelined_resource_bounds : int array; + nr_non_pipelined_units : int; + latency_of_op : Op.operation -> int -> int; + resources_of_op : Op.operation -> int -> int array; + non_pipelined_resources_of_op : Op.operation -> int -> int array; + latency_of_load : AST.trapping_mode -> AST.memory_chunk -> Op.addressing -> int -> int; + resources_of_load : AST.trapping_mode -> AST.memory_chunk -> Op.addressing -> int -> int array; + resources_of_store : AST.memory_chunk -> Op.addressing -> int -> int array; + resources_of_cond : Op.condition -> int -> int array; + latency_of_call : AST.signature -> called_function -> int; + resources_of_call : AST.signature -> called_function -> int array; + resources_of_builtin : AST.external_function -> int array + };; diff --git a/scheduling/InstructionScheduler.ml b/scheduling/InstructionScheduler.ml index eab0b21a..0203d9c8 100644 --- a/scheduling/InstructionScheduler.ml +++ b/scheduling/InstructionScheduler.ml @@ -33,6 +33,10 @@ type latency_constraint = { type problem = { max_latency : int; resource_bounds : int array; + live_regs_entry : Registers.Regset.t; + typing : RTLtyping.regenv; + reference_counting : ((Registers.reg, int * int) Hashtbl.t + * ((Registers.reg * bool) list array)) option; instruction_usages : int array array; latency_constraints : latency_constraint list; };; @@ -118,6 +122,13 @@ let vector_less_equal a b = true with Exit -> false;; +(* let vector_add a b = + * assert ((Array.length a) = (Array.length b)); + * for i=0 to (Array.length a)-1 + * do + * b.(i) <- b.(i) + a.(i) + * done;; *) + let vector_subtract a b = assert ((Array.length a) = (Array.length b)); for i=0 to (Array.length a)-1 @@ -257,8 +268,8 @@ let priority_list_scheduler (order : list_scheduler_order) assert(!time >= 0); !time with Exit -> -1 - in + let advance_time() = begin (if !current_time < max_time-1 @@ -267,7 +278,8 @@ let priority_list_scheduler (order : list_scheduler_order) Array.blit problem.resource_bounds 0 current_resources 0 (Array.length current_resources); ready.(!current_time + 1) <- - InstrSet.union (ready.(!current_time)) (ready.(!current_time + 1)); + InstrSet.union (ready.(!current_time)) + (ready.(!current_time + 1)); ready.(!current_time) <- InstrSet.empty; end); incr current_time @@ -334,6 +346,485 @@ let list_scheduler = priority_list_scheduler CRITICAL_PATH_ORDER;; (* dummy code for placating ocaml's warnings *) let _ = fun x -> priority_list_scheduler INSTRUCTION_ORDER x;; + +(* A scheduler sensitive to register pressure *) +let reg_pres_scheduler (problem : problem) : solution option = + DebugPrint.debug_flag := true; + + let nr_instructions = get_nr_instructions problem in + + if !Clflags.option_debug_compcert > 6 then + DebugPrint.debug "SCHEDULING_SUPERBLOCK %d\n" nr_instructions; + + let successors = get_successors problem + and predecessors = get_predecessors problem + and times = Array.make (nr_instructions+1) (-1) in + let live_regs_entry = problem.live_regs_entry in + + let available_regs = Array.copy Machregsaux.nr_regs in + + let nr_types_regs = Array.length available_regs in + + let thres = Array.fold_left (min) + (max !(Clflags.option_regpres_threshold) 0) + Machregsaux.nr_regs + in + + + let regs_thresholds = Array.make nr_types_regs thres in + (* placeholder value *) + + let class_r r = + Machregsaux.class_of_type (problem.typing r) in + + let live_regs = Hashtbl.create 42 in + + List.iter (fun r -> let classe = Machregsaux.class_of_type + (problem.typing r) in + available_regs.(classe) + <- available_regs.(classe) - 1; + Hashtbl.add live_regs r classe) + (Registers.Regset.elements live_regs_entry); + + let csr_b = ref false in + + let counts, mentions = + match problem.reference_counting with + | Some (l, r) -> l, r + | None -> assert false + in + + let fold_delta i = (fun a (r, b) -> + a + + if class_r r <> i then 0 else + (if b then + if (Hashtbl.find counts r = (i, 1)) + then 1 else 0 + else + match Hashtbl.find_opt live_regs r with + | None -> -1 + | Some t -> 0 + )) in + + let priorities = critical_paths successors in + + let current_resources = Array.copy problem.resource_bounds in + + let module InstrSet = + struct + module MSet = + Set.Make (struct + type t=int + let compare x y = + match priorities.(y) - priorities.(x) with + | 0 -> x - y + | z -> z + end) + + let empty = MSet.empty + let is_empty = MSet.is_empty + let add = MSet.add + let remove = MSet.remove + let union = MSet.union + let iter = MSet.iter + + let compare_regs i x y = + let pyi = List.fold_left (fold_delta i) 0 mentions.(y) in + (* print_int y; + * print_string " "; + * print_int pyi; + * print_newline (); + * flush stdout; *) + let pxi = List.fold_left (fold_delta i) 0 mentions.(x) in + match pyi - pxi with + | 0 -> (match priorities.(y) - priorities.(x) with + | 0 -> x - y + | z -> z) + | z -> z + + (** t is the register class *) + let sched_CSR t ready usages = + (* print_string "looking for max delta"; + * print_newline (); + * flush stdout; *) + let result = ref (-1) in + iter (fun i -> + if vector_less_equal usages.(i) current_resources + then if !result = -1 || (compare_regs t !result i > 0) + then result := i) ready; + !result + end + in + + let max_time = bound_max_time problem + 5*nr_instructions in + let ready = Array.make max_time InstrSet.empty in + + Array.iteri (fun i preds -> + if i < nr_instructions && preds = [] + then ready.(0) <- InstrSet.add i ready.(0)) predecessors; + + let current_time = ref 0 + and earliest_time i = + try + let time = ref (-1) in + List.iter (fun (j, latency) -> + if times.(j) < 0 + then raise Exit + else let t = times.(j) + latency in + if t > !time + then time := t) predecessors.(i); + assert (!time >= 0); + !time + with Exit -> -1 + in + + let advance_time () = + (if !current_time < max_time-1 + then ( + Array.blit problem.resource_bounds 0 current_resources 0 + (Array.length current_resources); + ready.(!current_time + 1) <- + InstrSet.union (ready.(!current_time)) + (ready.(!current_time +1)); + ready.(!current_time) <- InstrSet.empty)); + incr current_time + in + + (* ALL MENTIONS TO cnt ARE PLACEHOLDERS *) + let cnt = ref 0 in + + let attempt_scheduling ready usages = + let result = ref (-1) in + try + Array.iteri (fun i avlregs -> + (* print_string "avlregs: "; + * print_int i; + * print_string " "; + * print_int avlregs; + * print_newline (); + * print_string "live regs: "; + * print_int (Hashtbl.length live_regs); + * print_newline (); + * flush stdout; *) + if avlregs <= regs_thresholds.(i) + then ( + csr_b := true; + let maybe = InstrSet.sched_CSR i ready usages in + (* print_string "maybe\n"; + * print_int maybe; + * print_newline (); + * flush stdout; *) + (if maybe > 0 && + let delta = + List.fold_left (fold_delta i) 0 mentions.(maybe) in + (* print_string "delta "; + * print_int delta; + * print_newline (); + * flush stdout; *) + delta > 0 + then + (vector_subtract usages.(maybe) current_resources; + result := maybe) + else + if not !Clflags.option_regpres_wait_window + then + (InstrSet.iter (fun ins -> + if vector_less_equal usages.(ins) current_resources && + List.fold_left (fold_delta i) 0 mentions.(maybe) >= 0 + then result := ins + ) ready; + if !result <> -1 then + vector_subtract usages.(!result) current_resources) + else + (incr cnt) + ); + raise Exit)) available_regs; + InstrSet.iter (fun i -> + if vector_less_equal usages.(i) current_resources + then ( + vector_subtract usages.(i) current_resources; + result := i; + raise Exit)) ready; + -1 + with Exit -> + !result in + + while !current_time < max_time + do + if (InstrSet.is_empty ready.(!current_time)) + then advance_time () + else + match attempt_scheduling ready.(!current_time) + problem.instruction_usages with + | -1 -> advance_time() + | i -> (assert(times.(i) < 0); + ((* print_string "INSTR ISSUED: "; + * print_int i; + * print_newline (); + * flush stdout; *) + if !csr_b && !Clflags.option_debug_compcert > 6 then + DebugPrint.debug "REGPRES: high pres class %d\n" i; + csr_b := false; + if !Clflags.option_regpres_wait_window then + cnt := 0; + List.iter (fun (r,b) -> + if b then + (match Hashtbl.find_opt counts r with + | None -> assert false + | Some (t, n) -> + Hashtbl.remove counts r; + if n = 1 then + (Hashtbl.remove live_regs r; + available_regs.(t) + <- available_regs.(t) + 1)) + else + let t = class_r r in + match Hashtbl.find_opt live_regs r with + | None -> (Hashtbl.add live_regs r t; + available_regs.(t) + <- available_regs.(t) - 1) + | Some i -> () + ) mentions.(i)); + times.(i) <- !current_time; + ready.(!current_time) + <- InstrSet.remove i (ready.(!current_time)); + List.iter (fun (instr_to, latency) -> + if instr_to < nr_instructions then + match earliest_time instr_to with + | -1 -> () + | to_time -> + ((* DebugPrint.debug "TO TIME %d : %d\n" to_time + * (Array.length ready); *) + ready.(to_time) + <- InstrSet.add instr_to ready.(to_time)) + ) successors.(i); + successors.(i) <- [] + ) + done; + + try + let final_time = ref (-1) in + for i = 0 to nr_instructions - 1 do + (* print_int i; + * flush stdout; *) + (if times.(i) < 0 then raise Exit); + (if !final_time < times.(i) + 1 then final_time := times.(i) + 1) + done; + List.iter (fun (i, latency) -> + let target_time = latency + times.(i) in + if target_time > !final_time then + final_time := target_time) predecessors.(nr_instructions); + times.(nr_instructions) <- !final_time; + DebugPrint.debug_flag := false; + Some times + with Exit -> + DebugPrint.debug "reg_pres_sched failed\n"; + DebugPrint.debug_flag := false; + None + +;; + + +(********************************************************************) + +let reg_pres_scheduler_bis (problem : problem) : solution option = + DebugPrint.debug_flag := true; + Printf.printf "\nNEW\n\n"; + let nr_instructions = get_nr_instructions problem in + let successors = get_successors problem + and predecessors = get_predecessors problem + and times = Array.make (nr_instructions+1) (-1) in + let live_regs_entry = problem.live_regs_entry in + + (* let available_regs = Array.copy Machregsaux.nr_regs in *) + + let class_r r = + Machregsaux.class_of_type (problem.typing r) in + + let live_regs = Hashtbl.create 42 in + + List.iter (fun r -> let classe = Machregsaux.class_of_type + (problem.typing r) in + (* available_regs.(classe) + * <- available_regs.(classe) - 1; *) + Hashtbl.add live_regs r classe) + (Registers.Regset.elements live_regs_entry); + + + let counts, mentions = + match problem.reference_counting with + | Some (l, r) -> l, r + | None -> assert false + in + + let fold_delta a (r, b) = + a + (if b then + match Hashtbl.find_opt counts r with + | Some (_, 1) -> 1 + | _ -> 0 + else + match Hashtbl.find_opt live_regs r with + | None -> -1 + | Some t -> 0 + ) in + + let priorities = critical_paths successors in + + let current_resources = Array.copy problem.resource_bounds in + + let compare_pres x y = + let pdy = List.fold_left (fold_delta) 0 mentions.(y) in + let pdx = List.fold_left (fold_delta) 0 mentions.(x) in + match pdy - pdx with + | 0 -> x - y + | z -> z + in + + let module InstrSet = + Set.Make (struct + type t = int + let compare x y = + match priorities.(y) - priorities.(x) with + | 0 -> x - y + | z -> z + end) in + + let max_time = bound_max_time problem (* + 5*nr_instructions *) in + let ready = Array.make max_time InstrSet.empty in + + Array.iteri (fun i preds -> + if i < nr_instructions && preds = [] + then ready.(0) <- InstrSet.add i ready.(0)) predecessors; + + let current_time = ref 0 + and earliest_time i = + try + let time = ref (-1) in + List.iter (fun (j, latency) -> + if times.(j) < 0 + then raise Exit + else let t = times.(j) + latency in + if t > !time + then time := t) predecessors.(i); + assert (!time >= 0); + !time + with Exit -> -1 + in + + let advance_time () = + (* Printf.printf "ADV\n"; + * flush stdout; *) + (if !current_time < max_time-1 + then ( + Array.blit problem.resource_bounds 0 current_resources 0 + (Array.length current_resources); + ready.(!current_time + 1) <- + InstrSet.union (ready.(!current_time)) + (ready.(!current_time +1)); + ready.(!current_time) <- InstrSet.empty)); + incr current_time + in + + + let attempt_scheduling ready usages = + let result = ref [] in + try + InstrSet.iter (fun i -> + if vector_less_equal usages.(i) current_resources + then + if !result = [] || priorities.(i) = priorities.(List.hd (!result)) + then + result := i::(!result) + else raise Exit + ) ready; + if !result <> [] then raise Exit; + -1 + with + Exit -> + let mini = List.fold_left (fun a b -> + if a = -1 || compare_pres a b > 0 + then b else a + ) (-1) !result in + vector_subtract usages.(mini) current_resources; + mini + in + + while !current_time < max_time + do + if (InstrSet.is_empty ready.(!current_time)) + then advance_time () + else + match attempt_scheduling ready.(!current_time) + problem.instruction_usages with + | -1 -> advance_time() + | i -> ( + DebugPrint.debug "ISSUED: %d\nREADY: " i; + InstrSet.iter (fun i -> DebugPrint.debug "%d " i) + ready.(!current_time); + DebugPrint.debug "\nSUCC: "; + List.iter (fun (i, l) -> DebugPrint.debug "%d " i) + successors.(i); + DebugPrint.debug "\n\n"; + assert(times.(i) < 0); + times.(i) <- !current_time; + ready.(!current_time) + <- InstrSet.remove i (ready.(!current_time)); + (List.iter (fun (r,b) -> + if b then + (match Hashtbl.find_opt counts r with + | None -> assert false + | Some (t, n) -> + Hashtbl.remove counts r; + if n = 1 then + (Hashtbl.remove live_regs r; + (* available_regs.(t) + * <- available_regs.(t) + 1 *))) + else + let t = class_r r in + match Hashtbl.find_opt live_regs r with + | None -> (Hashtbl.add live_regs r t; + (* available_regs.(t) + * <- available_regs.(t) - 1 *)) + | Some i -> () + ) mentions.(i)); + List.iter (fun (instr_to, latency) -> + if instr_to < nr_instructions then + match earliest_time instr_to with + | -1 -> () + | to_time -> + ((* DebugPrint.debug "TO TIME %d : %d\n" to_time + * (Array.length ready); *) + ready.(to_time) + <- InstrSet.add instr_to ready.(to_time)) + ) successors.(i); + successors.(i) <- [] + ) + done; + + try + let final_time = ref (-1) in + for i = 0 to nr_instructions - 1 do + (* print_int i; + * flush stdout; *) + (if times.(i) < 0 then raise Exit); + (if !final_time < times.(i) + 1 then final_time := times.(i) + 1) + done; + List.iter (fun (i, latency) -> + let target_time = latency + times.(i) in + if target_time > !final_time then + final_time := target_time) predecessors.(nr_instructions); + times.(nr_instructions) <- !final_time; + DebugPrint.debug_flag := false; + Some times + with Exit -> + DebugPrint.debug "reg_pres_sched failed\n"; + DebugPrint.debug_flag := false; + None + +;; + +(********************************************************************) + type bundle = int list;; let rec extract_deps_to index = function @@ -438,6 +929,12 @@ let reverse_problem problem = { max_latency = problem.max_latency; resource_bounds = problem.resource_bounds; + live_regs_entry = Registers.Regset.empty; (* PLACEHOLDER *) + (* Not needed for the revlist sched, and for now we wont bother + with creating a reverse scheduler aware of reg press *) + + typing = problem.typing; + reference_counting = problem.reference_counting; instruction_usages = Array.init (nr_instructions + 1) (fun i -> if i=0 @@ -1259,5 +1756,7 @@ let scheduler_by_name name = | "ilp" -> validated_scheduler cascaded_scheduler | "list" -> validated_scheduler list_scheduler | "revlist" -> validated_scheduler reverse_list_scheduler + | "regpres" -> validated_scheduler reg_pres_scheduler + | "regpres_bis" -> validated_scheduler reg_pres_scheduler_bis | "greedy" -> greedy_scheduler | s -> failwith ("unknown scheduler: " ^ s);; diff --git a/scheduling/InstructionScheduler.mli b/scheduling/InstructionScheduler.mli index fb7af3f6..48c7bc09 100644 --- a/scheduling/InstructionScheduler.mli +++ b/scheduling/InstructionScheduler.mli @@ -23,6 +23,16 @@ type problem = { resource_bounds : int array; (** An array of number of units available indexed by the kind of resources to be allocated. It can be empty, in which case the problem is scheduling without resource constraints. *) + live_regs_entry : Registers.Regset.t; + (** The set of live pseudo-registers at entry. *) + + typing : RTLtyping.regenv; + (** Register type map. *) + + reference_counting : ((Registers.reg, int * int) Hashtbl.t + * ((Registers.reg * bool) list array)) option; + (** See RTLpathScheduleraux.reference_counting. *) + instruction_usages: int array array; (** At index {i i} the vector of resources used by instruction number {i i}. It must be the same length as [resource_bounds] *) @@ -68,6 +78,12 @@ Once a clock tick is full go to the next. @return [Some solution] when a solution is found, [None] if not. *) val list_scheduler : problem -> solution option +(** WIP : Same as list_scheduler, but schedules instructions which decrease +register pressure when it gets too high. *) +val reg_pres_scheduler : problem -> solution option + +val reg_pres_scheduler_bis : problem -> solution option + (** Schedule the problem using the order of instructions without any reordering *) val greedy_scheduler : problem -> solution option diff --git a/scheduling/RTLpathScheduleraux.ml b/scheduling/RTLpathScheduleraux.ml index aeed39df..f3f09954 100644 --- a/scheduling/RTLpathScheduleraux.ml +++ b/scheduling/RTLpathScheduleraux.ml @@ -17,7 +17,7 @@ let print_superblock (sb: superblock) code = begin debug "{ instructions = "; print_instructions (Array.to_list insts) code; debug "\n"; debug " liveins = "; print_ptree_regset li; debug "\n"; - debug " output_regs = "; print_regset outs; debug "}" + debug " output_regs = "; print_regset outs; debug "\n}" end let print_superblocks lsb code = @@ -72,6 +72,168 @@ let get_superblocks code entry pm typing = lsb end +(** the useful one. Returns a hashtable with bindings of shape + ** [(r,(t, n)], where [r] is a pseudo-register (Registers.reg), + ** [t] is its class (according to [typing]), and [n] the number of + ** times it's referenced as an argument in instructions of [seqa] ; + ** and an arrray containg the list of regs referenced by each + ** instruction, with a boolean to know whether it's as arg or dest *) +let reference_counting (seqa : (instruction * Regset.t) array) + (out_regs : Registers.Regset.t) (typing : RTLtyping.regenv) : + (Registers.reg, int * int) Hashtbl.t * + (Registers.reg * bool) list array = + let retl = Hashtbl.create 42 in + let retr = Array.make (Array.length seqa) [] in + (* retr.(i) : (r, b) -> (r', b') -> ... + * where b = true if seen as arg, false if seen as dest + *) + List.iter (fun reg -> + Hashtbl.add retl + reg (Machregsaux.class_of_type (typing reg), 1) + ) (Registers.Regset.elements out_regs); + let add_reg reg = + match Hashtbl.find_opt retl reg with + | Some (t, n) -> Hashtbl.add retl reg (t, n+1) + | None -> Hashtbl.add retl reg (Machregsaux.class_of_type + (typing reg), 1) + in + let map_true = List.map (fun r -> r, true) in + Array.iteri (fun i (ins, _) -> + match ins with + | Iop(_,args,dest,_) | Iload(_,_,_,args,dest,_) -> + List.iter (add_reg) args; + retr.(i) <- (dest, false)::(map_true args) + | Icond(_,args,_,_,_) -> + List.iter (add_reg) args; + retr.(i) <- map_true args + | Istore(_,_,args,src,_) -> + List.iter (add_reg) args; + add_reg src; + retr.(i) <- (src, true)::(map_true args) + | Icall(_,fn,args,dest,_) -> + List.iter (add_reg) args; + retr.(i) <- (match fn with + | Datatypes.Coq_inl reg -> + add_reg reg; + (dest,false)::(reg, true)::(map_true args) + | _ -> (dest,false)::(map_true args)) + + | Itailcall(_,fn,args) -> + List.iter (add_reg) args; + retr.(i) <- (match fn with + | Datatypes.Coq_inl reg -> + add_reg reg; + (reg, true)::(map_true args) + | _ -> map_true args) + | Ibuiltin(_,args,dest,_) -> + let rec bar = function + | AST.BA r -> add_reg r; + retr.(i) <- (r, true)::retr.(i) + | AST.BA_splitlong (hi, lo) | AST.BA_addptr (hi, lo) -> + bar hi; bar lo + | _ -> () + in + List.iter (bar) args; + let rec bad = function + | AST.BR r -> retr.(i) <- (r, false)::retr.(i) + | AST.BR_splitlong (hi, lo) -> + bad hi; bad lo + | _ -> () + in + bad dest; + | Ijumptable (reg,_) | Ireturn (Some reg) -> + add_reg reg; + retr.(i) <- [reg, true] + | _ -> () + ) seqa; + (* print_string "mentions\n"; + * Array.iteri (fun i l -> + * print_int i; + * print_string ": ["; + * List.iter (fun (r, b) -> + * print_int (Camlcoq.P.to_int r); + * print_string ":"; + * print_string (if b then "a:" else "d"); + * if b then print_int (snd (Hashtbl.find retl r)); + * print_string ", " + * ) l; + * print_string "]\n"; + * flush stdout; + * ) retr; *) + retl, retr + + +let get_live_regs_entry (sb : superblock) code = + (if !Clflags.option_debug_compcert > 6 + then debug_flag := true); + debug "getting live regs for superblock:\n"; + print_superblock sb code; + debug "\n"; + let seqa = Array.map (fun i -> + (match PTree.get i code with + | Some ii -> ii + | None -> failwith "RTLpathScheduleraux.get_live_regs_entry" + ), + (match PTree.get i sb.liveins with + | Some s -> s + | None -> Regset.empty)) + sb.instructions in + let ret = + Array.fold_right (fun (ins, liveins) regset_i -> + let regset = Registers.Regset.union liveins regset_i in + match ins with + | Inop _ -> regset + | Iop (_, args, dest, _) + | Iload (_, _, _, args, dest, _) -> + List.fold_left (fun set reg -> Registers.Regset.add reg set) + (Registers.Regset.remove dest regset) args + | Istore (_, _, args, src, _) -> + List.fold_left (fun set reg -> Registers.Regset.add reg set) + (Registers.Regset.add src regset) args + | Icall (_, fn, args, dest, _) -> + List.fold_left (fun set reg -> Registers.Regset.add reg set) + ((match fn with + | Datatypes.Coq_inl reg -> (Registers.Regset.add reg) + | Datatypes.Coq_inr _ -> (fun x -> x)) + (Registers.Regset.remove dest regset)) + args + | Itailcall (_, fn, args) -> + List.fold_left (fun set reg -> Registers.Regset.add reg set) + (match fn with + | Datatypes.Coq_inl reg -> (Registers.Regset.add reg regset) + | Datatypes.Coq_inr _ -> regset) + args + | Ibuiltin (_, args, dest, _) -> + List.fold_left (fun set arg -> + let rec add reg set = + match reg with + | AST.BA r -> Registers.Regset.add r set + | AST.BA_splitlong (hi, lo) + | AST.BA_addptr (hi, lo) -> add hi (add lo set) + | _ -> set + in add arg set) + (let rec rem dest set = + match dest with + | AST.BR r -> Registers.Regset.remove r set + | AST.BR_splitlong (hi, lo) -> rem hi (rem lo set) + | _ -> set + in rem dest regset) + args + | Icond (_, args, _, _, _) -> + List.fold_left (fun set reg -> + Registers.Regset.add reg set) + regset args + | Ijumptable (reg, _) + | Ireturn (Some reg) -> + Registers.Regset.add reg regset + | _ -> regset + ) seqa sb.s_output_regs + in debug "live in regs: "; + print_regset ret; + debug "\n"; + debug_flag := false; + ret + (* TODO David *) let schedule_superblock sb code = if not !Clflags.option_fprepass @@ -90,15 +252,22 @@ let schedule_superblock sb code = match predicted_successor ii with | Some _ -> 0 | None -> 1 in + debug "hello\n"; + let live_regs_entry = get_live_regs_entry sb code in + let seqa = + Array.map (fun i -> + (match PTree.get i code with + | Some ii -> ii + | None -> failwith "RTLpathScheduleraux.schedule_superblock"), + (match PTree.get i sb.liveins with + | Some s -> s + | None -> Regset.empty)) + (Array.sub sb.instructions 0 (nr_instr-trailer_length)) in match PrepassSchedulingOracle.schedule_sequence - (Array.map (fun i -> - (match PTree.get i code with - | Some ii -> ii - | None -> failwith "RTLpathScheduleraux.schedule_superblock"), - (match PTree.get i sb.liveins with - | Some s -> s - | None -> Regset.empty)) - (Array.sub sb.instructions 0 (nr_instr-trailer_length))) with + seqa + live_regs_entry + sb.typing + (reference_counting seqa sb.s_output_regs sb.typing) with | None -> sb.instructions | Some order -> let ins' = diff --git a/test/nardino/scheduling/entry_regs.c b/test/nardino/scheduling/entry_regs.c new file mode 100644 index 00000000..9e6adacb --- /dev/null +++ b/test/nardino/scheduling/entry_regs.c @@ -0,0 +1,19 @@ +#include + +int f(int n) { + if (n > 0) + return 42; + else + return n; +} + + +int main(int argc, char *argv[]) { + int a=1; + float b=2.; + int c = f(a); + a = 3; + int d = f(a); + printf("%e, %d, %d, %d", b, a, c, d); + return 0; +} diff --git a/test/nardino/scheduling/spille_backw.c b/test/nardino/scheduling/spille_backw.c new file mode 100644 index 00000000..1c36ee86 --- /dev/null +++ b/test/nardino/scheduling/spille_backw.c @@ -0,0 +1,114 @@ +int f(int k) { + int a1 = k; + int b1 = 2*a1; + int c = a1; + int a2 = k+1; + int b2 = 2*a2; + c += a2; + int a3 = k+2; + int b3 = 2*a3; + c += a3; + int a4 = k+3; + int b4 = 2*a4; + c += a4; + int a5 = k+4; + int b5 = 2*a5; + c += a5; + int a6 = k+5; + int b6 = 2*a6; + c += a6; + int a7 = k+6; + int b7 = 2*a7; + c += a7; + int a8 = k+7; + int b8 = 2*a8; + c += a8; + int a9 = k+8; + int b9 = 2*a9; + c += a9; + int a10 = k+9; + int b10 = 2*a10; + c += a10; + int a11 = k+10; + int b11 = 2*a11; + c += a11; + int a12 = k+11; + int b12 = 2*a12; + c += a12; + int a13 = k+12; + int b13 = 2*a13; + c += a13; + int a14 = k+13; + int b14 = 2*a14; + c += a14; + int a15 = k+14; + int b15 = 2*a15; + c += a15; + int a16 = k+15; + int b16 = 2*a16; + c += a16; + int a17 = k+16; + int b17 = 2*a17; + c += a17; + int a18 = k+17; + int b18 = 2*a18; + c += a18; + int a19 = k+18; + int b19 = 2*a19; + c += a19; + int a20 = k+19; + int b20 = 2*a20; + c += a20; + int a21 = k+20; + int b21 = 2*a21; + c += a21; + int a22 = k+21; + int b22 = 2*a22; + c += a22; + int a23 = k+22; + int b23 = 2*a23; + c += a23; + int a24 = k+23; + int b24 = 2*a24; + c += a24; + int a25 = k+24; + int b25 = 2*a25; + c += a25; + int a26 = k+25; + int b26 = 2*a26; + c += a26; + return + b13+ + b12+ + b11+ + b10+ + b9+ + b8+ + b7+ + b6+ + b5+ + b4+ + b3+ + b2+ + b1+ + b14+ + b15+ + b16+ + b17+ + b18+ + b19+ + b20+ + b21+ + b22+ + b23+ + b23+ + b24+ + b25+ + b26+ + c; +} + +int main(int argc, char *argv[]) { + f(3); + return 0; +} diff --git a/test/nardino/scheduling/spille_forw.c b/test/nardino/scheduling/spille_forw.c new file mode 100644 index 00000000..db88588b --- /dev/null +++ b/test/nardino/scheduling/spille_forw.c @@ -0,0 +1,166 @@ +#include + +int f(int n, float * arr) { + float a1 = (float) n; + float b1 = 2.*a1; + float c = a1; + float a2 = (float) n+1; + float b2 = 2.*a2; + c += a2; + float a3 = (float) n+2; + float b3 = 2.*a3; + c += a3; + float a4 = (float) n+3; + float b4 = 2.*a4; + c += a4; + float a5 = (float) n+4; + float b5 = 2.*a5; + c += a5; + float a6 = (float) n+5; + float b6 = 2.*a6; + c += a6; + float a7 = (float) n+6; + float b7 = 2.*a7; + c += a7; + float a8 = (float) n+7; + float b8 = 2.*a8; + c += a8; + float a9 = (float) n+8; + float b9 = 2.*a9; + c += a9; + float a10 = (float) n+9; + float b10 = 2.*a10; + c += a10; + float a11 = (float) n+10; + float b11 = 2.*a11; + c += a11; + float a12 = (float) n+11; + float b12 = 2.*a12; + c += a12; + float a13 = (float) n+12; + float b13 = 2.*a13; + c += a13; + float a14 = (float) n+13; + float b14 = 2.*a14; + c += a14; + float a15 = (float) n+14; + float b15 = 2.*a15; + c += a15; + float a16 = (float) n+15; + float b16 = 2.*a16; + c += a16; + float a17 = (float) n+16; + float b17 = 2.*a17; + c += a17; + float a18 = (float) n+17; + float b18 = 2.*a18; + c += a18; + float a19 = (float) n+18; + float b19 = 2.*a19; + c += a19; + float a20 = (float) n+19; + float b20 = 2.*a20; + c += a20; + float a21 = (float) n+20; + float b21 = 2.*a21; + c += a21; + float a22 = (float) n+21; + float b22 = 2.*a22; + c += a22; + float a23 = (float) n+22; + float b23 = 2.*a23; + c += a23; + float a24 = (float) n+23; + float b24 = 2.*a24; + c += a24; + float a25 = (float) n+24; + float b25 = 2.*a25; + c += a25; + float a26 = (float) n+25; + float b26 = 2.*a26; + c += a26; + float a27 = (float) n+26; + float b27 = 2.*a27; + c += a27; + float a28 = (float) n+27; + float b28 = 2.*a28; + c += a28; + float a29 = (float) n+28; + float b29 = 2.*a29; + c += a29; + float a30 = (float) n+29; + float b30 = 2.*a30; + c += a30; + /* arr[0] = a1; */ + /* arr[1] = a2; */ + /* arr[2] = a3; */ + /* arr[3] = a4; */ + /* arr[4] = a5; */ + /* arr[5] = a6; */ + /* arr[6] = a7; */ + /* arr[7] = a8; */ + /* arr[8] = a9; */ + /* arr[9] = a10; */ + /* arr[10] = a11; */ + /* arr[11] = a12; */ + /* arr[12] = a13; */ + /* arr[13] = a14; */ + /* arr[14] = a15; */ + /* arr[15] = a16; */ + /* arr[16] = a17; */ + /* arr[17] = a18; */ + /* arr[18] = a19; */ + /* arr[19] = a20; */ + /* arr[20] = a21; */ + /* arr[21] = a22; */ + /* arr[22] = a23; */ + /* arr[23] = a24; */ + /* arr[24] = a25; */ + /* arr[25] = a26; */ + /* arr[26] = a27; */ + /* arr[27] = a28; */ + /* arr[28] = a29; */ + /* arr[29] = a30; */ + return c + + b1+ + b2+ + b3+ + b4+ + b5+ + b6+ + b7+ + b8+ + b9+ + b10+ + b11+ + b12+ + b13+ + b14+ + b15+ + b16+ + b17+ + b18+ + b19+ + b20+ + b21+ + b22+ + b23+ + b24+ + b25+ + b26+ + b27+ + b28+ + b29+ + b30; +} + + + + + + +int main(int argc, char *argv[]) { + float arr[30]; + f(5, arr); + return 0; +} diff --git a/x86/ExpansionOracle.ml b/x86/ExpansionOracle.ml deleted file mode 120000 index ee2674bf..00000000 --- a/x86/ExpansionOracle.ml +++ /dev/null @@ -1 +0,0 @@ -../aarch64/ExpansionOracle.ml \ No newline at end of file diff --git a/x86/ExpansionOracle.ml b/x86/ExpansionOracle.ml new file mode 100644 index 00000000..3b63b80d --- /dev/null +++ b/x86/ExpansionOracle.ml @@ -0,0 +1,17 @@ +(* *************************************************************) +(* *) +(* The Compcert verified compiler *) +(* *) +(* Léo Gourdin UGA, VERIMAG *) +(* *) +(* Copyright VERIMAG. All rights reserved. *) +(* This file is distributed under the terms of the INRIA *) +(* Non-Commercial License Agreement. *) +(* *) +(* *************************************************************) + +open RTLpathCommon + +let expanse (sb : superblock) code pm = (code, pm) + +let find_last_node_reg c = () diff --git a/x86/PrepassSchedulingOracle.ml b/x86/PrepassSchedulingOracle.ml index 7b6a1b14..42a3da23 100644 --- a/x86/PrepassSchedulingOracle.ml +++ b/x86/PrepassSchedulingOracle.ml @@ -2,4 +2,5 @@ open RTL open Registers (* Do not do anything *) -let schedule_sequence (seqa : (instruction*Regset.t) array) = None +let schedule_sequence (seqa : (instruction*Regset.t) array) + live_regs_entry typing reference = None -- cgit