Update on Overleaf.

author: John Wickerson <j.wickerson@imperial.ac.uk> 2022-01-18 09:53:56 +0000
committer: node <node@git-bridge-prod-0> 2022-01-18 11:36:32 +0000
commit: e80b44a4440d956ad954990cd77ff25556facc29 (patch)
tree: 3664b139093a06c26b9ef185d95dbdf884074cb5
parent: 80ca1a53c0a54e4965bf3d92fab4cb428d7a5c9c (diff)
download: fccm22_rsvhls-e80b44a4440d956ad954990cd77ff25556facc29.tar.gz
fccm22_rsvhls-e80b44a4440d956ad954990cd77ff25556facc29.zip
2 files changed, 118 insertions, 139 deletions
diff --git a/stash.tex b/stash.tex
index a452c1f..cb6528f 100644
--- a/stash.tex
+++ b/stash.tex
@@ -89,4 +89,83 @@ main () {
 \end{lstlisting}
 \caption{Example HTL code (part 2)}
 \label{fig:example_HTL2}
-\end{figure}
-\ No newline at end of file
+\end{figure}
+
+\appendix
+
+We provide the Verilog output that is generated from our running example (Figures~\ref{fig:example_C} to~\ref{fig:example_HTL}).
+
+\begin{lstlisting}
+module main (clk, rst, ret, fin)
+  input clk, rst;
+  output reg [31:0] ret;
+  output reg fin;
+  reg[31:0] state;
+  reg[31:0] reg_1, reg_2, reg_3, reg_4, reg_5, reg_6;
+  reg add_rst, add_fin; 
+  reg[31:0] add_ret, add_x2, add_x1, reg_7, add_state;
+  
+  always @(posedge clk) // control logic for "add"
+    if ({add_rst == 1}) begin 
+      add_state <= 2; add_fin <= 0; 
+    end else begin 
+      case (add_state)
+        |\HL5{2: add\_state <= 1;}|
+        |\HL5{1: add\_state <= 3;}|
+        3: ;
+        default: ;
+      endcase
+    end   
+    
+  always @(posedge clk) // datapath for "add"    
+    case (add_state)
+      |\HL5{2: reg\_7 <= {{add\_x2 + add\_x1} + 0};}|
+      |\HL5{1: add\_fin = 1;}|
+         |\HL5{add\_ret = reg\_7;}|
+      3: add_fin <= 0;
+    endcase
+
+  always @(posedge clk) // control logic for "main"
+    if ({rst == 1}) begin 
+      state <= 9; fin <= 0; 
+    end else begin
+      case (state)
+        |\HL1{9: state <= 8;}|
+        |\HL2{8: state <= 7;}|
+        |\HL2{7: state <= 12;}|
+       |\HL2{12: if ({add\_fin == 1}) state <= 6;}|
+        |\HL2{6: state <= 5;}|
+        |\HL3{5: state <= 4;}|
+        |\HL3{4: state <= 10;}|
+       |\HL3{10: if ({add\_fin == 1}) state <= 3;}|
+        |\HL3{3: state <= 2;}|
+        |\HL4{2: state <= 1;}|
+        |\HL4{1: state <= 11;}|
+       |\HL0{11: ;}|
+      endcase
+    end
+
+  always @(posedge clk) // datapath for "main"
+    case (state)
+      |\HL1{9: reg\_3 <= 0;}|
+      |\HL2{8: reg\_6 <= 1;}|
+      |\HL2{7: add\_rst <= 1;}|
+         |\HL2{add\_x2 <= reg\_3;}|
+         |\HL2{add\_x1 <= reg\_6;}|
+     |\HL2{12: add\_rst <= 0;}|
+         |\HL2{reg\_1 <= add\_ret;}|
+      |\HL2{6: reg\_3 <= reg\_1;}|
+      |\HL3{5: reg\_5 <= 2;}|
+      |\HL3{4: add\_rst <= 1;}|
+         |\HL3{add\_x2 <= reg\_3;}|
+         |\HL3{add\_x1 <= reg\_5;}|
+     |\HL3{10: add\_rst <= 0;}|
+         |\HL3{reg\_2 <= add\_ret;}|
+      |\HL3{3: reg\_3 <= reg\_2;}|
+      |\HL4{2: reg\_4 <= reg\_3;}|
+      |\HL4{1: fin = 1;}|
+         |\HL4{ret = reg\_4;}|
+     |\HL0{11: fin <= 0;}|
+    endcase
+endmodule
+\end{lstlisting}
+\ No newline at end of file
diff --git a/verified_resource_sharing.tex b/verified_resource_sharing.tex
index 14b2c29..5b9a0ef 100644
--- a/verified_resource_sharing.tex
+++ b/verified_resource_sharing.tex
@@ -90,46 +90,41 @@ escapeinside=||,
 \begin{abstract}
 High-level synthesis (HLS) is playing an ever-increasing role in hardware design, but concerns have been raised about its reliability. Seeking to address these, Herklotz et al. have developed an HLS compiler called \vericert{} that has been mechanically proven (using the Coq proof assistant) to output Verilog designs that are behaviourally equivalent to the input C program. Unfortunately, \vericert{} cannot compete performance-wise with established HLS tools, and a major reason for this is \vericert{}'s complete lack of support for \emph{resource sharing}. 
   
-This paper introduces \vericertfun: \vericert{} extended with function-level resource sharing. Where original \vericert{} creates one block of hardware per function \emph{call}, \vericertfun{} creates one block of hardware per function \emph{definition}. To enable this, we extend \vericert{}'s intermediate language HTL with the ability to represent \emph{multiple} state machines, and we implement function calls by transferring control between these state machines. We are working to extend \vericert{}'s formal correctness proof to cover the translation from C into this extended HTL and thence to Verilog. Benchmarking on the PolyBench/C suite indicates that \vericertfun{} generates hardware with about half the resource usage of \vericert{}'s on average, with minimal impact on execution time.
+This paper introduces \vericertfun: \vericert{} extended with function-level resource sharing. Where original \vericert{} creates one block of hardware per function \emph{call}, \vericertfun{} creates one block of hardware per function \emph{definition}. To enable this, we extend \vericert{}'s intermediate language HTL with the ability to represent \emph{multiple} state machines, and we implement function calls by transferring control between these state machines. We are working to extend \vericert{}'s formal correctness proof to cover the translation from C into this extended HTL and thence to Verilog. Benchmarking on the PolyBench/C suite indicates that \vericertfun{} generates hardware with about 0.8$\times$ the resource usage of \vericert{}'s on average, with minimal impact on execution time.
 \end{abstract}
 
 \section{Introduction}
 \label{sec:introduction}
 
 The drive for faster, more energy-efficient computation has led to a surge in demand for custom hardware accelerators. This, in turn, has led to a great deal of interest in \emph{high-level synthesis} (HLS) as a means to program these devices.
+However, doubts have been raised about the reliability of the current crop of HLS tools. For instance, \citet{Herklotz2021_empiricalstudy} found numerous miscompilation bugs in commercial HLS tools including Xilinx Vivado HLS~\cite{xilinx_vitis}, the Intel HLS Compiler~\cite{intel_hls}, and LegUp~\cite{legup_CanisAndrew2011}. This unreliability can be a significant hindrance for developers, and it undermines the usefulness of HLS in safety- or security-critical settings. %It is therefore essential to ensure that HLS tools are as reliable as possible.
 
-However, doubts have been raised about the reliability of the current crop of HLS tools. For instance, \citet{Herklotz2021_empiricalstudy} found numerous miscompilation bugs in commercial HLS tools including Xilinx Vivado HLS~\cite{xilinx_vitis}, the Intel HLS Compiler~\cite{intel_hls}, and LegUp~\cite{legup_CanisAndrew2011}. This unreliability can be a significant hindrance in the development process, especially given the long iteration times of hardware design compared to software, and it undermines the usefulness of HLS in safety- or security-critical settings. %It is therefore essential to ensure that HLS tools are as reliable as possible.
-
-Aiming to address this issue is \vericert{}~\cite{vericert}, a new HLS tool whose correctness has been checked to the highest possible standard: machine-checked proof. It provides a proof, checked using the Coq proof assistant~\cite{coq}, that every step of its translation from C to Verilog preserves the semantics of its input program. This means that we can always trust any Verilog design produced by \vericert{} to behave the same way as the C program given as input. %It is based on the CompCert~\cite{compcert_Leroy2009} verified C compiler.
-
+Aiming to address this issue is \vericert{}~\cite{vericert}, a new HLS tool whose correctness has been verified to the highest possible standard: a computer-checked proof that any Verilog design it produces will behave the same way as the C program given as input. %It is based on the CompCert~\cite{compcert_Leroy2009} verified C compiler.
 Clearly, however, it is not enough for an HLS tool simply to be \emph{correct}; the generated hardware must also enjoy high throughput, low latency, and good \emph{area efficiency} -- the last of which is the topic of this paper. A common optimisation employed by HLS tools to improve area efficiency is \emph{resource sharing}; that is, re-using hardware for more than one purpose. Accordingly, our work adds function-level resource sharing to \vericert{}, yielding a new prototype HLS tool called `\vericertfun'. In keeping with the aims of the \vericert{} project, work is ongoing to extend the correctness proof.
 
 The entire \vericertfun{} development is fully open-source~\cite{vericertfun-github}, and more details about the implementation and proofs are available in a technical report~\cite{pardalos_thesis}.
 
 \section{Background}
 
-\paragraph{The Coq proof assistant} \vericert{} is implemented using the Coq proof assistant~\cite{coq}. This means that it consists of a collection of functions that define the compilation process, together with the proof of a theorem stating that those definitions constitute a correct HLS tool. Coq mechanically checks this proof using a formal mathematical calculus, and then automatically translates the function definitions into OCaml code that can be compiled and executed. 
-
-Engineering a software system within a proof assistant like Coq is widely held to be the gold standard for correctness. Recent years have shown that it is feasible to design substantial pieces of software in this way, such as database management systems~\cite{malecha+10}, web servers~\cite{chlipala15}, and operating system kernels~\cite{gu+16}. Coq has also been successfully deployed in the hardware design process, both in academia~\cite{braibant+13, bourgeat+20} and in industry~\cite{silveroak}. It has even been applied specifically to HLS: Faissole et al.~\cite{faissole+19} used it to verify that HLS optimisations respect dependencies in the source code.
-
-\paragraph{The CompCert verified C compiler} Among the most celebrated applications of Coq is the CompCert project~\cite{compcert}. CompCert is a lightly optimising C compiler, with backend support for the Arm, x86, PowerPC, and Kalray VLIW architectures~\cite{six+20}, that has been implemented and proven correct in Coq. CompCert accepts most of the C99 language, and generally produces code of comparable performance to that produced by GCC at \texttt{-O1}. It transforms its input through a series of ten intermediate languages before generating the final output. This design ensures that each individual pass remains relatively simple and hence feasible to prove correct. The correctness proof of the entire compiler is formed by composing the correctness proofs of each of its passes. 
+\paragraph{The Coq proof assistant} \vericert{} is implemented using Coq~\cite{coq}, which means it consists of a collection of functions that define the compilation process, together with the proof of a theorem stating that those definitions constitute a correct HLS tool. Coq mechanically checks this proof using a formal mathematical calculus, and then automatically translates the function definitions into OCaml code that can be compiled and executed. 
+Engineering a software system within a proof assistant like Coq is widely held to be the gold standard for correctness. Recent years have shown that it is feasible to design substantial pieces of software in this way, such as database systems~\cite{malecha+10}, web servers~\cite{chlipala15}, and operating system kernels~\cite{gu+16}. Coq has also been successfully deployed in the hardware design process, both in academia~\cite{braibant+13, bourgeat+20} and in industry~\cite{silveroak}. It has even been applied specifically to HLS: Faissole et al.~\cite{faissole+19} used it to verify that HLS optimisations respect dependencies in the source code.
 
-That the Csmith compiler testing tool has found hundreds of bugs in GCC and LLVM but none in (the verified parts of) CompCert~\cite{csmith} is a testament to the reliability of this development approach. That said, we cannot guarantee that hardware produced via \vericertfun{} will never go wrong, because of fallibilities in components not covered by the theorem, including the computer checking the proofs, the synthesis toolchain~\cite{verismith}, and the FPGA device itself.
+\paragraph{The CompCert verified C compiler} Among the most celebrated applications of Coq is the CompCert project~\cite{compcert}. CompCert is a lightly optimising C compiler, with backend support for the Arm, x86, PowerPC, and Kalray VLIW architectures~\cite{six+20}, that has been implemented and proven correct in Coq. CompCert accepts most of the C99 language, and generally produces code of comparable performance to that of GCC at \texttt{-O1}. It transforms its input through a series of ten intermediate languages before generating the final output. This design ensures that each individual pass remains relatively simple and hence feasible to prove correct. The correctness proof of the entire compiler is formed by composing the correctness proofs of each of its passes. 
+That the Csmith compiler testing tool~\cite{csmith} has found hundreds of bugs in GCC and LLVM but none in (the verified parts of) CompCert is a testament to the reliability of this development approach. That said, we cannot guarantee that \vericertfun{} designs will never go wrong, because of fallibilities in components not covered by the theorem, including the computer checking the proofs, the synthesis toolchain~\cite{verismith}, and the FPGA itself.
  
 \paragraph{The \vericert{} verified HLS tool}
 
-Introduced by \citet{vericert}, \vericert{} is a verified C-to-Verilog HLS tool. It was built by extending CompCert with a new hardware-oriented intermediate language (called HTL) and a Verilog backend. \vericert{} currently performs no significant optimisations beyond those inherited from CompCert's frontend. This results in performance generally about one order of magnitude slower than that achieved by comparable, unverified HLS tools.
+Introduced by \citet{vericert}, \vericert{} is a verified C-to-Verilog HLS tool, built by extending CompCert with a new hardware-oriented intermediate language (called HTL) and a Verilog backend. 
+\vericert{} branches off from CompCert at the intermediate language called \emph{register-transfer language} (which we shall call `3AC', for `three-address code', to avoid confusion with `register-transfer level').  
+In 3AC, each function is represented as a numbered list of instructions with gotos -- i.e., a control-flow graph (CFG). \vericert{}'s compilation strategy is to treat this CFG as a state machine, with each instruction in the CFG being a state, and each edge in the CFG being a transition. The stack is implemented in a block of RAM, and program variables that do not have their address taken are mapped to hardware registers. More precisely, \vericert{} builds a \emph{finite state machine with datapath} (FSMD). This comprises two maps, both taking the current state as their input: the \emph{control logic} map for determining the next state, and a \emph{datapath} map for updating the RAM and registers. These state machines are captured in \vericert{}'s new intermediate language, HTL. When \vericert{} compiles from HTL to the final Verilog output, these maps are converted from mathematical functions into syntactic Verilog case-statements, and placed inside always-blocks.
 
-\vericert{} branches off from CompCert at the intermediate language called \emph{register-transfer language} (RTL). \JW{Got up to here; will finish word-chopping pass tomorrow morning.} 
-Since that abbreviation is usually used in the hardware community for `register-transfer level', we shall henceforth avoid possible confusion by referring to the CompCert intermediate language as `3AC' (for `three-address code'). 
-In the 3AC language, each function in the program is represented as a numbered list of instructions with gotos -- essentially, a control-flow graph (CFG). The essence of \vericert{}'s compilation strategy is to treat this CFG as a state machine, with each instruction in the CFG becoming a state, and each edge in the CFG becoming a transition. Moreover, program variables that do not have their address taken are mapped to hardware registers; other variables (including arrays and structs) are allocated in a block of RAM that represents the stack. More precisely, \vericert{} builds a \emph{finite state machine with datapath} (FSMD). This comprises two maps, both of which take the current state as their input: the \emph{control logic} map for determining the next state, and a \emph{datapath} map for updating the RAM and registers. These state machines are captured in \vericert{}'s new intermediate language, HTL. When \vericert{} compiles from HTL to the final Verilog output, these maps are converted from proper `mathematical' functions into syntactic Verilog case-statements, and each is placed inside an always-block.
-
-The overall \vericert{} flow is shown at the top of Figure~\ref{fig:flow}. The key point to note here is the `inlining' step, which folds all function definitions into their call sites. This allows \vericert{} to make the simplifying assumption that there is only a single CFG, but has the unwanted effect of duplicating hardware. In this work, we remove some of this inlining and hence some of the duplication.
+\vericert{} currently performs no significant optimisations beyond those inherited from CompCert's frontend. This results in performance generally about an order of magnitude slower than that achieved by comparable, unverified HLS tools.
+The overall \vericert{} flow is shown in Figure~\ref{fig:flow} (top). Note the `inlining' step, which folds all function definitions into their call sites. This allows \vericert{} to make the simplifying assumption that there is only a single CFG, but has the unwanted effect of duplicating hardware. \vericertfun{} removes some of this inlining and hence some of the duplication.
 
 \paragraph{Resource sharing in HLS}
 
-Resource sharing is a feature expected of most HLS compilers. In a typical architecture generated by HLS~\cite{coussy+09}, a number of `functional components' are selected from a library according to the needs of the specific design, and in the scheduling process, a clock cycle is chosen for each operation such that the components it requires are available. Given the need to mechanically verify the correctness of our implementation, \vericertfun{} follows a simpler approach: we share resources at the granularity of entire functions, rather than individual operations. Function-level resource sharing is implemented in commercial HLS compilers such as the Intel HLS compiler~\cite{intel_hls} or
-Xilinx Vitis~\cite{xilinx_vitis}, and is guided by the programmer through appropriate pragmas.
+Resource sharing is a feature expected of most HLS compilers. In a typical HLS-generated architecture~\cite{coussy+09}, several `functional components' are selected from a library according to the needs of the specific design, and in the scheduling process, each operation is allotted a clock cycle in which the required components are all available. Given the need to mechanically verify the correctness of our implementation, \vericertfun{} follows a simpler approach: we share resources at the granularity of entire functions, rather than individual operations. Function-level resource sharing is implemented in commercial HLS compilers such as the Intel HLS compiler~\cite{intel_hls} or
+Xilinx Vitis~\cite{xilinx_vitis}, and is guided by the programmer through pragmas.
 
 Perna et al.~\cite{perna+12} developed a verified HLS tool for the Handel-C language, but, like \vericert{}, they did not implement function-level resource sharing, instead arranging that ``all procedure and function calls are expanded in the
 front-end''.
@@ -197,7 +192,7 @@ front-end''.
 \draw[flowlines, -latex] (background1.south) to node [auto] {\bf this work} (background2.north -| background1.south);
 
 \end{tikzpicture}
-\caption{Key compilation passes and intermediate languages in \vericert{}~\cite{vericert} (top) and \vericertfun{} (bottom)}
+\caption{Key compilation passes and intermediate languages in \vericert{}~\cite{vericert} (top) and in \vericertfun{} (bottom)}
 \label{fig:flow}
 \end{figure}
 
@@ -212,7 +207,7 @@ tit/.style={anchor=north west, font=\tt\footnotesize},
 edlab/.style={auto, inner sep=2pt, align=left, font=\tt\footnotesize}
 }
 
-In this section, we shall explain the implementation of \vericertfun, using Figure~\ref{fig:example_C} as a worked example. The basic idea is shown at the bottom of Figure~\ref{fig:flow}: we avoid inlining the function calls at the 3AC level (except in certain circumstances described below), instead maintaining one state machine per function. All the state machines run simultaneously, and function calls are implemented by sending messages between the state machines. We combine all of these state machines into a single Verilog module, after renaming variables as necessary to avoid clashes.
+We now explain the implementation of \vericertfun{} using Figure~\ref{fig:example_C} as a worked example. The overall flow is shown in Figure~\ref{fig:flow} (bottom): we avoid inlining the function calls at the 3AC level (except in certain circumstances described below), instead maintaining one state machine per function. All the state machines run simultaneously, and function calls are implemented by sending messages between them. We then combine all of these state machines into a single Verilog module after renaming variables to avoid clashes.
 
 \begin{figure}
 \centering
@@ -291,11 +286,9 @@ int main()
 
 \end{figure}
 
-Figure~\ref{fig:example_3AC} shows the 3AC representation of that C program, as obtained by the CompCert frontend. It takes the form of two CFGs, one per function. The control flow in this example is straightforward, but in general, 3AC programs can contain unrestricted gotos. The nodes of the CFGs are numbered in reverse order, as are the parameters of the \lstinline{add} function; both are conventions inherited from CompCert.
-
-Figure~\ref{fig:example_HTL} depicts the result of converting those CFGs into two state machines. The conversion of 3AC instructions into Verilog instructions has been described already by \citet{vericert}; what is new here is the handling of function calls, so the following description concentrates on that aspect. Note that ``$*{\rightarrow}\langle\mathit{node}\rangle$'' is an abbreviation for edges from all nodes to $\langle\mathit{node}\rangle$.
-
-The solid edges within the two state machines indicate the transfer of control between states, while the dashed edges between the state machines are more `fictional'. The ground truth is that both state machines run continuously, but it is convenient to think of only one of the machines as doing useful work at any point in time. So, the dashed edges indicate the points at which the `active' machine changes.
+Figure~\ref{fig:example_3AC} shows the 3AC representation of that C program, as obtained by the CompCert frontend. We see two CFGs, one per function. The control flow in this example is straightforward, but in general, 3AC programs can contain unrestricted gotos. The nodes of the CFGs are numbered in reverse, as are the parameters of the \lstinline{add} function, following CompCert convention.
+Figure~\ref{fig:example_HTL} depicts the result of converting those CFGs into two state machines. The conversion of 3AC instructions into Verilog instructions has been described already by \citet{vericert}; what is new here is the handling of function calls, so the following concentrates on that aspect. Note that ``$*{\rightarrow}\langle\mathit{node}\rangle$'' stands for edges from all nodes to $\langle\mathit{node}\rangle$.
+The solid edges within the two state machines indicate the transfer of control between states, while the dashed edges between the state machines are more `fictional'. The ground truth is that both state machines run continuously, but it is convenient to think that only one machine does useful work at a time. So, the dashed edges indicate when the `active' machine changes.
 
 \definecolor{background}{HTML}{eeeeee}
 \begin{figure}
@@ -407,34 +400,30 @@ add} (-2.6,-12.5);
 
 \end{figure}
 
-In more detail: Execution begins in state 9 of the \lstinline{main} machine, and proceeds through successive states until it reaches state 7, in which the \lstinline{add} machine's \lstinline{rst} signal is raised. This causes the \lstinline{add} machine to advance to state 2. When the \lstinline{main} machine advances to state 12, that \lstinline{rst} signal is lowered; the \lstinline{add} machine then begins its computation while the \lstinline{main} machine spins in state 12. When the \lstinline{add} machine has finished its computation (state 1), it asserts its \lstinline{fin} signal. This allows the \lstinline{main} machine to leave state 12. Finally, the \lstinline{add} machine lowers its \lstinline{fin} and waits in state 3 until the next call.
-
-The same sequence of events can also be understood using a timing diagram, as given in Figure~\ref{fig:timingdiagram}. In that diagram, the red lines indicate unspecified values. %We see that each call of \lstinline{add} begins with a pulse on \lstinline{add.rst} (fifth waveform) and ends with a pulse on \lstinline{add.fin} (sixth waveform).
-We see that calls are initiated by triggering the \lstinline{rst} signal of the called module. Similarly, a function returns by setting its own \lstinline{fin} register. %There are no `call' or `return' instructions in HTL.
+In more detail: Execution begins in state 9 of the \lstinline{main} machine, and proceeds through successive states until it reaches state 7, in which the \lstinline{add} machine's \lstinline{rst} signal is set. This causes the \lstinline{add} machine to advance to state 2. When \lstinline{main} advances to state 12, that \lstinline{rst} signal is unset; \lstinline{add} then begins its computation while \lstinline{main} spins in state 12. When \lstinline{add} has finished (state 1), it sets its \lstinline{fin} signal, which allows \lstinline{main} to leave state 12. Finally, \lstinline{add} unsets its \lstinline{fin} and waits in state 3 for the next call.
+The same event sequence can also be understood using the timing diagram in Figure~\ref{fig:timingdiagram}, in which red lines indicate unspecified values. %We see that each call of \lstinline{add} begins with a pulse on \lstinline{add.rst} (fifth waveform) and ends with a pulse on \lstinline{add.fin} (sixth waveform).
+We see that calls are initiated by triggering the \lstinline{rst} signal of the called module and that a function returns by setting its own \lstinline{fin} register. %There are no `call' or `return' instructions in HTL.
 
-One technical challenge we encountered in the implementation of \vericertfun{} has to do with the fact that the called and callee state machines need to modify each other's variables. This is problematic because each function is translated into a state machine independently, and hence the register names used in the other state machines are not necessarily available. We work around this problem by introducing an additional component to our state machines: a mapping, called \lstinline{extern_ctrl}, that maps local register names to pairs of module identifiers and roles in those modules. For instance, the first entry in \lstinline{extern_ctrl} in Figure~\ref{fig:example_HTL} tells us that the \lstinline{add_0_rst} register used by the \lstinline{main} state machine should resolve to whichever register plays the role of `reset' in the \lstinline{add} state machine. Once all the state machines have been generated, we erase \lstinline{extern_ctrl}. We do this in two steps. First, we perform a renaming pass through
-the whole program, making all register names globally unique. This is necessary to avoid
-unintended conflicts between register names in different modules, as register names can only be assumed to be unique within their own module. We then do a second pass, renaming
-registers in \lstinline{extern_ctrl} to the name of the actual register they target. %\JW{I'm still struggling to understand externctrl properly. I don't see why we can't resolve externctrl straight away, when each HTL state machine is first generated. Why do we have to wait until they are linked together?}\YH{I think that there are multiple reasons: The first is that it's very difficult to access the function information from other functions, because this requires resolving their ID in the global AST state and then inspecting it.  If one ends up using this information, it can be very tricky to prove the induction, because one isn't local to the current function anymore.  All the proof structure in CompCert assumes one only deals within the current function.  This also complicates the definition of the semantics.  Secondly, I think it just helps as a bridge, because functions have their completely separate register files in 3AC.  It's therefore quite difficult to map registers from one register file to the other unless one has something that describes which ones to map over.}
+One technical challenge we encountered in the implementation of \vericertfun{} has to do with the fact that the called and callee state machines need to modify each other's variables. This is problematic because each function is translated independently, and hence the register names used in the other state machines may not be available. We work around this by introducing an additional component to our state machines: an `\lstinline{extern_ctrl}' mapping from local register names to pairs of module identifiers and roles in those modules. For instance, the first entry in \lstinline{extern_ctrl} in Figure~\ref{fig:example_HTL} tells us that the \lstinline{add_0_rst} register used by \lstinline{main} should resolve to whichever register plays the role of `reset' in \lstinline{add}. Once all the state machines have been generated, we erase \lstinline{extern_ctrl}. We do this in two steps. First, we rename all registers to be globally unique, which avoids unintended conflicts between registers in different modules (register names can only be assumed unique within their own module). We then rename all
+registers mentioned in \lstinline{extern_ctrl} to the name of the actual register they target. %\JW{I'm still struggling to understand externctrl properly. I don't see why we can't resolve externctrl straight away, when each HTL state machine is first generated. Why do we have to wait until they are linked together?}\YH{I think that there are multiple reasons: The first is that it's very difficult to access the function information from other functions, because this requires resolving their ID in the global AST state and then inspecting it.  If one ends up using this information, it can be very tricky to prove the induction, because one isn't local to the current function anymore.  All the proof structure in CompCert assumes one only deals within the current function.  This also complicates the definition of the semantics.  Secondly, I think it just helps as a bridge, because functions have their completely separate register files in 3AC.  It's therefore quite difficult to map registers from one register file to the other unless one has something that describes which ones to map over.}
 
-A second technical challenge we encountered in the implementation of \vericertfun{} has to do with an assumption made in \vericert{}'s correctness proof: that all pointers used by the input program are stored as offsets from the main function's stack frame. This assumption was reasonable for \vericert{} because after full inlining, the only function is the main function. This assumption is baked into several of \vericert{}'s most complicated lemmas, including the correctness proof for \lstinline{load} and \lstinline{store} instructions, and so we have not sought to lift it in the current incarnation of \vericertfun{}. Instead, we have made the compromise of only \emph{partially} eliminating the inlining pass. That is: \vericertfun{} inlines all functions that contain \lstinline{load}, \lstinline{store} or \lstinline{call} instructions. Thus, the benefits of resource sharing are only enjoyed by functions that do not include these instructions.
+A second technical challenge we encountered in the implementation of \vericertfun{} has to do with an assumption made in \vericert{}'s correctness proof: that all pointers are stored as offsets from the main function's stack frame. This assumption was reasonable for \vericert{} because after full inlining, the main function is the only function. This assumption is baked into several of \vericert{}'s most complicated lemmas, including the correctness proof for load and store instructions, and so we have not sought to lift it in our current prototype of \vericertfun{}. Instead, we have made the compromise of only \emph{partially} eliminating the inlining pass. That is: \vericertfun{} inlines all functions that contain \lstinline{load}, \lstinline{store} or \lstinline{call} instructions. Thus, the benefits of resource sharing are currently only enjoyed by functions that do not contain load or store instructions and do not call other functions. 
 
 \section{Proving \vericertfun{} correct}
 
-The CompCert correctness theorem~\cite{compcert} expresses that every behaviour that can be exhibited by the compiled program is also a behaviour of the source program. \vericert{}~\cite{vericert} adapted this theorem for HLS by replacing `compiled program' with `generated Verilog design'. In both cases, a formal semantics is required for the source and target languages. \vericertfun{} targets the same fragment of the Verilog language as \citeauthor{vericert} already mechanised in Coq, so no changes are required there. 
+The CompCert correctness theorem~\cite{compcert} states that every behaviour of the compiled program is also a behaviour of the source program. \citet{vericert} adapted this theorem for HLS by replacing `compiled program' with `generated Verilog design'. In both cases, a formal semantics is required for the source and target languages. \vericertfun{} targets the same fragment of the Verilog language as \citeauthor{vericert} already mechanised in Coq, so no changes are required there. 
 
 Where changes \emph{are} required is in the semantics of the intermediate language HTL, which sits between CompCert's 3AC and the final Verilog.
-When \citeauthor{vericert} designed HTL, they did not include a semantics for function calls because they assumed all function calls would already have been inlined. We have extended HTL so that its semantics is additionally parameterised by an environment that maps function names to state machines. We have added a semantics for function calls that looks up the named function in this environment, activates the corresponding state machine, and creates a new frame on the stack. We have also added a semantics for return statements that pops the current frame off the stack and reactivates the caller's state machine. % \MP{One aspect of this I didn't mention: This introduces non-determinism. When you set the reset of a module, the next state can be either a call or a normal executing state, ignoring the reset. This non-determinism does not affect the RTL-to-HTL proof, but could affect the HTL-to-Verilog one.}
+When \citeauthor{vericert} designed HTL, they did not include a semantics for function calls because they assumed all function calls would already have been inlined. We have extended HTL so that its semantics is additionally parameterised by an environment that maps function names to state machines. Our semantics for function calls looks up the named function in this environment, activates the corresponding state machine, and pushes a new stack frame, and our semantics for return statements pops the current stack frame and reactivates the caller's state machine. % \MP{One aspect of this I didn't mention: This introduces non-determinism. When you set the reset of a module, the next state can be either a call or a normal executing state, ignoring the reset. This non-determinism does not affect the RTL-to-HTL proof, but could affect the HTL-to-Verilog one.}
 
-At the point of writing, the correctness of \vericertfun{} from C to HTL has been mostly proven: basic instructions and function calls are proven correct, but proofs of load and store instructions are not finished as they lack some key invariants. The pass that renames
-variables in HTL is yet to be proven, as is the pass that generates the final Verilog. 
-
-To give a rough idea of the scale and complexity of the task: the implementation of \vericertfun{} involved adding or changing about 700 lines of Coq code in \vericert{} and took the first author 4 months. The correctness proof, so far, has required about 2300 lines of additions or changes to the Coq code and 8 person-months of work.
+At the point of writing, the correctness of \vericertfun{} from C to HTL has been mostly proven: basic instructions and function calls are proven correct, but proofs of load and store instructions still lack some key invariants. The pass that renames
+variables in HTL is yet to be proven, as is the Verilog-generation pass. 
+To give a rough idea of the scale and complexity of our work: the implementation of \vericertfun{} involved adding or changing about 700 lines of Coq code in \vericert{} and took the first author 4 months. The correctness proof has so far required about 2300 lines of additions or changes to the Coq code and 8 person-months of work.
 
 \section{Performance evaluation}
 
-We now compare the quality of the hardware generated by \vericertfun{} against that generated by \vericert{}. Following \citet{vericert}, we use the PolyBench/C benchmark suite~\cite{polybench} with division and modulo replaced with iterative software implementations because \vericert{} does not handle them inefficiently. That is, \lstinline{a/b} and \lstinline{c%d}
-are textually replaced with \lstinline{div(a,b)} and \lstinline{mod(c,d)}. These \lstinline{div} and \lstinline{mod} functions are the only \YH{Non-inlinable? There are always 4 functions defined per benchmark, but these are inlined by Vericert.} function calls in the benchmarks.
+We now compare the quality of the hardware generated by \vericertfun{} against that generated by \vericert{}. Following \citet{vericert}, we use the PolyBench/C benchmark suite~\cite{polybench} with division and modulo replaced with iterative software implementations because \vericert{} does not handle them efficiently. That is, \lstinline{a/b} and \lstinline{c%d}
+are textually replaced with \lstinline{div(a,b)} and \lstinline{mod(c,d)}. These \lstinline{div} and \lstinline{mod} functions are the only function calls that are not inlined.
 
 We used the Icarus Verilog simulator to determine the cycle counts of the generated designs. We used Xilinx Vivado 2017.1, targeting a Xilinx 7-series FPGA (XC7K70T) to determine area usage and fmax.
 
@@ -445,9 +434,8 @@ We used the Icarus Verilog simulator to determine the cycle counts of the genera
 %introduces an extra state per function call. The impact on fmax is similarly minimal, ranging
 %between a 1.5\% increase and a 3.1\% decrease (0.2\% decrease on average).
 
-Figure~\ref{fig:results} compares the hardware generated by \vericertfun{} with that generated by \vericert{}, using the open-source (but unverified) \bambu{} HLS tool~\cite{pilato13_bambu} as a baseline. The top graph compares the area usage. We observe a consistent and substantial reduction in area usage across the benchmark programs, with \vericert{} consistently using more area than Bambu (1.5$\times$ on average) and \vericertfun{} being much closer to Bambu (1.2$\times$ on average). As expected, the benchmarks with the most function calls, such as mvt, 2mm, 3mm and ludcmp enjoy the biggest area savings.  Benchmarks where only one function call is made, such as heat-3d and nussinov naturally see a slight increase in area usage because of the extra circuitry required.
-
-The bottom graph compares the execution time. We observe that \vericertfun{} generates hardware that is almost exactly as fast as \vericert{}'s, with \vericertfun{} hardware being only 4\% slower on average. This can be attributed to the latency overhead of performing a function call. Hardware from \vericert{} and \vericertfun{} is significantly slower than \bambu's, which can be attributed to \vericert{} employing far fewer optimisations than the unverified \bambu{} tool.
+Figure~\ref{fig:results} compares the hardware generated by \vericertfun{} with that of \vericert{}, using the open-source (but unverified) \bambu{} tool~\cite{pilato13_bambu} as a baseline. The top graph compares the area usage. We observe a consistent and substantial reduction in area usage across the benchmark programs, with \vericert{} consistently using more area than Bambu (1.5$\times$ on average) and \vericertfun{} requiring less area than \vericert{} (0.8$\times$ on average), but still more than \bambu{} (1.2$\times$ on average).  As expected, the benchmarks with several function calls (mvt, 2mm, 3mm, ludcmp) enjoy the biggest area savings, while those with only one function call (heat-3d, nussinov) require slightly more area because of the extra circuitry required.
+The bottom graph compares the execution time. We observe that \vericertfun{} generates hardware that is slightly (about 4\%) slower than \vericert{}'s, which can be attributed to the latency overhead of performing a function call. Hardware from \vericert{} and \vericertfun{} is significantly slower than \bambu's, which can be attributed to \vericert{} employing far fewer optimisations than the unverified \bambu{} tool.
 
 \pgfplotstableread[col sep=comma]{data/time-ratio.csv}{\divtimingtable}
 \pgfplotstableread[col sep=comma]{data/slice-ratio.csv}{\divslicetable}
@@ -456,7 +444,6 @@ The bottom graph compares the execution time. We observe that \vericertfun{} gen
 \definecolor{legupnooptnochaincol}{HTML}{8DA0CB}
 \newcommand\backgroundbar[2][5]{\draw[draw=none, fill=black!#1] (axis cs:#2*2+0.5,0.1) rectangle
   (axis cs:1+#2*2+0.5,300);}
-\newcommand\legup{Bambu}
 
 \begin{figure}
   \begin{tikzpicture}[scale=0.8]
@@ -485,7 +472,7 @@ The bottom graph compares the execution time. We observe that \vericertfun{} gen
       xtick style={draw=none},
       ]
       
-      \nextgroupplot[ymin=0.8,ymax=5,ylabel={Area relative to \legup{}}, ytick={0.25,0.5,1,2,4}]
+      \nextgroupplot[ymin=0.8,ymax=5,ylabel={Area vs. \bambu{}}, ytick={1,2,4}]
       \pgfplotsinvokeforeach{0,...,12}{%
         \backgroundbar{#1}}
       \backgroundbar[10]{13}
@@ -496,7 +483,7 @@ The bottom graph compares the execution time. We observe that \vericertfun{} gen
       \draw (axis cs:-0.5,0.8) rectangle (axis cs:27.5,5);
       \legend{\vericert{},\vericertfun{}};
 
-      \nextgroupplot[ymin=1,ymax=10,ylabel={Execution time relative to \legup{}}, ytick={1,2,4}]
+      \nextgroupplot[ymin=1,ymax=10,ylabel={Execution time vs. \bambu{}}, ytick={1,2,4,8}]
       \pgfplotsinvokeforeach{0,...,12}{%
         \backgroundbar{#1}}
       \backgroundbar[10]{13}
@@ -507,103 +494,16 @@ The bottom graph compares the execution time. We observe that \vericertfun{} gen
       \draw (axis cs:-0.5,1) rectangle (axis cs:27.5,10);
     \end{groupplot}
   \end{tikzpicture}
-  \caption{Performance of \vericert{} compared to \vericertfun{}, using \legup{} as a baseline.}
+  \caption{\vericertfun{} vs \vericertfun{}, with \bambu{} as a baseline.}
 \label{fig:results}
 \end{figure}
 
 \section{Future work}
 
-Our immediate priority is to complete \vericertfun's correctness proof. In the medium term, we would
-like to improve our implementation of resource sharing by dropping the requirement to inline
-functions that access pointers; we anticipate that this will lead to further reductions in area
-usage. We would also like to make \vericertfun generate designs with one Verilog module per C
-function, as this is more idiomatic than packing all the state machines into a single module; we did
-not do this yet because it would require extending the formal Verilog semantics to handle multiple
-modules. It would also be interesting to design heuristics for selectively inlining functions~\cite{huang+15}. In the longer
-term, we are considering how to implement resource sharing even more effectively in a verified HLS
-tool, perhaps by implementing it as part of a resource-constrained scheduling algorithm~\cite{sdc}.
+Our immediate priority is to complete \vericertfun's correctness proof. In the medium term, we intend to improve our implementation of resource sharing by dropping the requirement to inline functions that access pointers; we anticipate that this will lead to further area savings. We would also like \vericertfun{} to generate designs with one Verilog module per C function, as this is more idiomatic than cramming all the state machines into a single module; we did not do this yet because it requires extending the Verilog semantics to handle multiple modules. It would also be interesting to design heuristics for inlining functions selectively~\cite{huang+15}. In the longer term, we are considering how to implement resource sharing even more effectively in a verified HLS tool, perhaps as part of a resource-constrained scheduling algorithm~\cite{sdc}.
 
 
 \bibliographystyle{ACM-Reference-Format}
 \bibliography{references}
 
-\appendix
-
-We provide the Verilog output that is generated from our running example (Figures~\ref{fig:example_C} to~\ref{fig:example_HTL}).
-
-\begin{lstlisting}
-module main (clk, rst, ret, fin)
-  input clk, rst;
-  output reg [31:0] ret;
-  output reg fin;
-  reg[31:0] state;
-  reg[31:0] reg_1, reg_2, reg_3, reg_4, reg_5, reg_6;
-  reg add_rst, add_fin; 
-  reg[31:0] add_ret, add_x2, add_x1, reg_7, add_state;
-  
-  always @(posedge clk) // control logic for "add"
-    if ({add_rst == 1}) begin 
-      add_state <= 2; add_fin <= 0; 
-    end else begin 
-      case (add_state)
-        |\HL5{2: add\_state <= 1;}|
-        |\HL5{1: add\_state <= 3;}|
-        3: ;
-        default: ;
-      endcase
-    end   
-    
-  always @(posedge clk) // datapath for "add"    
-    case (add_state)
-      |\HL5{2: reg\_7 <= {{add\_x2 + add\_x1} + 0};}|
-      |\HL5{1: add\_fin = 1;}|
-         |\HL5{add\_ret = reg\_7;}|
-      3: add_fin <= 0;
-    endcase
-
-  always @(posedge clk) // control logic for "main"
-    if ({rst == 1}) begin 
-      state <= 9; fin <= 0; 
-    end else begin
-      case (state)
-        |\HL1{9: state <= 8;}|
-        |\HL2{8: state <= 7;}|
-        |\HL2{7: state <= 12;}|
-       |\HL2{12: if ({add\_fin == 1}) state <= 6;}|
-        |\HL2{6: state <= 5;}|
-        |\HL3{5: state <= 4;}|
-        |\HL3{4: state <= 10;}|
-       |\HL3{10: if ({add\_fin == 1}) state <= 3;}|
-        |\HL3{3: state <= 2;}|
-        |\HL4{2: state <= 1;}|
-        |\HL4{1: state <= 11;}|
-       |\HL0{11: ;}|
-      endcase
-    end
-
-  always @(posedge clk) // datapath for "main"
-    case (state)
-      |\HL1{9: reg\_3 <= 0;}|
-      |\HL2{8: reg\_6 <= 1;}|
-      |\HL2{7: add\_rst <= 1;}|
-         |\HL2{add\_x2 <= reg\_3;}|
-         |\HL2{add\_x1 <= reg\_6;}|
-     |\HL2{12: add\_rst <= 0;}|
-         |\HL2{reg\_1 <= add\_ret;}|
-      |\HL2{6: reg\_3 <= reg\_1;}|
-      |\HL3{5: reg\_5 <= 2;}|
-      |\HL3{4: add\_rst <= 1;}|
-         |\HL3{add\_x2 <= reg\_3;}|
-         |\HL3{add\_x1 <= reg\_5;}|
-     |\HL3{10: add\_rst <= 0;}|
-         |\HL3{reg\_2 <= add\_ret;}|
-      |\HL3{3: reg\_3 <= reg\_2;}|
-      |\HL4{2: reg\_4 <= reg\_3;}|
-      |\HL4{1: fin = 1;}|
-         |\HL4{ret = reg\_4;}|
-     |\HL0{11: fin <= 0;}|
-    endcase
-endmodule
-\end{lstlisting}
-
 \end{document}
author	John Wickerson <j.wickerson@imperial.ac.uk>	2022-01-18 09:53:56 +0000
committer	node <node@git-bridge-prod-0>	2022-01-18 11:36:32 +0000
commit	e80b44a4440d956ad954990cd77ff25556facc29 (patch)
tree	3664b139093a06c26b9ef185d95dbdf884074cb5
parent	80ca1a53c0a54e4965bf3d92fab4cb428d7a5c9c (diff)
download	fccm22_rsvhls-e80b44a4440d956ad954990cd77ff25556facc29.tar.gz fccm22_rsvhls-e80b44a4440d956ad954990cd77ff25556facc29.zip